10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 50Sstevel@tonic-gate * Common Development and Distribution License, Version 1.0 only 60Sstevel@tonic-gate * (the "License"). You may not use this file except in compliance 70Sstevel@tonic-gate * with the License. 80Sstevel@tonic-gate * 90Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 100Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 110Sstevel@tonic-gate * See the License for the specific language governing permissions 120Sstevel@tonic-gate * and limitations under the License. 130Sstevel@tonic-gate * 140Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 150Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 160Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 170Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 180Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 190Sstevel@tonic-gate * 200Sstevel@tonic-gate * CDDL HEADER END 210Sstevel@tonic-gate */ 220Sstevel@tonic-gate /* 230Sstevel@tonic-gate * Copyright 2005 Sun Microsystems, Inc. All rights reserved. 240Sstevel@tonic-gate * Use is subject to license terms. 250Sstevel@tonic-gate */ 260Sstevel@tonic-gate 270Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 280Sstevel@tonic-gate 290Sstevel@tonic-gate /* 300Sstevel@tonic-gate * IP interface to squeues. 310Sstevel@tonic-gate * 320Sstevel@tonic-gate * IP creates an squeue instance for each CPU. The squeue pointer is saved in 330Sstevel@tonic-gate * cpu_squeue field of the cpu structure. Each squeue is associated with a 340Sstevel@tonic-gate * connection instance (conn_t). 350Sstevel@tonic-gate * 360Sstevel@tonic-gate * For CPUs available at system startup time the squeue creation and association 370Sstevel@tonic-gate * with CPU happens at MP initialization time. For CPUs added during dynamic 380Sstevel@tonic-gate * reconfiguration, the initialization happens when the new CPU is configured in 390Sstevel@tonic-gate * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either 400Sstevel@tonic-gate * return per-CPU squeue or random squeue based on the ip_squeue_fanout 410Sstevel@tonic-gate * variable. 420Sstevel@tonic-gate * 430Sstevel@tonic-gate * There are two modes of associating connection with squeues. The first mode 440Sstevel@tonic-gate * associates each connection with the CPU that creates the connection (either 450Sstevel@tonic-gate * during open time or during accept time). The second mode associates each 460Sstevel@tonic-gate * connection with a random CPU, effectively distributing load over all CPUs 470Sstevel@tonic-gate * and all squeues in the system. The mode is controlled by the 480Sstevel@tonic-gate * ip_squeue_fanout variable. 490Sstevel@tonic-gate * 500Sstevel@tonic-gate * NOTE: The fact that there is an association between each connection and 510Sstevel@tonic-gate * squeue and squeue and CPU does not mean that each connection is always 520Sstevel@tonic-gate * processed on this CPU and on this CPU only. Any thread calling squeue_enter() 530Sstevel@tonic-gate * may process the connection on whatever CPU it is scheduled. The squeue to CPU 540Sstevel@tonic-gate * binding is only relevant for the worker thread. 550Sstevel@tonic-gate * 560Sstevel@tonic-gate * The list of all created squeues is kept in squeue_set structure. This list is 570Sstevel@tonic-gate * used when ip_squeue_fanout is set and the load is distributed across all 580Sstevel@tonic-gate * squeues. 590Sstevel@tonic-gate * 600Sstevel@tonic-gate * INTERFACE: 610Sstevel@tonic-gate * 620Sstevel@tonic-gate * squeue_t *ip_squeue_get(hint) 630Sstevel@tonic-gate * 640Sstevel@tonic-gate * Find an squeue based on the 'hint' value. The hint is used as an index 650Sstevel@tonic-gate * in the array of IP squeues available. The way hint is computed may 660Sstevel@tonic-gate * affect the effectiveness of the squeue distribution. Currently squeues 670Sstevel@tonic-gate * are assigned in round-robin fashion using lbolt as a hint. 680Sstevel@tonic-gate * 690Sstevel@tonic-gate * 700Sstevel@tonic-gate * DR Notes 710Sstevel@tonic-gate * ======== 720Sstevel@tonic-gate * 730Sstevel@tonic-gate * The ip_squeue_init() registers a call-back function with the CPU DR 740Sstevel@tonic-gate * subsystem using register_cpu_setup_func(). The call-back function does two 750Sstevel@tonic-gate * things: 760Sstevel@tonic-gate * 770Sstevel@tonic-gate * o When the CPU is going off-line or unconfigured, the worker thread is 780Sstevel@tonic-gate * unbound from the CPU. This allows the CPU unconfig code to move it to 790Sstevel@tonic-gate * another CPU. 800Sstevel@tonic-gate * 810Sstevel@tonic-gate * o When the CPU is going online, it creates a new squeue for this CPU if 820Sstevel@tonic-gate * necessary and binds the squeue worker thread to this CPU. 830Sstevel@tonic-gate * 840Sstevel@tonic-gate * TUNEBALES: 850Sstevel@tonic-gate * 860Sstevel@tonic-gate * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU 870Sstevel@tonic-gate * associated with an squeue instance. 880Sstevel@tonic-gate * 890Sstevel@tonic-gate * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c 900Sstevel@tonic-gate * should be compiled with SQUEUE_PROFILE enabled for this variable to have 910Sstevel@tonic-gate * an impact. 920Sstevel@tonic-gate * 930Sstevel@tonic-gate * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue, 940Sstevel@tonic-gate * otherwise get it from CPU->cpu_squeue. 950Sstevel@tonic-gate * 960Sstevel@tonic-gate * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and 970Sstevel@tonic-gate * changed using ndd on /dev/tcp or /dev/ip. 980Sstevel@tonic-gate * 990Sstevel@tonic-gate * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 1000Sstevel@tonic-gate * created. This is the time squeue code waits before waking up the worker 1010Sstevel@tonic-gate * thread after queuing a request. 1020Sstevel@tonic-gate */ 1030Sstevel@tonic-gate 1040Sstevel@tonic-gate #include <sys/types.h> 1050Sstevel@tonic-gate #include <sys/debug.h> 1060Sstevel@tonic-gate #include <sys/kmem.h> 1070Sstevel@tonic-gate #include <sys/cpuvar.h> 1080Sstevel@tonic-gate 1090Sstevel@tonic-gate #include <sys/cmn_err.h> 1100Sstevel@tonic-gate 1110Sstevel@tonic-gate #include <inet/common.h> 1120Sstevel@tonic-gate #include <inet/ip.h> 1130Sstevel@tonic-gate #include <inet/ip_if.h> 1140Sstevel@tonic-gate #include <inet/mi.h> 1150Sstevel@tonic-gate #include <inet/nd.h> 1160Sstevel@tonic-gate #include <inet/ipclassifier.h> 1170Sstevel@tonic-gate #include <sys/types.h> 1180Sstevel@tonic-gate #include <sys/conf.h> 1190Sstevel@tonic-gate #include <sys/sunddi.h> 1200Sstevel@tonic-gate #include <sys/ddi.h> 1210Sstevel@tonic-gate #include <sys/squeue_impl.h> 1220Sstevel@tonic-gate 1230Sstevel@tonic-gate 1240Sstevel@tonic-gate /* 1250Sstevel@tonic-gate * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1 1260Sstevel@tonic-gate * mapping between squeue and NIC (or Rx ring) for performance reasons so 1270Sstevel@tonic-gate * each squeue can uniquely own a NIC or a Rx ring and do polling 128*1184Skrgopi * (PSARC 2004/630). So we allow up to MAX_SQUEUES_PER_CPU squeues per CPU. 129*1184Skrgopi * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues 1300Sstevel@tonic-gate * can be created dynamically as needed. 1310Sstevel@tonic-gate */ 132*1184Skrgopi #define MAX_SQUEUES_PER_CPU 32 133*1184Skrgopi #define MIN_SQUEUES_PER_CPU 1 134*1184Skrgopi uint_t ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 135*1184Skrgopi 136*1184Skrgopi #define IP_NUM_SOFT_RINGS 2 137*1184Skrgopi uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS; 1380Sstevel@tonic-gate 1390Sstevel@tonic-gate /* 1400Sstevel@tonic-gate * List of all created squeue sets. The size is protected by cpu_lock 1410Sstevel@tonic-gate */ 1420Sstevel@tonic-gate squeue_set_t **sqset_global_list; 1430Sstevel@tonic-gate uint_t sqset_global_size; 1440Sstevel@tonic-gate 1450Sstevel@tonic-gate int ip_squeue_bind = B_TRUE; 1460Sstevel@tonic-gate int ip_squeue_profile = B_TRUE; 1470Sstevel@tonic-gate static void (*ip_squeue_create_callback)(squeue_t *) = NULL; 1480Sstevel@tonic-gate 1490Sstevel@tonic-gate /* 1500Sstevel@tonic-gate * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 1510Sstevel@tonic-gate * created. This is the time squeue code waits before waking up the worker 1520Sstevel@tonic-gate * thread after queuing a request. 1530Sstevel@tonic-gate */ 1540Sstevel@tonic-gate uint_t ip_squeue_worker_wait = 10; 1550Sstevel@tonic-gate 1560Sstevel@tonic-gate static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t); 1570Sstevel@tonic-gate static int ip_squeue_cpu_setup(cpu_setup_t, int, void *); 1580Sstevel@tonic-gate 1590Sstevel@tonic-gate static void ip_squeue_set_bind(squeue_set_t *); 1600Sstevel@tonic-gate static void ip_squeue_set_unbind(squeue_set_t *); 161*1184Skrgopi static squeue_t *ip_find_unused_squeue(squeue_set_t *, cpu_t *, boolean_t); 1620Sstevel@tonic-gate 1630Sstevel@tonic-gate #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS)) 1640Sstevel@tonic-gate 1650Sstevel@tonic-gate /* 166*1184Skrgopi * Create squeue set containing ip_squeues_per_cpu number of squeues 1670Sstevel@tonic-gate * for this CPU and bind them all to the CPU. 1680Sstevel@tonic-gate */ 1690Sstevel@tonic-gate static squeue_set_t * 1700Sstevel@tonic-gate ip_squeue_set_create(cpu_t *cp, boolean_t reuse) 1710Sstevel@tonic-gate { 1720Sstevel@tonic-gate int i; 1730Sstevel@tonic-gate squeue_set_t *sqs; 1740Sstevel@tonic-gate squeue_t *sqp; 1750Sstevel@tonic-gate char sqname[64]; 1760Sstevel@tonic-gate processorid_t id = cp->cpu_id; 1770Sstevel@tonic-gate 1780Sstevel@tonic-gate if (reuse) { 1790Sstevel@tonic-gate int i; 1800Sstevel@tonic-gate 1810Sstevel@tonic-gate /* 1820Sstevel@tonic-gate * We may already have an squeue created for this CPU. Try to 1830Sstevel@tonic-gate * find one and reuse it if possible. 1840Sstevel@tonic-gate */ 1850Sstevel@tonic-gate for (i = 0; i < sqset_global_size; i++) { 1860Sstevel@tonic-gate sqs = sqset_global_list[i]; 1870Sstevel@tonic-gate if (id == sqs->sqs_bind) 1880Sstevel@tonic-gate return (sqs); 1890Sstevel@tonic-gate } 1900Sstevel@tonic-gate } 1910Sstevel@tonic-gate 1920Sstevel@tonic-gate sqs = kmem_zalloc(sizeof (squeue_set_t) + 193*1184Skrgopi (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP); 1940Sstevel@tonic-gate mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL); 1950Sstevel@tonic-gate sqs->sqs_list = (squeue_t **)&sqs[1]; 196*1184Skrgopi sqs->sqs_max_size = MAX_SQUEUES_PER_CPU; 1970Sstevel@tonic-gate sqs->sqs_bind = id; 1980Sstevel@tonic-gate 199*1184Skrgopi for (i = 0; i < ip_squeues_per_cpu; i++) { 2000Sstevel@tonic-gate bzero(sqname, sizeof (sqname)); 2010Sstevel@tonic-gate 2020Sstevel@tonic-gate (void) snprintf(sqname, sizeof (sqname), 2030Sstevel@tonic-gate "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid, 2040Sstevel@tonic-gate cp->cpu_id, i); 2050Sstevel@tonic-gate 2060Sstevel@tonic-gate sqp = squeue_create(sqname, id, ip_squeue_worker_wait, 2070Sstevel@tonic-gate minclsyspri); 2080Sstevel@tonic-gate 209*1184Skrgopi /* 210*1184Skrgopi * The first squeue in each squeue_set is the DEFAULT 211*1184Skrgopi * squeue. 212*1184Skrgopi */ 213*1184Skrgopi sqp->sq_state |= SQS_DEFAULT; 214*1184Skrgopi 2150Sstevel@tonic-gate ASSERT(sqp != NULL); 2160Sstevel@tonic-gate 2170Sstevel@tonic-gate squeue_profile_enable(sqp); 2180Sstevel@tonic-gate sqs->sqs_list[sqs->sqs_size++] = sqp; 2190Sstevel@tonic-gate 2200Sstevel@tonic-gate if (ip_squeue_create_callback != NULL) 2210Sstevel@tonic-gate ip_squeue_create_callback(sqp); 2220Sstevel@tonic-gate } 2230Sstevel@tonic-gate 224405Sakolb if (ip_squeue_bind && cpu_is_online(cp)) 2250Sstevel@tonic-gate ip_squeue_set_bind(sqs); 2260Sstevel@tonic-gate 2270Sstevel@tonic-gate sqset_global_list[sqset_global_size++] = sqs; 2280Sstevel@tonic-gate ASSERT(sqset_global_size <= NCPU); 2290Sstevel@tonic-gate return (sqs); 2300Sstevel@tonic-gate } 2310Sstevel@tonic-gate 2320Sstevel@tonic-gate /* 2330Sstevel@tonic-gate * Initialize IP squeues. 2340Sstevel@tonic-gate */ 2350Sstevel@tonic-gate void 2360Sstevel@tonic-gate ip_squeue_init(void (*callback)(squeue_t *)) 2370Sstevel@tonic-gate { 2380Sstevel@tonic-gate int i; 2390Sstevel@tonic-gate 2400Sstevel@tonic-gate ASSERT(sqset_global_list == NULL); 2410Sstevel@tonic-gate 242*1184Skrgopi if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU) 243*1184Skrgopi ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU; 244*1184Skrgopi else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU) 245*1184Skrgopi ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU; 2460Sstevel@tonic-gate 2470Sstevel@tonic-gate ip_squeue_create_callback = callback; 2480Sstevel@tonic-gate squeue_init(); 2490Sstevel@tonic-gate sqset_global_list = 2500Sstevel@tonic-gate kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP); 2510Sstevel@tonic-gate sqset_global_size = 0; 2520Sstevel@tonic-gate mutex_enter(&cpu_lock); 2530Sstevel@tonic-gate 2540Sstevel@tonic-gate /* Create squeue for each active CPU available */ 2550Sstevel@tonic-gate for (i = 0; i < NCPU; i++) { 2560Sstevel@tonic-gate cpu_t *cp = cpu[i]; 2570Sstevel@tonic-gate if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) { 2580Sstevel@tonic-gate cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE); 2590Sstevel@tonic-gate } 2600Sstevel@tonic-gate } 2610Sstevel@tonic-gate 2620Sstevel@tonic-gate register_cpu_setup_func(ip_squeue_cpu_setup, NULL); 2630Sstevel@tonic-gate 2640Sstevel@tonic-gate mutex_exit(&cpu_lock); 2650Sstevel@tonic-gate 2660Sstevel@tonic-gate if (ip_squeue_profile) 2670Sstevel@tonic-gate squeue_profile_start(); 2680Sstevel@tonic-gate } 2690Sstevel@tonic-gate 2700Sstevel@tonic-gate /* 2710Sstevel@tonic-gate * Get squeue_t structure based on index. 2720Sstevel@tonic-gate * Since the squeue list can only grow, no need to grab any lock. 2730Sstevel@tonic-gate */ 2740Sstevel@tonic-gate squeue_t * 2750Sstevel@tonic-gate ip_squeue_random(uint_t index) 2760Sstevel@tonic-gate { 2770Sstevel@tonic-gate squeue_set_t *sqs; 2780Sstevel@tonic-gate 2790Sstevel@tonic-gate sqs = sqset_global_list[index % sqset_global_size]; 2800Sstevel@tonic-gate return (sqs->sqs_list[index % sqs->sqs_size]); 2810Sstevel@tonic-gate } 2820Sstevel@tonic-gate 2830Sstevel@tonic-gate /* ARGSUSED */ 2840Sstevel@tonic-gate void 2850Sstevel@tonic-gate ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2) 2860Sstevel@tonic-gate { 2870Sstevel@tonic-gate squeue_t *sqp = arg2; 2880Sstevel@tonic-gate ill_rx_ring_t *ring = sqp->sq_rx_ring; 2890Sstevel@tonic-gate ill_t *ill; 2900Sstevel@tonic-gate 2910Sstevel@tonic-gate ASSERT(sqp != NULL); 2920Sstevel@tonic-gate 2930Sstevel@tonic-gate if (ring == NULL) { 2940Sstevel@tonic-gate return; 2950Sstevel@tonic-gate } 2960Sstevel@tonic-gate 2970Sstevel@tonic-gate /* 2980Sstevel@tonic-gate * Clean up squeue 2990Sstevel@tonic-gate */ 3000Sstevel@tonic-gate mutex_enter(&sqp->sq_lock); 3010Sstevel@tonic-gate sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB); 3020Sstevel@tonic-gate sqp->sq_rx_ring = NULL; 3030Sstevel@tonic-gate mutex_exit(&sqp->sq_lock); 3040Sstevel@tonic-gate 3050Sstevel@tonic-gate ill = ring->rr_ill; 306*1184Skrgopi if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 307*1184Skrgopi ASSERT(ring->rr_handle != NULL); 308*1184Skrgopi ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle); 309*1184Skrgopi } 3100Sstevel@tonic-gate 3110Sstevel@tonic-gate /* 3120Sstevel@tonic-gate * Cleanup the ring 3130Sstevel@tonic-gate */ 3140Sstevel@tonic-gate 3150Sstevel@tonic-gate ring->rr_blank = NULL; 3160Sstevel@tonic-gate ring->rr_handle = NULL; 3170Sstevel@tonic-gate ring->rr_sqp = NULL; 3180Sstevel@tonic-gate 3190Sstevel@tonic-gate /* 3200Sstevel@tonic-gate * Signal ill that cleanup is done 3210Sstevel@tonic-gate */ 3220Sstevel@tonic-gate mutex_enter(&ill->ill_lock); 3230Sstevel@tonic-gate ring->rr_ring_state = ILL_RING_FREE; 3240Sstevel@tonic-gate cv_signal(&ill->ill_cv); 3250Sstevel@tonic-gate mutex_exit(&ill->ill_lock); 3260Sstevel@tonic-gate } 3270Sstevel@tonic-gate 3280Sstevel@tonic-gate typedef struct ip_taskq_arg { 3290Sstevel@tonic-gate ill_t *ip_taskq_ill; 3300Sstevel@tonic-gate ill_rx_ring_t *ip_taskq_ill_rx_ring; 3310Sstevel@tonic-gate cpu_t *ip_taskq_cpu; 3320Sstevel@tonic-gate } ip_taskq_arg_t; 3330Sstevel@tonic-gate 3340Sstevel@tonic-gate /* 3350Sstevel@tonic-gate * Do a Rx ring to squeue binding. Find a unique squeue that is not 3360Sstevel@tonic-gate * managing a receive ring. If no such squeue exists, dynamically 3370Sstevel@tonic-gate * create a new one in the squeue set. 3380Sstevel@tonic-gate * 3390Sstevel@tonic-gate * The function runs via the system taskq. The ill passed as an 3400Sstevel@tonic-gate * argument can't go away since we hold a ref. The lock order is 3410Sstevel@tonic-gate * ill_lock -> sqs_lock -> sq_lock. 3420Sstevel@tonic-gate * 3430Sstevel@tonic-gate * If we are binding a Rx ring to a squeue attached to the offline CPU, 3440Sstevel@tonic-gate * no need to check that because squeues are never destroyed once 3450Sstevel@tonic-gate * created. 3460Sstevel@tonic-gate */ 3470Sstevel@tonic-gate /* ARGSUSED */ 3480Sstevel@tonic-gate static void 3490Sstevel@tonic-gate ip_squeue_extend(void *arg) 3500Sstevel@tonic-gate { 3510Sstevel@tonic-gate ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 3520Sstevel@tonic-gate ill_t *ill = sq_arg->ip_taskq_ill; 3530Sstevel@tonic-gate ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 3540Sstevel@tonic-gate cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 355*1184Skrgopi squeue_set_t *sqs; 3560Sstevel@tonic-gate squeue_t *sqp = NULL; 3570Sstevel@tonic-gate 3580Sstevel@tonic-gate ASSERT(ill != NULL); 3590Sstevel@tonic-gate ASSERT(ill_rx_ring != NULL); 3600Sstevel@tonic-gate kmem_free(arg, sizeof (ip_taskq_arg_t)); 3610Sstevel@tonic-gate 362*1184Skrgopi /* 363*1184Skrgopi * Make sure the CPU that originally took the interrupt still 364*1184Skrgopi * exists. 365*1184Skrgopi */ 366*1184Skrgopi if (!CPU_ISON(intr_cpu)) 367*1184Skrgopi intr_cpu = CPU; 368*1184Skrgopi 3690Sstevel@tonic-gate sqs = intr_cpu->cpu_squeue_set; 3700Sstevel@tonic-gate 3710Sstevel@tonic-gate /* 3720Sstevel@tonic-gate * If this ill represents link aggregation, then there might be 3730Sstevel@tonic-gate * multiple NICs trying to register them selves at the same time 3740Sstevel@tonic-gate * and in order to ensure that test and assignment of free rings 3750Sstevel@tonic-gate * is sequential, we need to hold the ill_lock. 3760Sstevel@tonic-gate */ 3770Sstevel@tonic-gate mutex_enter(&ill->ill_lock); 378*1184Skrgopi sqp = ip_find_unused_squeue(sqs, intr_cpu, B_FALSE); 379*1184Skrgopi if (sqp == NULL) { 380*1184Skrgopi /* 381*1184Skrgopi * We hit the max limit of squeues allowed per CPU. 382*1184Skrgopi * Assign this rx_ring to DEFAULT squeue of the 383*1184Skrgopi * interrupted CPU but the squeue will not manage 384*1184Skrgopi * the ring. Also print a warning. 385*1184Skrgopi */ 386*1184Skrgopi cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already " 387*1184Skrgopi "has max number of squeues. System performance might " 388*1184Skrgopi "become suboptimal\n", sqs->sqs_bind, (void *)sqs); 389*1184Skrgopi 390*1184Skrgopi /* the first squeue in the list is the default squeue */ 391*1184Skrgopi sqp = sqs->sqs_list[0]; 392*1184Skrgopi ASSERT(sqp != NULL); 393*1184Skrgopi ill_rx_ring->rr_sqp = sqp; 394*1184Skrgopi ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 395*1184Skrgopi 396*1184Skrgopi mutex_exit(&ill->ill_lock); 397*1184Skrgopi ill_waiter_dcr(ill); 398*1184Skrgopi return; 399*1184Skrgopi } 400*1184Skrgopi 401*1184Skrgopi ASSERT(MUTEX_HELD(&sqp->sq_lock)); 402*1184Skrgopi sqp->sq_rx_ring = ill_rx_ring; 403*1184Skrgopi ill_rx_ring->rr_sqp = sqp; 404*1184Skrgopi ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 405*1184Skrgopi 406*1184Skrgopi sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB); 407*1184Skrgopi mutex_exit(&sqp->sq_lock); 408*1184Skrgopi 409*1184Skrgopi mutex_exit(&ill->ill_lock); 410*1184Skrgopi 411*1184Skrgopi /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 412*1184Skrgopi ill_waiter_dcr(ill); 413*1184Skrgopi } 414*1184Skrgopi 415*1184Skrgopi /* 416*1184Skrgopi * Do a Rx ring to squeue binding. Find a unique squeue that is not 417*1184Skrgopi * managing a receive ring. If no such squeue exists, dynamically 418*1184Skrgopi * create a new one in the squeue set. 419*1184Skrgopi * 420*1184Skrgopi * The function runs via the system taskq. The ill passed as an 421*1184Skrgopi * argument can't go away since we hold a ref. The lock order is 422*1184Skrgopi * ill_lock -> sqs_lock -> sq_lock. 423*1184Skrgopi * 424*1184Skrgopi * If we are binding a Rx ring to a squeue attached to the offline CPU, 425*1184Skrgopi * no need to check that because squeues are never destroyed once 426*1184Skrgopi * created. 427*1184Skrgopi */ 428*1184Skrgopi /* ARGSUSED */ 429*1184Skrgopi static void 430*1184Skrgopi ip_squeue_soft_ring_affinity(void *arg) 431*1184Skrgopi { 432*1184Skrgopi ip_taskq_arg_t *sq_arg = (ip_taskq_arg_t *)arg; 433*1184Skrgopi ill_t *ill = sq_arg->ip_taskq_ill; 434*1184Skrgopi ill_dls_capab_t *ill_soft_ring = ill->ill_dls_capab; 435*1184Skrgopi ill_rx_ring_t *ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring; 436*1184Skrgopi cpu_t *intr_cpu = sq_arg->ip_taskq_cpu; 437*1184Skrgopi cpu_t *bind_cpu; 438*1184Skrgopi int cpu_id = intr_cpu->cpu_id; 439*1184Skrgopi int min_cpu_id, max_cpu_id; 440*1184Skrgopi boolean_t enough_uniq_cpus = B_FALSE; 441*1184Skrgopi boolean_t enough_cpus = B_FALSE; 442*1184Skrgopi squeue_set_t *sqs, *last_sqs; 443*1184Skrgopi squeue_t *sqp = NULL; 444*1184Skrgopi int i, j; 445*1184Skrgopi 446*1184Skrgopi ASSERT(ill != NULL); 447*1184Skrgopi kmem_free(arg, sizeof (ip_taskq_arg_t)); 448*1184Skrgopi 449*1184Skrgopi /* 450*1184Skrgopi * Make sure the CPU that originally took the interrupt still 451*1184Skrgopi * exists. 452*1184Skrgopi */ 453*1184Skrgopi if (!CPU_ISON(intr_cpu)) { 454*1184Skrgopi intr_cpu = CPU; 455*1184Skrgopi cpu_id = intr_cpu->cpu_id; 456*1184Skrgopi } 457*1184Skrgopi 458*1184Skrgopi /* 459*1184Skrgopi * If this ill represents link aggregation, then there might be 460*1184Skrgopi * multiple NICs trying to register them selves at the same time 461*1184Skrgopi * and in order to ensure that test and assignment of free rings 462*1184Skrgopi * is sequential, we need to hold the ill_lock. 463*1184Skrgopi */ 464*1184Skrgopi mutex_enter(&ill->ill_lock); 465*1184Skrgopi 466*1184Skrgopi if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 467*1184Skrgopi mutex_exit(&ill->ill_lock); 468*1184Skrgopi return; 469*1184Skrgopi } 470*1184Skrgopi /* 471*1184Skrgopi * We need to fanout the interrupts from the NIC. We do that by 472*1184Skrgopi * telling the driver underneath to create soft rings and use 473*1184Skrgopi * worker threads (if the driver advertized SOFT_RING capability) 474*1184Skrgopi * Its still a big performance win to if we can fanout to the 475*1184Skrgopi * threads on the same core that is taking interrupts. 476*1184Skrgopi * 477*1184Skrgopi * Since we don't know the interrupt to CPU binding, we don't 478*1184Skrgopi * assign any squeues or affinity to worker threads in the NIC. 479*1184Skrgopi * At the time of the first interrupt, we know which CPU is 480*1184Skrgopi * taking interrupts and try to find other threads on the same 481*1184Skrgopi * core. Assuming, ip_threads_per_cpu is correct and cpus are 482*1184Skrgopi * numbered sequentially for each core (XXX need something better 483*1184Skrgopi * than this in future), find the lowest number and highest 484*1184Skrgopi * number thread for that core. 485*1184Skrgopi * 486*1184Skrgopi * If we have one more thread per core than number of soft rings, 487*1184Skrgopi * then don't assign any worker threads to the H/W thread (cpu) 488*1184Skrgopi * taking interrupts (capability negotiation tries to ensure this) 489*1184Skrgopi * 490*1184Skrgopi * If the number of threads per core are same as the number of 491*1184Skrgopi * soft rings, then assign the worker affinity and squeue to 492*1184Skrgopi * the same cpu. 493*1184Skrgopi * 494*1184Skrgopi * Otherwise, just fanout to higher number CPUs starting from 495*1184Skrgopi * the interrupted CPU. 496*1184Skrgopi */ 497*1184Skrgopi 498*1184Skrgopi min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu; 499*1184Skrgopi max_cpu_id = min_cpu_id + ip_threads_per_cpu; 500*1184Skrgopi 501*1184Skrgopi cmn_err(CE_CONT, "soft_ring_affinity: min/max/intr = %d/%d/%d\n", 502*1184Skrgopi min_cpu_id, max_cpu_id, (int)intr_cpu->cpu_id); 503*1184Skrgopi 504*1184Skrgopi /* 505*1184Skrgopi * Quickly check if there are enough CPUs present for fanout 506*1184Skrgopi * and also max_cpu_id is less than the id of the active CPU. 507*1184Skrgopi * We use the cpu_id stored in the last squeue_set to get 508*1184Skrgopi * an idea. The scheme is by no means perfect since it doesn't 509*1184Skrgopi * take into account CPU DR operations and the fact that 510*1184Skrgopi * interrupts themselves might change. An ideal scenario 511*1184Skrgopi * would be to ensure that interrupts run cpus by themselves 512*1184Skrgopi * and worker threads never have affinity to those CPUs. If 513*1184Skrgopi * the interrupts move to CPU which had a worker thread, it 514*1184Skrgopi * should be changed. Probably callbacks similar to CPU offline 515*1184Skrgopi * are needed to make it work perfectly. 516*1184Skrgopi */ 517*1184Skrgopi last_sqs = sqset_global_list[sqset_global_size - 1]; 518*1184Skrgopi if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) { 519*1184Skrgopi if ((max_cpu_id - min_cpu_id) > 520*1184Skrgopi ill_soft_ring->ill_dls_soft_ring_cnt) 521*1184Skrgopi enough_uniq_cpus = B_TRUE; 522*1184Skrgopi else if ((max_cpu_id - min_cpu_id) >= 523*1184Skrgopi ill_soft_ring->ill_dls_soft_ring_cnt) 524*1184Skrgopi enough_cpus = B_TRUE; 525*1184Skrgopi } 526*1184Skrgopi 527*1184Skrgopi j = 0; 528*1184Skrgopi for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) { 529*1184Skrgopi if (enough_uniq_cpus) { 530*1184Skrgopi if ((min_cpu_id + i) == cpu_id) { 531*1184Skrgopi j++; 532*1184Skrgopi continue; 533*1184Skrgopi } 534*1184Skrgopi bind_cpu = cpu[min_cpu_id + i]; 535*1184Skrgopi } else if (enough_cpus) { 536*1184Skrgopi bind_cpu = cpu[min_cpu_id + i]; 537*1184Skrgopi } else { 538*1184Skrgopi /* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */ 539*1184Skrgopi bind_cpu = cpu[(cpu_id + i) % ncpus]; 540*1184Skrgopi } 541*1184Skrgopi 542*1184Skrgopi /* 543*1184Skrgopi * Check if the CPU actually exist and active. If not, 544*1184Skrgopi * use the interrupted CPU. ip_find_unused_squeue() will 545*1184Skrgopi * find the right CPU to fanout anyway. 546*1184Skrgopi */ 547*1184Skrgopi if (!CPU_ISON(bind_cpu)) 548*1184Skrgopi bind_cpu = intr_cpu; 549*1184Skrgopi 550*1184Skrgopi sqs = bind_cpu->cpu_squeue_set; 551*1184Skrgopi ASSERT(sqs != NULL); 552*1184Skrgopi ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j]; 553*1184Skrgopi 554*1184Skrgopi sqp = ip_find_unused_squeue(sqs, bind_cpu, B_TRUE); 555*1184Skrgopi if (sqp == NULL) { 556*1184Skrgopi /* 557*1184Skrgopi * We hit the max limit of squeues allowed per CPU. 558*1184Skrgopi * Assign this rx_ring to DEFAULT squeue of the 559*1184Skrgopi * interrupted CPU but thesqueue will not manage 560*1184Skrgopi * the ring. Also print a warning. 561*1184Skrgopi */ 562*1184Skrgopi cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = " 563*1184Skrgopi "%d/%p already has max number of squeues. System " 564*1184Skrgopi "performance might become suboptimal\n", 565*1184Skrgopi sqs->sqs_bind, (void *)sqs); 566*1184Skrgopi 567*1184Skrgopi /* the first squeue in the list is the default squeue */ 568*1184Skrgopi sqp = intr_cpu->cpu_squeue_set->sqs_list[0]; 569*1184Skrgopi ASSERT(sqp != NULL); 570*1184Skrgopi 571*1184Skrgopi ill_rx_ring->rr_sqp = sqp; 572*1184Skrgopi ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 573*1184Skrgopi continue; 574*1184Skrgopi 575*1184Skrgopi } 576*1184Skrgopi ASSERT(MUTEX_HELD(&sqp->sq_lock)); 577*1184Skrgopi ill_rx_ring->rr_sqp = sqp; 578*1184Skrgopi sqp->sq_rx_ring = ill_rx_ring; 579*1184Skrgopi ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 580*1184Skrgopi sqp->sq_state |= SQS_ILL_BOUND; 581*1184Skrgopi 582*1184Skrgopi /* assign affinity to soft ring */ 583*1184Skrgopi if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) { 584*1184Skrgopi ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle, 585*1184Skrgopi sqp->sq_bind); 586*1184Skrgopi } 587*1184Skrgopi mutex_exit(&sqp->sq_lock); 588*1184Skrgopi 589*1184Skrgopi cmn_err(CE_CONT, "soft_ring_affinity: ring = %d, bind = %d\n", 590*1184Skrgopi i - j, sqp->sq_bind); 591*1184Skrgopi } 592*1184Skrgopi mutex_exit(&ill->ill_lock); 593*1184Skrgopi 594*1184Skrgopi ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle, 595*1184Skrgopi SOFT_RING_SRC_HASH); 596*1184Skrgopi 597*1184Skrgopi /* ill_waiter_dcr will also signal any waiters on ill_ring_state */ 598*1184Skrgopi ill_waiter_dcr(ill); 599*1184Skrgopi } 600*1184Skrgopi 601*1184Skrgopi void 602*1184Skrgopi ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring, 603*1184Skrgopi mblk_t *mp_chain, size_t hdrlen) 604*1184Skrgopi { 605*1184Skrgopi ip_taskq_arg_t *taskq_arg; 606*1184Skrgopi boolean_t refheld; 607*1184Skrgopi 608*1184Skrgopi ASSERT(servicing_interrupt()); 609*1184Skrgopi ASSERT(ip_ring == NULL); 610*1184Skrgopi 611*1184Skrgopi mutex_enter(&ill->ill_lock); 612*1184Skrgopi if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) { 613*1184Skrgopi taskq_arg = (ip_taskq_arg_t *) 614*1184Skrgopi kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP); 615*1184Skrgopi 616*1184Skrgopi if (taskq_arg == NULL) 617*1184Skrgopi goto out; 618*1184Skrgopi 619*1184Skrgopi taskq_arg->ip_taskq_ill = ill; 620*1184Skrgopi taskq_arg->ip_taskq_ill_rx_ring = ip_ring; 621*1184Skrgopi taskq_arg->ip_taskq_cpu = CPU; 622*1184Skrgopi 623*1184Skrgopi /* 624*1184Skrgopi * Set ILL_SOFT_RING_ASSIGN flag. We don't want 625*1184Skrgopi * the next interrupt to schedule a task for calling 626*1184Skrgopi * ip_squeue_soft_ring_affinity(); 627*1184Skrgopi */ 628*1184Skrgopi ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN; 629*1184Skrgopi } else { 630*1184Skrgopi mutex_exit(&ill->ill_lock); 631*1184Skrgopi goto out; 632*1184Skrgopi } 633*1184Skrgopi mutex_exit(&ill->ill_lock); 634*1184Skrgopi refheld = ill_waiter_inc(ill); 635*1184Skrgopi if (refheld) { 636*1184Skrgopi if (taskq_dispatch(system_taskq, 637*1184Skrgopi ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP)) 638*1184Skrgopi goto out; 639*1184Skrgopi 640*1184Skrgopi /* release ref on ill if taskq dispatch fails */ 641*1184Skrgopi ill_waiter_dcr(ill); 642*1184Skrgopi } 643*1184Skrgopi /* 644*1184Skrgopi * Turn on CAPAB_SOFT_RING so that affinity assignment 645*1184Skrgopi * can be tried again later. 646*1184Skrgopi */ 647*1184Skrgopi mutex_enter(&ill->ill_lock); 648*1184Skrgopi ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN; 649*1184Skrgopi mutex_exit(&ill->ill_lock); 650*1184Skrgopi kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 651*1184Skrgopi 652*1184Skrgopi out: 653*1184Skrgopi ip_input(ill, ip_ring, mp_chain, hdrlen); 654*1184Skrgopi } 655*1184Skrgopi 656*1184Skrgopi static squeue_t * 657*1184Skrgopi ip_find_unused_squeue(squeue_set_t *sqs, cpu_t *bind_cpu, boolean_t fanout) 658*1184Skrgopi { 659*1184Skrgopi int i; 660*1184Skrgopi squeue_set_t *best_sqs = NULL; 661*1184Skrgopi squeue_set_t *curr_sqs = NULL; 662*1184Skrgopi int min_sq = 0; 663*1184Skrgopi squeue_t *sqp = NULL; 664*1184Skrgopi char sqname[64]; 665*1184Skrgopi 666*1184Skrgopi /* 667*1184Skrgopi * If fanout is set and the passed squeue_set already has some 668*1184Skrgopi * squeues which are managing the NICs, try to find squeues on 669*1184Skrgopi * unused CPU. 670*1184Skrgopi */ 671*1184Skrgopi if (sqs->sqs_size > 1 && fanout) { 672*1184Skrgopi /* 673*1184Skrgopi * First check to see if any squeue on the CPU passed 674*1184Skrgopi * is managing a NIC. 675*1184Skrgopi */ 676*1184Skrgopi for (i = 0; i < sqs->sqs_size; i++) { 677*1184Skrgopi mutex_enter(&sqs->sqs_list[i]->sq_lock); 678*1184Skrgopi if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) && 679*1184Skrgopi !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) { 680*1184Skrgopi mutex_exit(&sqs->sqs_list[i]->sq_lock); 681*1184Skrgopi break; 682*1184Skrgopi } 683*1184Skrgopi mutex_exit(&sqs->sqs_list[i]->sq_lock); 684*1184Skrgopi } 685*1184Skrgopi if (i != sqs->sqs_size) { 686*1184Skrgopi best_sqs = sqset_global_list[sqset_global_size - 1]; 687*1184Skrgopi min_sq = best_sqs->sqs_size; 688*1184Skrgopi 689*1184Skrgopi for (i = sqset_global_size - 2; i >= 0; i--) { 690*1184Skrgopi curr_sqs = sqset_global_list[i]; 691*1184Skrgopi if (curr_sqs->sqs_size < min_sq) { 692*1184Skrgopi best_sqs = curr_sqs; 693*1184Skrgopi min_sq = curr_sqs->sqs_size; 694*1184Skrgopi } 695*1184Skrgopi } 696*1184Skrgopi 697*1184Skrgopi ASSERT(best_sqs != NULL); 698*1184Skrgopi sqs = best_sqs; 699*1184Skrgopi bind_cpu = cpu[sqs->sqs_bind]; 700*1184Skrgopi } 701*1184Skrgopi } 702*1184Skrgopi 7030Sstevel@tonic-gate mutex_enter(&sqs->sqs_lock); 704*1184Skrgopi 7050Sstevel@tonic-gate for (i = 0; i < sqs->sqs_size; i++) { 7060Sstevel@tonic-gate mutex_enter(&sqs->sqs_list[i]->sq_lock); 707*1184Skrgopi if ((sqs->sqs_list[i]->sq_state & 708*1184Skrgopi (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) { 7090Sstevel@tonic-gate sqp = sqs->sqs_list[i]; 7100Sstevel@tonic-gate break; 7110Sstevel@tonic-gate } 7120Sstevel@tonic-gate mutex_exit(&sqs->sqs_list[i]->sq_lock); 7130Sstevel@tonic-gate } 7140Sstevel@tonic-gate 7150Sstevel@tonic-gate if (sqp == NULL) { 7160Sstevel@tonic-gate /* Need to create a new squeue */ 7170Sstevel@tonic-gate if (sqs->sqs_size == sqs->sqs_max_size) { 7180Sstevel@tonic-gate /* 7190Sstevel@tonic-gate * Reached the max limit for squeue 720*1184Skrgopi * we can allocate on this CPU. 7210Sstevel@tonic-gate */ 7220Sstevel@tonic-gate mutex_exit(&sqs->sqs_lock); 723*1184Skrgopi return (NULL); 7240Sstevel@tonic-gate } 7250Sstevel@tonic-gate 7260Sstevel@tonic-gate bzero(sqname, sizeof (sqname)); 7270Sstevel@tonic-gate (void) snprintf(sqname, sizeof (sqname), 728*1184Skrgopi "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid, 729*1184Skrgopi bind_cpu->cpu_id, sqs->sqs_size); 7300Sstevel@tonic-gate 731*1184Skrgopi sqp = squeue_create(sqname, bind_cpu->cpu_id, 732*1184Skrgopi ip_squeue_worker_wait, minclsyspri); 7330Sstevel@tonic-gate 7340Sstevel@tonic-gate ASSERT(sqp != NULL); 7350Sstevel@tonic-gate 7360Sstevel@tonic-gate squeue_profile_enable(sqp); 7370Sstevel@tonic-gate sqs->sqs_list[sqs->sqs_size++] = sqp; 7380Sstevel@tonic-gate 7390Sstevel@tonic-gate if (ip_squeue_create_callback != NULL) 7400Sstevel@tonic-gate ip_squeue_create_callback(sqp); 7410Sstevel@tonic-gate 742*1184Skrgopi mutex_enter(&cpu_lock); 743*1184Skrgopi if (ip_squeue_bind && cpu_is_online(bind_cpu)) { 7440Sstevel@tonic-gate squeue_bind(sqp, -1); 7450Sstevel@tonic-gate } 746*1184Skrgopi mutex_exit(&cpu_lock); 747*1184Skrgopi 7480Sstevel@tonic-gate mutex_enter(&sqp->sq_lock); 7490Sstevel@tonic-gate } 7500Sstevel@tonic-gate 751*1184Skrgopi mutex_exit(&sqs->sqs_lock); 7520Sstevel@tonic-gate ASSERT(sqp != NULL); 753*1184Skrgopi return (sqp); 7540Sstevel@tonic-gate } 7550Sstevel@tonic-gate 7560Sstevel@tonic-gate /* 7570Sstevel@tonic-gate * Find the squeue assigned to manage this Rx ring. If the Rx ring is not 7580Sstevel@tonic-gate * owned by a squeue yet, do the assignment. When the NIC registers it 7590Sstevel@tonic-gate * Rx rings with IP, we don't know where the interrupts will land and 7600Sstevel@tonic-gate * hence we need to wait till this point to do the assignment. 7610Sstevel@tonic-gate */ 7620Sstevel@tonic-gate squeue_t * 7630Sstevel@tonic-gate ip_squeue_get(ill_rx_ring_t *ill_rx_ring) 7640Sstevel@tonic-gate { 7650Sstevel@tonic-gate squeue_t *sqp; 7660Sstevel@tonic-gate ill_t *ill; 7670Sstevel@tonic-gate int interrupt; 7680Sstevel@tonic-gate ip_taskq_arg_t *taskq_arg; 7690Sstevel@tonic-gate boolean_t refheld; 7700Sstevel@tonic-gate 7710Sstevel@tonic-gate if (ill_rx_ring == NULL) 7720Sstevel@tonic-gate return (IP_SQUEUE_GET(lbolt)); 7730Sstevel@tonic-gate 7740Sstevel@tonic-gate sqp = ill_rx_ring->rr_sqp; 7750Sstevel@tonic-gate /* 7760Sstevel@tonic-gate * Do a quick check. If it's not NULL, we are done. 7770Sstevel@tonic-gate * Squeues are never destroyed so worse we will bind 7780Sstevel@tonic-gate * this connection to a suboptimal squeue. 7790Sstevel@tonic-gate * 7800Sstevel@tonic-gate * This is the fast path case. 7810Sstevel@tonic-gate */ 7820Sstevel@tonic-gate if (sqp != NULL) 7830Sstevel@tonic-gate return (sqp); 7840Sstevel@tonic-gate 7850Sstevel@tonic-gate ill = ill_rx_ring->rr_ill; 7860Sstevel@tonic-gate ASSERT(ill != NULL); 7870Sstevel@tonic-gate 7880Sstevel@tonic-gate interrupt = servicing_interrupt(); 7890Sstevel@tonic-gate taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t), 7900Sstevel@tonic-gate KM_NOSLEEP); 7910Sstevel@tonic-gate 7920Sstevel@tonic-gate mutex_enter(&ill->ill_lock); 7930Sstevel@tonic-gate if (!interrupt || ill_rx_ring->rr_ring_state != ILL_RING_INUSE || 7940Sstevel@tonic-gate taskq_arg == NULL) { 7950Sstevel@tonic-gate /* 7960Sstevel@tonic-gate * Do the ring to squeue binding only if we are in interrupt 7970Sstevel@tonic-gate * context and there is no one else trying the bind already. 7980Sstevel@tonic-gate */ 7990Sstevel@tonic-gate mutex_exit(&ill->ill_lock); 8000Sstevel@tonic-gate if (taskq_arg != NULL) 8010Sstevel@tonic-gate kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 8020Sstevel@tonic-gate return (IP_SQUEUE_GET(lbolt)); 8030Sstevel@tonic-gate } 8040Sstevel@tonic-gate 8050Sstevel@tonic-gate /* 8060Sstevel@tonic-gate * No sqp assigned yet. Can't really do that in interrupt 8070Sstevel@tonic-gate * context. Assign the default sqp to this connection and 8080Sstevel@tonic-gate * trigger creation of new sqp and binding it to this ring 8090Sstevel@tonic-gate * via taskq. Need to make sure ill stays around. 8100Sstevel@tonic-gate */ 8110Sstevel@tonic-gate taskq_arg->ip_taskq_ill = ill; 8120Sstevel@tonic-gate taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring; 8130Sstevel@tonic-gate taskq_arg->ip_taskq_cpu = CPU; 8140Sstevel@tonic-gate ill_rx_ring->rr_ring_state = ILL_RING_INPROC; 8150Sstevel@tonic-gate mutex_exit(&ill->ill_lock); 8160Sstevel@tonic-gate refheld = ill_waiter_inc(ill); 8170Sstevel@tonic-gate if (refheld) { 8180Sstevel@tonic-gate if (taskq_dispatch(system_taskq, ip_squeue_extend, 8190Sstevel@tonic-gate taskq_arg, TQ_NOSLEEP) != NULL) { 8200Sstevel@tonic-gate return (IP_SQUEUE_GET(lbolt)); 8210Sstevel@tonic-gate } 8220Sstevel@tonic-gate } 8230Sstevel@tonic-gate /* 8240Sstevel@tonic-gate * The ill is closing and we could not get a reference on the ill OR 8250Sstevel@tonic-gate * taskq_dispatch failed probably due to memory allocation failure. 8260Sstevel@tonic-gate * We will try again next time. 8270Sstevel@tonic-gate */ 8280Sstevel@tonic-gate mutex_enter(&ill->ill_lock); 8290Sstevel@tonic-gate ill_rx_ring->rr_ring_state = ILL_RING_INUSE; 8300Sstevel@tonic-gate mutex_exit(&ill->ill_lock); 8310Sstevel@tonic-gate kmem_free(taskq_arg, sizeof (ip_taskq_arg_t)); 8320Sstevel@tonic-gate if (refheld) 8330Sstevel@tonic-gate ill_waiter_dcr(ill); 8340Sstevel@tonic-gate 8350Sstevel@tonic-gate return (IP_SQUEUE_GET(lbolt)); 8360Sstevel@tonic-gate } 8370Sstevel@tonic-gate 8380Sstevel@tonic-gate /* 8390Sstevel@tonic-gate * NDD hooks for setting ip_squeue_xxx tuneables. 8400Sstevel@tonic-gate */ 8410Sstevel@tonic-gate 8420Sstevel@tonic-gate /* ARGSUSED */ 8430Sstevel@tonic-gate int 8440Sstevel@tonic-gate ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value, 8450Sstevel@tonic-gate caddr_t addr, cred_t *cr) 8460Sstevel@tonic-gate { 8470Sstevel@tonic-gate int *bind_enabled = (int *)addr; 8480Sstevel@tonic-gate long new_value; 8490Sstevel@tonic-gate int i; 8500Sstevel@tonic-gate 8510Sstevel@tonic-gate if (ddi_strtol(value, NULL, 10, &new_value) != 0) 8520Sstevel@tonic-gate return (EINVAL); 8530Sstevel@tonic-gate 8540Sstevel@tonic-gate if (ip_squeue_bind == new_value) 8550Sstevel@tonic-gate return (0); 8560Sstevel@tonic-gate 8570Sstevel@tonic-gate *bind_enabled = new_value; 8580Sstevel@tonic-gate mutex_enter(&cpu_lock); 8590Sstevel@tonic-gate if (new_value == 0) { 8600Sstevel@tonic-gate for (i = 0; i < sqset_global_size; i++) 8610Sstevel@tonic-gate ip_squeue_set_unbind(sqset_global_list[i]); 8620Sstevel@tonic-gate } else { 8630Sstevel@tonic-gate for (i = 0; i < sqset_global_size; i++) 8640Sstevel@tonic-gate ip_squeue_set_bind(sqset_global_list[i]); 8650Sstevel@tonic-gate } 8660Sstevel@tonic-gate 8670Sstevel@tonic-gate mutex_exit(&cpu_lock); 8680Sstevel@tonic-gate return (0); 8690Sstevel@tonic-gate } 8700Sstevel@tonic-gate 8710Sstevel@tonic-gate /* 8720Sstevel@tonic-gate * Set squeue profiling. 8730Sstevel@tonic-gate * 0 means "disable" 8740Sstevel@tonic-gate * 1 means "enable" 8750Sstevel@tonic-gate * 2 means "enable and reset" 8760Sstevel@tonic-gate */ 8770Sstevel@tonic-gate /* ARGSUSED */ 8780Sstevel@tonic-gate int 8790Sstevel@tonic-gate ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp, 8800Sstevel@tonic-gate cred_t *cr) 8810Sstevel@tonic-gate { 8820Sstevel@tonic-gate int *profile_enabled = (int *)cp; 8830Sstevel@tonic-gate long new_value; 8840Sstevel@tonic-gate squeue_set_t *sqs; 8850Sstevel@tonic-gate 8860Sstevel@tonic-gate if (ddi_strtol(value, NULL, 10, &new_value) != 0) 8870Sstevel@tonic-gate return (EINVAL); 8880Sstevel@tonic-gate 8890Sstevel@tonic-gate if (new_value == 0) 8900Sstevel@tonic-gate squeue_profile_stop(); 8910Sstevel@tonic-gate else if (new_value == 1) 8920Sstevel@tonic-gate squeue_profile_start(); 8930Sstevel@tonic-gate else if (new_value == 2) { 8940Sstevel@tonic-gate int i, j; 8950Sstevel@tonic-gate 8960Sstevel@tonic-gate squeue_profile_stop(); 8970Sstevel@tonic-gate mutex_enter(&cpu_lock); 8980Sstevel@tonic-gate for (i = 0; i < sqset_global_size; i++) { 8990Sstevel@tonic-gate sqs = sqset_global_list[i]; 9000Sstevel@tonic-gate for (j = 0; j < sqs->sqs_size; j++) { 9010Sstevel@tonic-gate squeue_profile_reset(sqs->sqs_list[j]); 9020Sstevel@tonic-gate } 9030Sstevel@tonic-gate } 9040Sstevel@tonic-gate mutex_exit(&cpu_lock); 9050Sstevel@tonic-gate 9060Sstevel@tonic-gate new_value = 1; 9070Sstevel@tonic-gate squeue_profile_start(); 9080Sstevel@tonic-gate } 9090Sstevel@tonic-gate *profile_enabled = new_value; 9100Sstevel@tonic-gate 9110Sstevel@tonic-gate return (0); 9120Sstevel@tonic-gate } 9130Sstevel@tonic-gate 9140Sstevel@tonic-gate /* 9150Sstevel@tonic-gate * Reconfiguration callback 9160Sstevel@tonic-gate */ 9170Sstevel@tonic-gate 9180Sstevel@tonic-gate /* ARGSUSED */ 9190Sstevel@tonic-gate static int 9200Sstevel@tonic-gate ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg) 9210Sstevel@tonic-gate { 9220Sstevel@tonic-gate cpu_t *cp = cpu[id]; 9230Sstevel@tonic-gate 9240Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock)); 9250Sstevel@tonic-gate switch (what) { 926405Sakolb case CPU_CONFIG: 927405Sakolb /* 928405Sakolb * A new CPU is added. Create an squeue for it but do not bind 929405Sakolb * it yet. 930405Sakolb */ 931405Sakolb if (cp->cpu_squeue_set == NULL) 932405Sakolb cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 933405Sakolb break; 9340Sstevel@tonic-gate case CPU_ON: 9350Sstevel@tonic-gate case CPU_INIT: 9360Sstevel@tonic-gate case CPU_CPUPART_IN: 9370Sstevel@tonic-gate if (cp->cpu_squeue_set == NULL) { 9380Sstevel@tonic-gate cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE); 9390Sstevel@tonic-gate } 9400Sstevel@tonic-gate if (ip_squeue_bind) 9410Sstevel@tonic-gate ip_squeue_set_bind(cp->cpu_squeue_set); 9420Sstevel@tonic-gate break; 9430Sstevel@tonic-gate case CPU_UNCONFIG: 9440Sstevel@tonic-gate case CPU_OFF: 9450Sstevel@tonic-gate case CPU_CPUPART_OUT: 9460Sstevel@tonic-gate ASSERT((cp->cpu_squeue_set != NULL) || 9470Sstevel@tonic-gate (cp->cpu_flags & CPU_OFFLINE)); 9480Sstevel@tonic-gate 9490Sstevel@tonic-gate if (cp->cpu_squeue_set != NULL) { 9500Sstevel@tonic-gate ip_squeue_set_unbind(cp->cpu_squeue_set); 9510Sstevel@tonic-gate } 9520Sstevel@tonic-gate break; 9530Sstevel@tonic-gate default: 9540Sstevel@tonic-gate break; 9550Sstevel@tonic-gate } 9560Sstevel@tonic-gate return (0); 9570Sstevel@tonic-gate } 9580Sstevel@tonic-gate 9590Sstevel@tonic-gate /* ARGSUSED */ 9600Sstevel@tonic-gate static void 9610Sstevel@tonic-gate ip_squeue_set_bind(squeue_set_t *sqs) 9620Sstevel@tonic-gate { 9630Sstevel@tonic-gate int i; 9640Sstevel@tonic-gate squeue_t *sqp; 9650Sstevel@tonic-gate 9660Sstevel@tonic-gate if (!ip_squeue_bind) 9670Sstevel@tonic-gate return; 9680Sstevel@tonic-gate 9690Sstevel@tonic-gate mutex_enter(&sqs->sqs_lock); 9700Sstevel@tonic-gate for (i = 0; i < sqs->sqs_size; i++) { 9710Sstevel@tonic-gate sqp = sqs->sqs_list[i]; 9720Sstevel@tonic-gate if (sqp->sq_state & SQS_BOUND) 9730Sstevel@tonic-gate continue; 9740Sstevel@tonic-gate squeue_bind(sqp, -1); 9750Sstevel@tonic-gate } 9760Sstevel@tonic-gate mutex_exit(&sqs->sqs_lock); 9770Sstevel@tonic-gate } 9780Sstevel@tonic-gate 9790Sstevel@tonic-gate static void 9800Sstevel@tonic-gate ip_squeue_set_unbind(squeue_set_t *sqs) 9810Sstevel@tonic-gate { 9820Sstevel@tonic-gate int i; 9830Sstevel@tonic-gate squeue_t *sqp; 9840Sstevel@tonic-gate 9850Sstevel@tonic-gate mutex_enter(&sqs->sqs_lock); 9860Sstevel@tonic-gate for (i = 0; i < sqs->sqs_size; i++) { 9870Sstevel@tonic-gate sqp = sqs->sqs_list[i]; 988*1184Skrgopi 989*1184Skrgopi /* 990*1184Skrgopi * CPU is going offline. Remove the thread affinity 991*1184Skrgopi * for any soft ring threads the squeue is managing. 992*1184Skrgopi */ 993*1184Skrgopi if (sqp->sq_state & SQS_ILL_BOUND) { 994*1184Skrgopi ill_rx_ring_t *ring = sqp->sq_rx_ring; 995*1184Skrgopi ill_t *ill = ring->rr_ill; 996*1184Skrgopi 997*1184Skrgopi if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) { 998*1184Skrgopi ASSERT(ring->rr_handle != NULL); 999*1184Skrgopi ill->ill_dls_capab->ill_dls_unbind( 1000*1184Skrgopi ring->rr_handle); 1001*1184Skrgopi } 1002*1184Skrgopi } 10030Sstevel@tonic-gate if (!(sqp->sq_state & SQS_BOUND)) 10040Sstevel@tonic-gate continue; 10050Sstevel@tonic-gate squeue_unbind(sqp); 10060Sstevel@tonic-gate } 10070Sstevel@tonic-gate mutex_exit(&sqs->sqs_lock); 10080Sstevel@tonic-gate } 1009