10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 51503Sericheng * Common Development and Distribution License (the "License"). 61503Sericheng * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 225895Syz147064 * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate /* 270Sstevel@tonic-gate * IP interface to squeues. 280Sstevel@tonic-gate * 29*8275SEric Cheng * IP uses squeues to force serialization of packets, both incoming and 30*8275SEric Cheng * outgoing. Each squeue is associated with a connection instance (conn_t) 31*8275SEric Cheng * above, and a soft ring (if enabled) below. Each CPU will have a default 32*8275SEric Cheng * squeue for outbound connections, and each soft ring of an interface will 33*8275SEric Cheng * have an squeue to which it sends incoming packets. squeues are never 34*8275SEric Cheng * destroyed, and if they become unused they are kept around against future 35*8275SEric Cheng * needs. 360Sstevel@tonic-gate * 37*8275SEric Cheng * IP organizes its squeues using squeue sets (squeue_set_t). For each CPU 38*8275SEric Cheng * in the system there will be one squeue set, all of whose squeues will be 39*8275SEric Cheng * bound to that CPU, plus one additional set known as the unbound set. Sets 40*8275SEric Cheng * associated with CPUs will have one default squeue, for outbound 41*8275SEric Cheng * connections, and a linked list of squeues used by various NICs for inbound 42*8275SEric Cheng * packets. The unbound set also has a linked list of squeues, but no default 43*8275SEric Cheng * squeue. 44*8275SEric Cheng * 45*8275SEric Cheng * When a CPU goes offline its squeue set is destroyed, and all its squeues 46*8275SEric Cheng * are moved to the unbound set. When a CPU comes online, a new squeue set is 47*8275SEric Cheng * created and the default set is searched for a default squeue formerly bound 48*8275SEric Cheng * to this CPU. If no default squeue is found, a new one is created. 49*8275SEric Cheng * 50*8275SEric Cheng * Two fields of the squeue_t, namely sq_next and sq_set, are owned by IP 51*8275SEric Cheng * and not the squeue code. squeue.c will not touch them, and we can modify 52*8275SEric Cheng * them without holding the squeue lock because of the guarantee that squeues 53*8275SEric Cheng * are never destroyed. ip_squeue locks must be held, however. 54*8275SEric Cheng * 55*8275SEric Cheng * All the squeue sets are protected by a single lock, the sqset_lock. This 56*8275SEric Cheng * is also used to protect the sq_next and sq_set fields of an squeue_t. 57*8275SEric Cheng * 58*8275SEric Cheng * The lock order is: cpu_lock --> ill_lock --> sqset_lock --> sq_lock 590Sstevel@tonic-gate * 600Sstevel@tonic-gate * There are two modes of associating connection with squeues. The first mode 610Sstevel@tonic-gate * associates each connection with the CPU that creates the connection (either 620Sstevel@tonic-gate * during open time or during accept time). The second mode associates each 630Sstevel@tonic-gate * connection with a random CPU, effectively distributing load over all CPUs 640Sstevel@tonic-gate * and all squeues in the system. The mode is controlled by the 650Sstevel@tonic-gate * ip_squeue_fanout variable. 660Sstevel@tonic-gate * 670Sstevel@tonic-gate * NOTE: The fact that there is an association between each connection and 680Sstevel@tonic-gate * squeue and squeue and CPU does not mean that each connection is always 690Sstevel@tonic-gate * processed on this CPU and on this CPU only. Any thread calling squeue_enter() 700Sstevel@tonic-gate * may process the connection on whatever CPU it is scheduled. The squeue to CPU 710Sstevel@tonic-gate * binding is only relevant for the worker thread. 720Sstevel@tonic-gate * 730Sstevel@tonic-gate * INTERFACE: 740Sstevel@tonic-gate * 75*8275SEric Cheng * squeue_t *ip_squeue_get(ill_rx_ring_t) 760Sstevel@tonic-gate * 77*8275SEric Cheng * Returns the squeue associated with an ill receive ring. If the ring is 78*8275SEric Cheng * not bound to a CPU, and we're currently servicing the interrupt which 79*8275SEric Cheng * generated the packet, then bind the squeue to CPU. 800Sstevel@tonic-gate * 810Sstevel@tonic-gate * 820Sstevel@tonic-gate * DR Notes 830Sstevel@tonic-gate * ======== 840Sstevel@tonic-gate * 850Sstevel@tonic-gate * The ip_squeue_init() registers a call-back function with the CPU DR 860Sstevel@tonic-gate * subsystem using register_cpu_setup_func(). The call-back function does two 870Sstevel@tonic-gate * things: 880Sstevel@tonic-gate * 890Sstevel@tonic-gate * o When the CPU is going off-line or unconfigured, the worker thread is 900Sstevel@tonic-gate * unbound from the CPU. This allows the CPU unconfig code to move it to 910Sstevel@tonic-gate * another CPU. 920Sstevel@tonic-gate * 930Sstevel@tonic-gate * o When the CPU is going online, it creates a new squeue for this CPU if 940Sstevel@tonic-gate * necessary and binds the squeue worker thread to this CPU. 950Sstevel@tonic-gate * 96*8275SEric Cheng * TUNABLES: 970Sstevel@tonic-gate * 98*8275SEric Cheng * ip_squeue_fanout: used when TCP calls IP_SQUEUE_GET(). If 1, then 99*8275SEric Cheng * pick the default squeue from a random CPU, otherwise use our CPU's default 100*8275SEric Cheng * squeue. 1010Sstevel@tonic-gate * 102*8275SEric Cheng * ip_squeue_fanout can be accessed and changed using ndd on /dev/tcp or 103*8275SEric Cheng * /dev/ip. 1040Sstevel@tonic-gate * 105*8275SEric Cheng * ip_squeue_worker_wait: global value for the sq_wait field for all squeues * 106*8275SEric Cheng * created. This is the time squeue code waits before waking up the worker 107*8275SEric Cheng * thread after queuing a request. 1080Sstevel@tonic-gate */ 1090Sstevel@tonic-gate 1100Sstevel@tonic-gate #include <sys/types.h> 1110Sstevel@tonic-gate #include <sys/debug.h> 1120Sstevel@tonic-gate #include <sys/kmem.h> 1130Sstevel@tonic-gate #include <sys/cpuvar.h> 1140Sstevel@tonic-gate #include <sys/cmn_err.h> 1150Sstevel@tonic-gate 1160Sstevel@tonic-gate #include <inet/common.h> 1170Sstevel@tonic-gate #include <inet/ip.h> 118*8275SEric Cheng #include <netinet/ip6.h> 1190Sstevel@tonic-gate #include <inet/ip_if.h> 120*8275SEric Cheng #include <inet/ip_ire.h> 1210Sstevel@tonic-gate #include <inet/nd.h> 1220Sstevel@tonic-gate #include <inet/ipclassifier.h> 1230Sstevel@tonic-gate #include <sys/types.h> 1240Sstevel@tonic-gate #include <sys/conf.h> 1250Sstevel@tonic-gate #include <sys/sunddi.h> 1262546Scarlsonj #include <sys/dlpi.h> 1270Sstevel@tonic-gate #include <sys/squeue_impl.h> 128*8275SEric Cheng #include <sys/tihdr.h> 129*8275SEric Cheng #include <inet/udp_impl.h> 130*8275SEric Cheng #include <sys/strsubr.h> 131*8275SEric Cheng #include <sys/zone.h> 132*8275SEric Cheng #include <sys/dld.h> 1338130SGeorge.Shepherd@Sun.COM #include <sys/atomic.h> 1340Sstevel@tonic-gate 1350Sstevel@tonic-gate /* 136*8275SEric Cheng * List of all created squeue sets. The list and its size are protected by 137*8275SEric Cheng * sqset_lock. 1380Sstevel@tonic-gate */ 139*8275SEric Cheng static squeue_set_t **sqset_global_list; /* list 0 is the unbound list */ 140*8275SEric Cheng static uint_t sqset_global_size; 141*8275SEric Cheng kmutex_t sqset_lock; 1421184Skrgopi 1430Sstevel@tonic-gate static void (*ip_squeue_create_callback)(squeue_t *) = NULL; 1440Sstevel@tonic-gate 1450Sstevel@tonic-gate /* 1460Sstevel@tonic-gate * ip_squeue_worker_wait: global value for the sq_wait field for all squeues 1470Sstevel@tonic-gate * created. This is the time squeue code waits before waking up the worker 1480Sstevel@tonic-gate * thread after queuing a request. 1490Sstevel@tonic-gate */ 1500Sstevel@tonic-gate uint_t ip_squeue_worker_wait = 10; 1510Sstevel@tonic-gate 152*8275SEric Cheng static squeue_t *ip_squeue_create(pri_t); 153*8275SEric Cheng static squeue_set_t *ip_squeue_set_create(processorid_t); 1540Sstevel@tonic-gate static int ip_squeue_cpu_setup(cpu_setup_t, int, void *); 155*8275SEric Cheng static void ip_squeue_set_move(squeue_t *, squeue_set_t *); 156*8275SEric Cheng static void ip_squeue_set_destroy(cpu_t *); 1574360Smeem static void ip_squeue_clean(void *, mblk_t *, void *); 1580Sstevel@tonic-gate 1590Sstevel@tonic-gate #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS)) 1600Sstevel@tonic-gate 161*8275SEric Cheng static squeue_t * 162*8275SEric Cheng ip_squeue_create(pri_t pri) 163*8275SEric Cheng { 164*8275SEric Cheng squeue_t *sqp; 165*8275SEric Cheng 166*8275SEric Cheng sqp = squeue_create(ip_squeue_worker_wait, pri); 167*8275SEric Cheng ASSERT(sqp != NULL); 168*8275SEric Cheng if (ip_squeue_create_callback != NULL) 169*8275SEric Cheng ip_squeue_create_callback(sqp); 170*8275SEric Cheng return (sqp); 171*8275SEric Cheng } 172*8275SEric Cheng 1730Sstevel@tonic-gate /* 174*8275SEric Cheng * Create a new squeue_set. If id == -1, then we're creating the unbound set, 175*8275SEric Cheng * which should only happen once when we are first initialized. Otherwise id 176*8275SEric Cheng * is the id of the CPU that needs a set, either because we are initializing 177*8275SEric Cheng * or because the CPU has come online. 178*8275SEric Cheng * 179*8275SEric Cheng * If id != -1, then we need at a minimum to provide a default squeue for the 180*8275SEric Cheng * new set. We search the unbound set for candidates, and if none are found we 181*8275SEric Cheng * create a new one. 1820Sstevel@tonic-gate */ 1830Sstevel@tonic-gate static squeue_set_t * 184*8275SEric Cheng ip_squeue_set_create(processorid_t id) 1850Sstevel@tonic-gate { 1860Sstevel@tonic-gate squeue_set_t *sqs; 187*8275SEric Cheng squeue_set_t *src = sqset_global_list[0]; 188*8275SEric Cheng squeue_t **lastsqp, *sq; 189*8275SEric Cheng squeue_t **defaultq_lastp = NULL; 1900Sstevel@tonic-gate 191*8275SEric Cheng sqs = kmem_zalloc(sizeof (squeue_set_t), KM_SLEEP); 192*8275SEric Cheng sqs->sqs_cpuid = id; 1930Sstevel@tonic-gate 194*8275SEric Cheng if (id == -1) { 195*8275SEric Cheng ASSERT(sqset_global_size == 0); 196*8275SEric Cheng sqset_global_list[0] = sqs; 197*8275SEric Cheng sqset_global_size = 1; 198*8275SEric Cheng return (sqs); 1990Sstevel@tonic-gate } 2000Sstevel@tonic-gate 201*8275SEric Cheng /* 202*8275SEric Cheng * When we create an squeue set id != -1, we need to give it a 203*8275SEric Cheng * default squeue, in order to support fanout of conns across 204*8275SEric Cheng * CPUs. Try to find a former default squeue that matches this 205*8275SEric Cheng * cpu id on the unbound squeue set. If no such squeue is found, 206*8275SEric Cheng * find some non-default TCP squeue and steal it. If still no such 207*8275SEric Cheng * candidate is found, create a new squeue. 208*8275SEric Cheng */ 2090Sstevel@tonic-gate 210*8275SEric Cheng ASSERT(MUTEX_HELD(&cpu_lock)); 211*8275SEric Cheng mutex_enter(&sqset_lock); 212*8275SEric Cheng lastsqp = &src->sqs_head; 2130Sstevel@tonic-gate 214*8275SEric Cheng while (*lastsqp) { 215*8275SEric Cheng if ((*lastsqp)->sq_bind == id && 216*8275SEric Cheng (*lastsqp)->sq_state & SQS_DEFAULT) { 217*8275SEric Cheng defaultq_lastp = lastsqp; 218*8275SEric Cheng break; 219*8275SEric Cheng } 220*8275SEric Cheng if (defaultq_lastp == NULL && 221*8275SEric Cheng !((*lastsqp)->sq_state & SQS_DEFAULT)) { 222*8275SEric Cheng defaultq_lastp = lastsqp; 223*8275SEric Cheng } 224*8275SEric Cheng lastsqp = &(*lastsqp)->sq_next; 2250Sstevel@tonic-gate 226*8275SEric Cheng } 227*8275SEric Cheng if (defaultq_lastp) { 228*8275SEric Cheng /* Remove from src set and set SQS_DEFAULT */ 229*8275SEric Cheng sq = *defaultq_lastp; 230*8275SEric Cheng *defaultq_lastp = sq->sq_next; 231*8275SEric Cheng sq->sq_next = NULL; 232*8275SEric Cheng if (!(sq->sq_state & SQS_DEFAULT)) { 233*8275SEric Cheng mutex_enter(&sq->sq_lock); 234*8275SEric Cheng sq->sq_state |= SQS_DEFAULT; 235*8275SEric Cheng mutex_exit(&sq->sq_lock); 236*8275SEric Cheng } 237*8275SEric Cheng } else { 238*8275SEric Cheng sq = ip_squeue_create(SQUEUE_DEFAULT_PRIORITY); 239*8275SEric Cheng sq->sq_state |= SQS_DEFAULT; 2400Sstevel@tonic-gate } 2410Sstevel@tonic-gate 242*8275SEric Cheng sq->sq_set = sqs; 243*8275SEric Cheng sqs->sqs_default = sq; 244*8275SEric Cheng squeue_bind(sq, id); /* this locks squeue mutex */ 245*8275SEric Cheng 246*8275SEric Cheng ASSERT(sqset_global_size <= NCPU); 247*8275SEric Cheng sqset_global_list[sqset_global_size++] = sqs; 248*8275SEric Cheng mutex_exit(&sqset_lock); 249*8275SEric Cheng return (sqs); 250*8275SEric Cheng } 251*8275SEric Cheng 252*8275SEric Cheng /* 253*8275SEric Cheng * Called by ill_ring_add() to find an squeue to associate with a new ring. 254*8275SEric Cheng */ 255*8275SEric Cheng 256*8275SEric Cheng squeue_t * 257*8275SEric Cheng ip_squeue_getfree(pri_t pri) 258*8275SEric Cheng { 259*8275SEric Cheng squeue_set_t *sqs = sqset_global_list[0]; 260*8275SEric Cheng squeue_t *sq; 261*8275SEric Cheng 262*8275SEric Cheng mutex_enter(&sqset_lock); 263*8275SEric Cheng for (sq = sqs->sqs_head; sq != NULL; sq = sq->sq_next) { 264*8275SEric Cheng /* 265*8275SEric Cheng * Select a non-default squeue 266*8275SEric Cheng */ 267*8275SEric Cheng if (!(sq->sq_state & (SQS_DEFAULT | SQS_ILL_BOUND))) 268*8275SEric Cheng break; 269*8275SEric Cheng } 2700Sstevel@tonic-gate 271*8275SEric Cheng if (sq == NULL) { 272*8275SEric Cheng sq = ip_squeue_create(pri); 273*8275SEric Cheng sq->sq_set = sqs; 274*8275SEric Cheng sq->sq_next = sqs->sqs_head; 275*8275SEric Cheng sqs->sqs_head = sq; 276*8275SEric Cheng } 277*8275SEric Cheng 278*8275SEric Cheng ASSERT(!(sq->sq_state & (SQS_POLL_THR_CONTROL | SQS_WORKER_THR_CONTROL | 279*8275SEric Cheng SQS_POLL_CLEANUP_DONE | SQS_POLL_QUIESCE_DONE | 280*8275SEric Cheng SQS_POLL_THR_QUIESCED))); 281*8275SEric Cheng 282*8275SEric Cheng mutex_enter(&sq->sq_lock); 283*8275SEric Cheng sq->sq_state |= SQS_ILL_BOUND; 284*8275SEric Cheng mutex_exit(&sq->sq_lock); 285*8275SEric Cheng mutex_exit(&sqset_lock); 286*8275SEric Cheng 287*8275SEric Cheng if (sq->sq_priority != pri) { 288*8275SEric Cheng thread_lock(sq->sq_worker); 289*8275SEric Cheng (void) thread_change_pri(sq->sq_worker, pri, 0); 290*8275SEric Cheng thread_unlock(sq->sq_worker); 291*8275SEric Cheng 292*8275SEric Cheng thread_lock(sq->sq_poll_thr); 293*8275SEric Cheng (void) thread_change_pri(sq->sq_poll_thr, pri, 0); 294*8275SEric Cheng thread_unlock(sq->sq_poll_thr); 295*8275SEric Cheng 296*8275SEric Cheng sq->sq_priority = pri; 297*8275SEric Cheng } 298*8275SEric Cheng return (sq); 2990Sstevel@tonic-gate } 3000Sstevel@tonic-gate 3010Sstevel@tonic-gate /* 3020Sstevel@tonic-gate * Initialize IP squeues. 3030Sstevel@tonic-gate */ 3040Sstevel@tonic-gate void 3050Sstevel@tonic-gate ip_squeue_init(void (*callback)(squeue_t *)) 3060Sstevel@tonic-gate { 3070Sstevel@tonic-gate int i; 308*8275SEric Cheng squeue_set_t *sqs; 3090Sstevel@tonic-gate 3100Sstevel@tonic-gate ASSERT(sqset_global_list == NULL); 3110Sstevel@tonic-gate 3120Sstevel@tonic-gate ip_squeue_create_callback = callback; 3130Sstevel@tonic-gate squeue_init(); 314*8275SEric Cheng mutex_init(&sqset_lock, NULL, MUTEX_DEFAULT, NULL); 3150Sstevel@tonic-gate sqset_global_list = 316*8275SEric Cheng kmem_zalloc(sizeof (squeue_set_t *) * (NCPU+1), KM_SLEEP); 3170Sstevel@tonic-gate sqset_global_size = 0; 318*8275SEric Cheng /* 319*8275SEric Cheng * We are called at system boot time and we don't 320*8275SEric Cheng * expect memory allocation failure. 321*8275SEric Cheng */ 322*8275SEric Cheng sqs = ip_squeue_set_create(-1); 323*8275SEric Cheng ASSERT(sqs != NULL); 324*8275SEric Cheng 3250Sstevel@tonic-gate mutex_enter(&cpu_lock); 3260Sstevel@tonic-gate /* Create squeue for each active CPU available */ 3270Sstevel@tonic-gate for (i = 0; i < NCPU; i++) { 328*8275SEric Cheng cpu_t *cp = cpu_get(i); 3290Sstevel@tonic-gate if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) { 330*8275SEric Cheng /* 331*8275SEric Cheng * We are called at system boot time and we don't 332*8275SEric Cheng * expect memory allocation failure then 333*8275SEric Cheng */ 334*8275SEric Cheng cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id); 335*8275SEric Cheng ASSERT(cp->cpu_squeue_set != NULL); 3360Sstevel@tonic-gate } 3370Sstevel@tonic-gate } 3380Sstevel@tonic-gate 3390Sstevel@tonic-gate register_cpu_setup_func(ip_squeue_cpu_setup, NULL); 3400Sstevel@tonic-gate mutex_exit(&cpu_lock); 3410Sstevel@tonic-gate } 3420Sstevel@tonic-gate 3430Sstevel@tonic-gate /* 344*8275SEric Cheng * Get a default squeue, either from the current CPU or a CPU derived by hash 345*8275SEric Cheng * from the index argument, depending upon the setting of ip_squeue_fanout. 3460Sstevel@tonic-gate */ 3470Sstevel@tonic-gate squeue_t * 3480Sstevel@tonic-gate ip_squeue_random(uint_t index) 3490Sstevel@tonic-gate { 350*8275SEric Cheng squeue_set_t *sqs = NULL; 351*8275SEric Cheng squeue_t *sq; 352*8275SEric Cheng 353*8275SEric Cheng /* 354*8275SEric Cheng * The minimum value of sqset_global_size is 2, one for the unbound 355*8275SEric Cheng * squeue set and another for the squeue set of the zeroth CPU. 356*8275SEric Cheng * Even though the value could be changing, it can never go below 2, 357*8275SEric Cheng * so the assert does not need the lock protection. 358*8275SEric Cheng */ 359*8275SEric Cheng ASSERT(sqset_global_size > 1); 360*8275SEric Cheng 361*8275SEric Cheng /* Protect against changes to sqset_global_list */ 362*8275SEric Cheng mutex_enter(&sqset_lock); 3630Sstevel@tonic-gate 364*8275SEric Cheng if (!ip_squeue_fanout) 365*8275SEric Cheng sqs = CPU->cpu_squeue_set; 366*8275SEric Cheng 367*8275SEric Cheng /* 368*8275SEric Cheng * sqset_global_list[0] corresponds to the unbound squeue set. 369*8275SEric Cheng * The computation below picks a set other than the unbound set. 370*8275SEric Cheng */ 371*8275SEric Cheng if (sqs == NULL) 372*8275SEric Cheng sqs = sqset_global_list[(index % (sqset_global_size - 1)) + 1]; 373*8275SEric Cheng sq = sqs->sqs_default; 374*8275SEric Cheng 375*8275SEric Cheng mutex_exit(&sqset_lock); 376*8275SEric Cheng ASSERT(sq); 377*8275SEric Cheng return (sq); 3780Sstevel@tonic-gate } 3790Sstevel@tonic-gate 380*8275SEric Cheng /* 381*8275SEric Cheng * Move squeue from its current set to newset. Not used for default squeues. 382*8275SEric Cheng * Bind or unbind the worker thread as appropriate. 383*8275SEric Cheng */ 384*8275SEric Cheng 3854360Smeem static void 386*8275SEric Cheng ip_squeue_set_move(squeue_t *sq, squeue_set_t *newset) 3870Sstevel@tonic-gate { 388*8275SEric Cheng squeue_set_t *set; 389*8275SEric Cheng squeue_t **lastsqp; 390*8275SEric Cheng processorid_t cpuid = newset->sqs_cpuid; 391*8275SEric Cheng 392*8275SEric Cheng ASSERT(!(sq->sq_state & SQS_DEFAULT)); 393*8275SEric Cheng ASSERT(!MUTEX_HELD(&sq->sq_lock)); 394*8275SEric Cheng ASSERT(MUTEX_HELD(&sqset_lock)); 395*8275SEric Cheng 396*8275SEric Cheng set = sq->sq_set; 397*8275SEric Cheng if (set == newset) 398*8275SEric Cheng return; 399*8275SEric Cheng 400*8275SEric Cheng lastsqp = &set->sqs_head; 401*8275SEric Cheng while (*lastsqp != sq) 402*8275SEric Cheng lastsqp = &(*lastsqp)->sq_next; 403*8275SEric Cheng 404*8275SEric Cheng *lastsqp = sq->sq_next; 405*8275SEric Cheng sq->sq_next = newset->sqs_head; 406*8275SEric Cheng newset->sqs_head = sq; 407*8275SEric Cheng sq->sq_set = newset; 408*8275SEric Cheng if (cpuid == -1) 409*8275SEric Cheng squeue_unbind(sq); 410*8275SEric Cheng else 411*8275SEric Cheng squeue_bind(sq, cpuid); 412*8275SEric Cheng } 413*8275SEric Cheng 414*8275SEric Cheng /* 415*8275SEric Cheng * Move squeue from its current set to cpuid's set and bind to cpuid. 416*8275SEric Cheng */ 4170Sstevel@tonic-gate 418*8275SEric Cheng int 419*8275SEric Cheng ip_squeue_cpu_move(squeue_t *sq, processorid_t cpuid) 420*8275SEric Cheng { 421*8275SEric Cheng cpu_t *cpu; 422*8275SEric Cheng squeue_set_t *set; 423*8275SEric Cheng 424*8275SEric Cheng if (sq->sq_state & SQS_DEFAULT) 425*8275SEric Cheng return (-1); 426*8275SEric Cheng 427*8275SEric Cheng ASSERT(MUTEX_HELD(&cpu_lock)); 428*8275SEric Cheng 429*8275SEric Cheng cpu = cpu_get(cpuid); 430*8275SEric Cheng if (!CPU_ISON(cpu)) 431*8275SEric Cheng return (-1); 4320Sstevel@tonic-gate 433*8275SEric Cheng mutex_enter(&sqset_lock); 434*8275SEric Cheng set = cpu->cpu_squeue_set; 435*8275SEric Cheng if (set != NULL) 436*8275SEric Cheng ip_squeue_set_move(sq, set); 437*8275SEric Cheng mutex_exit(&sqset_lock); 438*8275SEric Cheng return ((set == NULL) ? -1 : 0); 439*8275SEric Cheng } 440*8275SEric Cheng 441*8275SEric Cheng /* 442*8275SEric Cheng * The mac layer is calling, asking us to move an squeue to a 443*8275SEric Cheng * new CPU. This routine is called with cpu_lock held. 444*8275SEric Cheng */ 445*8275SEric Cheng void 446*8275SEric Cheng ip_squeue_bind_ring(ill_t *ill, ill_rx_ring_t *rx_ring, processorid_t cpuid) 447*8275SEric Cheng { 448*8275SEric Cheng ASSERT(ILL_MAC_PERIM_HELD(ill)); 449*8275SEric Cheng ASSERT(rx_ring->rr_ill == ill); 450*8275SEric Cheng 451*8275SEric Cheng mutex_enter(&ill->ill_lock); 452*8275SEric Cheng if (rx_ring->rr_ring_state == RR_FREE || 453*8275SEric Cheng rx_ring->rr_ring_state == RR_FREE_INPROG) { 454*8275SEric Cheng mutex_exit(&ill->ill_lock); 4550Sstevel@tonic-gate return; 4560Sstevel@tonic-gate } 4570Sstevel@tonic-gate 458*8275SEric Cheng if (ip_squeue_cpu_move(rx_ring->rr_sqp, cpuid) != -1) 459*8275SEric Cheng rx_ring->rr_ring_state = RR_SQUEUE_BOUND; 460*8275SEric Cheng 461*8275SEric Cheng mutex_exit(&ill->ill_lock); 462*8275SEric Cheng } 463*8275SEric Cheng 464*8275SEric Cheng void * 465*8275SEric Cheng ip_squeue_add_ring(ill_t *ill, void *mrp) 466*8275SEric Cheng { 467*8275SEric Cheng mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp; 468*8275SEric Cheng ill_rx_ring_t *rx_ring, *ring_tbl; 469*8275SEric Cheng int ip_rx_index; 470*8275SEric Cheng squeue_t *sq = NULL; 471*8275SEric Cheng pri_t pri; 472*8275SEric Cheng 473*8275SEric Cheng ASSERT(ILL_MAC_PERIM_HELD(ill)); 474*8275SEric Cheng ASSERT(mrfp->mrf_type == MAC_RX_FIFO); 475*8275SEric Cheng ASSERT(ill->ill_dld_capab != NULL); 476*8275SEric Cheng 477*8275SEric Cheng ring_tbl = ill->ill_dld_capab->idc_poll.idp_ring_tbl; 4780Sstevel@tonic-gate 479*8275SEric Cheng mutex_enter(&ill->ill_lock); 480*8275SEric Cheng for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) { 481*8275SEric Cheng rx_ring = &ring_tbl[ip_rx_index]; 482*8275SEric Cheng if (rx_ring->rr_ring_state == RR_FREE) 483*8275SEric Cheng break; 484*8275SEric Cheng } 485*8275SEric Cheng 486*8275SEric Cheng if (ip_rx_index == ILL_MAX_RINGS) { 487*8275SEric Cheng /* 488*8275SEric Cheng * We ran out of ILL_MAX_RINGS worth rx_ring structures. If 489*8275SEric Cheng * we have devices which can overwhelm this limit, 490*8275SEric Cheng * ILL_MAX_RING should be made configurable. Meanwhile it 491*8275SEric Cheng * cause no panic because driver will pass ip_input a NULL 492*8275SEric Cheng * handle which will make IP allocate the default squeue and 493*8275SEric Cheng * Polling mode will not be used for this ring. 494*8275SEric Cheng */ 495*8275SEric Cheng cmn_err(CE_NOTE, 496*8275SEric Cheng "Reached maximum number of receiving rings (%d) for %s\n", 497*8275SEric Cheng ILL_MAX_RINGS, ill->ill_name); 498*8275SEric Cheng mutex_exit(&ill->ill_lock); 499*8275SEric Cheng return (NULL); 5001184Skrgopi } 5010Sstevel@tonic-gate 502*8275SEric Cheng bzero(rx_ring, sizeof (ill_rx_ring_t)); 503*8275SEric Cheng rx_ring->rr_rx = (ip_mac_rx_t)mrfp->mrf_receive; 504*8275SEric Cheng /* XXX: Hard code it to tcp accept for now */ 505*8275SEric Cheng rx_ring->rr_ip_accept = (ip_accept_t)ip_accept_tcp; 506*8275SEric Cheng 507*8275SEric Cheng rx_ring->rr_intr_handle = mrfp->mrf_intr_handle; 508*8275SEric Cheng rx_ring->rr_intr_enable = (ip_mac_intr_enable_t)mrfp->mrf_intr_enable; 509*8275SEric Cheng rx_ring->rr_intr_disable = 510*8275SEric Cheng (ip_mac_intr_disable_t)mrfp->mrf_intr_disable; 511*8275SEric Cheng rx_ring->rr_rx_handle = mrfp->mrf_rx_arg; 512*8275SEric Cheng rx_ring->rr_ill = ill; 513*8275SEric Cheng 514*8275SEric Cheng pri = mrfp->mrf_flow_priority; 515*8275SEric Cheng 516*8275SEric Cheng sq = ip_squeue_getfree(pri); 517*8275SEric Cheng 518*8275SEric Cheng mutex_enter(&sq->sq_lock); 519*8275SEric Cheng sq->sq_rx_ring = rx_ring; 520*8275SEric Cheng rx_ring->rr_sqp = sq; 521*8275SEric Cheng 522*8275SEric Cheng sq->sq_state |= SQS_POLL_CAPAB; 523*8275SEric Cheng 524*8275SEric Cheng rx_ring->rr_ring_state = RR_SQUEUE_UNBOUND; 525*8275SEric Cheng sq->sq_ill = ill; 526*8275SEric Cheng mutex_exit(&sq->sq_lock); 527*8275SEric Cheng mutex_exit(&ill->ill_lock); 528*8275SEric Cheng 529*8275SEric Cheng DTRACE_PROBE4(ill__ring__add, char *, ill->ill_name, ill_t *, ill, int, 530*8275SEric Cheng ip_rx_index, void *, mrfp->mrf_rx_arg); 531*8275SEric Cheng 532*8275SEric Cheng /* Assign the squeue to the specified CPU as well */ 533*8275SEric Cheng mutex_enter(&cpu_lock); 534*8275SEric Cheng (void) ip_squeue_bind_ring(ill, rx_ring, mrfp->mrf_cpu_id); 535*8275SEric Cheng mutex_exit(&cpu_lock); 5360Sstevel@tonic-gate 537*8275SEric Cheng return (rx_ring); 538*8275SEric Cheng } 539*8275SEric Cheng 540*8275SEric Cheng /* 541*8275SEric Cheng * sanitize the squeue etc. Some of the processing 542*8275SEric Cheng * needs to be done from inside the perimeter. 543*8275SEric Cheng */ 544*8275SEric Cheng void 545*8275SEric Cheng ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring) 546*8275SEric Cheng { 547*8275SEric Cheng squeue_t *sqp; 548*8275SEric Cheng 549*8275SEric Cheng ASSERT(ILL_MAC_PERIM_HELD(ill)); 550*8275SEric Cheng ASSERT(rx_ring != NULL); 551*8275SEric Cheng 552*8275SEric Cheng /* Just clean one squeue */ 553*8275SEric Cheng mutex_enter(&ill->ill_lock); 554*8275SEric Cheng if (rx_ring->rr_ring_state == RR_FREE) { 555*8275SEric Cheng mutex_exit(&ill->ill_lock); 556*8275SEric Cheng return; 557*8275SEric Cheng } 558*8275SEric Cheng rx_ring->rr_ring_state = RR_FREE_INPROG; 559*8275SEric Cheng sqp = rx_ring->rr_sqp; 560*8275SEric Cheng 561*8275SEric Cheng mutex_enter(&sqp->sq_lock); 562*8275SEric Cheng sqp->sq_state |= SQS_POLL_CLEANUP; 563*8275SEric Cheng cv_signal(&sqp->sq_worker_cv); 564*8275SEric Cheng mutex_exit(&ill->ill_lock); 565*8275SEric Cheng while (!(sqp->sq_state & SQS_POLL_CLEANUP_DONE)) 566*8275SEric Cheng cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock); 567*8275SEric Cheng sqp->sq_state &= ~(SQS_POLL_CLEANUP_DONE | SQS_ILL_BOUND); 568*8275SEric Cheng 569*8275SEric Cheng ASSERT(!(sqp->sq_state & (SQS_POLL_THR_CONTROL | 570*8275SEric Cheng SQS_WORKER_THR_CONTROL | SQS_POLL_QUIESCE_DONE | 571*8275SEric Cheng SQS_POLL_THR_QUIESCED))); 572*8275SEric Cheng 573*8275SEric Cheng cv_signal(&sqp->sq_worker_cv); 574*8275SEric Cheng mutex_exit(&sqp->sq_lock); 5750Sstevel@tonic-gate 5760Sstevel@tonic-gate /* 577*8275SEric Cheng * Logically free the squeue. It goes back to the set of unused 578*8275SEric Cheng * squeues 5790Sstevel@tonic-gate */ 580*8275SEric Cheng mutex_enter(&sqset_lock); 581*8275SEric Cheng ip_squeue_set_move(sqp, sqset_global_list[0]); 582*8275SEric Cheng mutex_exit(&sqset_lock); 583*8275SEric Cheng 5840Sstevel@tonic-gate mutex_enter(&ill->ill_lock); 585*8275SEric Cheng rx_ring->rr_ring_state = RR_FREE; 5860Sstevel@tonic-gate mutex_exit(&ill->ill_lock); 5870Sstevel@tonic-gate } 5880Sstevel@tonic-gate 5894360Smeem /* 590*8275SEric Cheng * Stop the squeue from polling. This needs to be done 591*8275SEric Cheng * from inside the perimeter. 5924360Smeem */ 593*8275SEric Cheng void 594*8275SEric Cheng ip_squeue_quiesce_ring(ill_t *ill, ill_rx_ring_t *rx_ring) 5954360Smeem { 5964360Smeem squeue_t *sqp; 5974360Smeem 598*8275SEric Cheng ASSERT(ILL_MAC_PERIM_HELD(ill)); 5994360Smeem ASSERT(rx_ring != NULL); 6004360Smeem 601*8275SEric Cheng sqp = rx_ring->rr_sqp; 602*8275SEric Cheng mutex_enter(&sqp->sq_lock); 603*8275SEric Cheng sqp->sq_state |= SQS_POLL_QUIESCE; 604*8275SEric Cheng cv_signal(&sqp->sq_worker_cv); 605*8275SEric Cheng while (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE)) 606*8275SEric Cheng cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock); 607*8275SEric Cheng 608*8275SEric Cheng mutex_exit(&sqp->sq_lock); 609*8275SEric Cheng } 610*8275SEric Cheng 611*8275SEric Cheng /* 612*8275SEric Cheng * Restart polling etc. Needs to be inside the perimeter to 613*8275SEric Cheng * prevent races. 614*8275SEric Cheng */ 615*8275SEric Cheng void 616*8275SEric Cheng ip_squeue_restart_ring(ill_t *ill, ill_rx_ring_t *rx_ring) 617*8275SEric Cheng { 618*8275SEric Cheng squeue_t *sqp; 619*8275SEric Cheng 620*8275SEric Cheng ASSERT(ILL_MAC_PERIM_HELD(ill)); 621*8275SEric Cheng ASSERT(rx_ring != NULL); 622*8275SEric Cheng 623*8275SEric Cheng sqp = rx_ring->rr_sqp; 624*8275SEric Cheng mutex_enter(&sqp->sq_lock); 6254360Smeem /* 626*8275SEric Cheng * Handle change in number of rings between the quiesce and 627*8275SEric Cheng * restart operations by checking for a previous quiesce before 628*8275SEric Cheng * attempting a restart. 6294360Smeem */ 630*8275SEric Cheng if (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE)) { 631*8275SEric Cheng mutex_exit(&sqp->sq_lock); 6324360Smeem return; 6334360Smeem } 634*8275SEric Cheng sqp->sq_state |= SQS_POLL_RESTART; 635*8275SEric Cheng cv_signal(&sqp->sq_worker_cv); 636*8275SEric Cheng while (!(sqp->sq_state & SQS_POLL_RESTART_DONE)) 637*8275SEric Cheng cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock); 638*8275SEric Cheng sqp->sq_state &= ~SQS_POLL_RESTART_DONE; 639*8275SEric Cheng mutex_exit(&sqp->sq_lock); 6404360Smeem } 6414360Smeem 642*8275SEric Cheng /* 643*8275SEric Cheng * sanitize all squeues associated with the ill. 644*8275SEric Cheng */ 6454360Smeem void 6464360Smeem ip_squeue_clean_all(ill_t *ill) 6474360Smeem { 6484360Smeem int idx; 649*8275SEric Cheng ill_rx_ring_t *rx_ring; 6504360Smeem 6514360Smeem for (idx = 0; idx < ILL_MAX_RINGS; idx++) { 652*8275SEric Cheng rx_ring = &ill->ill_dld_capab->idc_poll.idp_ring_tbl[idx]; 653*8275SEric Cheng ip_squeue_clean_ring(ill, rx_ring); 6544360Smeem } 6551184Skrgopi } 6561184Skrgopi 6571184Skrgopi /* 658*8275SEric Cheng * Used by IP to get the squeue associated with a ring. If the squeue isn't 659*8275SEric Cheng * yet bound to a CPU, and we're being called directly from the NIC's 660*8275SEric Cheng * interrupt, then we know what CPU we want to assign the squeue to, so 661*8275SEric Cheng * dispatch that task to a taskq. 6620Sstevel@tonic-gate */ 6630Sstevel@tonic-gate squeue_t * 6640Sstevel@tonic-gate ip_squeue_get(ill_rx_ring_t *ill_rx_ring) 6650Sstevel@tonic-gate { 6660Sstevel@tonic-gate squeue_t *sqp; 6670Sstevel@tonic-gate 668*8275SEric Cheng if ((ill_rx_ring == NULL) || ((sqp = ill_rx_ring->rr_sqp) == NULL)) 6690Sstevel@tonic-gate return (IP_SQUEUE_GET(lbolt)); 6700Sstevel@tonic-gate 671*8275SEric Cheng return (sqp); 6720Sstevel@tonic-gate } 6730Sstevel@tonic-gate 6740Sstevel@tonic-gate /* 675*8275SEric Cheng * Called when a CPU goes offline. It's squeue_set_t is destroyed, and all 676*8275SEric Cheng * squeues are unboudn and moved to the unbound set. 6770Sstevel@tonic-gate */ 678*8275SEric Cheng static void 679*8275SEric Cheng ip_squeue_set_destroy(cpu_t *cpu) 6800Sstevel@tonic-gate { 6810Sstevel@tonic-gate int i; 682*8275SEric Cheng squeue_t *sqp, *lastsqp = NULL; 683*8275SEric Cheng squeue_set_t *sqs, *unbound = sqset_global_list[0]; 6840Sstevel@tonic-gate 685*8275SEric Cheng mutex_enter(&sqset_lock); 686*8275SEric Cheng if ((sqs = cpu->cpu_squeue_set) == NULL) { 687*8275SEric Cheng mutex_exit(&sqset_lock); 688*8275SEric Cheng return; 6890Sstevel@tonic-gate } 6900Sstevel@tonic-gate 691*8275SEric Cheng /* Move all squeues to unbound set */ 6920Sstevel@tonic-gate 693*8275SEric Cheng for (sqp = sqs->sqs_head; sqp; lastsqp = sqp, sqp = sqp->sq_next) { 694*8275SEric Cheng squeue_unbind(sqp); 695*8275SEric Cheng sqp->sq_set = unbound; 696*8275SEric Cheng } 697*8275SEric Cheng if (sqs->sqs_head) { 698*8275SEric Cheng lastsqp->sq_next = unbound->sqs_head; 699*8275SEric Cheng unbound->sqs_head = sqs->sqs_head; 700*8275SEric Cheng } 7010Sstevel@tonic-gate 702*8275SEric Cheng /* Also move default squeue to unbound set */ 703*8275SEric Cheng 704*8275SEric Cheng sqp = sqs->sqs_default; 705*8275SEric Cheng ASSERT(sqp); 706*8275SEric Cheng ASSERT((sqp->sq_state & (SQS_DEFAULT|SQS_ILL_BOUND)) == SQS_DEFAULT); 7070Sstevel@tonic-gate 708*8275SEric Cheng sqp->sq_next = unbound->sqs_head; 709*8275SEric Cheng unbound->sqs_head = sqp; 710*8275SEric Cheng squeue_unbind(sqp); 711*8275SEric Cheng sqp->sq_set = unbound; 7120Sstevel@tonic-gate 713*8275SEric Cheng for (i = 1; i < sqset_global_size; i++) 714*8275SEric Cheng if (sqset_global_list[i] == sqs) 715*8275SEric Cheng break; 7160Sstevel@tonic-gate 717*8275SEric Cheng ASSERT(i < sqset_global_size); 718*8275SEric Cheng sqset_global_list[i] = sqset_global_list[sqset_global_size - 1]; 719*8275SEric Cheng sqset_global_list[sqset_global_size - 1] = NULL; 720*8275SEric Cheng sqset_global_size--; 7210Sstevel@tonic-gate 722*8275SEric Cheng mutex_exit(&sqset_lock); 723*8275SEric Cheng kmem_free(sqs, sizeof (*sqs)); 7240Sstevel@tonic-gate } 7250Sstevel@tonic-gate 7260Sstevel@tonic-gate /* 7270Sstevel@tonic-gate * Reconfiguration callback 7280Sstevel@tonic-gate */ 7290Sstevel@tonic-gate /* ARGSUSED */ 7300Sstevel@tonic-gate static int 7310Sstevel@tonic-gate ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg) 7320Sstevel@tonic-gate { 733*8275SEric Cheng cpu_t *cp = cpu_get(id); 7340Sstevel@tonic-gate 7350Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock)); 7360Sstevel@tonic-gate switch (what) { 737405Sakolb case CPU_CONFIG: 7380Sstevel@tonic-gate case CPU_ON: 7390Sstevel@tonic-gate case CPU_INIT: 7400Sstevel@tonic-gate case CPU_CPUPART_IN: 741*8275SEric Cheng if (cp->cpu_squeue_set == NULL) 742*8275SEric Cheng cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id); 7430Sstevel@tonic-gate break; 7440Sstevel@tonic-gate case CPU_UNCONFIG: 7450Sstevel@tonic-gate case CPU_OFF: 7460Sstevel@tonic-gate case CPU_CPUPART_OUT: 7470Sstevel@tonic-gate ASSERT((cp->cpu_squeue_set != NULL) || 7480Sstevel@tonic-gate (cp->cpu_flags & CPU_OFFLINE)); 7490Sstevel@tonic-gate if (cp->cpu_squeue_set != NULL) { 750*8275SEric Cheng ip_squeue_set_destroy(cp); 751*8275SEric Cheng cp->cpu_squeue_set = NULL; 7520Sstevel@tonic-gate } 7530Sstevel@tonic-gate break; 7540Sstevel@tonic-gate default: 7550Sstevel@tonic-gate break; 7560Sstevel@tonic-gate } 7570Sstevel@tonic-gate return (0); 7580Sstevel@tonic-gate } 759