xref: /onnv-gate/usr/src/uts/common/inet/ip/ip_squeue.c (revision 8275:7c223a798022)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
51503Sericheng  * Common Development and Distribution License (the "License").
61503Sericheng  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
225895Syz147064  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate /*
270Sstevel@tonic-gate  * IP interface to squeues.
280Sstevel@tonic-gate  *
29*8275SEric Cheng  * IP uses squeues to force serialization of packets, both incoming and
30*8275SEric Cheng  * outgoing. Each squeue is associated with a connection instance (conn_t)
31*8275SEric Cheng  * above, and a soft ring (if enabled) below. Each CPU will have a default
32*8275SEric Cheng  * squeue for outbound connections, and each soft ring of an interface will
33*8275SEric Cheng  * have an squeue to which it sends incoming packets. squeues are never
34*8275SEric Cheng  * destroyed, and if they become unused they are kept around against future
35*8275SEric Cheng  * needs.
360Sstevel@tonic-gate  *
37*8275SEric Cheng  * IP organizes its squeues using squeue sets (squeue_set_t). For each CPU
38*8275SEric Cheng  * in the system there will be one squeue set, all of whose squeues will be
39*8275SEric Cheng  * bound to that CPU, plus one additional set known as the unbound set. Sets
40*8275SEric Cheng  * associated with CPUs will have one default squeue, for outbound
41*8275SEric Cheng  * connections, and a linked list of squeues used by various NICs for inbound
42*8275SEric Cheng  * packets. The unbound set also has a linked list of squeues, but no default
43*8275SEric Cheng  * squeue.
44*8275SEric Cheng  *
45*8275SEric Cheng  * When a CPU goes offline its squeue set is destroyed, and all its squeues
46*8275SEric Cheng  * are moved to the unbound set. When a CPU comes online, a new squeue set is
47*8275SEric Cheng  * created and the default set is searched for a default squeue formerly bound
48*8275SEric Cheng  * to this CPU. If no default squeue is found, a new one is created.
49*8275SEric Cheng  *
50*8275SEric Cheng  * Two fields of the squeue_t, namely sq_next and sq_set, are owned by IP
51*8275SEric Cheng  * and not the squeue code. squeue.c will not touch them, and we can modify
52*8275SEric Cheng  * them without holding the squeue lock because of the guarantee that squeues
53*8275SEric Cheng  * are never destroyed. ip_squeue locks must be held, however.
54*8275SEric Cheng  *
55*8275SEric Cheng  * All the squeue sets are protected by a single lock, the sqset_lock. This
56*8275SEric Cheng  * is also used to protect the sq_next and sq_set fields of an squeue_t.
57*8275SEric Cheng  *
58*8275SEric Cheng  * The lock order is: cpu_lock --> ill_lock --> sqset_lock --> sq_lock
590Sstevel@tonic-gate  *
600Sstevel@tonic-gate  * There are two modes of associating connection with squeues. The first mode
610Sstevel@tonic-gate  * associates each connection with the CPU that creates the connection (either
620Sstevel@tonic-gate  * during open time or during accept time). The second mode associates each
630Sstevel@tonic-gate  * connection with a random CPU, effectively distributing load over all CPUs
640Sstevel@tonic-gate  * and all squeues in the system. The mode is controlled by the
650Sstevel@tonic-gate  * ip_squeue_fanout variable.
660Sstevel@tonic-gate  *
670Sstevel@tonic-gate  * NOTE: The fact that there is an association between each connection and
680Sstevel@tonic-gate  * squeue and squeue and CPU does not mean that each connection is always
690Sstevel@tonic-gate  * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
700Sstevel@tonic-gate  * may process the connection on whatever CPU it is scheduled. The squeue to CPU
710Sstevel@tonic-gate  * binding is only relevant for the worker thread.
720Sstevel@tonic-gate  *
730Sstevel@tonic-gate  * INTERFACE:
740Sstevel@tonic-gate  *
75*8275SEric Cheng  * squeue_t *ip_squeue_get(ill_rx_ring_t)
760Sstevel@tonic-gate  *
77*8275SEric Cheng  * Returns the squeue associated with an ill receive ring. If the ring is
78*8275SEric Cheng  * not bound to a CPU, and we're currently servicing the interrupt which
79*8275SEric Cheng  * generated the packet, then bind the squeue to CPU.
800Sstevel@tonic-gate  *
810Sstevel@tonic-gate  *
820Sstevel@tonic-gate  * DR Notes
830Sstevel@tonic-gate  * ========
840Sstevel@tonic-gate  *
850Sstevel@tonic-gate  * The ip_squeue_init() registers a call-back function with the CPU DR
860Sstevel@tonic-gate  * subsystem using register_cpu_setup_func(). The call-back function does two
870Sstevel@tonic-gate  * things:
880Sstevel@tonic-gate  *
890Sstevel@tonic-gate  * o When the CPU is going off-line or unconfigured, the worker thread is
900Sstevel@tonic-gate  *	unbound from the CPU. This allows the CPU unconfig code to move it to
910Sstevel@tonic-gate  *	another CPU.
920Sstevel@tonic-gate  *
930Sstevel@tonic-gate  * o When the CPU is going online, it creates a new squeue for this CPU if
940Sstevel@tonic-gate  *	necessary and binds the squeue worker thread to this CPU.
950Sstevel@tonic-gate  *
96*8275SEric Cheng  * TUNABLES:
970Sstevel@tonic-gate  *
98*8275SEric Cheng  * ip_squeue_fanout: used when TCP calls IP_SQUEUE_GET(). If 1, then
99*8275SEric Cheng  * pick the default squeue from a random CPU, otherwise use our CPU's default
100*8275SEric Cheng  * squeue.
1010Sstevel@tonic-gate  *
102*8275SEric Cheng  * ip_squeue_fanout can be accessed and changed using ndd on /dev/tcp or
103*8275SEric Cheng  * /dev/ip.
1040Sstevel@tonic-gate  *
105*8275SEric Cheng  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues *
106*8275SEric Cheng  * created. This is the time squeue code waits before waking up the worker
107*8275SEric Cheng  * thread after queuing a request.
1080Sstevel@tonic-gate  */
1090Sstevel@tonic-gate 
1100Sstevel@tonic-gate #include <sys/types.h>
1110Sstevel@tonic-gate #include <sys/debug.h>
1120Sstevel@tonic-gate #include <sys/kmem.h>
1130Sstevel@tonic-gate #include <sys/cpuvar.h>
1140Sstevel@tonic-gate #include <sys/cmn_err.h>
1150Sstevel@tonic-gate 
1160Sstevel@tonic-gate #include <inet/common.h>
1170Sstevel@tonic-gate #include <inet/ip.h>
118*8275SEric Cheng #include <netinet/ip6.h>
1190Sstevel@tonic-gate #include <inet/ip_if.h>
120*8275SEric Cheng #include <inet/ip_ire.h>
1210Sstevel@tonic-gate #include <inet/nd.h>
1220Sstevel@tonic-gate #include <inet/ipclassifier.h>
1230Sstevel@tonic-gate #include <sys/types.h>
1240Sstevel@tonic-gate #include <sys/conf.h>
1250Sstevel@tonic-gate #include <sys/sunddi.h>
1262546Scarlsonj #include <sys/dlpi.h>
1270Sstevel@tonic-gate #include <sys/squeue_impl.h>
128*8275SEric Cheng #include <sys/tihdr.h>
129*8275SEric Cheng #include <inet/udp_impl.h>
130*8275SEric Cheng #include <sys/strsubr.h>
131*8275SEric Cheng #include <sys/zone.h>
132*8275SEric Cheng #include <sys/dld.h>
1338130SGeorge.Shepherd@Sun.COM #include <sys/atomic.h>
1340Sstevel@tonic-gate 
1350Sstevel@tonic-gate /*
136*8275SEric Cheng  * List of all created squeue sets. The list and its size are protected by
137*8275SEric Cheng  * sqset_lock.
1380Sstevel@tonic-gate  */
139*8275SEric Cheng static squeue_set_t	**sqset_global_list; /* list 0 is the unbound list */
140*8275SEric Cheng static uint_t		sqset_global_size;
141*8275SEric Cheng kmutex_t		sqset_lock;
1421184Skrgopi 
1430Sstevel@tonic-gate static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
1440Sstevel@tonic-gate 
1450Sstevel@tonic-gate /*
1460Sstevel@tonic-gate  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
1470Sstevel@tonic-gate  *	created. This is the time squeue code waits before waking up the worker
1480Sstevel@tonic-gate  *	thread after queuing a request.
1490Sstevel@tonic-gate  */
1500Sstevel@tonic-gate uint_t ip_squeue_worker_wait = 10;
1510Sstevel@tonic-gate 
152*8275SEric Cheng static squeue_t *ip_squeue_create(pri_t);
153*8275SEric Cheng static squeue_set_t *ip_squeue_set_create(processorid_t);
1540Sstevel@tonic-gate static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
155*8275SEric Cheng static void ip_squeue_set_move(squeue_t *, squeue_set_t *);
156*8275SEric Cheng static void ip_squeue_set_destroy(cpu_t *);
1574360Smeem static void ip_squeue_clean(void *, mblk_t *, void *);
1580Sstevel@tonic-gate 
1590Sstevel@tonic-gate #define	CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
1600Sstevel@tonic-gate 
161*8275SEric Cheng static squeue_t *
162*8275SEric Cheng ip_squeue_create(pri_t pri)
163*8275SEric Cheng {
164*8275SEric Cheng 	squeue_t *sqp;
165*8275SEric Cheng 
166*8275SEric Cheng 	sqp = squeue_create(ip_squeue_worker_wait, pri);
167*8275SEric Cheng 	ASSERT(sqp != NULL);
168*8275SEric Cheng 	if (ip_squeue_create_callback != NULL)
169*8275SEric Cheng 		ip_squeue_create_callback(sqp);
170*8275SEric Cheng 	return (sqp);
171*8275SEric Cheng }
172*8275SEric Cheng 
1730Sstevel@tonic-gate /*
174*8275SEric Cheng  * Create a new squeue_set. If id == -1, then we're creating the unbound set,
175*8275SEric Cheng  * which should only happen once when we are first initialized. Otherwise id
176*8275SEric Cheng  * is the id of the CPU that needs a set, either because we are initializing
177*8275SEric Cheng  * or because the CPU has come online.
178*8275SEric Cheng  *
179*8275SEric Cheng  * If id != -1, then we need at a minimum to provide a default squeue for the
180*8275SEric Cheng  * new set. We search the unbound set for candidates, and if none are found we
181*8275SEric Cheng  * create a new one.
1820Sstevel@tonic-gate  */
1830Sstevel@tonic-gate static squeue_set_t *
184*8275SEric Cheng ip_squeue_set_create(processorid_t id)
1850Sstevel@tonic-gate {
1860Sstevel@tonic-gate 	squeue_set_t	*sqs;
187*8275SEric Cheng 	squeue_set_t	*src = sqset_global_list[0];
188*8275SEric Cheng 	squeue_t	**lastsqp, *sq;
189*8275SEric Cheng 	squeue_t	**defaultq_lastp = NULL;
1900Sstevel@tonic-gate 
191*8275SEric Cheng 	sqs = kmem_zalloc(sizeof (squeue_set_t), KM_SLEEP);
192*8275SEric Cheng 	sqs->sqs_cpuid = id;
1930Sstevel@tonic-gate 
194*8275SEric Cheng 	if (id == -1) {
195*8275SEric Cheng 		ASSERT(sqset_global_size == 0);
196*8275SEric Cheng 		sqset_global_list[0] = sqs;
197*8275SEric Cheng 		sqset_global_size = 1;
198*8275SEric Cheng 		return (sqs);
1990Sstevel@tonic-gate 	}
2000Sstevel@tonic-gate 
201*8275SEric Cheng 	/*
202*8275SEric Cheng 	 * When we create an squeue set id != -1, we need to give it a
203*8275SEric Cheng 	 * default squeue, in order to support fanout of conns across
204*8275SEric Cheng 	 * CPUs. Try to find a former default squeue that matches this
205*8275SEric Cheng 	 * cpu id on the unbound squeue set. If no such squeue is found,
206*8275SEric Cheng 	 * find some non-default TCP squeue and steal it. If still no such
207*8275SEric Cheng 	 * candidate is found, create a new squeue.
208*8275SEric Cheng 	 */
2090Sstevel@tonic-gate 
210*8275SEric Cheng 	ASSERT(MUTEX_HELD(&cpu_lock));
211*8275SEric Cheng 	mutex_enter(&sqset_lock);
212*8275SEric Cheng 	lastsqp = &src->sqs_head;
2130Sstevel@tonic-gate 
214*8275SEric Cheng 	while (*lastsqp) {
215*8275SEric Cheng 		if ((*lastsqp)->sq_bind == id &&
216*8275SEric Cheng 		    (*lastsqp)->sq_state & SQS_DEFAULT) {
217*8275SEric Cheng 			defaultq_lastp = lastsqp;
218*8275SEric Cheng 			break;
219*8275SEric Cheng 		}
220*8275SEric Cheng 		if (defaultq_lastp == NULL &&
221*8275SEric Cheng 		    !((*lastsqp)->sq_state & SQS_DEFAULT)) {
222*8275SEric Cheng 			defaultq_lastp = lastsqp;
223*8275SEric Cheng 		}
224*8275SEric Cheng 		lastsqp = &(*lastsqp)->sq_next;
2250Sstevel@tonic-gate 
226*8275SEric Cheng 	}
227*8275SEric Cheng 	if (defaultq_lastp) {
228*8275SEric Cheng 		/* Remove from src set and set SQS_DEFAULT */
229*8275SEric Cheng 		sq = *defaultq_lastp;
230*8275SEric Cheng 		*defaultq_lastp = sq->sq_next;
231*8275SEric Cheng 		sq->sq_next = NULL;
232*8275SEric Cheng 		if (!(sq->sq_state & SQS_DEFAULT)) {
233*8275SEric Cheng 			mutex_enter(&sq->sq_lock);
234*8275SEric Cheng 			sq->sq_state |= SQS_DEFAULT;
235*8275SEric Cheng 			mutex_exit(&sq->sq_lock);
236*8275SEric Cheng 		}
237*8275SEric Cheng 	} else {
238*8275SEric Cheng 		sq = ip_squeue_create(SQUEUE_DEFAULT_PRIORITY);
239*8275SEric Cheng 		sq->sq_state |= SQS_DEFAULT;
2400Sstevel@tonic-gate 	}
2410Sstevel@tonic-gate 
242*8275SEric Cheng 	sq->sq_set = sqs;
243*8275SEric Cheng 	sqs->sqs_default = sq;
244*8275SEric Cheng 	squeue_bind(sq, id); /* this locks squeue mutex */
245*8275SEric Cheng 
246*8275SEric Cheng 	ASSERT(sqset_global_size <= NCPU);
247*8275SEric Cheng 	sqset_global_list[sqset_global_size++] = sqs;
248*8275SEric Cheng 	mutex_exit(&sqset_lock);
249*8275SEric Cheng 	return (sqs);
250*8275SEric Cheng }
251*8275SEric Cheng 
252*8275SEric Cheng /*
253*8275SEric Cheng  * Called by ill_ring_add() to find an squeue to associate with a new ring.
254*8275SEric Cheng  */
255*8275SEric Cheng 
256*8275SEric Cheng squeue_t *
257*8275SEric Cheng ip_squeue_getfree(pri_t pri)
258*8275SEric Cheng {
259*8275SEric Cheng 	squeue_set_t	*sqs = sqset_global_list[0];
260*8275SEric Cheng 	squeue_t	*sq;
261*8275SEric Cheng 
262*8275SEric Cheng 	mutex_enter(&sqset_lock);
263*8275SEric Cheng 	for (sq = sqs->sqs_head; sq != NULL; sq = sq->sq_next) {
264*8275SEric Cheng 		/*
265*8275SEric Cheng 		 * Select a non-default squeue
266*8275SEric Cheng 		 */
267*8275SEric Cheng 		if (!(sq->sq_state & (SQS_DEFAULT | SQS_ILL_BOUND)))
268*8275SEric Cheng 			break;
269*8275SEric Cheng 	}
2700Sstevel@tonic-gate 
271*8275SEric Cheng 	if (sq == NULL) {
272*8275SEric Cheng 		sq = ip_squeue_create(pri);
273*8275SEric Cheng 		sq->sq_set = sqs;
274*8275SEric Cheng 		sq->sq_next = sqs->sqs_head;
275*8275SEric Cheng 		sqs->sqs_head = sq;
276*8275SEric Cheng 	}
277*8275SEric Cheng 
278*8275SEric Cheng 	ASSERT(!(sq->sq_state & (SQS_POLL_THR_CONTROL | SQS_WORKER_THR_CONTROL |
279*8275SEric Cheng 	    SQS_POLL_CLEANUP_DONE | SQS_POLL_QUIESCE_DONE |
280*8275SEric Cheng 	    SQS_POLL_THR_QUIESCED)));
281*8275SEric Cheng 
282*8275SEric Cheng 	mutex_enter(&sq->sq_lock);
283*8275SEric Cheng 	sq->sq_state |= SQS_ILL_BOUND;
284*8275SEric Cheng 	mutex_exit(&sq->sq_lock);
285*8275SEric Cheng 	mutex_exit(&sqset_lock);
286*8275SEric Cheng 
287*8275SEric Cheng 	if (sq->sq_priority != pri) {
288*8275SEric Cheng 		thread_lock(sq->sq_worker);
289*8275SEric Cheng 		(void) thread_change_pri(sq->sq_worker, pri, 0);
290*8275SEric Cheng 		thread_unlock(sq->sq_worker);
291*8275SEric Cheng 
292*8275SEric Cheng 		thread_lock(sq->sq_poll_thr);
293*8275SEric Cheng 		(void) thread_change_pri(sq->sq_poll_thr, pri, 0);
294*8275SEric Cheng 		thread_unlock(sq->sq_poll_thr);
295*8275SEric Cheng 
296*8275SEric Cheng 		sq->sq_priority = pri;
297*8275SEric Cheng 	}
298*8275SEric Cheng 	return (sq);
2990Sstevel@tonic-gate }
3000Sstevel@tonic-gate 
3010Sstevel@tonic-gate /*
3020Sstevel@tonic-gate  * Initialize IP squeues.
3030Sstevel@tonic-gate  */
3040Sstevel@tonic-gate void
3050Sstevel@tonic-gate ip_squeue_init(void (*callback)(squeue_t *))
3060Sstevel@tonic-gate {
3070Sstevel@tonic-gate 	int i;
308*8275SEric Cheng 	squeue_set_t	*sqs;
3090Sstevel@tonic-gate 
3100Sstevel@tonic-gate 	ASSERT(sqset_global_list == NULL);
3110Sstevel@tonic-gate 
3120Sstevel@tonic-gate 	ip_squeue_create_callback = callback;
3130Sstevel@tonic-gate 	squeue_init();
314*8275SEric Cheng 	mutex_init(&sqset_lock, NULL, MUTEX_DEFAULT, NULL);
3150Sstevel@tonic-gate 	sqset_global_list =
316*8275SEric Cheng 	    kmem_zalloc(sizeof (squeue_set_t *) * (NCPU+1), KM_SLEEP);
3170Sstevel@tonic-gate 	sqset_global_size = 0;
318*8275SEric Cheng 	/*
319*8275SEric Cheng 	 * We are called at system boot time and we don't
320*8275SEric Cheng 	 * expect memory allocation failure.
321*8275SEric Cheng 	 */
322*8275SEric Cheng 	sqs = ip_squeue_set_create(-1);
323*8275SEric Cheng 	ASSERT(sqs != NULL);
324*8275SEric Cheng 
3250Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
3260Sstevel@tonic-gate 	/* Create squeue for each active CPU available */
3270Sstevel@tonic-gate 	for (i = 0; i < NCPU; i++) {
328*8275SEric Cheng 		cpu_t *cp = cpu_get(i);
3290Sstevel@tonic-gate 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
330*8275SEric Cheng 			/*
331*8275SEric Cheng 			 * We are called at system boot time and we don't
332*8275SEric Cheng 			 * expect memory allocation failure then
333*8275SEric Cheng 			 */
334*8275SEric Cheng 			cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
335*8275SEric Cheng 			ASSERT(cp->cpu_squeue_set != NULL);
3360Sstevel@tonic-gate 		}
3370Sstevel@tonic-gate 	}
3380Sstevel@tonic-gate 
3390Sstevel@tonic-gate 	register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
3400Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
3410Sstevel@tonic-gate }
3420Sstevel@tonic-gate 
3430Sstevel@tonic-gate /*
344*8275SEric Cheng  * Get a default squeue, either from the current CPU or a CPU derived by hash
345*8275SEric Cheng  * from the index argument, depending upon the setting of ip_squeue_fanout.
3460Sstevel@tonic-gate  */
3470Sstevel@tonic-gate squeue_t *
3480Sstevel@tonic-gate ip_squeue_random(uint_t index)
3490Sstevel@tonic-gate {
350*8275SEric Cheng 	squeue_set_t *sqs = NULL;
351*8275SEric Cheng 	squeue_t *sq;
352*8275SEric Cheng 
353*8275SEric Cheng 	/*
354*8275SEric Cheng 	 * The minimum value of sqset_global_size is 2, one for the unbound
355*8275SEric Cheng 	 * squeue set and another for the squeue set of the zeroth CPU.
356*8275SEric Cheng 	 * Even though the value could be changing, it can never go below 2,
357*8275SEric Cheng 	 * so the assert does not need the lock protection.
358*8275SEric Cheng 	 */
359*8275SEric Cheng 	ASSERT(sqset_global_size > 1);
360*8275SEric Cheng 
361*8275SEric Cheng 	/* Protect against changes to sqset_global_list */
362*8275SEric Cheng 	mutex_enter(&sqset_lock);
3630Sstevel@tonic-gate 
364*8275SEric Cheng 	if (!ip_squeue_fanout)
365*8275SEric Cheng 		sqs = CPU->cpu_squeue_set;
366*8275SEric Cheng 
367*8275SEric Cheng 	/*
368*8275SEric Cheng 	 * sqset_global_list[0] corresponds to the unbound squeue set.
369*8275SEric Cheng 	 * The computation below picks a set other than the unbound set.
370*8275SEric Cheng 	 */
371*8275SEric Cheng 	if (sqs == NULL)
372*8275SEric Cheng 		sqs = sqset_global_list[(index % (sqset_global_size - 1)) + 1];
373*8275SEric Cheng 	sq = sqs->sqs_default;
374*8275SEric Cheng 
375*8275SEric Cheng 	mutex_exit(&sqset_lock);
376*8275SEric Cheng 	ASSERT(sq);
377*8275SEric Cheng 	return (sq);
3780Sstevel@tonic-gate }
3790Sstevel@tonic-gate 
380*8275SEric Cheng /*
381*8275SEric Cheng  * Move squeue from its current set to newset. Not used for default squeues.
382*8275SEric Cheng  * Bind or unbind the worker thread as appropriate.
383*8275SEric Cheng  */
384*8275SEric Cheng 
3854360Smeem static void
386*8275SEric Cheng ip_squeue_set_move(squeue_t *sq, squeue_set_t *newset)
3870Sstevel@tonic-gate {
388*8275SEric Cheng 	squeue_set_t	*set;
389*8275SEric Cheng 	squeue_t	**lastsqp;
390*8275SEric Cheng 	processorid_t	cpuid = newset->sqs_cpuid;
391*8275SEric Cheng 
392*8275SEric Cheng 	ASSERT(!(sq->sq_state & SQS_DEFAULT));
393*8275SEric Cheng 	ASSERT(!MUTEX_HELD(&sq->sq_lock));
394*8275SEric Cheng 	ASSERT(MUTEX_HELD(&sqset_lock));
395*8275SEric Cheng 
396*8275SEric Cheng 	set = sq->sq_set;
397*8275SEric Cheng 	if (set == newset)
398*8275SEric Cheng 		return;
399*8275SEric Cheng 
400*8275SEric Cheng 	lastsqp = &set->sqs_head;
401*8275SEric Cheng 	while (*lastsqp != sq)
402*8275SEric Cheng 		lastsqp = &(*lastsqp)->sq_next;
403*8275SEric Cheng 
404*8275SEric Cheng 	*lastsqp = sq->sq_next;
405*8275SEric Cheng 	sq->sq_next = newset->sqs_head;
406*8275SEric Cheng 	newset->sqs_head = sq;
407*8275SEric Cheng 	sq->sq_set = newset;
408*8275SEric Cheng 	if (cpuid == -1)
409*8275SEric Cheng 		squeue_unbind(sq);
410*8275SEric Cheng 	else
411*8275SEric Cheng 		squeue_bind(sq, cpuid);
412*8275SEric Cheng }
413*8275SEric Cheng 
414*8275SEric Cheng /*
415*8275SEric Cheng  * Move squeue from its current set to cpuid's set and bind to cpuid.
416*8275SEric Cheng  */
4170Sstevel@tonic-gate 
418*8275SEric Cheng int
419*8275SEric Cheng ip_squeue_cpu_move(squeue_t *sq, processorid_t cpuid)
420*8275SEric Cheng {
421*8275SEric Cheng 	cpu_t *cpu;
422*8275SEric Cheng 	squeue_set_t *set;
423*8275SEric Cheng 
424*8275SEric Cheng 	if (sq->sq_state & SQS_DEFAULT)
425*8275SEric Cheng 		return (-1);
426*8275SEric Cheng 
427*8275SEric Cheng 	ASSERT(MUTEX_HELD(&cpu_lock));
428*8275SEric Cheng 
429*8275SEric Cheng 	cpu = cpu_get(cpuid);
430*8275SEric Cheng 	if (!CPU_ISON(cpu))
431*8275SEric Cheng 		return (-1);
4320Sstevel@tonic-gate 
433*8275SEric Cheng 	mutex_enter(&sqset_lock);
434*8275SEric Cheng 	set = cpu->cpu_squeue_set;
435*8275SEric Cheng 	if (set != NULL)
436*8275SEric Cheng 		ip_squeue_set_move(sq, set);
437*8275SEric Cheng 	mutex_exit(&sqset_lock);
438*8275SEric Cheng 	return ((set == NULL) ? -1 : 0);
439*8275SEric Cheng }
440*8275SEric Cheng 
441*8275SEric Cheng /*
442*8275SEric Cheng  * The mac layer is calling, asking us to move an squeue to a
443*8275SEric Cheng  * new CPU. This routine is called with cpu_lock held.
444*8275SEric Cheng  */
445*8275SEric Cheng void
446*8275SEric Cheng ip_squeue_bind_ring(ill_t *ill, ill_rx_ring_t *rx_ring, processorid_t cpuid)
447*8275SEric Cheng {
448*8275SEric Cheng 	ASSERT(ILL_MAC_PERIM_HELD(ill));
449*8275SEric Cheng 	ASSERT(rx_ring->rr_ill == ill);
450*8275SEric Cheng 
451*8275SEric Cheng 	mutex_enter(&ill->ill_lock);
452*8275SEric Cheng 	if (rx_ring->rr_ring_state == RR_FREE ||
453*8275SEric Cheng 	    rx_ring->rr_ring_state == RR_FREE_INPROG) {
454*8275SEric Cheng 		mutex_exit(&ill->ill_lock);
4550Sstevel@tonic-gate 		return;
4560Sstevel@tonic-gate 	}
4570Sstevel@tonic-gate 
458*8275SEric Cheng 	if (ip_squeue_cpu_move(rx_ring->rr_sqp, cpuid) != -1)
459*8275SEric Cheng 		rx_ring->rr_ring_state = RR_SQUEUE_BOUND;
460*8275SEric Cheng 
461*8275SEric Cheng 	mutex_exit(&ill->ill_lock);
462*8275SEric Cheng }
463*8275SEric Cheng 
464*8275SEric Cheng void *
465*8275SEric Cheng ip_squeue_add_ring(ill_t *ill, void *mrp)
466*8275SEric Cheng {
467*8275SEric Cheng 	mac_rx_fifo_t		*mrfp = (mac_rx_fifo_t *)mrp;
468*8275SEric Cheng 	ill_rx_ring_t		*rx_ring, *ring_tbl;
469*8275SEric Cheng 	int			ip_rx_index;
470*8275SEric Cheng 	squeue_t		*sq = NULL;
471*8275SEric Cheng 	pri_t			pri;
472*8275SEric Cheng 
473*8275SEric Cheng 	ASSERT(ILL_MAC_PERIM_HELD(ill));
474*8275SEric Cheng 	ASSERT(mrfp->mrf_type == MAC_RX_FIFO);
475*8275SEric Cheng 	ASSERT(ill->ill_dld_capab != NULL);
476*8275SEric Cheng 
477*8275SEric Cheng 	ring_tbl = ill->ill_dld_capab->idc_poll.idp_ring_tbl;
4780Sstevel@tonic-gate 
479*8275SEric Cheng 	mutex_enter(&ill->ill_lock);
480*8275SEric Cheng 	for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) {
481*8275SEric Cheng 		rx_ring = &ring_tbl[ip_rx_index];
482*8275SEric Cheng 		if (rx_ring->rr_ring_state == RR_FREE)
483*8275SEric Cheng 			break;
484*8275SEric Cheng 	}
485*8275SEric Cheng 
486*8275SEric Cheng 	if (ip_rx_index == ILL_MAX_RINGS) {
487*8275SEric Cheng 		/*
488*8275SEric Cheng 		 * We ran out of ILL_MAX_RINGS worth rx_ring structures. If
489*8275SEric Cheng 		 * we have devices which can overwhelm this limit,
490*8275SEric Cheng 		 * ILL_MAX_RING should be made configurable. Meanwhile it
491*8275SEric Cheng 		 * cause no panic because driver will pass ip_input a NULL
492*8275SEric Cheng 		 * handle which will make IP allocate the default squeue and
493*8275SEric Cheng 		 * Polling mode will not be used for this ring.
494*8275SEric Cheng 		 */
495*8275SEric Cheng 		cmn_err(CE_NOTE,
496*8275SEric Cheng 		    "Reached maximum number of receiving rings (%d) for %s\n",
497*8275SEric Cheng 		    ILL_MAX_RINGS, ill->ill_name);
498*8275SEric Cheng 		mutex_exit(&ill->ill_lock);
499*8275SEric Cheng 		return (NULL);
5001184Skrgopi 	}
5010Sstevel@tonic-gate 
502*8275SEric Cheng 	bzero(rx_ring, sizeof (ill_rx_ring_t));
503*8275SEric Cheng 	rx_ring->rr_rx = (ip_mac_rx_t)mrfp->mrf_receive;
504*8275SEric Cheng 	/* XXX: Hard code it to tcp accept for now */
505*8275SEric Cheng 	rx_ring->rr_ip_accept = (ip_accept_t)ip_accept_tcp;
506*8275SEric Cheng 
507*8275SEric Cheng 	rx_ring->rr_intr_handle = mrfp->mrf_intr_handle;
508*8275SEric Cheng 	rx_ring->rr_intr_enable = (ip_mac_intr_enable_t)mrfp->mrf_intr_enable;
509*8275SEric Cheng 	rx_ring->rr_intr_disable =
510*8275SEric Cheng 	    (ip_mac_intr_disable_t)mrfp->mrf_intr_disable;
511*8275SEric Cheng 	rx_ring->rr_rx_handle = mrfp->mrf_rx_arg;
512*8275SEric Cheng 	rx_ring->rr_ill = ill;
513*8275SEric Cheng 
514*8275SEric Cheng 	pri = mrfp->mrf_flow_priority;
515*8275SEric Cheng 
516*8275SEric Cheng 	sq = ip_squeue_getfree(pri);
517*8275SEric Cheng 
518*8275SEric Cheng 	mutex_enter(&sq->sq_lock);
519*8275SEric Cheng 	sq->sq_rx_ring = rx_ring;
520*8275SEric Cheng 	rx_ring->rr_sqp = sq;
521*8275SEric Cheng 
522*8275SEric Cheng 	sq->sq_state |= SQS_POLL_CAPAB;
523*8275SEric Cheng 
524*8275SEric Cheng 	rx_ring->rr_ring_state = RR_SQUEUE_UNBOUND;
525*8275SEric Cheng 	sq->sq_ill = ill;
526*8275SEric Cheng 	mutex_exit(&sq->sq_lock);
527*8275SEric Cheng 	mutex_exit(&ill->ill_lock);
528*8275SEric Cheng 
529*8275SEric Cheng 	DTRACE_PROBE4(ill__ring__add, char *, ill->ill_name, ill_t *, ill, int,
530*8275SEric Cheng 	    ip_rx_index, void *, mrfp->mrf_rx_arg);
531*8275SEric Cheng 
532*8275SEric Cheng 	/* Assign the squeue to the specified CPU as well */
533*8275SEric Cheng 	mutex_enter(&cpu_lock);
534*8275SEric Cheng 	(void) ip_squeue_bind_ring(ill, rx_ring, mrfp->mrf_cpu_id);
535*8275SEric Cheng 	mutex_exit(&cpu_lock);
5360Sstevel@tonic-gate 
537*8275SEric Cheng 	return (rx_ring);
538*8275SEric Cheng }
539*8275SEric Cheng 
540*8275SEric Cheng /*
541*8275SEric Cheng  * sanitize the squeue etc. Some of the processing
542*8275SEric Cheng  * needs to be done from inside the perimeter.
543*8275SEric Cheng  */
544*8275SEric Cheng void
545*8275SEric Cheng ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
546*8275SEric Cheng {
547*8275SEric Cheng 	squeue_t *sqp;
548*8275SEric Cheng 
549*8275SEric Cheng 	ASSERT(ILL_MAC_PERIM_HELD(ill));
550*8275SEric Cheng 	ASSERT(rx_ring != NULL);
551*8275SEric Cheng 
552*8275SEric Cheng 	/* Just clean one squeue */
553*8275SEric Cheng 	mutex_enter(&ill->ill_lock);
554*8275SEric Cheng 	if (rx_ring->rr_ring_state == RR_FREE) {
555*8275SEric Cheng 		mutex_exit(&ill->ill_lock);
556*8275SEric Cheng 		return;
557*8275SEric Cheng 	}
558*8275SEric Cheng 	rx_ring->rr_ring_state = RR_FREE_INPROG;
559*8275SEric Cheng 	sqp = rx_ring->rr_sqp;
560*8275SEric Cheng 
561*8275SEric Cheng 	mutex_enter(&sqp->sq_lock);
562*8275SEric Cheng 	sqp->sq_state |= SQS_POLL_CLEANUP;
563*8275SEric Cheng 	cv_signal(&sqp->sq_worker_cv);
564*8275SEric Cheng 	mutex_exit(&ill->ill_lock);
565*8275SEric Cheng 	while (!(sqp->sq_state & SQS_POLL_CLEANUP_DONE))
566*8275SEric Cheng 		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
567*8275SEric Cheng 	sqp->sq_state &= ~(SQS_POLL_CLEANUP_DONE | SQS_ILL_BOUND);
568*8275SEric Cheng 
569*8275SEric Cheng 	ASSERT(!(sqp->sq_state & (SQS_POLL_THR_CONTROL |
570*8275SEric Cheng 	    SQS_WORKER_THR_CONTROL | SQS_POLL_QUIESCE_DONE |
571*8275SEric Cheng 	    SQS_POLL_THR_QUIESCED)));
572*8275SEric Cheng 
573*8275SEric Cheng 	cv_signal(&sqp->sq_worker_cv);
574*8275SEric Cheng 	mutex_exit(&sqp->sq_lock);
5750Sstevel@tonic-gate 
5760Sstevel@tonic-gate 	/*
577*8275SEric Cheng 	 * Logically free the squeue. It goes back to the set of unused
578*8275SEric Cheng 	 * squeues
5790Sstevel@tonic-gate 	 */
580*8275SEric Cheng 	mutex_enter(&sqset_lock);
581*8275SEric Cheng 	ip_squeue_set_move(sqp, sqset_global_list[0]);
582*8275SEric Cheng 	mutex_exit(&sqset_lock);
583*8275SEric Cheng 
5840Sstevel@tonic-gate 	mutex_enter(&ill->ill_lock);
585*8275SEric Cheng 	rx_ring->rr_ring_state = RR_FREE;
5860Sstevel@tonic-gate 	mutex_exit(&ill->ill_lock);
5870Sstevel@tonic-gate }
5880Sstevel@tonic-gate 
5894360Smeem /*
590*8275SEric Cheng  * Stop the squeue from polling. This needs to be done
591*8275SEric Cheng  * from inside the perimeter.
5924360Smeem  */
593*8275SEric Cheng void
594*8275SEric Cheng ip_squeue_quiesce_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
5954360Smeem {
5964360Smeem 	squeue_t *sqp;
5974360Smeem 
598*8275SEric Cheng 	ASSERT(ILL_MAC_PERIM_HELD(ill));
5994360Smeem 	ASSERT(rx_ring != NULL);
6004360Smeem 
601*8275SEric Cheng 	sqp = rx_ring->rr_sqp;
602*8275SEric Cheng 	mutex_enter(&sqp->sq_lock);
603*8275SEric Cheng 	sqp->sq_state |= SQS_POLL_QUIESCE;
604*8275SEric Cheng 	cv_signal(&sqp->sq_worker_cv);
605*8275SEric Cheng 	while (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE))
606*8275SEric Cheng 		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
607*8275SEric Cheng 
608*8275SEric Cheng 	mutex_exit(&sqp->sq_lock);
609*8275SEric Cheng }
610*8275SEric Cheng 
611*8275SEric Cheng /*
612*8275SEric Cheng  * Restart polling etc. Needs to be inside the perimeter to
613*8275SEric Cheng  * prevent races.
614*8275SEric Cheng  */
615*8275SEric Cheng void
616*8275SEric Cheng ip_squeue_restart_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
617*8275SEric Cheng {
618*8275SEric Cheng 	squeue_t *sqp;
619*8275SEric Cheng 
620*8275SEric Cheng 	ASSERT(ILL_MAC_PERIM_HELD(ill));
621*8275SEric Cheng 	ASSERT(rx_ring != NULL);
622*8275SEric Cheng 
623*8275SEric Cheng 	sqp = rx_ring->rr_sqp;
624*8275SEric Cheng 	mutex_enter(&sqp->sq_lock);
6254360Smeem 	/*
626*8275SEric Cheng 	 * Handle change in number of rings between the quiesce and
627*8275SEric Cheng 	 * restart operations by checking for a previous quiesce before
628*8275SEric Cheng 	 * attempting a restart.
6294360Smeem 	 */
630*8275SEric Cheng 	if (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE)) {
631*8275SEric Cheng 		mutex_exit(&sqp->sq_lock);
6324360Smeem 		return;
6334360Smeem 	}
634*8275SEric Cheng 	sqp->sq_state |= SQS_POLL_RESTART;
635*8275SEric Cheng 	cv_signal(&sqp->sq_worker_cv);
636*8275SEric Cheng 	while (!(sqp->sq_state & SQS_POLL_RESTART_DONE))
637*8275SEric Cheng 		cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
638*8275SEric Cheng 	sqp->sq_state &= ~SQS_POLL_RESTART_DONE;
639*8275SEric Cheng 	mutex_exit(&sqp->sq_lock);
6404360Smeem }
6414360Smeem 
642*8275SEric Cheng /*
643*8275SEric Cheng  * sanitize all squeues associated with the ill.
644*8275SEric Cheng  */
6454360Smeem void
6464360Smeem ip_squeue_clean_all(ill_t *ill)
6474360Smeem {
6484360Smeem 	int idx;
649*8275SEric Cheng 	ill_rx_ring_t	*rx_ring;
6504360Smeem 
6514360Smeem 	for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
652*8275SEric Cheng 		rx_ring = &ill->ill_dld_capab->idc_poll.idp_ring_tbl[idx];
653*8275SEric Cheng 		ip_squeue_clean_ring(ill, rx_ring);
6544360Smeem 	}
6551184Skrgopi }
6561184Skrgopi 
6571184Skrgopi /*
658*8275SEric Cheng  * Used by IP to get the squeue associated with a ring. If the squeue isn't
659*8275SEric Cheng  * yet bound to a CPU, and we're being called directly from the NIC's
660*8275SEric Cheng  * interrupt, then we know what CPU we want to assign the squeue to, so
661*8275SEric Cheng  * dispatch that task to a taskq.
6620Sstevel@tonic-gate  */
6630Sstevel@tonic-gate squeue_t *
6640Sstevel@tonic-gate ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
6650Sstevel@tonic-gate {
6660Sstevel@tonic-gate 	squeue_t 	*sqp;
6670Sstevel@tonic-gate 
668*8275SEric Cheng 	if ((ill_rx_ring == NULL) || ((sqp = ill_rx_ring->rr_sqp) == NULL))
6690Sstevel@tonic-gate 		return (IP_SQUEUE_GET(lbolt));
6700Sstevel@tonic-gate 
671*8275SEric Cheng 	return (sqp);
6720Sstevel@tonic-gate }
6730Sstevel@tonic-gate 
6740Sstevel@tonic-gate /*
675*8275SEric Cheng  * Called when a CPU goes offline. It's squeue_set_t is destroyed, and all
676*8275SEric Cheng  * squeues are unboudn and moved to the unbound set.
6770Sstevel@tonic-gate  */
678*8275SEric Cheng static void
679*8275SEric Cheng ip_squeue_set_destroy(cpu_t *cpu)
6800Sstevel@tonic-gate {
6810Sstevel@tonic-gate 	int i;
682*8275SEric Cheng 	squeue_t *sqp, *lastsqp = NULL;
683*8275SEric Cheng 	squeue_set_t *sqs, *unbound = sqset_global_list[0];
6840Sstevel@tonic-gate 
685*8275SEric Cheng 	mutex_enter(&sqset_lock);
686*8275SEric Cheng 	if ((sqs = cpu->cpu_squeue_set) == NULL) {
687*8275SEric Cheng 		mutex_exit(&sqset_lock);
688*8275SEric Cheng 		return;
6890Sstevel@tonic-gate 	}
6900Sstevel@tonic-gate 
691*8275SEric Cheng 	/* Move all squeues to unbound set */
6920Sstevel@tonic-gate 
693*8275SEric Cheng 	for (sqp = sqs->sqs_head; sqp; lastsqp = sqp, sqp = sqp->sq_next) {
694*8275SEric Cheng 		squeue_unbind(sqp);
695*8275SEric Cheng 		sqp->sq_set = unbound;
696*8275SEric Cheng 	}
697*8275SEric Cheng 	if (sqs->sqs_head) {
698*8275SEric Cheng 		lastsqp->sq_next = unbound->sqs_head;
699*8275SEric Cheng 		unbound->sqs_head = sqs->sqs_head;
700*8275SEric Cheng 	}
7010Sstevel@tonic-gate 
702*8275SEric Cheng 	/* Also move default squeue to unbound set */
703*8275SEric Cheng 
704*8275SEric Cheng 	sqp = sqs->sqs_default;
705*8275SEric Cheng 	ASSERT(sqp);
706*8275SEric Cheng 	ASSERT((sqp->sq_state & (SQS_DEFAULT|SQS_ILL_BOUND)) == SQS_DEFAULT);
7070Sstevel@tonic-gate 
708*8275SEric Cheng 	sqp->sq_next = unbound->sqs_head;
709*8275SEric Cheng 	unbound->sqs_head = sqp;
710*8275SEric Cheng 	squeue_unbind(sqp);
711*8275SEric Cheng 	sqp->sq_set = unbound;
7120Sstevel@tonic-gate 
713*8275SEric Cheng 	for (i = 1; i < sqset_global_size; i++)
714*8275SEric Cheng 		if (sqset_global_list[i] == sqs)
715*8275SEric Cheng 			break;
7160Sstevel@tonic-gate 
717*8275SEric Cheng 	ASSERT(i < sqset_global_size);
718*8275SEric Cheng 	sqset_global_list[i] = sqset_global_list[sqset_global_size - 1];
719*8275SEric Cheng 	sqset_global_list[sqset_global_size - 1] = NULL;
720*8275SEric Cheng 	sqset_global_size--;
7210Sstevel@tonic-gate 
722*8275SEric Cheng 	mutex_exit(&sqset_lock);
723*8275SEric Cheng 	kmem_free(sqs, sizeof (*sqs));
7240Sstevel@tonic-gate }
7250Sstevel@tonic-gate 
7260Sstevel@tonic-gate /*
7270Sstevel@tonic-gate  * Reconfiguration callback
7280Sstevel@tonic-gate  */
7290Sstevel@tonic-gate /* ARGSUSED */
7300Sstevel@tonic-gate static int
7310Sstevel@tonic-gate ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
7320Sstevel@tonic-gate {
733*8275SEric Cheng 	cpu_t *cp = cpu_get(id);
7340Sstevel@tonic-gate 
7350Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
7360Sstevel@tonic-gate 	switch (what) {
737405Sakolb 	case CPU_CONFIG:
7380Sstevel@tonic-gate 	case CPU_ON:
7390Sstevel@tonic-gate 	case CPU_INIT:
7400Sstevel@tonic-gate 	case CPU_CPUPART_IN:
741*8275SEric Cheng 		if (cp->cpu_squeue_set == NULL)
742*8275SEric Cheng 			cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
7430Sstevel@tonic-gate 		break;
7440Sstevel@tonic-gate 	case CPU_UNCONFIG:
7450Sstevel@tonic-gate 	case CPU_OFF:
7460Sstevel@tonic-gate 	case CPU_CPUPART_OUT:
7470Sstevel@tonic-gate 		ASSERT((cp->cpu_squeue_set != NULL) ||
7480Sstevel@tonic-gate 		    (cp->cpu_flags & CPU_OFFLINE));
7490Sstevel@tonic-gate 		if (cp->cpu_squeue_set != NULL) {
750*8275SEric Cheng 			ip_squeue_set_destroy(cp);
751*8275SEric Cheng 			cp->cpu_squeue_set = NULL;
7520Sstevel@tonic-gate 		}
7530Sstevel@tonic-gate 		break;
7540Sstevel@tonic-gate 	default:
7550Sstevel@tonic-gate 		break;
7560Sstevel@tonic-gate 	}
7570Sstevel@tonic-gate 	return (0);
7580Sstevel@tonic-gate }
759