xref: /onnv-gate/usr/src/uts/common/inet/ip/ip_squeue.c (revision 0:68f95e015346)
1*0Sstevel@tonic-gate /*
2*0Sstevel@tonic-gate  * CDDL HEADER START
3*0Sstevel@tonic-gate  *
4*0Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*0Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
6*0Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
7*0Sstevel@tonic-gate  * with the License.
8*0Sstevel@tonic-gate  *
9*0Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10*0Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
11*0Sstevel@tonic-gate  * See the License for the specific language governing permissions
12*0Sstevel@tonic-gate  * and limitations under the License.
13*0Sstevel@tonic-gate  *
14*0Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
15*0Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16*0Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
17*0Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
18*0Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
19*0Sstevel@tonic-gate  *
20*0Sstevel@tonic-gate  * CDDL HEADER END
21*0Sstevel@tonic-gate  */
22*0Sstevel@tonic-gate /*
23*0Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24*0Sstevel@tonic-gate  * Use is subject to license terms.
25*0Sstevel@tonic-gate  */
26*0Sstevel@tonic-gate 
27*0Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
28*0Sstevel@tonic-gate 
29*0Sstevel@tonic-gate /*
30*0Sstevel@tonic-gate  * IP interface to squeues.
31*0Sstevel@tonic-gate  *
32*0Sstevel@tonic-gate  * IP creates an squeue instance for each CPU. The squeue pointer is saved in
33*0Sstevel@tonic-gate  * cpu_squeue field of the cpu structure. Each squeue is associated with a
34*0Sstevel@tonic-gate  * connection instance (conn_t).
35*0Sstevel@tonic-gate  *
36*0Sstevel@tonic-gate  * For CPUs available at system startup time the squeue creation and association
37*0Sstevel@tonic-gate  * with CPU happens at MP initialization time. For CPUs added during dynamic
38*0Sstevel@tonic-gate  * reconfiguration, the initialization happens when the new CPU is configured in
39*0Sstevel@tonic-gate  * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either
40*0Sstevel@tonic-gate  * return per-CPU squeue or random squeue based on the ip_squeue_fanout
41*0Sstevel@tonic-gate  * variable.
42*0Sstevel@tonic-gate  *
43*0Sstevel@tonic-gate  * There are two modes of associating connection with squeues. The first mode
44*0Sstevel@tonic-gate  * associates each connection with the CPU that creates the connection (either
45*0Sstevel@tonic-gate  * during open time or during accept time). The second mode associates each
46*0Sstevel@tonic-gate  * connection with a random CPU, effectively distributing load over all CPUs
47*0Sstevel@tonic-gate  * and all squeues in the system. The mode is controlled by the
48*0Sstevel@tonic-gate  * ip_squeue_fanout variable.
49*0Sstevel@tonic-gate  *
50*0Sstevel@tonic-gate  * NOTE: The fact that there is an association between each connection and
51*0Sstevel@tonic-gate  * squeue and squeue and CPU does not mean that each connection is always
52*0Sstevel@tonic-gate  * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
53*0Sstevel@tonic-gate  * may process the connection on whatever CPU it is scheduled. The squeue to CPU
54*0Sstevel@tonic-gate  * binding is only relevant for the worker thread.
55*0Sstevel@tonic-gate  *
56*0Sstevel@tonic-gate  * The list of all created squeues is kept in squeue_set structure. This list is
57*0Sstevel@tonic-gate  * used when ip_squeue_fanout is set and the load is distributed across all
58*0Sstevel@tonic-gate  * squeues.
59*0Sstevel@tonic-gate  *
60*0Sstevel@tonic-gate  * INTERFACE:
61*0Sstevel@tonic-gate  *
62*0Sstevel@tonic-gate  * squeue_t *ip_squeue_get(hint)
63*0Sstevel@tonic-gate  *
64*0Sstevel@tonic-gate  * 	Find an squeue based on the 'hint' value. The hint is used as an index
65*0Sstevel@tonic-gate  * 	in the array of IP squeues available. The way hint is computed may
66*0Sstevel@tonic-gate  * 	affect the effectiveness of the squeue distribution. Currently squeues
67*0Sstevel@tonic-gate  * 	are assigned in round-robin fashion using lbolt as a hint.
68*0Sstevel@tonic-gate  *
69*0Sstevel@tonic-gate  *
70*0Sstevel@tonic-gate  * DR Notes
71*0Sstevel@tonic-gate  * ========
72*0Sstevel@tonic-gate  *
73*0Sstevel@tonic-gate  * The ip_squeue_init() registers a call-back function with the CPU DR
74*0Sstevel@tonic-gate  * subsystem using register_cpu_setup_func(). The call-back function does two
75*0Sstevel@tonic-gate  * things:
76*0Sstevel@tonic-gate  *
77*0Sstevel@tonic-gate  * o When the CPU is going off-line or unconfigured, the worker thread is
78*0Sstevel@tonic-gate  *	unbound from the CPU. This allows the CPU unconfig code to move it to
79*0Sstevel@tonic-gate  *	another CPU.
80*0Sstevel@tonic-gate  *
81*0Sstevel@tonic-gate  * o When the CPU is going online, it creates a new squeue for this CPU if
82*0Sstevel@tonic-gate  *	necessary and binds the squeue worker thread to this CPU.
83*0Sstevel@tonic-gate  *
84*0Sstevel@tonic-gate  * TUNEBALES:
85*0Sstevel@tonic-gate  *
86*0Sstevel@tonic-gate  * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU
87*0Sstevel@tonic-gate  * 	associated with an squeue instance.
88*0Sstevel@tonic-gate  *
89*0Sstevel@tonic-gate  * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c
90*0Sstevel@tonic-gate  *	should be compiled with SQUEUE_PROFILE enabled for this variable to have
91*0Sstevel@tonic-gate  *	an impact.
92*0Sstevel@tonic-gate  *
93*0Sstevel@tonic-gate  * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue,
94*0Sstevel@tonic-gate  *	otherwise get it from CPU->cpu_squeue.
95*0Sstevel@tonic-gate  *
96*0Sstevel@tonic-gate  * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and
97*0Sstevel@tonic-gate  * changed using ndd on /dev/tcp or /dev/ip.
98*0Sstevel@tonic-gate  *
99*0Sstevel@tonic-gate  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
100*0Sstevel@tonic-gate  *	created. This is the time squeue code waits before waking up the worker
101*0Sstevel@tonic-gate  *	thread after queuing a request.
102*0Sstevel@tonic-gate  */
103*0Sstevel@tonic-gate 
104*0Sstevel@tonic-gate #include <sys/types.h>
105*0Sstevel@tonic-gate #include <sys/debug.h>
106*0Sstevel@tonic-gate #include <sys/kmem.h>
107*0Sstevel@tonic-gate #include <sys/cpuvar.h>
108*0Sstevel@tonic-gate 
109*0Sstevel@tonic-gate #include <sys/cmn_err.h>
110*0Sstevel@tonic-gate 
111*0Sstevel@tonic-gate #include <inet/common.h>
112*0Sstevel@tonic-gate #include <inet/ip.h>
113*0Sstevel@tonic-gate #include <inet/ip_if.h>
114*0Sstevel@tonic-gate #include <inet/mi.h>
115*0Sstevel@tonic-gate #include <inet/nd.h>
116*0Sstevel@tonic-gate #include <inet/ipclassifier.h>
117*0Sstevel@tonic-gate #include <sys/types.h>
118*0Sstevel@tonic-gate #include <sys/conf.h>
119*0Sstevel@tonic-gate #include <sys/sunddi.h>
120*0Sstevel@tonic-gate #include <sys/ddi.h>
121*0Sstevel@tonic-gate #include <sys/squeue_impl.h>
122*0Sstevel@tonic-gate 
123*0Sstevel@tonic-gate 
124*0Sstevel@tonic-gate /*
125*0Sstevel@tonic-gate  * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
126*0Sstevel@tonic-gate  * mapping between squeue and NIC (or Rx ring) for performance reasons so
127*0Sstevel@tonic-gate  * each squeue can uniquely own a NIC or a Rx ring and do polling
128*0Sstevel@tonic-gate  * (PSARC 2004/630). So we allow up to  MAX_THREAD_PER_CPU squeues per CPU.
129*0Sstevel@tonic-gate  * We start by creating MIN_THREAD_PER_CPU squeues per CPU but more squeues
130*0Sstevel@tonic-gate  * can be created dynamically as needed.
131*0Sstevel@tonic-gate  */
132*0Sstevel@tonic-gate #define	MAX_THREAD_PER_CPU	32
133*0Sstevel@tonic-gate #define	MIN_THREAD_PER_CPU	1
134*0Sstevel@tonic-gate uint_t	ip_threads_per_cpu = MIN_THREAD_PER_CPU;
135*0Sstevel@tonic-gate 
136*0Sstevel@tonic-gate /*
137*0Sstevel@tonic-gate  * List of all created squeue sets. The size is protected by cpu_lock
138*0Sstevel@tonic-gate  */
139*0Sstevel@tonic-gate squeue_set_t	**sqset_global_list;
140*0Sstevel@tonic-gate uint_t		sqset_global_size;
141*0Sstevel@tonic-gate 
142*0Sstevel@tonic-gate int ip_squeue_bind = B_TRUE;
143*0Sstevel@tonic-gate int ip_squeue_profile = B_TRUE;
144*0Sstevel@tonic-gate static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
145*0Sstevel@tonic-gate 
146*0Sstevel@tonic-gate /*
147*0Sstevel@tonic-gate  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
148*0Sstevel@tonic-gate  *	created. This is the time squeue code waits before waking up the worker
149*0Sstevel@tonic-gate  *	thread after queuing a request.
150*0Sstevel@tonic-gate  */
151*0Sstevel@tonic-gate uint_t ip_squeue_worker_wait = 10;
152*0Sstevel@tonic-gate 
153*0Sstevel@tonic-gate static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t);
154*0Sstevel@tonic-gate static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
155*0Sstevel@tonic-gate 
156*0Sstevel@tonic-gate static void ip_squeue_set_bind(squeue_set_t *);
157*0Sstevel@tonic-gate static void ip_squeue_set_unbind(squeue_set_t *);
158*0Sstevel@tonic-gate 
159*0Sstevel@tonic-gate #define	CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
160*0Sstevel@tonic-gate 
161*0Sstevel@tonic-gate /*
162*0Sstevel@tonic-gate  * Create squeue set containing ip_threads_per_cpu number of squeues
163*0Sstevel@tonic-gate  * for this CPU and bind them all to the CPU.
164*0Sstevel@tonic-gate  */
165*0Sstevel@tonic-gate static squeue_set_t *
166*0Sstevel@tonic-gate ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
167*0Sstevel@tonic-gate {
168*0Sstevel@tonic-gate 	int i;
169*0Sstevel@tonic-gate 	squeue_set_t	*sqs;
170*0Sstevel@tonic-gate 	squeue_t 	*sqp;
171*0Sstevel@tonic-gate 	char 		sqname[64];
172*0Sstevel@tonic-gate 	processorid_t 	id = cp->cpu_id;
173*0Sstevel@tonic-gate 
174*0Sstevel@tonic-gate 	if (reuse) {
175*0Sstevel@tonic-gate 		int i;
176*0Sstevel@tonic-gate 
177*0Sstevel@tonic-gate 		/*
178*0Sstevel@tonic-gate 		 * We may already have an squeue created for this CPU. Try to
179*0Sstevel@tonic-gate 		 * find one and reuse it if possible.
180*0Sstevel@tonic-gate 		 */
181*0Sstevel@tonic-gate 		for (i = 0; i < sqset_global_size; i++) {
182*0Sstevel@tonic-gate 			sqs = sqset_global_list[i];
183*0Sstevel@tonic-gate 			if (id == sqs->sqs_bind)
184*0Sstevel@tonic-gate 				return (sqs);
185*0Sstevel@tonic-gate 		}
186*0Sstevel@tonic-gate 	}
187*0Sstevel@tonic-gate 
188*0Sstevel@tonic-gate 	sqs = kmem_zalloc(sizeof (squeue_set_t) +
189*0Sstevel@tonic-gate 	    (sizeof (squeue_t *) * MAX_THREAD_PER_CPU), KM_SLEEP);
190*0Sstevel@tonic-gate 	mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL);
191*0Sstevel@tonic-gate 	sqs->sqs_list = (squeue_t **)&sqs[1];
192*0Sstevel@tonic-gate 	sqs->sqs_max_size = MAX_THREAD_PER_CPU;
193*0Sstevel@tonic-gate 	sqs->sqs_bind = id;
194*0Sstevel@tonic-gate 
195*0Sstevel@tonic-gate 	for (i = 0; i < ip_threads_per_cpu; i++) {
196*0Sstevel@tonic-gate 		bzero(sqname, sizeof (sqname));
197*0Sstevel@tonic-gate 
198*0Sstevel@tonic-gate 		(void) snprintf(sqname, sizeof (sqname),
199*0Sstevel@tonic-gate 		    "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid,
200*0Sstevel@tonic-gate 		    cp->cpu_id, i);
201*0Sstevel@tonic-gate 
202*0Sstevel@tonic-gate 		sqp = squeue_create(sqname, id, ip_squeue_worker_wait,
203*0Sstevel@tonic-gate 		    minclsyspri);
204*0Sstevel@tonic-gate 
205*0Sstevel@tonic-gate 		ASSERT(sqp != NULL);
206*0Sstevel@tonic-gate 
207*0Sstevel@tonic-gate 		squeue_profile_enable(sqp);
208*0Sstevel@tonic-gate 		sqs->sqs_list[sqs->sqs_size++] = sqp;
209*0Sstevel@tonic-gate 
210*0Sstevel@tonic-gate 		if (ip_squeue_create_callback != NULL)
211*0Sstevel@tonic-gate 			ip_squeue_create_callback(sqp);
212*0Sstevel@tonic-gate 	}
213*0Sstevel@tonic-gate 
214*0Sstevel@tonic-gate 	if (ip_squeue_bind)
215*0Sstevel@tonic-gate 		ip_squeue_set_bind(sqs);
216*0Sstevel@tonic-gate 
217*0Sstevel@tonic-gate 	sqset_global_list[sqset_global_size++] = sqs;
218*0Sstevel@tonic-gate 	ASSERT(sqset_global_size <= NCPU);
219*0Sstevel@tonic-gate 	return (sqs);
220*0Sstevel@tonic-gate }
221*0Sstevel@tonic-gate 
222*0Sstevel@tonic-gate /*
223*0Sstevel@tonic-gate  * Initialize IP squeues.
224*0Sstevel@tonic-gate  */
225*0Sstevel@tonic-gate void
226*0Sstevel@tonic-gate ip_squeue_init(void (*callback)(squeue_t *))
227*0Sstevel@tonic-gate {
228*0Sstevel@tonic-gate 	int i;
229*0Sstevel@tonic-gate 
230*0Sstevel@tonic-gate 	ASSERT(sqset_global_list == NULL);
231*0Sstevel@tonic-gate 
232*0Sstevel@tonic-gate 	if (ip_threads_per_cpu < MIN_THREAD_PER_CPU)
233*0Sstevel@tonic-gate 		ip_threads_per_cpu = MIN_THREAD_PER_CPU;
234*0Sstevel@tonic-gate 	else if (ip_threads_per_cpu > MAX_THREAD_PER_CPU)
235*0Sstevel@tonic-gate 		ip_threads_per_cpu = MAX_THREAD_PER_CPU;
236*0Sstevel@tonic-gate 
237*0Sstevel@tonic-gate 	ip_squeue_create_callback = callback;
238*0Sstevel@tonic-gate 	squeue_init();
239*0Sstevel@tonic-gate 	sqset_global_list =
240*0Sstevel@tonic-gate 	    kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP);
241*0Sstevel@tonic-gate 	sqset_global_size = 0;
242*0Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
243*0Sstevel@tonic-gate 
244*0Sstevel@tonic-gate 	/* Create squeue for each active CPU available */
245*0Sstevel@tonic-gate 	for (i = 0; i < NCPU; i++) {
246*0Sstevel@tonic-gate 		cpu_t *cp = cpu[i];
247*0Sstevel@tonic-gate 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
248*0Sstevel@tonic-gate 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE);
249*0Sstevel@tonic-gate 		}
250*0Sstevel@tonic-gate 	}
251*0Sstevel@tonic-gate 
252*0Sstevel@tonic-gate 	register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
253*0Sstevel@tonic-gate 
254*0Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
255*0Sstevel@tonic-gate 
256*0Sstevel@tonic-gate 	if (ip_squeue_profile)
257*0Sstevel@tonic-gate 		squeue_profile_start();
258*0Sstevel@tonic-gate }
259*0Sstevel@tonic-gate 
260*0Sstevel@tonic-gate /*
261*0Sstevel@tonic-gate  * Get squeue_t structure based on index.
262*0Sstevel@tonic-gate  * Since the squeue list can only grow, no need to grab any lock.
263*0Sstevel@tonic-gate  */
264*0Sstevel@tonic-gate squeue_t *
265*0Sstevel@tonic-gate ip_squeue_random(uint_t index)
266*0Sstevel@tonic-gate {
267*0Sstevel@tonic-gate 	squeue_set_t *sqs;
268*0Sstevel@tonic-gate 
269*0Sstevel@tonic-gate 	sqs = sqset_global_list[index % sqset_global_size];
270*0Sstevel@tonic-gate 	return (sqs->sqs_list[index % sqs->sqs_size]);
271*0Sstevel@tonic-gate }
272*0Sstevel@tonic-gate 
273*0Sstevel@tonic-gate /* ARGSUSED */
274*0Sstevel@tonic-gate void
275*0Sstevel@tonic-gate ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2)
276*0Sstevel@tonic-gate {
277*0Sstevel@tonic-gate 	squeue_t	*sqp = arg2;
278*0Sstevel@tonic-gate 	ill_rx_ring_t	*ring = sqp->sq_rx_ring;
279*0Sstevel@tonic-gate 	ill_t		*ill;
280*0Sstevel@tonic-gate 
281*0Sstevel@tonic-gate 	ASSERT(sqp != NULL);
282*0Sstevel@tonic-gate 
283*0Sstevel@tonic-gate 	if (ring == NULL) {
284*0Sstevel@tonic-gate 		return;
285*0Sstevel@tonic-gate 	}
286*0Sstevel@tonic-gate 
287*0Sstevel@tonic-gate 	/*
288*0Sstevel@tonic-gate 	 * Clean up squeue
289*0Sstevel@tonic-gate 	 */
290*0Sstevel@tonic-gate 	mutex_enter(&sqp->sq_lock);
291*0Sstevel@tonic-gate 	sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB);
292*0Sstevel@tonic-gate 	sqp->sq_rx_ring = NULL;
293*0Sstevel@tonic-gate 	mutex_exit(&sqp->sq_lock);
294*0Sstevel@tonic-gate 
295*0Sstevel@tonic-gate 	ill = ring->rr_ill;
296*0Sstevel@tonic-gate 
297*0Sstevel@tonic-gate 	/*
298*0Sstevel@tonic-gate 	 * Cleanup the ring
299*0Sstevel@tonic-gate 	 */
300*0Sstevel@tonic-gate 
301*0Sstevel@tonic-gate 	ring->rr_blank = NULL;
302*0Sstevel@tonic-gate 	ring->rr_handle = NULL;
303*0Sstevel@tonic-gate 	ring->rr_sqp = NULL;
304*0Sstevel@tonic-gate 
305*0Sstevel@tonic-gate 	/*
306*0Sstevel@tonic-gate 	 * Signal ill that cleanup is done
307*0Sstevel@tonic-gate 	 */
308*0Sstevel@tonic-gate 	mutex_enter(&ill->ill_lock);
309*0Sstevel@tonic-gate 	ring->rr_ring_state = ILL_RING_FREE;
310*0Sstevel@tonic-gate 	cv_signal(&ill->ill_cv);
311*0Sstevel@tonic-gate 	mutex_exit(&ill->ill_lock);
312*0Sstevel@tonic-gate }
313*0Sstevel@tonic-gate 
314*0Sstevel@tonic-gate typedef struct ip_taskq_arg {
315*0Sstevel@tonic-gate 	ill_t		*ip_taskq_ill;
316*0Sstevel@tonic-gate 	ill_rx_ring_t	*ip_taskq_ill_rx_ring;
317*0Sstevel@tonic-gate 	cpu_t		*ip_taskq_cpu;
318*0Sstevel@tonic-gate } ip_taskq_arg_t;
319*0Sstevel@tonic-gate 
320*0Sstevel@tonic-gate /*
321*0Sstevel@tonic-gate  * Do a Rx ring to squeue binding. Find a unique squeue that is not
322*0Sstevel@tonic-gate  * managing a receive ring. If no such squeue exists, dynamically
323*0Sstevel@tonic-gate  * create a new one in the squeue set.
324*0Sstevel@tonic-gate  *
325*0Sstevel@tonic-gate  * The function runs via the system taskq. The ill passed as an
326*0Sstevel@tonic-gate  * argument can't go away since we hold a ref. The lock order is
327*0Sstevel@tonic-gate  * ill_lock -> sqs_lock -> sq_lock.
328*0Sstevel@tonic-gate  *
329*0Sstevel@tonic-gate  * If we are binding a Rx ring to a squeue attached to the offline CPU,
330*0Sstevel@tonic-gate  * no need to check that because squeues are never destroyed once
331*0Sstevel@tonic-gate  * created.
332*0Sstevel@tonic-gate  */
333*0Sstevel@tonic-gate /* ARGSUSED */
334*0Sstevel@tonic-gate static void
335*0Sstevel@tonic-gate ip_squeue_extend(void *arg)
336*0Sstevel@tonic-gate {
337*0Sstevel@tonic-gate 	ip_taskq_arg_t	*sq_arg = (ip_taskq_arg_t *)arg;
338*0Sstevel@tonic-gate 	ill_t		*ill = sq_arg->ip_taskq_ill;
339*0Sstevel@tonic-gate 	ill_rx_ring_t	*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
340*0Sstevel@tonic-gate 	cpu_t		*intr_cpu = sq_arg->ip_taskq_cpu;
341*0Sstevel@tonic-gate 	squeue_set_t *sqs;
342*0Sstevel@tonic-gate 	squeue_t 	*sqp = NULL;
343*0Sstevel@tonic-gate 	char		sqname[64];
344*0Sstevel@tonic-gate 	int		i;
345*0Sstevel@tonic-gate 
346*0Sstevel@tonic-gate 	ASSERT(ill != NULL);
347*0Sstevel@tonic-gate 	ASSERT(ill_rx_ring != NULL);
348*0Sstevel@tonic-gate 	kmem_free(arg, sizeof (ip_taskq_arg_t));
349*0Sstevel@tonic-gate 
350*0Sstevel@tonic-gate 	sqs = intr_cpu->cpu_squeue_set;
351*0Sstevel@tonic-gate 
352*0Sstevel@tonic-gate 	/*
353*0Sstevel@tonic-gate 	 * If this ill represents link aggregation, then there might be
354*0Sstevel@tonic-gate 	 * multiple NICs trying to register them selves at the same time
355*0Sstevel@tonic-gate 	 * and in order to ensure that test and assignment of free rings
356*0Sstevel@tonic-gate 	 * is sequential, we need to hold the ill_lock.
357*0Sstevel@tonic-gate 	 */
358*0Sstevel@tonic-gate 	mutex_enter(&ill->ill_lock);
359*0Sstevel@tonic-gate 	mutex_enter(&sqs->sqs_lock);
360*0Sstevel@tonic-gate 	for (i = 0; i < sqs->sqs_size; i++) {
361*0Sstevel@tonic-gate 		mutex_enter(&sqs->sqs_list[i]->sq_lock);
362*0Sstevel@tonic-gate 		if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) == 0) {
363*0Sstevel@tonic-gate 			sqp = sqs->sqs_list[i];
364*0Sstevel@tonic-gate 			break;
365*0Sstevel@tonic-gate 		}
366*0Sstevel@tonic-gate 		mutex_exit(&sqs->sqs_list[i]->sq_lock);
367*0Sstevel@tonic-gate 	}
368*0Sstevel@tonic-gate 
369*0Sstevel@tonic-gate 	if (sqp == NULL) {
370*0Sstevel@tonic-gate 		/* Need to create a new squeue */
371*0Sstevel@tonic-gate 		if (sqs->sqs_size == sqs->sqs_max_size) {
372*0Sstevel@tonic-gate 			/*
373*0Sstevel@tonic-gate 			 * Reached the max limit for squeue
374*0Sstevel@tonic-gate 			 * we can allocate on this CPU. Leave
375*0Sstevel@tonic-gate 			 * ill_ring_state set to ILL_RING_INPROC
376*0Sstevel@tonic-gate 			 * so that ip_squeue_direct will just
377*0Sstevel@tonic-gate 			 * assign the default squeue for this
378*0Sstevel@tonic-gate 			 * ring for future connections.
379*0Sstevel@tonic-gate 			 */
380*0Sstevel@tonic-gate #ifdef DEBUG
381*0Sstevel@tonic-gate 			cmn_err(CE_NOTE, "ip_squeue_add: Reached max "
382*0Sstevel@tonic-gate 			    " threads per CPU for sqp = %p\n", (void *)sqp);
383*0Sstevel@tonic-gate #endif
384*0Sstevel@tonic-gate 			mutex_exit(&sqs->sqs_lock);
385*0Sstevel@tonic-gate 			mutex_exit(&ill->ill_lock);
386*0Sstevel@tonic-gate 			ill_waiter_dcr(ill);
387*0Sstevel@tonic-gate 			return;
388*0Sstevel@tonic-gate 		}
389*0Sstevel@tonic-gate 
390*0Sstevel@tonic-gate 		bzero(sqname, sizeof (sqname));
391*0Sstevel@tonic-gate 		(void) snprintf(sqname, sizeof (sqname),
392*0Sstevel@tonic-gate 		    "ip_squeue_cpu_%d/%d/%d", CPU->cpu_seqid,
393*0Sstevel@tonic-gate 		    CPU->cpu_id, sqs->sqs_size);
394*0Sstevel@tonic-gate 
395*0Sstevel@tonic-gate 		sqp = squeue_create(sqname, CPU->cpu_id, ip_squeue_worker_wait,
396*0Sstevel@tonic-gate 		    minclsyspri);
397*0Sstevel@tonic-gate 
398*0Sstevel@tonic-gate 		ASSERT(sqp != NULL);
399*0Sstevel@tonic-gate 
400*0Sstevel@tonic-gate 		squeue_profile_enable(sqp);
401*0Sstevel@tonic-gate 		sqs->sqs_list[sqs->sqs_size++] = sqp;
402*0Sstevel@tonic-gate 
403*0Sstevel@tonic-gate 		if (ip_squeue_create_callback != NULL)
404*0Sstevel@tonic-gate 			ip_squeue_create_callback(sqp);
405*0Sstevel@tonic-gate 
406*0Sstevel@tonic-gate 		if (ip_squeue_bind) {
407*0Sstevel@tonic-gate 			squeue_bind(sqp, -1);
408*0Sstevel@tonic-gate 		}
409*0Sstevel@tonic-gate 		mutex_enter(&sqp->sq_lock);
410*0Sstevel@tonic-gate 	}
411*0Sstevel@tonic-gate 
412*0Sstevel@tonic-gate 	ASSERT(sqp != NULL);
413*0Sstevel@tonic-gate 
414*0Sstevel@tonic-gate 	sqp->sq_rx_ring = ill_rx_ring;
415*0Sstevel@tonic-gate 	ill_rx_ring->rr_sqp = sqp;
416*0Sstevel@tonic-gate 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
417*0Sstevel@tonic-gate 
418*0Sstevel@tonic-gate 	sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
419*0Sstevel@tonic-gate 	mutex_exit(&sqp->sq_lock);
420*0Sstevel@tonic-gate 	mutex_exit(&sqs->sqs_lock);
421*0Sstevel@tonic-gate 
422*0Sstevel@tonic-gate 	mutex_exit(&ill->ill_lock);
423*0Sstevel@tonic-gate 
424*0Sstevel@tonic-gate 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
425*0Sstevel@tonic-gate 	ill_waiter_dcr(ill);
426*0Sstevel@tonic-gate }
427*0Sstevel@tonic-gate 
428*0Sstevel@tonic-gate /*
429*0Sstevel@tonic-gate  * Find the squeue assigned to manage this Rx ring. If the Rx ring is not
430*0Sstevel@tonic-gate  * owned by a squeue yet, do the assignment. When the NIC registers it
431*0Sstevel@tonic-gate  * Rx rings with IP, we don't know where the interrupts will land and
432*0Sstevel@tonic-gate  * hence we need to wait till this point to do the assignment.
433*0Sstevel@tonic-gate  */
434*0Sstevel@tonic-gate squeue_t *
435*0Sstevel@tonic-gate ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
436*0Sstevel@tonic-gate {
437*0Sstevel@tonic-gate 	squeue_t 	*sqp;
438*0Sstevel@tonic-gate 	ill_t 		*ill;
439*0Sstevel@tonic-gate 	int		interrupt;
440*0Sstevel@tonic-gate 	ip_taskq_arg_t	*taskq_arg;
441*0Sstevel@tonic-gate 	boolean_t	refheld;
442*0Sstevel@tonic-gate 
443*0Sstevel@tonic-gate 	if (ill_rx_ring == NULL)
444*0Sstevel@tonic-gate 		return (IP_SQUEUE_GET(lbolt));
445*0Sstevel@tonic-gate 
446*0Sstevel@tonic-gate 	sqp = ill_rx_ring->rr_sqp;
447*0Sstevel@tonic-gate 	/*
448*0Sstevel@tonic-gate 	 * Do a quick check. If it's not NULL, we are done.
449*0Sstevel@tonic-gate 	 * Squeues are never destroyed so worse we will bind
450*0Sstevel@tonic-gate 	 * this connection to a suboptimal squeue.
451*0Sstevel@tonic-gate 	 *
452*0Sstevel@tonic-gate 	 * This is the fast path case.
453*0Sstevel@tonic-gate 	 */
454*0Sstevel@tonic-gate 	if (sqp != NULL)
455*0Sstevel@tonic-gate 		return (sqp);
456*0Sstevel@tonic-gate 
457*0Sstevel@tonic-gate 	ill = ill_rx_ring->rr_ill;
458*0Sstevel@tonic-gate 	ASSERT(ill != NULL);
459*0Sstevel@tonic-gate 
460*0Sstevel@tonic-gate 	interrupt = servicing_interrupt();
461*0Sstevel@tonic-gate 	taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t),
462*0Sstevel@tonic-gate 	    KM_NOSLEEP);
463*0Sstevel@tonic-gate 
464*0Sstevel@tonic-gate 	mutex_enter(&ill->ill_lock);
465*0Sstevel@tonic-gate 	if (!interrupt || ill_rx_ring->rr_ring_state != ILL_RING_INUSE ||
466*0Sstevel@tonic-gate 		taskq_arg == NULL) {
467*0Sstevel@tonic-gate 		/*
468*0Sstevel@tonic-gate 		 * Do the ring to squeue binding only if we are in interrupt
469*0Sstevel@tonic-gate 		 * context and there is no one else trying the bind already.
470*0Sstevel@tonic-gate 		 */
471*0Sstevel@tonic-gate 		mutex_exit(&ill->ill_lock);
472*0Sstevel@tonic-gate 		if (taskq_arg != NULL)
473*0Sstevel@tonic-gate 			kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
474*0Sstevel@tonic-gate 		return (IP_SQUEUE_GET(lbolt));
475*0Sstevel@tonic-gate 	}
476*0Sstevel@tonic-gate 
477*0Sstevel@tonic-gate 	/*
478*0Sstevel@tonic-gate 	 * No sqp assigned yet. Can't really do that in interrupt
479*0Sstevel@tonic-gate 	 * context. Assign the default sqp to this connection and
480*0Sstevel@tonic-gate 	 * trigger creation of new sqp and binding it to this ring
481*0Sstevel@tonic-gate 	 * via taskq. Need to make sure ill stays around.
482*0Sstevel@tonic-gate 	 */
483*0Sstevel@tonic-gate 	taskq_arg->ip_taskq_ill = ill;
484*0Sstevel@tonic-gate 	taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring;
485*0Sstevel@tonic-gate 	taskq_arg->ip_taskq_cpu = CPU;
486*0Sstevel@tonic-gate 	ill_rx_ring->rr_ring_state = ILL_RING_INPROC;
487*0Sstevel@tonic-gate 	mutex_exit(&ill->ill_lock);
488*0Sstevel@tonic-gate 	refheld = ill_waiter_inc(ill);
489*0Sstevel@tonic-gate 	if (refheld) {
490*0Sstevel@tonic-gate 		if (taskq_dispatch(system_taskq, ip_squeue_extend,
491*0Sstevel@tonic-gate 		    taskq_arg, TQ_NOSLEEP) != NULL) {
492*0Sstevel@tonic-gate 			return (IP_SQUEUE_GET(lbolt));
493*0Sstevel@tonic-gate 		}
494*0Sstevel@tonic-gate 	}
495*0Sstevel@tonic-gate 	/*
496*0Sstevel@tonic-gate 	 * The ill is closing and we could not get a reference on the ill OR
497*0Sstevel@tonic-gate 	 * taskq_dispatch failed probably due to memory allocation failure.
498*0Sstevel@tonic-gate 	 * We will try again next time.
499*0Sstevel@tonic-gate 	 */
500*0Sstevel@tonic-gate 	mutex_enter(&ill->ill_lock);
501*0Sstevel@tonic-gate 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
502*0Sstevel@tonic-gate 	mutex_exit(&ill->ill_lock);
503*0Sstevel@tonic-gate 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
504*0Sstevel@tonic-gate 	if (refheld)
505*0Sstevel@tonic-gate 		ill_waiter_dcr(ill);
506*0Sstevel@tonic-gate 
507*0Sstevel@tonic-gate 	return (IP_SQUEUE_GET(lbolt));
508*0Sstevel@tonic-gate }
509*0Sstevel@tonic-gate 
510*0Sstevel@tonic-gate /*
511*0Sstevel@tonic-gate  * NDD hooks for setting ip_squeue_xxx tuneables.
512*0Sstevel@tonic-gate  */
513*0Sstevel@tonic-gate 
514*0Sstevel@tonic-gate /* ARGSUSED */
515*0Sstevel@tonic-gate int
516*0Sstevel@tonic-gate ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value,
517*0Sstevel@tonic-gate     caddr_t addr, cred_t *cr)
518*0Sstevel@tonic-gate {
519*0Sstevel@tonic-gate 	int *bind_enabled = (int *)addr;
520*0Sstevel@tonic-gate 	long new_value;
521*0Sstevel@tonic-gate 	int i;
522*0Sstevel@tonic-gate 
523*0Sstevel@tonic-gate 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
524*0Sstevel@tonic-gate 		return (EINVAL);
525*0Sstevel@tonic-gate 
526*0Sstevel@tonic-gate 	if (ip_squeue_bind == new_value)
527*0Sstevel@tonic-gate 		return (0);
528*0Sstevel@tonic-gate 
529*0Sstevel@tonic-gate 	*bind_enabled = new_value;
530*0Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
531*0Sstevel@tonic-gate 	if (new_value == 0) {
532*0Sstevel@tonic-gate 		for (i = 0; i < sqset_global_size; i++)
533*0Sstevel@tonic-gate 			ip_squeue_set_unbind(sqset_global_list[i]);
534*0Sstevel@tonic-gate 	} else {
535*0Sstevel@tonic-gate 		for (i = 0; i < sqset_global_size; i++)
536*0Sstevel@tonic-gate 			ip_squeue_set_bind(sqset_global_list[i]);
537*0Sstevel@tonic-gate 	}
538*0Sstevel@tonic-gate 
539*0Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
540*0Sstevel@tonic-gate 	return (0);
541*0Sstevel@tonic-gate }
542*0Sstevel@tonic-gate 
543*0Sstevel@tonic-gate /*
544*0Sstevel@tonic-gate  * Set squeue profiling.
545*0Sstevel@tonic-gate  * 0 means "disable"
546*0Sstevel@tonic-gate  * 1 means "enable"
547*0Sstevel@tonic-gate  * 2 means "enable and reset"
548*0Sstevel@tonic-gate  */
549*0Sstevel@tonic-gate /* ARGSUSED */
550*0Sstevel@tonic-gate int
551*0Sstevel@tonic-gate ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
552*0Sstevel@tonic-gate     cred_t *cr)
553*0Sstevel@tonic-gate {
554*0Sstevel@tonic-gate 	int *profile_enabled = (int *)cp;
555*0Sstevel@tonic-gate 	long new_value;
556*0Sstevel@tonic-gate 	squeue_set_t *sqs;
557*0Sstevel@tonic-gate 
558*0Sstevel@tonic-gate 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
559*0Sstevel@tonic-gate 		return (EINVAL);
560*0Sstevel@tonic-gate 
561*0Sstevel@tonic-gate 	if (new_value == 0)
562*0Sstevel@tonic-gate 		squeue_profile_stop();
563*0Sstevel@tonic-gate 	else if (new_value == 1)
564*0Sstevel@tonic-gate 		squeue_profile_start();
565*0Sstevel@tonic-gate 	else if (new_value == 2) {
566*0Sstevel@tonic-gate 		int i, j;
567*0Sstevel@tonic-gate 
568*0Sstevel@tonic-gate 		squeue_profile_stop();
569*0Sstevel@tonic-gate 		mutex_enter(&cpu_lock);
570*0Sstevel@tonic-gate 		for (i = 0; i < sqset_global_size; i++) {
571*0Sstevel@tonic-gate 			sqs = sqset_global_list[i];
572*0Sstevel@tonic-gate 			for (j = 0; j < sqs->sqs_size; j++) {
573*0Sstevel@tonic-gate 				squeue_profile_reset(sqs->sqs_list[j]);
574*0Sstevel@tonic-gate 			}
575*0Sstevel@tonic-gate 		}
576*0Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
577*0Sstevel@tonic-gate 
578*0Sstevel@tonic-gate 		new_value = 1;
579*0Sstevel@tonic-gate 		squeue_profile_start();
580*0Sstevel@tonic-gate 	}
581*0Sstevel@tonic-gate 	*profile_enabled = new_value;
582*0Sstevel@tonic-gate 
583*0Sstevel@tonic-gate 	return (0);
584*0Sstevel@tonic-gate }
585*0Sstevel@tonic-gate 
586*0Sstevel@tonic-gate /*
587*0Sstevel@tonic-gate  * Reconfiguration callback
588*0Sstevel@tonic-gate  */
589*0Sstevel@tonic-gate 
590*0Sstevel@tonic-gate /* ARGSUSED */
591*0Sstevel@tonic-gate static int
592*0Sstevel@tonic-gate ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
593*0Sstevel@tonic-gate {
594*0Sstevel@tonic-gate 	cpu_t *cp = cpu[id];
595*0Sstevel@tonic-gate 
596*0Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
597*0Sstevel@tonic-gate 	switch (what) {
598*0Sstevel@tonic-gate 	case CPU_ON:
599*0Sstevel@tonic-gate 	case CPU_INIT:
600*0Sstevel@tonic-gate 	case CPU_CPUPART_IN:
601*0Sstevel@tonic-gate 		if (cp->cpu_squeue_set == NULL) {
602*0Sstevel@tonic-gate 			/* New CPU! */
603*0Sstevel@tonic-gate 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
604*0Sstevel@tonic-gate 		}
605*0Sstevel@tonic-gate 		if (ip_squeue_bind)
606*0Sstevel@tonic-gate 			ip_squeue_set_bind(cp->cpu_squeue_set);
607*0Sstevel@tonic-gate 		break;
608*0Sstevel@tonic-gate 	case CPU_UNCONFIG:
609*0Sstevel@tonic-gate 	case CPU_OFF:
610*0Sstevel@tonic-gate 	case CPU_CPUPART_OUT:
611*0Sstevel@tonic-gate 		ASSERT((cp->cpu_squeue_set != NULL) ||
612*0Sstevel@tonic-gate 		    (cp->cpu_flags & CPU_OFFLINE));
613*0Sstevel@tonic-gate 
614*0Sstevel@tonic-gate 		if (cp->cpu_squeue_set != NULL) {
615*0Sstevel@tonic-gate 			ip_squeue_set_unbind(cp->cpu_squeue_set);
616*0Sstevel@tonic-gate 		}
617*0Sstevel@tonic-gate 		break;
618*0Sstevel@tonic-gate 	default:
619*0Sstevel@tonic-gate 		break;
620*0Sstevel@tonic-gate 	}
621*0Sstevel@tonic-gate 	return (0);
622*0Sstevel@tonic-gate }
623*0Sstevel@tonic-gate 
624*0Sstevel@tonic-gate /* ARGSUSED */
625*0Sstevel@tonic-gate static void
626*0Sstevel@tonic-gate ip_squeue_set_bind(squeue_set_t *sqs)
627*0Sstevel@tonic-gate {
628*0Sstevel@tonic-gate 	int i;
629*0Sstevel@tonic-gate 	squeue_t *sqp;
630*0Sstevel@tonic-gate 
631*0Sstevel@tonic-gate 	if (!ip_squeue_bind)
632*0Sstevel@tonic-gate 		return;
633*0Sstevel@tonic-gate 
634*0Sstevel@tonic-gate 	mutex_enter(&sqs->sqs_lock);
635*0Sstevel@tonic-gate 	for (i = 0; i < sqs->sqs_size; i++) {
636*0Sstevel@tonic-gate 		sqp = sqs->sqs_list[i];
637*0Sstevel@tonic-gate 		if (sqp->sq_state & SQS_BOUND)
638*0Sstevel@tonic-gate 			continue;
639*0Sstevel@tonic-gate 		squeue_bind(sqp, -1);
640*0Sstevel@tonic-gate 	}
641*0Sstevel@tonic-gate 	mutex_exit(&sqs->sqs_lock);
642*0Sstevel@tonic-gate }
643*0Sstevel@tonic-gate 
644*0Sstevel@tonic-gate static void
645*0Sstevel@tonic-gate ip_squeue_set_unbind(squeue_set_t *sqs)
646*0Sstevel@tonic-gate {
647*0Sstevel@tonic-gate 	int i;
648*0Sstevel@tonic-gate 	squeue_t *sqp;
649*0Sstevel@tonic-gate 
650*0Sstevel@tonic-gate 	mutex_enter(&sqs->sqs_lock);
651*0Sstevel@tonic-gate 	for (i = 0; i < sqs->sqs_size; i++) {
652*0Sstevel@tonic-gate 		sqp = sqs->sqs_list[i];
653*0Sstevel@tonic-gate 		if (!(sqp->sq_state & SQS_BOUND))
654*0Sstevel@tonic-gate 			continue;
655*0Sstevel@tonic-gate 		squeue_unbind(sqp);
656*0Sstevel@tonic-gate 	}
657*0Sstevel@tonic-gate 	mutex_exit(&sqs->sqs_lock);
658*0Sstevel@tonic-gate }
659