xref: /onnv-gate/usr/src/uts/common/inet/ip/ip_squeue.c (revision 1184:1c788f55a808)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
50Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
60Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
70Sstevel@tonic-gate  * with the License.
80Sstevel@tonic-gate  *
90Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
100Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
110Sstevel@tonic-gate  * See the License for the specific language governing permissions
120Sstevel@tonic-gate  * and limitations under the License.
130Sstevel@tonic-gate  *
140Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
150Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
160Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
170Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
180Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
190Sstevel@tonic-gate  *
200Sstevel@tonic-gate  * CDDL HEADER END
210Sstevel@tonic-gate  */
220Sstevel@tonic-gate /*
230Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
240Sstevel@tonic-gate  * Use is subject to license terms.
250Sstevel@tonic-gate  */
260Sstevel@tonic-gate 
270Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
280Sstevel@tonic-gate 
290Sstevel@tonic-gate /*
300Sstevel@tonic-gate  * IP interface to squeues.
310Sstevel@tonic-gate  *
320Sstevel@tonic-gate  * IP creates an squeue instance for each CPU. The squeue pointer is saved in
330Sstevel@tonic-gate  * cpu_squeue field of the cpu structure. Each squeue is associated with a
340Sstevel@tonic-gate  * connection instance (conn_t).
350Sstevel@tonic-gate  *
360Sstevel@tonic-gate  * For CPUs available at system startup time the squeue creation and association
370Sstevel@tonic-gate  * with CPU happens at MP initialization time. For CPUs added during dynamic
380Sstevel@tonic-gate  * reconfiguration, the initialization happens when the new CPU is configured in
390Sstevel@tonic-gate  * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either
400Sstevel@tonic-gate  * return per-CPU squeue or random squeue based on the ip_squeue_fanout
410Sstevel@tonic-gate  * variable.
420Sstevel@tonic-gate  *
430Sstevel@tonic-gate  * There are two modes of associating connection with squeues. The first mode
440Sstevel@tonic-gate  * associates each connection with the CPU that creates the connection (either
450Sstevel@tonic-gate  * during open time or during accept time). The second mode associates each
460Sstevel@tonic-gate  * connection with a random CPU, effectively distributing load over all CPUs
470Sstevel@tonic-gate  * and all squeues in the system. The mode is controlled by the
480Sstevel@tonic-gate  * ip_squeue_fanout variable.
490Sstevel@tonic-gate  *
500Sstevel@tonic-gate  * NOTE: The fact that there is an association between each connection and
510Sstevel@tonic-gate  * squeue and squeue and CPU does not mean that each connection is always
520Sstevel@tonic-gate  * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
530Sstevel@tonic-gate  * may process the connection on whatever CPU it is scheduled. The squeue to CPU
540Sstevel@tonic-gate  * binding is only relevant for the worker thread.
550Sstevel@tonic-gate  *
560Sstevel@tonic-gate  * The list of all created squeues is kept in squeue_set structure. This list is
570Sstevel@tonic-gate  * used when ip_squeue_fanout is set and the load is distributed across all
580Sstevel@tonic-gate  * squeues.
590Sstevel@tonic-gate  *
600Sstevel@tonic-gate  * INTERFACE:
610Sstevel@tonic-gate  *
620Sstevel@tonic-gate  * squeue_t *ip_squeue_get(hint)
630Sstevel@tonic-gate  *
640Sstevel@tonic-gate  * 	Find an squeue based on the 'hint' value. The hint is used as an index
650Sstevel@tonic-gate  * 	in the array of IP squeues available. The way hint is computed may
660Sstevel@tonic-gate  * 	affect the effectiveness of the squeue distribution. Currently squeues
670Sstevel@tonic-gate  * 	are assigned in round-robin fashion using lbolt as a hint.
680Sstevel@tonic-gate  *
690Sstevel@tonic-gate  *
700Sstevel@tonic-gate  * DR Notes
710Sstevel@tonic-gate  * ========
720Sstevel@tonic-gate  *
730Sstevel@tonic-gate  * The ip_squeue_init() registers a call-back function with the CPU DR
740Sstevel@tonic-gate  * subsystem using register_cpu_setup_func(). The call-back function does two
750Sstevel@tonic-gate  * things:
760Sstevel@tonic-gate  *
770Sstevel@tonic-gate  * o When the CPU is going off-line or unconfigured, the worker thread is
780Sstevel@tonic-gate  *	unbound from the CPU. This allows the CPU unconfig code to move it to
790Sstevel@tonic-gate  *	another CPU.
800Sstevel@tonic-gate  *
810Sstevel@tonic-gate  * o When the CPU is going online, it creates a new squeue for this CPU if
820Sstevel@tonic-gate  *	necessary and binds the squeue worker thread to this CPU.
830Sstevel@tonic-gate  *
840Sstevel@tonic-gate  * TUNEBALES:
850Sstevel@tonic-gate  *
860Sstevel@tonic-gate  * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU
870Sstevel@tonic-gate  * 	associated with an squeue instance.
880Sstevel@tonic-gate  *
890Sstevel@tonic-gate  * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c
900Sstevel@tonic-gate  *	should be compiled with SQUEUE_PROFILE enabled for this variable to have
910Sstevel@tonic-gate  *	an impact.
920Sstevel@tonic-gate  *
930Sstevel@tonic-gate  * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue,
940Sstevel@tonic-gate  *	otherwise get it from CPU->cpu_squeue.
950Sstevel@tonic-gate  *
960Sstevel@tonic-gate  * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and
970Sstevel@tonic-gate  * changed using ndd on /dev/tcp or /dev/ip.
980Sstevel@tonic-gate  *
990Sstevel@tonic-gate  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
1000Sstevel@tonic-gate  *	created. This is the time squeue code waits before waking up the worker
1010Sstevel@tonic-gate  *	thread after queuing a request.
1020Sstevel@tonic-gate  */
1030Sstevel@tonic-gate 
1040Sstevel@tonic-gate #include <sys/types.h>
1050Sstevel@tonic-gate #include <sys/debug.h>
1060Sstevel@tonic-gate #include <sys/kmem.h>
1070Sstevel@tonic-gate #include <sys/cpuvar.h>
1080Sstevel@tonic-gate 
1090Sstevel@tonic-gate #include <sys/cmn_err.h>
1100Sstevel@tonic-gate 
1110Sstevel@tonic-gate #include <inet/common.h>
1120Sstevel@tonic-gate #include <inet/ip.h>
1130Sstevel@tonic-gate #include <inet/ip_if.h>
1140Sstevel@tonic-gate #include <inet/mi.h>
1150Sstevel@tonic-gate #include <inet/nd.h>
1160Sstevel@tonic-gate #include <inet/ipclassifier.h>
1170Sstevel@tonic-gate #include <sys/types.h>
1180Sstevel@tonic-gate #include <sys/conf.h>
1190Sstevel@tonic-gate #include <sys/sunddi.h>
1200Sstevel@tonic-gate #include <sys/ddi.h>
1210Sstevel@tonic-gate #include <sys/squeue_impl.h>
1220Sstevel@tonic-gate 
1230Sstevel@tonic-gate 
1240Sstevel@tonic-gate /*
1250Sstevel@tonic-gate  * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
1260Sstevel@tonic-gate  * mapping between squeue and NIC (or Rx ring) for performance reasons so
1270Sstevel@tonic-gate  * each squeue can uniquely own a NIC or a Rx ring and do polling
128*1184Skrgopi  * (PSARC 2004/630). So we allow up to  MAX_SQUEUES_PER_CPU squeues per CPU.
129*1184Skrgopi  * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues
1300Sstevel@tonic-gate  * can be created dynamically as needed.
1310Sstevel@tonic-gate  */
132*1184Skrgopi #define	MAX_SQUEUES_PER_CPU	32
133*1184Skrgopi #define	MIN_SQUEUES_PER_CPU	1
134*1184Skrgopi uint_t	ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
135*1184Skrgopi 
136*1184Skrgopi #define	IP_NUM_SOFT_RINGS	2
137*1184Skrgopi uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS;
1380Sstevel@tonic-gate 
1390Sstevel@tonic-gate /*
1400Sstevel@tonic-gate  * List of all created squeue sets. The size is protected by cpu_lock
1410Sstevel@tonic-gate  */
1420Sstevel@tonic-gate squeue_set_t	**sqset_global_list;
1430Sstevel@tonic-gate uint_t		sqset_global_size;
1440Sstevel@tonic-gate 
1450Sstevel@tonic-gate int ip_squeue_bind = B_TRUE;
1460Sstevel@tonic-gate int ip_squeue_profile = B_TRUE;
1470Sstevel@tonic-gate static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
1480Sstevel@tonic-gate 
1490Sstevel@tonic-gate /*
1500Sstevel@tonic-gate  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
1510Sstevel@tonic-gate  *	created. This is the time squeue code waits before waking up the worker
1520Sstevel@tonic-gate  *	thread after queuing a request.
1530Sstevel@tonic-gate  */
1540Sstevel@tonic-gate uint_t ip_squeue_worker_wait = 10;
1550Sstevel@tonic-gate 
1560Sstevel@tonic-gate static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t);
1570Sstevel@tonic-gate static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
1580Sstevel@tonic-gate 
1590Sstevel@tonic-gate static void ip_squeue_set_bind(squeue_set_t *);
1600Sstevel@tonic-gate static void ip_squeue_set_unbind(squeue_set_t *);
161*1184Skrgopi static squeue_t *ip_find_unused_squeue(squeue_set_t *, cpu_t *, boolean_t);
1620Sstevel@tonic-gate 
1630Sstevel@tonic-gate #define	CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
1640Sstevel@tonic-gate 
1650Sstevel@tonic-gate /*
166*1184Skrgopi  * Create squeue set containing ip_squeues_per_cpu number of squeues
1670Sstevel@tonic-gate  * for this CPU and bind them all to the CPU.
1680Sstevel@tonic-gate  */
1690Sstevel@tonic-gate static squeue_set_t *
1700Sstevel@tonic-gate ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
1710Sstevel@tonic-gate {
1720Sstevel@tonic-gate 	int i;
1730Sstevel@tonic-gate 	squeue_set_t	*sqs;
1740Sstevel@tonic-gate 	squeue_t 	*sqp;
1750Sstevel@tonic-gate 	char 		sqname[64];
1760Sstevel@tonic-gate 	processorid_t 	id = cp->cpu_id;
1770Sstevel@tonic-gate 
1780Sstevel@tonic-gate 	if (reuse) {
1790Sstevel@tonic-gate 		int i;
1800Sstevel@tonic-gate 
1810Sstevel@tonic-gate 		/*
1820Sstevel@tonic-gate 		 * We may already have an squeue created for this CPU. Try to
1830Sstevel@tonic-gate 		 * find one and reuse it if possible.
1840Sstevel@tonic-gate 		 */
1850Sstevel@tonic-gate 		for (i = 0; i < sqset_global_size; i++) {
1860Sstevel@tonic-gate 			sqs = sqset_global_list[i];
1870Sstevel@tonic-gate 			if (id == sqs->sqs_bind)
1880Sstevel@tonic-gate 				return (sqs);
1890Sstevel@tonic-gate 		}
1900Sstevel@tonic-gate 	}
1910Sstevel@tonic-gate 
1920Sstevel@tonic-gate 	sqs = kmem_zalloc(sizeof (squeue_set_t) +
193*1184Skrgopi 	    (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP);
1940Sstevel@tonic-gate 	mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL);
1950Sstevel@tonic-gate 	sqs->sqs_list = (squeue_t **)&sqs[1];
196*1184Skrgopi 	sqs->sqs_max_size = MAX_SQUEUES_PER_CPU;
1970Sstevel@tonic-gate 	sqs->sqs_bind = id;
1980Sstevel@tonic-gate 
199*1184Skrgopi 	for (i = 0; i < ip_squeues_per_cpu; i++) {
2000Sstevel@tonic-gate 		bzero(sqname, sizeof (sqname));
2010Sstevel@tonic-gate 
2020Sstevel@tonic-gate 		(void) snprintf(sqname, sizeof (sqname),
2030Sstevel@tonic-gate 		    "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid,
2040Sstevel@tonic-gate 		    cp->cpu_id, i);
2050Sstevel@tonic-gate 
2060Sstevel@tonic-gate 		sqp = squeue_create(sqname, id, ip_squeue_worker_wait,
2070Sstevel@tonic-gate 		    minclsyspri);
2080Sstevel@tonic-gate 
209*1184Skrgopi 		/*
210*1184Skrgopi 		 * The first squeue in each squeue_set is the DEFAULT
211*1184Skrgopi 		 * squeue.
212*1184Skrgopi 		 */
213*1184Skrgopi 		sqp->sq_state |= SQS_DEFAULT;
214*1184Skrgopi 
2150Sstevel@tonic-gate 		ASSERT(sqp != NULL);
2160Sstevel@tonic-gate 
2170Sstevel@tonic-gate 		squeue_profile_enable(sqp);
2180Sstevel@tonic-gate 		sqs->sqs_list[sqs->sqs_size++] = sqp;
2190Sstevel@tonic-gate 
2200Sstevel@tonic-gate 		if (ip_squeue_create_callback != NULL)
2210Sstevel@tonic-gate 			ip_squeue_create_callback(sqp);
2220Sstevel@tonic-gate 	}
2230Sstevel@tonic-gate 
224405Sakolb 	if (ip_squeue_bind && cpu_is_online(cp))
2250Sstevel@tonic-gate 		ip_squeue_set_bind(sqs);
2260Sstevel@tonic-gate 
2270Sstevel@tonic-gate 	sqset_global_list[sqset_global_size++] = sqs;
2280Sstevel@tonic-gate 	ASSERT(sqset_global_size <= NCPU);
2290Sstevel@tonic-gate 	return (sqs);
2300Sstevel@tonic-gate }
2310Sstevel@tonic-gate 
2320Sstevel@tonic-gate /*
2330Sstevel@tonic-gate  * Initialize IP squeues.
2340Sstevel@tonic-gate  */
2350Sstevel@tonic-gate void
2360Sstevel@tonic-gate ip_squeue_init(void (*callback)(squeue_t *))
2370Sstevel@tonic-gate {
2380Sstevel@tonic-gate 	int i;
2390Sstevel@tonic-gate 
2400Sstevel@tonic-gate 	ASSERT(sqset_global_list == NULL);
2410Sstevel@tonic-gate 
242*1184Skrgopi 	if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU)
243*1184Skrgopi 		ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
244*1184Skrgopi 	else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU)
245*1184Skrgopi 		ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU;
2460Sstevel@tonic-gate 
2470Sstevel@tonic-gate 	ip_squeue_create_callback = callback;
2480Sstevel@tonic-gate 	squeue_init();
2490Sstevel@tonic-gate 	sqset_global_list =
2500Sstevel@tonic-gate 	    kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP);
2510Sstevel@tonic-gate 	sqset_global_size = 0;
2520Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
2530Sstevel@tonic-gate 
2540Sstevel@tonic-gate 	/* Create squeue for each active CPU available */
2550Sstevel@tonic-gate 	for (i = 0; i < NCPU; i++) {
2560Sstevel@tonic-gate 		cpu_t *cp = cpu[i];
2570Sstevel@tonic-gate 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
2580Sstevel@tonic-gate 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE);
2590Sstevel@tonic-gate 		}
2600Sstevel@tonic-gate 	}
2610Sstevel@tonic-gate 
2620Sstevel@tonic-gate 	register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
2630Sstevel@tonic-gate 
2640Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
2650Sstevel@tonic-gate 
2660Sstevel@tonic-gate 	if (ip_squeue_profile)
2670Sstevel@tonic-gate 		squeue_profile_start();
2680Sstevel@tonic-gate }
2690Sstevel@tonic-gate 
2700Sstevel@tonic-gate /*
2710Sstevel@tonic-gate  * Get squeue_t structure based on index.
2720Sstevel@tonic-gate  * Since the squeue list can only grow, no need to grab any lock.
2730Sstevel@tonic-gate  */
2740Sstevel@tonic-gate squeue_t *
2750Sstevel@tonic-gate ip_squeue_random(uint_t index)
2760Sstevel@tonic-gate {
2770Sstevel@tonic-gate 	squeue_set_t *sqs;
2780Sstevel@tonic-gate 
2790Sstevel@tonic-gate 	sqs = sqset_global_list[index % sqset_global_size];
2800Sstevel@tonic-gate 	return (sqs->sqs_list[index % sqs->sqs_size]);
2810Sstevel@tonic-gate }
2820Sstevel@tonic-gate 
2830Sstevel@tonic-gate /* ARGSUSED */
2840Sstevel@tonic-gate void
2850Sstevel@tonic-gate ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2)
2860Sstevel@tonic-gate {
2870Sstevel@tonic-gate 	squeue_t	*sqp = arg2;
2880Sstevel@tonic-gate 	ill_rx_ring_t	*ring = sqp->sq_rx_ring;
2890Sstevel@tonic-gate 	ill_t		*ill;
2900Sstevel@tonic-gate 
2910Sstevel@tonic-gate 	ASSERT(sqp != NULL);
2920Sstevel@tonic-gate 
2930Sstevel@tonic-gate 	if (ring == NULL) {
2940Sstevel@tonic-gate 		return;
2950Sstevel@tonic-gate 	}
2960Sstevel@tonic-gate 
2970Sstevel@tonic-gate 	/*
2980Sstevel@tonic-gate 	 * Clean up squeue
2990Sstevel@tonic-gate 	 */
3000Sstevel@tonic-gate 	mutex_enter(&sqp->sq_lock);
3010Sstevel@tonic-gate 	sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB);
3020Sstevel@tonic-gate 	sqp->sq_rx_ring = NULL;
3030Sstevel@tonic-gate 	mutex_exit(&sqp->sq_lock);
3040Sstevel@tonic-gate 
3050Sstevel@tonic-gate 	ill = ring->rr_ill;
306*1184Skrgopi 	if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
307*1184Skrgopi 		ASSERT(ring->rr_handle != NULL);
308*1184Skrgopi 		ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle);
309*1184Skrgopi 	}
3100Sstevel@tonic-gate 
3110Sstevel@tonic-gate 	/*
3120Sstevel@tonic-gate 	 * Cleanup the ring
3130Sstevel@tonic-gate 	 */
3140Sstevel@tonic-gate 
3150Sstevel@tonic-gate 	ring->rr_blank = NULL;
3160Sstevel@tonic-gate 	ring->rr_handle = NULL;
3170Sstevel@tonic-gate 	ring->rr_sqp = NULL;
3180Sstevel@tonic-gate 
3190Sstevel@tonic-gate 	/*
3200Sstevel@tonic-gate 	 * Signal ill that cleanup is done
3210Sstevel@tonic-gate 	 */
3220Sstevel@tonic-gate 	mutex_enter(&ill->ill_lock);
3230Sstevel@tonic-gate 	ring->rr_ring_state = ILL_RING_FREE;
3240Sstevel@tonic-gate 	cv_signal(&ill->ill_cv);
3250Sstevel@tonic-gate 	mutex_exit(&ill->ill_lock);
3260Sstevel@tonic-gate }
3270Sstevel@tonic-gate 
3280Sstevel@tonic-gate typedef struct ip_taskq_arg {
3290Sstevel@tonic-gate 	ill_t		*ip_taskq_ill;
3300Sstevel@tonic-gate 	ill_rx_ring_t	*ip_taskq_ill_rx_ring;
3310Sstevel@tonic-gate 	cpu_t		*ip_taskq_cpu;
3320Sstevel@tonic-gate } ip_taskq_arg_t;
3330Sstevel@tonic-gate 
3340Sstevel@tonic-gate /*
3350Sstevel@tonic-gate  * Do a Rx ring to squeue binding. Find a unique squeue that is not
3360Sstevel@tonic-gate  * managing a receive ring. If no such squeue exists, dynamically
3370Sstevel@tonic-gate  * create a new one in the squeue set.
3380Sstevel@tonic-gate  *
3390Sstevel@tonic-gate  * The function runs via the system taskq. The ill passed as an
3400Sstevel@tonic-gate  * argument can't go away since we hold a ref. The lock order is
3410Sstevel@tonic-gate  * ill_lock -> sqs_lock -> sq_lock.
3420Sstevel@tonic-gate  *
3430Sstevel@tonic-gate  * If we are binding a Rx ring to a squeue attached to the offline CPU,
3440Sstevel@tonic-gate  * no need to check that because squeues are never destroyed once
3450Sstevel@tonic-gate  * created.
3460Sstevel@tonic-gate  */
3470Sstevel@tonic-gate /* ARGSUSED */
3480Sstevel@tonic-gate static void
3490Sstevel@tonic-gate ip_squeue_extend(void *arg)
3500Sstevel@tonic-gate {
3510Sstevel@tonic-gate 	ip_taskq_arg_t	*sq_arg = (ip_taskq_arg_t *)arg;
3520Sstevel@tonic-gate 	ill_t		*ill = sq_arg->ip_taskq_ill;
3530Sstevel@tonic-gate 	ill_rx_ring_t	*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
3540Sstevel@tonic-gate 	cpu_t		*intr_cpu = sq_arg->ip_taskq_cpu;
355*1184Skrgopi 	squeue_set_t 	*sqs;
3560Sstevel@tonic-gate 	squeue_t 	*sqp = NULL;
3570Sstevel@tonic-gate 
3580Sstevel@tonic-gate 	ASSERT(ill != NULL);
3590Sstevel@tonic-gate 	ASSERT(ill_rx_ring != NULL);
3600Sstevel@tonic-gate 	kmem_free(arg, sizeof (ip_taskq_arg_t));
3610Sstevel@tonic-gate 
362*1184Skrgopi 	/*
363*1184Skrgopi 	 * Make sure the CPU that originally took the interrupt still
364*1184Skrgopi 	 * exists.
365*1184Skrgopi 	 */
366*1184Skrgopi 	if (!CPU_ISON(intr_cpu))
367*1184Skrgopi 		intr_cpu = CPU;
368*1184Skrgopi 
3690Sstevel@tonic-gate 	sqs = intr_cpu->cpu_squeue_set;
3700Sstevel@tonic-gate 
3710Sstevel@tonic-gate 	/*
3720Sstevel@tonic-gate 	 * If this ill represents link aggregation, then there might be
3730Sstevel@tonic-gate 	 * multiple NICs trying to register them selves at the same time
3740Sstevel@tonic-gate 	 * and in order to ensure that test and assignment of free rings
3750Sstevel@tonic-gate 	 * is sequential, we need to hold the ill_lock.
3760Sstevel@tonic-gate 	 */
3770Sstevel@tonic-gate 	mutex_enter(&ill->ill_lock);
378*1184Skrgopi 	sqp = ip_find_unused_squeue(sqs, intr_cpu, B_FALSE);
379*1184Skrgopi 	if (sqp == NULL) {
380*1184Skrgopi 		/*
381*1184Skrgopi 		 * We hit the max limit of squeues allowed per CPU.
382*1184Skrgopi 		 * Assign this rx_ring to DEFAULT squeue of the
383*1184Skrgopi 		 * interrupted CPU but the squeue will not manage
384*1184Skrgopi 		 * the ring. Also print a warning.
385*1184Skrgopi 		 */
386*1184Skrgopi 		cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already "
387*1184Skrgopi 		    "has max number of squeues. System performance might "
388*1184Skrgopi 		    "become suboptimal\n", sqs->sqs_bind, (void *)sqs);
389*1184Skrgopi 
390*1184Skrgopi 		/* the first squeue in the list is the default squeue */
391*1184Skrgopi 		sqp = sqs->sqs_list[0];
392*1184Skrgopi 		ASSERT(sqp != NULL);
393*1184Skrgopi 		ill_rx_ring->rr_sqp = sqp;
394*1184Skrgopi 		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
395*1184Skrgopi 
396*1184Skrgopi 		mutex_exit(&ill->ill_lock);
397*1184Skrgopi 		ill_waiter_dcr(ill);
398*1184Skrgopi 		return;
399*1184Skrgopi 	}
400*1184Skrgopi 
401*1184Skrgopi 	ASSERT(MUTEX_HELD(&sqp->sq_lock));
402*1184Skrgopi 	sqp->sq_rx_ring = ill_rx_ring;
403*1184Skrgopi 	ill_rx_ring->rr_sqp = sqp;
404*1184Skrgopi 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
405*1184Skrgopi 
406*1184Skrgopi 	sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
407*1184Skrgopi 	mutex_exit(&sqp->sq_lock);
408*1184Skrgopi 
409*1184Skrgopi 	mutex_exit(&ill->ill_lock);
410*1184Skrgopi 
411*1184Skrgopi 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
412*1184Skrgopi 	ill_waiter_dcr(ill);
413*1184Skrgopi }
414*1184Skrgopi 
415*1184Skrgopi /*
416*1184Skrgopi  * Do a Rx ring to squeue binding. Find a unique squeue that is not
417*1184Skrgopi  * managing a receive ring. If no such squeue exists, dynamically
418*1184Skrgopi  * create a new one in the squeue set.
419*1184Skrgopi  *
420*1184Skrgopi  * The function runs via the system taskq. The ill passed as an
421*1184Skrgopi  * argument can't go away since we hold a ref. The lock order is
422*1184Skrgopi  * ill_lock -> sqs_lock -> sq_lock.
423*1184Skrgopi  *
424*1184Skrgopi  * If we are binding a Rx ring to a squeue attached to the offline CPU,
425*1184Skrgopi  * no need to check that because squeues are never destroyed once
426*1184Skrgopi  * created.
427*1184Skrgopi  */
428*1184Skrgopi /* ARGSUSED */
429*1184Skrgopi static void
430*1184Skrgopi ip_squeue_soft_ring_affinity(void *arg)
431*1184Skrgopi {
432*1184Skrgopi 	ip_taskq_arg_t		*sq_arg = (ip_taskq_arg_t *)arg;
433*1184Skrgopi 	ill_t			*ill = sq_arg->ip_taskq_ill;
434*1184Skrgopi 	ill_dls_capab_t	*ill_soft_ring = ill->ill_dls_capab;
435*1184Skrgopi 	ill_rx_ring_t		*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
436*1184Skrgopi 	cpu_t			*intr_cpu = sq_arg->ip_taskq_cpu;
437*1184Skrgopi 	cpu_t			*bind_cpu;
438*1184Skrgopi 	int			cpu_id = intr_cpu->cpu_id;
439*1184Skrgopi 	int			min_cpu_id, max_cpu_id;
440*1184Skrgopi 	boolean_t		enough_uniq_cpus = B_FALSE;
441*1184Skrgopi 	boolean_t		enough_cpus = B_FALSE;
442*1184Skrgopi 	squeue_set_t 		*sqs, *last_sqs;
443*1184Skrgopi 	squeue_t 		*sqp = NULL;
444*1184Skrgopi 	int			i, j;
445*1184Skrgopi 
446*1184Skrgopi 	ASSERT(ill != NULL);
447*1184Skrgopi 	kmem_free(arg, sizeof (ip_taskq_arg_t));
448*1184Skrgopi 
449*1184Skrgopi 	/*
450*1184Skrgopi 	 * Make sure the CPU that originally took the interrupt still
451*1184Skrgopi 	 * exists.
452*1184Skrgopi 	 */
453*1184Skrgopi 	if (!CPU_ISON(intr_cpu)) {
454*1184Skrgopi 		intr_cpu = CPU;
455*1184Skrgopi 		cpu_id = intr_cpu->cpu_id;
456*1184Skrgopi 	}
457*1184Skrgopi 
458*1184Skrgopi 	/*
459*1184Skrgopi 	 * If this ill represents link aggregation, then there might be
460*1184Skrgopi 	 * multiple NICs trying to register them selves at the same time
461*1184Skrgopi 	 * and in order to ensure that test and assignment of free rings
462*1184Skrgopi 	 * is sequential, we need to hold the ill_lock.
463*1184Skrgopi 	 */
464*1184Skrgopi 	mutex_enter(&ill->ill_lock);
465*1184Skrgopi 
466*1184Skrgopi 	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
467*1184Skrgopi 		mutex_exit(&ill->ill_lock);
468*1184Skrgopi 		return;
469*1184Skrgopi 	}
470*1184Skrgopi 	/*
471*1184Skrgopi 	 * We need to fanout the interrupts from the NIC. We do that by
472*1184Skrgopi 	 * telling the driver underneath to create soft rings and use
473*1184Skrgopi 	 * worker threads (if the driver advertized SOFT_RING capability)
474*1184Skrgopi 	 * Its still a big performance win to if we can fanout to the
475*1184Skrgopi 	 * threads on the same core that is taking interrupts.
476*1184Skrgopi 	 *
477*1184Skrgopi 	 * Since we don't know the interrupt to CPU binding, we don't
478*1184Skrgopi 	 * assign any squeues or affinity to worker threads in the NIC.
479*1184Skrgopi 	 * At the time of the first interrupt, we know which CPU is
480*1184Skrgopi 	 * taking interrupts and try to find other threads on the same
481*1184Skrgopi 	 * core. Assuming, ip_threads_per_cpu is correct and cpus are
482*1184Skrgopi 	 * numbered sequentially for each core (XXX need something better
483*1184Skrgopi 	 * than this in future), find the lowest number and highest
484*1184Skrgopi 	 * number thread for that core.
485*1184Skrgopi 	 *
486*1184Skrgopi 	 * If we have one more thread per core than number of soft rings,
487*1184Skrgopi 	 * then don't assign any worker threads to the H/W thread (cpu)
488*1184Skrgopi 	 * taking interrupts (capability negotiation tries to ensure this)
489*1184Skrgopi 	 *
490*1184Skrgopi 	 * If the number of threads per core are same as the number of
491*1184Skrgopi 	 * soft rings, then assign the worker affinity and squeue to
492*1184Skrgopi 	 * the same cpu.
493*1184Skrgopi 	 *
494*1184Skrgopi 	 * Otherwise, just fanout to higher number CPUs starting from
495*1184Skrgopi 	 * the interrupted CPU.
496*1184Skrgopi 	 */
497*1184Skrgopi 
498*1184Skrgopi 	min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu;
499*1184Skrgopi 	max_cpu_id = min_cpu_id + ip_threads_per_cpu;
500*1184Skrgopi 
501*1184Skrgopi 	cmn_err(CE_CONT, "soft_ring_affinity: min/max/intr = %d/%d/%d\n",
502*1184Skrgopi 	    min_cpu_id, max_cpu_id, (int)intr_cpu->cpu_id);
503*1184Skrgopi 
504*1184Skrgopi 	/*
505*1184Skrgopi 	 * Quickly check if there are enough CPUs present for fanout
506*1184Skrgopi 	 * and also max_cpu_id is less than the id of the active CPU.
507*1184Skrgopi 	 * We use the cpu_id stored in the last squeue_set to get
508*1184Skrgopi 	 * an idea. The scheme is by no means perfect since it doesn't
509*1184Skrgopi 	 * take into account CPU DR operations and the fact that
510*1184Skrgopi 	 * interrupts themselves might change. An ideal scenario
511*1184Skrgopi 	 * would be to ensure that interrupts run cpus by themselves
512*1184Skrgopi 	 * and worker threads never have affinity to those CPUs. If
513*1184Skrgopi 	 * the interrupts move to CPU which had a worker thread, it
514*1184Skrgopi 	 * should be changed. Probably callbacks similar to CPU offline
515*1184Skrgopi 	 * are needed to make it work perfectly.
516*1184Skrgopi 	 */
517*1184Skrgopi 	last_sqs = sqset_global_list[sqset_global_size - 1];
518*1184Skrgopi 	if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) {
519*1184Skrgopi 		if ((max_cpu_id - min_cpu_id) >
520*1184Skrgopi 		    ill_soft_ring->ill_dls_soft_ring_cnt)
521*1184Skrgopi 			enough_uniq_cpus = B_TRUE;
522*1184Skrgopi 		else if ((max_cpu_id - min_cpu_id) >=
523*1184Skrgopi 		    ill_soft_ring->ill_dls_soft_ring_cnt)
524*1184Skrgopi 			enough_cpus = B_TRUE;
525*1184Skrgopi 	}
526*1184Skrgopi 
527*1184Skrgopi 	j = 0;
528*1184Skrgopi 	for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) {
529*1184Skrgopi 		if (enough_uniq_cpus) {
530*1184Skrgopi 			if ((min_cpu_id + i) == cpu_id) {
531*1184Skrgopi 				j++;
532*1184Skrgopi 				continue;
533*1184Skrgopi 			}
534*1184Skrgopi 			bind_cpu = cpu[min_cpu_id + i];
535*1184Skrgopi 		} else if (enough_cpus) {
536*1184Skrgopi 			bind_cpu = cpu[min_cpu_id + i];
537*1184Skrgopi 		} else {
538*1184Skrgopi 			/* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */
539*1184Skrgopi 			bind_cpu = cpu[(cpu_id + i) % ncpus];
540*1184Skrgopi 		}
541*1184Skrgopi 
542*1184Skrgopi 		/*
543*1184Skrgopi 		 * Check if the CPU actually exist and active. If not,
544*1184Skrgopi 		 * use the interrupted CPU. ip_find_unused_squeue() will
545*1184Skrgopi 		 * find the right CPU to fanout anyway.
546*1184Skrgopi 		 */
547*1184Skrgopi 		if (!CPU_ISON(bind_cpu))
548*1184Skrgopi 			bind_cpu = intr_cpu;
549*1184Skrgopi 
550*1184Skrgopi 		sqs = bind_cpu->cpu_squeue_set;
551*1184Skrgopi 		ASSERT(sqs != NULL);
552*1184Skrgopi 		ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j];
553*1184Skrgopi 
554*1184Skrgopi 		sqp = ip_find_unused_squeue(sqs, bind_cpu, B_TRUE);
555*1184Skrgopi 		if (sqp == NULL) {
556*1184Skrgopi 			/*
557*1184Skrgopi 			 * We hit the max limit of squeues allowed per CPU.
558*1184Skrgopi 			 * Assign this rx_ring to DEFAULT squeue of the
559*1184Skrgopi 			 * interrupted CPU but thesqueue will not manage
560*1184Skrgopi 			 * the ring. Also print a warning.
561*1184Skrgopi 			 */
562*1184Skrgopi 			cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = "
563*1184Skrgopi 			    "%d/%p already has max number of squeues. System "
564*1184Skrgopi 			    "performance might become suboptimal\n",
565*1184Skrgopi 			    sqs->sqs_bind, (void *)sqs);
566*1184Skrgopi 
567*1184Skrgopi 			/* the first squeue in the list is the default squeue */
568*1184Skrgopi 			sqp = intr_cpu->cpu_squeue_set->sqs_list[0];
569*1184Skrgopi 			ASSERT(sqp != NULL);
570*1184Skrgopi 
571*1184Skrgopi 			ill_rx_ring->rr_sqp = sqp;
572*1184Skrgopi 			ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
573*1184Skrgopi 			continue;
574*1184Skrgopi 
575*1184Skrgopi 		}
576*1184Skrgopi 		ASSERT(MUTEX_HELD(&sqp->sq_lock));
577*1184Skrgopi 		ill_rx_ring->rr_sqp = sqp;
578*1184Skrgopi 		sqp->sq_rx_ring = ill_rx_ring;
579*1184Skrgopi 		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
580*1184Skrgopi 		sqp->sq_state |= SQS_ILL_BOUND;
581*1184Skrgopi 
582*1184Skrgopi 		/* assign affinity to soft ring */
583*1184Skrgopi 		if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) {
584*1184Skrgopi 			ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle,
585*1184Skrgopi 			    sqp->sq_bind);
586*1184Skrgopi 		}
587*1184Skrgopi 		mutex_exit(&sqp->sq_lock);
588*1184Skrgopi 
589*1184Skrgopi 		cmn_err(CE_CONT, "soft_ring_affinity: ring = %d, bind = %d\n",
590*1184Skrgopi 		    i - j, sqp->sq_bind);
591*1184Skrgopi 	}
592*1184Skrgopi 	mutex_exit(&ill->ill_lock);
593*1184Skrgopi 
594*1184Skrgopi 	ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle,
595*1184Skrgopi 	    SOFT_RING_SRC_HASH);
596*1184Skrgopi 
597*1184Skrgopi 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
598*1184Skrgopi 	ill_waiter_dcr(ill);
599*1184Skrgopi }
600*1184Skrgopi 
601*1184Skrgopi void
602*1184Skrgopi ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring,
603*1184Skrgopi mblk_t *mp_chain, size_t hdrlen)
604*1184Skrgopi {
605*1184Skrgopi 	ip_taskq_arg_t	*taskq_arg;
606*1184Skrgopi 	boolean_t	refheld;
607*1184Skrgopi 
608*1184Skrgopi 	ASSERT(servicing_interrupt());
609*1184Skrgopi 	ASSERT(ip_ring == NULL);
610*1184Skrgopi 
611*1184Skrgopi 	mutex_enter(&ill->ill_lock);
612*1184Skrgopi 	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
613*1184Skrgopi 		taskq_arg = (ip_taskq_arg_t *)
614*1184Skrgopi 		    kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP);
615*1184Skrgopi 
616*1184Skrgopi 		if (taskq_arg == NULL)
617*1184Skrgopi 			goto out;
618*1184Skrgopi 
619*1184Skrgopi 		taskq_arg->ip_taskq_ill = ill;
620*1184Skrgopi 		taskq_arg->ip_taskq_ill_rx_ring = ip_ring;
621*1184Skrgopi 		taskq_arg->ip_taskq_cpu = CPU;
622*1184Skrgopi 
623*1184Skrgopi 		/*
624*1184Skrgopi 		 * Set ILL_SOFT_RING_ASSIGN flag. We don't want
625*1184Skrgopi 		 * the next interrupt to schedule a task for calling
626*1184Skrgopi 		 * ip_squeue_soft_ring_affinity();
627*1184Skrgopi 		 */
628*1184Skrgopi 		ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN;
629*1184Skrgopi 	} else {
630*1184Skrgopi 		mutex_exit(&ill->ill_lock);
631*1184Skrgopi 		goto out;
632*1184Skrgopi 	}
633*1184Skrgopi 	mutex_exit(&ill->ill_lock);
634*1184Skrgopi 	refheld = ill_waiter_inc(ill);
635*1184Skrgopi 	if (refheld) {
636*1184Skrgopi 		if (taskq_dispatch(system_taskq,
637*1184Skrgopi 		    ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP))
638*1184Skrgopi 			goto out;
639*1184Skrgopi 
640*1184Skrgopi 		/* release ref on ill if taskq dispatch fails */
641*1184Skrgopi 		ill_waiter_dcr(ill);
642*1184Skrgopi 	}
643*1184Skrgopi 	/*
644*1184Skrgopi 	 * Turn on CAPAB_SOFT_RING so that affinity assignment
645*1184Skrgopi 	 * can be tried again later.
646*1184Skrgopi 	 */
647*1184Skrgopi 	mutex_enter(&ill->ill_lock);
648*1184Skrgopi 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
649*1184Skrgopi 	mutex_exit(&ill->ill_lock);
650*1184Skrgopi 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
651*1184Skrgopi 
652*1184Skrgopi out:
653*1184Skrgopi 	ip_input(ill, ip_ring, mp_chain, hdrlen);
654*1184Skrgopi }
655*1184Skrgopi 
656*1184Skrgopi static squeue_t *
657*1184Skrgopi ip_find_unused_squeue(squeue_set_t *sqs, cpu_t *bind_cpu, boolean_t fanout)
658*1184Skrgopi {
659*1184Skrgopi 	int 		i;
660*1184Skrgopi 	squeue_set_t	*best_sqs = NULL;
661*1184Skrgopi 	squeue_set_t	*curr_sqs = NULL;
662*1184Skrgopi 	int		min_sq = 0;
663*1184Skrgopi 	squeue_t 	*sqp = NULL;
664*1184Skrgopi 	char		sqname[64];
665*1184Skrgopi 
666*1184Skrgopi 	/*
667*1184Skrgopi 	 * If fanout is set and the passed squeue_set already has some
668*1184Skrgopi 	 * squeues which are managing the NICs, try to find squeues on
669*1184Skrgopi 	 * unused CPU.
670*1184Skrgopi 	 */
671*1184Skrgopi 	if (sqs->sqs_size > 1 && fanout) {
672*1184Skrgopi 		/*
673*1184Skrgopi 		 * First check to see if any squeue on the CPU passed
674*1184Skrgopi 		 * is managing a NIC.
675*1184Skrgopi 		 */
676*1184Skrgopi 		for (i = 0; i < sqs->sqs_size; i++) {
677*1184Skrgopi 			mutex_enter(&sqs->sqs_list[i]->sq_lock);
678*1184Skrgopi 			if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) &&
679*1184Skrgopi 			    !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) {
680*1184Skrgopi 				mutex_exit(&sqs->sqs_list[i]->sq_lock);
681*1184Skrgopi 				break;
682*1184Skrgopi 			}
683*1184Skrgopi 			mutex_exit(&sqs->sqs_list[i]->sq_lock);
684*1184Skrgopi 		}
685*1184Skrgopi 		if (i != sqs->sqs_size) {
686*1184Skrgopi 			best_sqs = sqset_global_list[sqset_global_size - 1];
687*1184Skrgopi 			min_sq = best_sqs->sqs_size;
688*1184Skrgopi 
689*1184Skrgopi 			for (i = sqset_global_size - 2; i >= 0; i--) {
690*1184Skrgopi 				curr_sqs = sqset_global_list[i];
691*1184Skrgopi 				if (curr_sqs->sqs_size < min_sq) {
692*1184Skrgopi 					best_sqs = curr_sqs;
693*1184Skrgopi 					min_sq = curr_sqs->sqs_size;
694*1184Skrgopi 				}
695*1184Skrgopi 			}
696*1184Skrgopi 
697*1184Skrgopi 			ASSERT(best_sqs != NULL);
698*1184Skrgopi 			sqs = best_sqs;
699*1184Skrgopi 			bind_cpu = cpu[sqs->sqs_bind];
700*1184Skrgopi 		}
701*1184Skrgopi 	}
702*1184Skrgopi 
7030Sstevel@tonic-gate 	mutex_enter(&sqs->sqs_lock);
704*1184Skrgopi 
7050Sstevel@tonic-gate 	for (i = 0; i < sqs->sqs_size; i++) {
7060Sstevel@tonic-gate 		mutex_enter(&sqs->sqs_list[i]->sq_lock);
707*1184Skrgopi 		if ((sqs->sqs_list[i]->sq_state &
708*1184Skrgopi 		    (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) {
7090Sstevel@tonic-gate 			sqp = sqs->sqs_list[i];
7100Sstevel@tonic-gate 			break;
7110Sstevel@tonic-gate 		}
7120Sstevel@tonic-gate 		mutex_exit(&sqs->sqs_list[i]->sq_lock);
7130Sstevel@tonic-gate 	}
7140Sstevel@tonic-gate 
7150Sstevel@tonic-gate 	if (sqp == NULL) {
7160Sstevel@tonic-gate 		/* Need to create a new squeue */
7170Sstevel@tonic-gate 		if (sqs->sqs_size == sqs->sqs_max_size) {
7180Sstevel@tonic-gate 			/*
7190Sstevel@tonic-gate 			 * Reached the max limit for squeue
720*1184Skrgopi 			 * we can allocate on this CPU.
7210Sstevel@tonic-gate 			 */
7220Sstevel@tonic-gate 			mutex_exit(&sqs->sqs_lock);
723*1184Skrgopi 			return (NULL);
7240Sstevel@tonic-gate 		}
7250Sstevel@tonic-gate 
7260Sstevel@tonic-gate 		bzero(sqname, sizeof (sqname));
7270Sstevel@tonic-gate 		(void) snprintf(sqname, sizeof (sqname),
728*1184Skrgopi 		    "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid,
729*1184Skrgopi 		    bind_cpu->cpu_id, sqs->sqs_size);
7300Sstevel@tonic-gate 
731*1184Skrgopi 		sqp = squeue_create(sqname, bind_cpu->cpu_id,
732*1184Skrgopi 		    ip_squeue_worker_wait, minclsyspri);
7330Sstevel@tonic-gate 
7340Sstevel@tonic-gate 		ASSERT(sqp != NULL);
7350Sstevel@tonic-gate 
7360Sstevel@tonic-gate 		squeue_profile_enable(sqp);
7370Sstevel@tonic-gate 		sqs->sqs_list[sqs->sqs_size++] = sqp;
7380Sstevel@tonic-gate 
7390Sstevel@tonic-gate 		if (ip_squeue_create_callback != NULL)
7400Sstevel@tonic-gate 			ip_squeue_create_callback(sqp);
7410Sstevel@tonic-gate 
742*1184Skrgopi 		mutex_enter(&cpu_lock);
743*1184Skrgopi 		if (ip_squeue_bind && cpu_is_online(bind_cpu)) {
7440Sstevel@tonic-gate 			squeue_bind(sqp, -1);
7450Sstevel@tonic-gate 		}
746*1184Skrgopi 		mutex_exit(&cpu_lock);
747*1184Skrgopi 
7480Sstevel@tonic-gate 		mutex_enter(&sqp->sq_lock);
7490Sstevel@tonic-gate 	}
7500Sstevel@tonic-gate 
751*1184Skrgopi 	mutex_exit(&sqs->sqs_lock);
7520Sstevel@tonic-gate 	ASSERT(sqp != NULL);
753*1184Skrgopi 	return (sqp);
7540Sstevel@tonic-gate }
7550Sstevel@tonic-gate 
7560Sstevel@tonic-gate /*
7570Sstevel@tonic-gate  * Find the squeue assigned to manage this Rx ring. If the Rx ring is not
7580Sstevel@tonic-gate  * owned by a squeue yet, do the assignment. When the NIC registers it
7590Sstevel@tonic-gate  * Rx rings with IP, we don't know where the interrupts will land and
7600Sstevel@tonic-gate  * hence we need to wait till this point to do the assignment.
7610Sstevel@tonic-gate  */
7620Sstevel@tonic-gate squeue_t *
7630Sstevel@tonic-gate ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
7640Sstevel@tonic-gate {
7650Sstevel@tonic-gate 	squeue_t 	*sqp;
7660Sstevel@tonic-gate 	ill_t 		*ill;
7670Sstevel@tonic-gate 	int		interrupt;
7680Sstevel@tonic-gate 	ip_taskq_arg_t	*taskq_arg;
7690Sstevel@tonic-gate 	boolean_t	refheld;
7700Sstevel@tonic-gate 
7710Sstevel@tonic-gate 	if (ill_rx_ring == NULL)
7720Sstevel@tonic-gate 		return (IP_SQUEUE_GET(lbolt));
7730Sstevel@tonic-gate 
7740Sstevel@tonic-gate 	sqp = ill_rx_ring->rr_sqp;
7750Sstevel@tonic-gate 	/*
7760Sstevel@tonic-gate 	 * Do a quick check. If it's not NULL, we are done.
7770Sstevel@tonic-gate 	 * Squeues are never destroyed so worse we will bind
7780Sstevel@tonic-gate 	 * this connection to a suboptimal squeue.
7790Sstevel@tonic-gate 	 *
7800Sstevel@tonic-gate 	 * This is the fast path case.
7810Sstevel@tonic-gate 	 */
7820Sstevel@tonic-gate 	if (sqp != NULL)
7830Sstevel@tonic-gate 		return (sqp);
7840Sstevel@tonic-gate 
7850Sstevel@tonic-gate 	ill = ill_rx_ring->rr_ill;
7860Sstevel@tonic-gate 	ASSERT(ill != NULL);
7870Sstevel@tonic-gate 
7880Sstevel@tonic-gate 	interrupt = servicing_interrupt();
7890Sstevel@tonic-gate 	taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t),
7900Sstevel@tonic-gate 	    KM_NOSLEEP);
7910Sstevel@tonic-gate 
7920Sstevel@tonic-gate 	mutex_enter(&ill->ill_lock);
7930Sstevel@tonic-gate 	if (!interrupt || ill_rx_ring->rr_ring_state != ILL_RING_INUSE ||
7940Sstevel@tonic-gate 		taskq_arg == NULL) {
7950Sstevel@tonic-gate 		/*
7960Sstevel@tonic-gate 		 * Do the ring to squeue binding only if we are in interrupt
7970Sstevel@tonic-gate 		 * context and there is no one else trying the bind already.
7980Sstevel@tonic-gate 		 */
7990Sstevel@tonic-gate 		mutex_exit(&ill->ill_lock);
8000Sstevel@tonic-gate 		if (taskq_arg != NULL)
8010Sstevel@tonic-gate 			kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
8020Sstevel@tonic-gate 		return (IP_SQUEUE_GET(lbolt));
8030Sstevel@tonic-gate 	}
8040Sstevel@tonic-gate 
8050Sstevel@tonic-gate 	/*
8060Sstevel@tonic-gate 	 * No sqp assigned yet. Can't really do that in interrupt
8070Sstevel@tonic-gate 	 * context. Assign the default sqp to this connection and
8080Sstevel@tonic-gate 	 * trigger creation of new sqp and binding it to this ring
8090Sstevel@tonic-gate 	 * via taskq. Need to make sure ill stays around.
8100Sstevel@tonic-gate 	 */
8110Sstevel@tonic-gate 	taskq_arg->ip_taskq_ill = ill;
8120Sstevel@tonic-gate 	taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring;
8130Sstevel@tonic-gate 	taskq_arg->ip_taskq_cpu = CPU;
8140Sstevel@tonic-gate 	ill_rx_ring->rr_ring_state = ILL_RING_INPROC;
8150Sstevel@tonic-gate 	mutex_exit(&ill->ill_lock);
8160Sstevel@tonic-gate 	refheld = ill_waiter_inc(ill);
8170Sstevel@tonic-gate 	if (refheld) {
8180Sstevel@tonic-gate 		if (taskq_dispatch(system_taskq, ip_squeue_extend,
8190Sstevel@tonic-gate 		    taskq_arg, TQ_NOSLEEP) != NULL) {
8200Sstevel@tonic-gate 			return (IP_SQUEUE_GET(lbolt));
8210Sstevel@tonic-gate 		}
8220Sstevel@tonic-gate 	}
8230Sstevel@tonic-gate 	/*
8240Sstevel@tonic-gate 	 * The ill is closing and we could not get a reference on the ill OR
8250Sstevel@tonic-gate 	 * taskq_dispatch failed probably due to memory allocation failure.
8260Sstevel@tonic-gate 	 * We will try again next time.
8270Sstevel@tonic-gate 	 */
8280Sstevel@tonic-gate 	mutex_enter(&ill->ill_lock);
8290Sstevel@tonic-gate 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
8300Sstevel@tonic-gate 	mutex_exit(&ill->ill_lock);
8310Sstevel@tonic-gate 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
8320Sstevel@tonic-gate 	if (refheld)
8330Sstevel@tonic-gate 		ill_waiter_dcr(ill);
8340Sstevel@tonic-gate 
8350Sstevel@tonic-gate 	return (IP_SQUEUE_GET(lbolt));
8360Sstevel@tonic-gate }
8370Sstevel@tonic-gate 
8380Sstevel@tonic-gate /*
8390Sstevel@tonic-gate  * NDD hooks for setting ip_squeue_xxx tuneables.
8400Sstevel@tonic-gate  */
8410Sstevel@tonic-gate 
8420Sstevel@tonic-gate /* ARGSUSED */
8430Sstevel@tonic-gate int
8440Sstevel@tonic-gate ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value,
8450Sstevel@tonic-gate     caddr_t addr, cred_t *cr)
8460Sstevel@tonic-gate {
8470Sstevel@tonic-gate 	int *bind_enabled = (int *)addr;
8480Sstevel@tonic-gate 	long new_value;
8490Sstevel@tonic-gate 	int i;
8500Sstevel@tonic-gate 
8510Sstevel@tonic-gate 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
8520Sstevel@tonic-gate 		return (EINVAL);
8530Sstevel@tonic-gate 
8540Sstevel@tonic-gate 	if (ip_squeue_bind == new_value)
8550Sstevel@tonic-gate 		return (0);
8560Sstevel@tonic-gate 
8570Sstevel@tonic-gate 	*bind_enabled = new_value;
8580Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
8590Sstevel@tonic-gate 	if (new_value == 0) {
8600Sstevel@tonic-gate 		for (i = 0; i < sqset_global_size; i++)
8610Sstevel@tonic-gate 			ip_squeue_set_unbind(sqset_global_list[i]);
8620Sstevel@tonic-gate 	} else {
8630Sstevel@tonic-gate 		for (i = 0; i < sqset_global_size; i++)
8640Sstevel@tonic-gate 			ip_squeue_set_bind(sqset_global_list[i]);
8650Sstevel@tonic-gate 	}
8660Sstevel@tonic-gate 
8670Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
8680Sstevel@tonic-gate 	return (0);
8690Sstevel@tonic-gate }
8700Sstevel@tonic-gate 
8710Sstevel@tonic-gate /*
8720Sstevel@tonic-gate  * Set squeue profiling.
8730Sstevel@tonic-gate  * 0 means "disable"
8740Sstevel@tonic-gate  * 1 means "enable"
8750Sstevel@tonic-gate  * 2 means "enable and reset"
8760Sstevel@tonic-gate  */
8770Sstevel@tonic-gate /* ARGSUSED */
8780Sstevel@tonic-gate int
8790Sstevel@tonic-gate ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
8800Sstevel@tonic-gate     cred_t *cr)
8810Sstevel@tonic-gate {
8820Sstevel@tonic-gate 	int *profile_enabled = (int *)cp;
8830Sstevel@tonic-gate 	long new_value;
8840Sstevel@tonic-gate 	squeue_set_t *sqs;
8850Sstevel@tonic-gate 
8860Sstevel@tonic-gate 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
8870Sstevel@tonic-gate 		return (EINVAL);
8880Sstevel@tonic-gate 
8890Sstevel@tonic-gate 	if (new_value == 0)
8900Sstevel@tonic-gate 		squeue_profile_stop();
8910Sstevel@tonic-gate 	else if (new_value == 1)
8920Sstevel@tonic-gate 		squeue_profile_start();
8930Sstevel@tonic-gate 	else if (new_value == 2) {
8940Sstevel@tonic-gate 		int i, j;
8950Sstevel@tonic-gate 
8960Sstevel@tonic-gate 		squeue_profile_stop();
8970Sstevel@tonic-gate 		mutex_enter(&cpu_lock);
8980Sstevel@tonic-gate 		for (i = 0; i < sqset_global_size; i++) {
8990Sstevel@tonic-gate 			sqs = sqset_global_list[i];
9000Sstevel@tonic-gate 			for (j = 0; j < sqs->sqs_size; j++) {
9010Sstevel@tonic-gate 				squeue_profile_reset(sqs->sqs_list[j]);
9020Sstevel@tonic-gate 			}
9030Sstevel@tonic-gate 		}
9040Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
9050Sstevel@tonic-gate 
9060Sstevel@tonic-gate 		new_value = 1;
9070Sstevel@tonic-gate 		squeue_profile_start();
9080Sstevel@tonic-gate 	}
9090Sstevel@tonic-gate 	*profile_enabled = new_value;
9100Sstevel@tonic-gate 
9110Sstevel@tonic-gate 	return (0);
9120Sstevel@tonic-gate }
9130Sstevel@tonic-gate 
9140Sstevel@tonic-gate /*
9150Sstevel@tonic-gate  * Reconfiguration callback
9160Sstevel@tonic-gate  */
9170Sstevel@tonic-gate 
9180Sstevel@tonic-gate /* ARGSUSED */
9190Sstevel@tonic-gate static int
9200Sstevel@tonic-gate ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
9210Sstevel@tonic-gate {
9220Sstevel@tonic-gate 	cpu_t *cp = cpu[id];
9230Sstevel@tonic-gate 
9240Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
9250Sstevel@tonic-gate 	switch (what) {
926405Sakolb 	case CPU_CONFIG:
927405Sakolb 		/*
928405Sakolb 		 * A new CPU is added. Create an squeue for it but do not bind
929405Sakolb 		 * it yet.
930405Sakolb 		 */
931405Sakolb 		if (cp->cpu_squeue_set == NULL)
932405Sakolb 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
933405Sakolb 		break;
9340Sstevel@tonic-gate 	case CPU_ON:
9350Sstevel@tonic-gate 	case CPU_INIT:
9360Sstevel@tonic-gate 	case CPU_CPUPART_IN:
9370Sstevel@tonic-gate 		if (cp->cpu_squeue_set == NULL) {
9380Sstevel@tonic-gate 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
9390Sstevel@tonic-gate 		}
9400Sstevel@tonic-gate 		if (ip_squeue_bind)
9410Sstevel@tonic-gate 			ip_squeue_set_bind(cp->cpu_squeue_set);
9420Sstevel@tonic-gate 		break;
9430Sstevel@tonic-gate 	case CPU_UNCONFIG:
9440Sstevel@tonic-gate 	case CPU_OFF:
9450Sstevel@tonic-gate 	case CPU_CPUPART_OUT:
9460Sstevel@tonic-gate 		ASSERT((cp->cpu_squeue_set != NULL) ||
9470Sstevel@tonic-gate 		    (cp->cpu_flags & CPU_OFFLINE));
9480Sstevel@tonic-gate 
9490Sstevel@tonic-gate 		if (cp->cpu_squeue_set != NULL) {
9500Sstevel@tonic-gate 			ip_squeue_set_unbind(cp->cpu_squeue_set);
9510Sstevel@tonic-gate 		}
9520Sstevel@tonic-gate 		break;
9530Sstevel@tonic-gate 	default:
9540Sstevel@tonic-gate 		break;
9550Sstevel@tonic-gate 	}
9560Sstevel@tonic-gate 	return (0);
9570Sstevel@tonic-gate }
9580Sstevel@tonic-gate 
9590Sstevel@tonic-gate /* ARGSUSED */
9600Sstevel@tonic-gate static void
9610Sstevel@tonic-gate ip_squeue_set_bind(squeue_set_t *sqs)
9620Sstevel@tonic-gate {
9630Sstevel@tonic-gate 	int i;
9640Sstevel@tonic-gate 	squeue_t *sqp;
9650Sstevel@tonic-gate 
9660Sstevel@tonic-gate 	if (!ip_squeue_bind)
9670Sstevel@tonic-gate 		return;
9680Sstevel@tonic-gate 
9690Sstevel@tonic-gate 	mutex_enter(&sqs->sqs_lock);
9700Sstevel@tonic-gate 	for (i = 0; i < sqs->sqs_size; i++) {
9710Sstevel@tonic-gate 		sqp = sqs->sqs_list[i];
9720Sstevel@tonic-gate 		if (sqp->sq_state & SQS_BOUND)
9730Sstevel@tonic-gate 			continue;
9740Sstevel@tonic-gate 		squeue_bind(sqp, -1);
9750Sstevel@tonic-gate 	}
9760Sstevel@tonic-gate 	mutex_exit(&sqs->sqs_lock);
9770Sstevel@tonic-gate }
9780Sstevel@tonic-gate 
9790Sstevel@tonic-gate static void
9800Sstevel@tonic-gate ip_squeue_set_unbind(squeue_set_t *sqs)
9810Sstevel@tonic-gate {
9820Sstevel@tonic-gate 	int i;
9830Sstevel@tonic-gate 	squeue_t *sqp;
9840Sstevel@tonic-gate 
9850Sstevel@tonic-gate 	mutex_enter(&sqs->sqs_lock);
9860Sstevel@tonic-gate 	for (i = 0; i < sqs->sqs_size; i++) {
9870Sstevel@tonic-gate 		sqp = sqs->sqs_list[i];
988*1184Skrgopi 
989*1184Skrgopi 		/*
990*1184Skrgopi 		 * CPU is going offline. Remove the thread affinity
991*1184Skrgopi 		 * for any soft ring threads the squeue is managing.
992*1184Skrgopi 		 */
993*1184Skrgopi 		if (sqp->sq_state & SQS_ILL_BOUND) {
994*1184Skrgopi 			ill_rx_ring_t	*ring = sqp->sq_rx_ring;
995*1184Skrgopi 			ill_t		*ill = ring->rr_ill;
996*1184Skrgopi 
997*1184Skrgopi 			if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
998*1184Skrgopi 				ASSERT(ring->rr_handle != NULL);
999*1184Skrgopi 				ill->ill_dls_capab->ill_dls_unbind(
1000*1184Skrgopi 					ring->rr_handle);
1001*1184Skrgopi 			}
1002*1184Skrgopi 		}
10030Sstevel@tonic-gate 		if (!(sqp->sq_state & SQS_BOUND))
10040Sstevel@tonic-gate 			continue;
10050Sstevel@tonic-gate 		squeue_unbind(sqp);
10060Sstevel@tonic-gate 	}
10070Sstevel@tonic-gate 	mutex_exit(&sqs->sqs_lock);
10080Sstevel@tonic-gate }
1009