xref: /onnv-gate/usr/src/uts/common/inet/ip/ip_squeue.c (revision 1503:9c3595b79c0d)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*1503Sericheng  * Common Development and Distribution License (the "License").
6*1503Sericheng  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
22*1503Sericheng  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
270Sstevel@tonic-gate 
280Sstevel@tonic-gate /*
290Sstevel@tonic-gate  * IP interface to squeues.
300Sstevel@tonic-gate  *
310Sstevel@tonic-gate  * IP creates an squeue instance for each CPU. The squeue pointer is saved in
320Sstevel@tonic-gate  * cpu_squeue field of the cpu structure. Each squeue is associated with a
330Sstevel@tonic-gate  * connection instance (conn_t).
340Sstevel@tonic-gate  *
350Sstevel@tonic-gate  * For CPUs available at system startup time the squeue creation and association
360Sstevel@tonic-gate  * with CPU happens at MP initialization time. For CPUs added during dynamic
370Sstevel@tonic-gate  * reconfiguration, the initialization happens when the new CPU is configured in
380Sstevel@tonic-gate  * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either
390Sstevel@tonic-gate  * return per-CPU squeue or random squeue based on the ip_squeue_fanout
400Sstevel@tonic-gate  * variable.
410Sstevel@tonic-gate  *
420Sstevel@tonic-gate  * There are two modes of associating connection with squeues. The first mode
430Sstevel@tonic-gate  * associates each connection with the CPU that creates the connection (either
440Sstevel@tonic-gate  * during open time or during accept time). The second mode associates each
450Sstevel@tonic-gate  * connection with a random CPU, effectively distributing load over all CPUs
460Sstevel@tonic-gate  * and all squeues in the system. The mode is controlled by the
470Sstevel@tonic-gate  * ip_squeue_fanout variable.
480Sstevel@tonic-gate  *
490Sstevel@tonic-gate  * NOTE: The fact that there is an association between each connection and
500Sstevel@tonic-gate  * squeue and squeue and CPU does not mean that each connection is always
510Sstevel@tonic-gate  * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
520Sstevel@tonic-gate  * may process the connection on whatever CPU it is scheduled. The squeue to CPU
530Sstevel@tonic-gate  * binding is only relevant for the worker thread.
540Sstevel@tonic-gate  *
550Sstevel@tonic-gate  * The list of all created squeues is kept in squeue_set structure. This list is
560Sstevel@tonic-gate  * used when ip_squeue_fanout is set and the load is distributed across all
570Sstevel@tonic-gate  * squeues.
580Sstevel@tonic-gate  *
590Sstevel@tonic-gate  * INTERFACE:
600Sstevel@tonic-gate  *
610Sstevel@tonic-gate  * squeue_t *ip_squeue_get(hint)
620Sstevel@tonic-gate  *
630Sstevel@tonic-gate  * 	Find an squeue based on the 'hint' value. The hint is used as an index
640Sstevel@tonic-gate  * 	in the array of IP squeues available. The way hint is computed may
650Sstevel@tonic-gate  * 	affect the effectiveness of the squeue distribution. Currently squeues
660Sstevel@tonic-gate  * 	are assigned in round-robin fashion using lbolt as a hint.
670Sstevel@tonic-gate  *
680Sstevel@tonic-gate  *
690Sstevel@tonic-gate  * DR Notes
700Sstevel@tonic-gate  * ========
710Sstevel@tonic-gate  *
720Sstevel@tonic-gate  * The ip_squeue_init() registers a call-back function with the CPU DR
730Sstevel@tonic-gate  * subsystem using register_cpu_setup_func(). The call-back function does two
740Sstevel@tonic-gate  * things:
750Sstevel@tonic-gate  *
760Sstevel@tonic-gate  * o When the CPU is going off-line or unconfigured, the worker thread is
770Sstevel@tonic-gate  *	unbound from the CPU. This allows the CPU unconfig code to move it to
780Sstevel@tonic-gate  *	another CPU.
790Sstevel@tonic-gate  *
800Sstevel@tonic-gate  * o When the CPU is going online, it creates a new squeue for this CPU if
810Sstevel@tonic-gate  *	necessary and binds the squeue worker thread to this CPU.
820Sstevel@tonic-gate  *
830Sstevel@tonic-gate  * TUNEBALES:
840Sstevel@tonic-gate  *
850Sstevel@tonic-gate  * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU
860Sstevel@tonic-gate  * 	associated with an squeue instance.
870Sstevel@tonic-gate  *
880Sstevel@tonic-gate  * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c
890Sstevel@tonic-gate  *	should be compiled with SQUEUE_PROFILE enabled for this variable to have
900Sstevel@tonic-gate  *	an impact.
910Sstevel@tonic-gate  *
920Sstevel@tonic-gate  * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue,
930Sstevel@tonic-gate  *	otherwise get it from CPU->cpu_squeue.
940Sstevel@tonic-gate  *
950Sstevel@tonic-gate  * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and
960Sstevel@tonic-gate  * changed using ndd on /dev/tcp or /dev/ip.
970Sstevel@tonic-gate  *
980Sstevel@tonic-gate  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
990Sstevel@tonic-gate  *	created. This is the time squeue code waits before waking up the worker
1000Sstevel@tonic-gate  *	thread after queuing a request.
1010Sstevel@tonic-gate  */
1020Sstevel@tonic-gate 
1030Sstevel@tonic-gate #include <sys/types.h>
1040Sstevel@tonic-gate #include <sys/debug.h>
1050Sstevel@tonic-gate #include <sys/kmem.h>
1060Sstevel@tonic-gate #include <sys/cpuvar.h>
1070Sstevel@tonic-gate 
1080Sstevel@tonic-gate #include <sys/cmn_err.h>
1090Sstevel@tonic-gate 
1100Sstevel@tonic-gate #include <inet/common.h>
1110Sstevel@tonic-gate #include <inet/ip.h>
1120Sstevel@tonic-gate #include <inet/ip_if.h>
1130Sstevel@tonic-gate #include <inet/mi.h>
1140Sstevel@tonic-gate #include <inet/nd.h>
1150Sstevel@tonic-gate #include <inet/ipclassifier.h>
1160Sstevel@tonic-gate #include <sys/types.h>
1170Sstevel@tonic-gate #include <sys/conf.h>
1180Sstevel@tonic-gate #include <sys/sunddi.h>
1190Sstevel@tonic-gate #include <sys/ddi.h>
1200Sstevel@tonic-gate #include <sys/squeue_impl.h>
1210Sstevel@tonic-gate 
1220Sstevel@tonic-gate 
1230Sstevel@tonic-gate /*
1240Sstevel@tonic-gate  * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
1250Sstevel@tonic-gate  * mapping between squeue and NIC (or Rx ring) for performance reasons so
1260Sstevel@tonic-gate  * each squeue can uniquely own a NIC or a Rx ring and do polling
1271184Skrgopi  * (PSARC 2004/630). So we allow up to  MAX_SQUEUES_PER_CPU squeues per CPU.
1281184Skrgopi  * We start by creating MIN_SQUEUES_PER_CPU squeues per CPU but more squeues
1290Sstevel@tonic-gate  * can be created dynamically as needed.
1300Sstevel@tonic-gate  */
1311184Skrgopi #define	MAX_SQUEUES_PER_CPU	32
1321184Skrgopi #define	MIN_SQUEUES_PER_CPU	1
1331184Skrgopi uint_t	ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
1341184Skrgopi 
1351184Skrgopi #define	IP_NUM_SOFT_RINGS	2
1361184Skrgopi uint_t ip_soft_rings_cnt = IP_NUM_SOFT_RINGS;
1370Sstevel@tonic-gate 
1380Sstevel@tonic-gate /*
1390Sstevel@tonic-gate  * List of all created squeue sets. The size is protected by cpu_lock
1400Sstevel@tonic-gate  */
1410Sstevel@tonic-gate squeue_set_t	**sqset_global_list;
1420Sstevel@tonic-gate uint_t		sqset_global_size;
1430Sstevel@tonic-gate 
1440Sstevel@tonic-gate int ip_squeue_bind = B_TRUE;
1450Sstevel@tonic-gate int ip_squeue_profile = B_TRUE;
1460Sstevel@tonic-gate static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
1470Sstevel@tonic-gate 
1480Sstevel@tonic-gate /*
1490Sstevel@tonic-gate  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
1500Sstevel@tonic-gate  *	created. This is the time squeue code waits before waking up the worker
1510Sstevel@tonic-gate  *	thread after queuing a request.
1520Sstevel@tonic-gate  */
1530Sstevel@tonic-gate uint_t ip_squeue_worker_wait = 10;
1540Sstevel@tonic-gate 
1550Sstevel@tonic-gate static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t);
1560Sstevel@tonic-gate static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
1570Sstevel@tonic-gate 
1580Sstevel@tonic-gate static void ip_squeue_set_bind(squeue_set_t *);
1590Sstevel@tonic-gate static void ip_squeue_set_unbind(squeue_set_t *);
1601184Skrgopi static squeue_t *ip_find_unused_squeue(squeue_set_t *, cpu_t *, boolean_t);
1610Sstevel@tonic-gate 
1620Sstevel@tonic-gate #define	CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
1630Sstevel@tonic-gate 
1640Sstevel@tonic-gate /*
1651184Skrgopi  * Create squeue set containing ip_squeues_per_cpu number of squeues
1660Sstevel@tonic-gate  * for this CPU and bind them all to the CPU.
1670Sstevel@tonic-gate  */
1680Sstevel@tonic-gate static squeue_set_t *
1690Sstevel@tonic-gate ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
1700Sstevel@tonic-gate {
1710Sstevel@tonic-gate 	int i;
1720Sstevel@tonic-gate 	squeue_set_t	*sqs;
1730Sstevel@tonic-gate 	squeue_t 	*sqp;
1740Sstevel@tonic-gate 	char 		sqname[64];
1750Sstevel@tonic-gate 	processorid_t 	id = cp->cpu_id;
1760Sstevel@tonic-gate 
1770Sstevel@tonic-gate 	if (reuse) {
1780Sstevel@tonic-gate 		int i;
1790Sstevel@tonic-gate 
1800Sstevel@tonic-gate 		/*
1810Sstevel@tonic-gate 		 * We may already have an squeue created for this CPU. Try to
1820Sstevel@tonic-gate 		 * find one and reuse it if possible.
1830Sstevel@tonic-gate 		 */
1840Sstevel@tonic-gate 		for (i = 0; i < sqset_global_size; i++) {
1850Sstevel@tonic-gate 			sqs = sqset_global_list[i];
1860Sstevel@tonic-gate 			if (id == sqs->sqs_bind)
1870Sstevel@tonic-gate 				return (sqs);
1880Sstevel@tonic-gate 		}
1890Sstevel@tonic-gate 	}
1900Sstevel@tonic-gate 
1910Sstevel@tonic-gate 	sqs = kmem_zalloc(sizeof (squeue_set_t) +
1921184Skrgopi 	    (sizeof (squeue_t *) * MAX_SQUEUES_PER_CPU), KM_SLEEP);
1930Sstevel@tonic-gate 	mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL);
1940Sstevel@tonic-gate 	sqs->sqs_list = (squeue_t **)&sqs[1];
1951184Skrgopi 	sqs->sqs_max_size = MAX_SQUEUES_PER_CPU;
1960Sstevel@tonic-gate 	sqs->sqs_bind = id;
1970Sstevel@tonic-gate 
1981184Skrgopi 	for (i = 0; i < ip_squeues_per_cpu; i++) {
1990Sstevel@tonic-gate 		bzero(sqname, sizeof (sqname));
2000Sstevel@tonic-gate 
2010Sstevel@tonic-gate 		(void) snprintf(sqname, sizeof (sqname),
2020Sstevel@tonic-gate 		    "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid,
2030Sstevel@tonic-gate 		    cp->cpu_id, i);
2040Sstevel@tonic-gate 
2050Sstevel@tonic-gate 		sqp = squeue_create(sqname, id, ip_squeue_worker_wait,
2060Sstevel@tonic-gate 		    minclsyspri);
2070Sstevel@tonic-gate 
2081184Skrgopi 		/*
2091184Skrgopi 		 * The first squeue in each squeue_set is the DEFAULT
2101184Skrgopi 		 * squeue.
2111184Skrgopi 		 */
2121184Skrgopi 		sqp->sq_state |= SQS_DEFAULT;
2131184Skrgopi 
2140Sstevel@tonic-gate 		ASSERT(sqp != NULL);
2150Sstevel@tonic-gate 
2160Sstevel@tonic-gate 		squeue_profile_enable(sqp);
2170Sstevel@tonic-gate 		sqs->sqs_list[sqs->sqs_size++] = sqp;
2180Sstevel@tonic-gate 
2190Sstevel@tonic-gate 		if (ip_squeue_create_callback != NULL)
2200Sstevel@tonic-gate 			ip_squeue_create_callback(sqp);
2210Sstevel@tonic-gate 	}
2220Sstevel@tonic-gate 
223405Sakolb 	if (ip_squeue_bind && cpu_is_online(cp))
2240Sstevel@tonic-gate 		ip_squeue_set_bind(sqs);
2250Sstevel@tonic-gate 
2260Sstevel@tonic-gate 	sqset_global_list[sqset_global_size++] = sqs;
2270Sstevel@tonic-gate 	ASSERT(sqset_global_size <= NCPU);
2280Sstevel@tonic-gate 	return (sqs);
2290Sstevel@tonic-gate }
2300Sstevel@tonic-gate 
2310Sstevel@tonic-gate /*
2320Sstevel@tonic-gate  * Initialize IP squeues.
2330Sstevel@tonic-gate  */
2340Sstevel@tonic-gate void
2350Sstevel@tonic-gate ip_squeue_init(void (*callback)(squeue_t *))
2360Sstevel@tonic-gate {
2370Sstevel@tonic-gate 	int i;
2380Sstevel@tonic-gate 
2390Sstevel@tonic-gate 	ASSERT(sqset_global_list == NULL);
2400Sstevel@tonic-gate 
2411184Skrgopi 	if (ip_squeues_per_cpu < MIN_SQUEUES_PER_CPU)
2421184Skrgopi 		ip_squeues_per_cpu = MIN_SQUEUES_PER_CPU;
2431184Skrgopi 	else if (ip_squeues_per_cpu > MAX_SQUEUES_PER_CPU)
2441184Skrgopi 		ip_squeues_per_cpu = MAX_SQUEUES_PER_CPU;
2450Sstevel@tonic-gate 
2460Sstevel@tonic-gate 	ip_squeue_create_callback = callback;
2470Sstevel@tonic-gate 	squeue_init();
2480Sstevel@tonic-gate 	sqset_global_list =
2490Sstevel@tonic-gate 	    kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP);
2500Sstevel@tonic-gate 	sqset_global_size = 0;
2510Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
2520Sstevel@tonic-gate 
2530Sstevel@tonic-gate 	/* Create squeue for each active CPU available */
2540Sstevel@tonic-gate 	for (i = 0; i < NCPU; i++) {
2550Sstevel@tonic-gate 		cpu_t *cp = cpu[i];
2560Sstevel@tonic-gate 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
2570Sstevel@tonic-gate 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE);
2580Sstevel@tonic-gate 		}
2590Sstevel@tonic-gate 	}
2600Sstevel@tonic-gate 
2610Sstevel@tonic-gate 	register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
2620Sstevel@tonic-gate 
2630Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
2640Sstevel@tonic-gate 
2650Sstevel@tonic-gate 	if (ip_squeue_profile)
2660Sstevel@tonic-gate 		squeue_profile_start();
2670Sstevel@tonic-gate }
2680Sstevel@tonic-gate 
2690Sstevel@tonic-gate /*
2700Sstevel@tonic-gate  * Get squeue_t structure based on index.
2710Sstevel@tonic-gate  * Since the squeue list can only grow, no need to grab any lock.
2720Sstevel@tonic-gate  */
2730Sstevel@tonic-gate squeue_t *
2740Sstevel@tonic-gate ip_squeue_random(uint_t index)
2750Sstevel@tonic-gate {
2760Sstevel@tonic-gate 	squeue_set_t *sqs;
2770Sstevel@tonic-gate 
2780Sstevel@tonic-gate 	sqs = sqset_global_list[index % sqset_global_size];
2790Sstevel@tonic-gate 	return (sqs->sqs_list[index % sqs->sqs_size]);
2800Sstevel@tonic-gate }
2810Sstevel@tonic-gate 
2820Sstevel@tonic-gate /* ARGSUSED */
2830Sstevel@tonic-gate void
2840Sstevel@tonic-gate ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2)
2850Sstevel@tonic-gate {
2860Sstevel@tonic-gate 	squeue_t	*sqp = arg2;
2870Sstevel@tonic-gate 	ill_rx_ring_t	*ring = sqp->sq_rx_ring;
2880Sstevel@tonic-gate 	ill_t		*ill;
2890Sstevel@tonic-gate 
2900Sstevel@tonic-gate 	ASSERT(sqp != NULL);
2910Sstevel@tonic-gate 
2920Sstevel@tonic-gate 	if (ring == NULL) {
2930Sstevel@tonic-gate 		return;
2940Sstevel@tonic-gate 	}
2950Sstevel@tonic-gate 
2960Sstevel@tonic-gate 	/*
2970Sstevel@tonic-gate 	 * Clean up squeue
2980Sstevel@tonic-gate 	 */
2990Sstevel@tonic-gate 	mutex_enter(&sqp->sq_lock);
3000Sstevel@tonic-gate 	sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB);
3010Sstevel@tonic-gate 	sqp->sq_rx_ring = NULL;
3020Sstevel@tonic-gate 	mutex_exit(&sqp->sq_lock);
3030Sstevel@tonic-gate 
3040Sstevel@tonic-gate 	ill = ring->rr_ill;
3051184Skrgopi 	if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
3061184Skrgopi 		ASSERT(ring->rr_handle != NULL);
3071184Skrgopi 		ill->ill_dls_capab->ill_dls_unbind(ring->rr_handle);
3081184Skrgopi 	}
3090Sstevel@tonic-gate 
3100Sstevel@tonic-gate 	/*
3110Sstevel@tonic-gate 	 * Cleanup the ring
3120Sstevel@tonic-gate 	 */
3130Sstevel@tonic-gate 
3140Sstevel@tonic-gate 	ring->rr_blank = NULL;
3150Sstevel@tonic-gate 	ring->rr_handle = NULL;
3160Sstevel@tonic-gate 	ring->rr_sqp = NULL;
3170Sstevel@tonic-gate 
3180Sstevel@tonic-gate 	/*
3190Sstevel@tonic-gate 	 * Signal ill that cleanup is done
3200Sstevel@tonic-gate 	 */
3210Sstevel@tonic-gate 	mutex_enter(&ill->ill_lock);
3220Sstevel@tonic-gate 	ring->rr_ring_state = ILL_RING_FREE;
3230Sstevel@tonic-gate 	cv_signal(&ill->ill_cv);
3240Sstevel@tonic-gate 	mutex_exit(&ill->ill_lock);
3250Sstevel@tonic-gate }
3260Sstevel@tonic-gate 
3270Sstevel@tonic-gate typedef struct ip_taskq_arg {
3280Sstevel@tonic-gate 	ill_t		*ip_taskq_ill;
3290Sstevel@tonic-gate 	ill_rx_ring_t	*ip_taskq_ill_rx_ring;
3300Sstevel@tonic-gate 	cpu_t		*ip_taskq_cpu;
3310Sstevel@tonic-gate } ip_taskq_arg_t;
3320Sstevel@tonic-gate 
3330Sstevel@tonic-gate /*
3340Sstevel@tonic-gate  * Do a Rx ring to squeue binding. Find a unique squeue that is not
3350Sstevel@tonic-gate  * managing a receive ring. If no such squeue exists, dynamically
3360Sstevel@tonic-gate  * create a new one in the squeue set.
3370Sstevel@tonic-gate  *
3380Sstevel@tonic-gate  * The function runs via the system taskq. The ill passed as an
3390Sstevel@tonic-gate  * argument can't go away since we hold a ref. The lock order is
3400Sstevel@tonic-gate  * ill_lock -> sqs_lock -> sq_lock.
3410Sstevel@tonic-gate  *
3420Sstevel@tonic-gate  * If we are binding a Rx ring to a squeue attached to the offline CPU,
3430Sstevel@tonic-gate  * no need to check that because squeues are never destroyed once
3440Sstevel@tonic-gate  * created.
3450Sstevel@tonic-gate  */
3460Sstevel@tonic-gate /* ARGSUSED */
3470Sstevel@tonic-gate static void
3480Sstevel@tonic-gate ip_squeue_extend(void *arg)
3490Sstevel@tonic-gate {
3500Sstevel@tonic-gate 	ip_taskq_arg_t	*sq_arg = (ip_taskq_arg_t *)arg;
3510Sstevel@tonic-gate 	ill_t		*ill = sq_arg->ip_taskq_ill;
3520Sstevel@tonic-gate 	ill_rx_ring_t	*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
3530Sstevel@tonic-gate 	cpu_t		*intr_cpu = sq_arg->ip_taskq_cpu;
3541184Skrgopi 	squeue_set_t 	*sqs;
3550Sstevel@tonic-gate 	squeue_t 	*sqp = NULL;
3560Sstevel@tonic-gate 
3570Sstevel@tonic-gate 	ASSERT(ill != NULL);
3580Sstevel@tonic-gate 	ASSERT(ill_rx_ring != NULL);
3590Sstevel@tonic-gate 	kmem_free(arg, sizeof (ip_taskq_arg_t));
3600Sstevel@tonic-gate 
3611184Skrgopi 	/*
3621184Skrgopi 	 * Make sure the CPU that originally took the interrupt still
3631184Skrgopi 	 * exists.
3641184Skrgopi 	 */
3651184Skrgopi 	if (!CPU_ISON(intr_cpu))
3661184Skrgopi 		intr_cpu = CPU;
3671184Skrgopi 
3680Sstevel@tonic-gate 	sqs = intr_cpu->cpu_squeue_set;
3690Sstevel@tonic-gate 
3700Sstevel@tonic-gate 	/*
3710Sstevel@tonic-gate 	 * If this ill represents link aggregation, then there might be
3720Sstevel@tonic-gate 	 * multiple NICs trying to register them selves at the same time
3730Sstevel@tonic-gate 	 * and in order to ensure that test and assignment of free rings
3740Sstevel@tonic-gate 	 * is sequential, we need to hold the ill_lock.
3750Sstevel@tonic-gate 	 */
3760Sstevel@tonic-gate 	mutex_enter(&ill->ill_lock);
3771184Skrgopi 	sqp = ip_find_unused_squeue(sqs, intr_cpu, B_FALSE);
3781184Skrgopi 	if (sqp == NULL) {
3791184Skrgopi 		/*
3801184Skrgopi 		 * We hit the max limit of squeues allowed per CPU.
3811184Skrgopi 		 * Assign this rx_ring to DEFAULT squeue of the
3821184Skrgopi 		 * interrupted CPU but the squeue will not manage
3831184Skrgopi 		 * the ring. Also print a warning.
3841184Skrgopi 		 */
3851184Skrgopi 		cmn_err(CE_NOTE, "ip_squeue_extend: CPU/sqset = %d/%p already "
3861184Skrgopi 		    "has max number of squeues. System performance might "
3871184Skrgopi 		    "become suboptimal\n", sqs->sqs_bind, (void *)sqs);
3881184Skrgopi 
3891184Skrgopi 		/* the first squeue in the list is the default squeue */
3901184Skrgopi 		sqp = sqs->sqs_list[0];
3911184Skrgopi 		ASSERT(sqp != NULL);
3921184Skrgopi 		ill_rx_ring->rr_sqp = sqp;
3931184Skrgopi 		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
3941184Skrgopi 
3951184Skrgopi 		mutex_exit(&ill->ill_lock);
3961184Skrgopi 		ill_waiter_dcr(ill);
3971184Skrgopi 		return;
3981184Skrgopi 	}
3991184Skrgopi 
4001184Skrgopi 	ASSERT(MUTEX_HELD(&sqp->sq_lock));
4011184Skrgopi 	sqp->sq_rx_ring = ill_rx_ring;
4021184Skrgopi 	ill_rx_ring->rr_sqp = sqp;
4031184Skrgopi 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
4041184Skrgopi 
4051184Skrgopi 	sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
4061184Skrgopi 	mutex_exit(&sqp->sq_lock);
4071184Skrgopi 
4081184Skrgopi 	mutex_exit(&ill->ill_lock);
4091184Skrgopi 
4101184Skrgopi 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
4111184Skrgopi 	ill_waiter_dcr(ill);
4121184Skrgopi }
4131184Skrgopi 
4141184Skrgopi /*
4151184Skrgopi  * Do a Rx ring to squeue binding. Find a unique squeue that is not
4161184Skrgopi  * managing a receive ring. If no such squeue exists, dynamically
4171184Skrgopi  * create a new one in the squeue set.
4181184Skrgopi  *
4191184Skrgopi  * The function runs via the system taskq. The ill passed as an
4201184Skrgopi  * argument can't go away since we hold a ref. The lock order is
4211184Skrgopi  * ill_lock -> sqs_lock -> sq_lock.
4221184Skrgopi  *
4231184Skrgopi  * If we are binding a Rx ring to a squeue attached to the offline CPU,
4241184Skrgopi  * no need to check that because squeues are never destroyed once
4251184Skrgopi  * created.
4261184Skrgopi  */
4271184Skrgopi /* ARGSUSED */
4281184Skrgopi static void
4291184Skrgopi ip_squeue_soft_ring_affinity(void *arg)
4301184Skrgopi {
4311184Skrgopi 	ip_taskq_arg_t		*sq_arg = (ip_taskq_arg_t *)arg;
4321184Skrgopi 	ill_t			*ill = sq_arg->ip_taskq_ill;
4331184Skrgopi 	ill_dls_capab_t	*ill_soft_ring = ill->ill_dls_capab;
4341184Skrgopi 	ill_rx_ring_t		*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
4351184Skrgopi 	cpu_t			*intr_cpu = sq_arg->ip_taskq_cpu;
4361184Skrgopi 	cpu_t			*bind_cpu;
4371184Skrgopi 	int			cpu_id = intr_cpu->cpu_id;
4381184Skrgopi 	int			min_cpu_id, max_cpu_id;
4391184Skrgopi 	boolean_t		enough_uniq_cpus = B_FALSE;
4401184Skrgopi 	boolean_t		enough_cpus = B_FALSE;
4411184Skrgopi 	squeue_set_t 		*sqs, *last_sqs;
4421184Skrgopi 	squeue_t 		*sqp = NULL;
4431184Skrgopi 	int			i, j;
4441184Skrgopi 
4451184Skrgopi 	ASSERT(ill != NULL);
4461184Skrgopi 	kmem_free(arg, sizeof (ip_taskq_arg_t));
4471184Skrgopi 
4481184Skrgopi 	/*
4491184Skrgopi 	 * Make sure the CPU that originally took the interrupt still
4501184Skrgopi 	 * exists.
4511184Skrgopi 	 */
4521184Skrgopi 	if (!CPU_ISON(intr_cpu)) {
4531184Skrgopi 		intr_cpu = CPU;
4541184Skrgopi 		cpu_id = intr_cpu->cpu_id;
4551184Skrgopi 	}
4561184Skrgopi 
4571184Skrgopi 	/*
4581184Skrgopi 	 * If this ill represents link aggregation, then there might be
4591184Skrgopi 	 * multiple NICs trying to register them selves at the same time
4601184Skrgopi 	 * and in order to ensure that test and assignment of free rings
4611184Skrgopi 	 * is sequential, we need to hold the ill_lock.
4621184Skrgopi 	 */
4631184Skrgopi 	mutex_enter(&ill->ill_lock);
4641184Skrgopi 
4651184Skrgopi 	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
4661184Skrgopi 		mutex_exit(&ill->ill_lock);
4671184Skrgopi 		return;
4681184Skrgopi 	}
4691184Skrgopi 	/*
4701184Skrgopi 	 * We need to fanout the interrupts from the NIC. We do that by
4711184Skrgopi 	 * telling the driver underneath to create soft rings and use
4721184Skrgopi 	 * worker threads (if the driver advertized SOFT_RING capability)
4731184Skrgopi 	 * Its still a big performance win to if we can fanout to the
4741184Skrgopi 	 * threads on the same core that is taking interrupts.
4751184Skrgopi 	 *
4761184Skrgopi 	 * Since we don't know the interrupt to CPU binding, we don't
4771184Skrgopi 	 * assign any squeues or affinity to worker threads in the NIC.
4781184Skrgopi 	 * At the time of the first interrupt, we know which CPU is
4791184Skrgopi 	 * taking interrupts and try to find other threads on the same
4801184Skrgopi 	 * core. Assuming, ip_threads_per_cpu is correct and cpus are
4811184Skrgopi 	 * numbered sequentially for each core (XXX need something better
4821184Skrgopi 	 * than this in future), find the lowest number and highest
4831184Skrgopi 	 * number thread for that core.
4841184Skrgopi 	 *
4851184Skrgopi 	 * If we have one more thread per core than number of soft rings,
4861184Skrgopi 	 * then don't assign any worker threads to the H/W thread (cpu)
4871184Skrgopi 	 * taking interrupts (capability negotiation tries to ensure this)
4881184Skrgopi 	 *
4891184Skrgopi 	 * If the number of threads per core are same as the number of
4901184Skrgopi 	 * soft rings, then assign the worker affinity and squeue to
4911184Skrgopi 	 * the same cpu.
4921184Skrgopi 	 *
4931184Skrgopi 	 * Otherwise, just fanout to higher number CPUs starting from
4941184Skrgopi 	 * the interrupted CPU.
4951184Skrgopi 	 */
4961184Skrgopi 
4971184Skrgopi 	min_cpu_id = (cpu_id / ip_threads_per_cpu) * ip_threads_per_cpu;
4981184Skrgopi 	max_cpu_id = min_cpu_id + ip_threads_per_cpu;
4991184Skrgopi 
5001184Skrgopi 	cmn_err(CE_CONT, "soft_ring_affinity: min/max/intr = %d/%d/%d\n",
5011184Skrgopi 	    min_cpu_id, max_cpu_id, (int)intr_cpu->cpu_id);
5021184Skrgopi 
5031184Skrgopi 	/*
5041184Skrgopi 	 * Quickly check if there are enough CPUs present for fanout
5051184Skrgopi 	 * and also max_cpu_id is less than the id of the active CPU.
5061184Skrgopi 	 * We use the cpu_id stored in the last squeue_set to get
5071184Skrgopi 	 * an idea. The scheme is by no means perfect since it doesn't
5081184Skrgopi 	 * take into account CPU DR operations and the fact that
5091184Skrgopi 	 * interrupts themselves might change. An ideal scenario
5101184Skrgopi 	 * would be to ensure that interrupts run cpus by themselves
5111184Skrgopi 	 * and worker threads never have affinity to those CPUs. If
5121184Skrgopi 	 * the interrupts move to CPU which had a worker thread, it
5131184Skrgopi 	 * should be changed. Probably callbacks similar to CPU offline
5141184Skrgopi 	 * are needed to make it work perfectly.
5151184Skrgopi 	 */
5161184Skrgopi 	last_sqs = sqset_global_list[sqset_global_size - 1];
5171184Skrgopi 	if (ip_threads_per_cpu <= ncpus && max_cpu_id <= last_sqs->sqs_bind) {
5181184Skrgopi 		if ((max_cpu_id - min_cpu_id) >
5191184Skrgopi 		    ill_soft_ring->ill_dls_soft_ring_cnt)
5201184Skrgopi 			enough_uniq_cpus = B_TRUE;
5211184Skrgopi 		else if ((max_cpu_id - min_cpu_id) >=
5221184Skrgopi 		    ill_soft_ring->ill_dls_soft_ring_cnt)
5231184Skrgopi 			enough_cpus = B_TRUE;
5241184Skrgopi 	}
5251184Skrgopi 
5261184Skrgopi 	j = 0;
5271184Skrgopi 	for (i = 0; i < (ill_soft_ring->ill_dls_soft_ring_cnt + j); i++) {
5281184Skrgopi 		if (enough_uniq_cpus) {
5291184Skrgopi 			if ((min_cpu_id + i) == cpu_id) {
5301184Skrgopi 				j++;
5311184Skrgopi 				continue;
5321184Skrgopi 			}
5331184Skrgopi 			bind_cpu = cpu[min_cpu_id + i];
5341184Skrgopi 		} else if (enough_cpus) {
5351184Skrgopi 			bind_cpu = cpu[min_cpu_id + i];
5361184Skrgopi 		} else {
5371184Skrgopi 			/* bind_cpu = cpu[(cpu_id + i) % last_sqs->sqs_bind]; */
5381184Skrgopi 			bind_cpu = cpu[(cpu_id + i) % ncpus];
5391184Skrgopi 		}
5401184Skrgopi 
5411184Skrgopi 		/*
5421184Skrgopi 		 * Check if the CPU actually exist and active. If not,
5431184Skrgopi 		 * use the interrupted CPU. ip_find_unused_squeue() will
5441184Skrgopi 		 * find the right CPU to fanout anyway.
5451184Skrgopi 		 */
5461184Skrgopi 		if (!CPU_ISON(bind_cpu))
5471184Skrgopi 			bind_cpu = intr_cpu;
5481184Skrgopi 
5491184Skrgopi 		sqs = bind_cpu->cpu_squeue_set;
5501184Skrgopi 		ASSERT(sqs != NULL);
5511184Skrgopi 		ill_rx_ring = &ill_soft_ring->ill_ring_tbl[i - j];
5521184Skrgopi 
5531184Skrgopi 		sqp = ip_find_unused_squeue(sqs, bind_cpu, B_TRUE);
5541184Skrgopi 		if (sqp == NULL) {
5551184Skrgopi 			/*
5561184Skrgopi 			 * We hit the max limit of squeues allowed per CPU.
5571184Skrgopi 			 * Assign this rx_ring to DEFAULT squeue of the
5581184Skrgopi 			 * interrupted CPU but thesqueue will not manage
5591184Skrgopi 			 * the ring. Also print a warning.
5601184Skrgopi 			 */
5611184Skrgopi 			cmn_err(CE_NOTE, "ip_squeue_soft_ring: CPU/sqset = "
5621184Skrgopi 			    "%d/%p already has max number of squeues. System "
5631184Skrgopi 			    "performance might become suboptimal\n",
5641184Skrgopi 			    sqs->sqs_bind, (void *)sqs);
5651184Skrgopi 
5661184Skrgopi 			/* the first squeue in the list is the default squeue */
5671184Skrgopi 			sqp = intr_cpu->cpu_squeue_set->sqs_list[0];
5681184Skrgopi 			ASSERT(sqp != NULL);
5691184Skrgopi 
5701184Skrgopi 			ill_rx_ring->rr_sqp = sqp;
5711184Skrgopi 			ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
5721184Skrgopi 			continue;
5731184Skrgopi 
5741184Skrgopi 		}
5751184Skrgopi 		ASSERT(MUTEX_HELD(&sqp->sq_lock));
5761184Skrgopi 		ill_rx_ring->rr_sqp = sqp;
5771184Skrgopi 		sqp->sq_rx_ring = ill_rx_ring;
5781184Skrgopi 		ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
5791184Skrgopi 		sqp->sq_state |= SQS_ILL_BOUND;
5801184Skrgopi 
5811184Skrgopi 		/* assign affinity to soft ring */
5821184Skrgopi 		if (ip_squeue_bind && (sqp->sq_state & SQS_BOUND)) {
5831184Skrgopi 			ill_soft_ring->ill_dls_bind(ill_rx_ring->rr_handle,
5841184Skrgopi 			    sqp->sq_bind);
5851184Skrgopi 		}
5861184Skrgopi 		mutex_exit(&sqp->sq_lock);
5871184Skrgopi 
5881184Skrgopi 		cmn_err(CE_CONT, "soft_ring_affinity: ring = %d, bind = %d\n",
5891184Skrgopi 		    i - j, sqp->sq_bind);
5901184Skrgopi 	}
5911184Skrgopi 	mutex_exit(&ill->ill_lock);
5921184Skrgopi 
5931184Skrgopi 	ill_soft_ring->ill_dls_change_status(ill_soft_ring->ill_tx_handle,
5941184Skrgopi 	    SOFT_RING_SRC_HASH);
5951184Skrgopi 
596*1503Sericheng 	mutex_enter(&ill->ill_lock);
597*1503Sericheng 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
598*1503Sericheng 	mutex_exit(&ill->ill_lock);
599*1503Sericheng 
6001184Skrgopi 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
6011184Skrgopi 	ill_waiter_dcr(ill);
6021184Skrgopi }
6031184Skrgopi 
6041184Skrgopi void
6051184Skrgopi ip_soft_ring_assignment(ill_t *ill, ill_rx_ring_t *ip_ring,
6061184Skrgopi mblk_t *mp_chain, size_t hdrlen)
6071184Skrgopi {
6081184Skrgopi 	ip_taskq_arg_t	*taskq_arg;
6091184Skrgopi 	boolean_t	refheld;
6101184Skrgopi 
6111184Skrgopi 	ASSERT(servicing_interrupt());
6121184Skrgopi 	ASSERT(ip_ring == NULL);
6131184Skrgopi 
6141184Skrgopi 	mutex_enter(&ill->ill_lock);
6151184Skrgopi 	if (!(ill->ill_state_flags & ILL_SOFT_RING_ASSIGN)) {
6161184Skrgopi 		taskq_arg = (ip_taskq_arg_t *)
6171184Skrgopi 		    kmem_zalloc(sizeof (ip_taskq_arg_t), KM_NOSLEEP);
6181184Skrgopi 
6191184Skrgopi 		if (taskq_arg == NULL)
6201184Skrgopi 			goto out;
6211184Skrgopi 
6221184Skrgopi 		taskq_arg->ip_taskq_ill = ill;
6231184Skrgopi 		taskq_arg->ip_taskq_ill_rx_ring = ip_ring;
6241184Skrgopi 		taskq_arg->ip_taskq_cpu = CPU;
6251184Skrgopi 
6261184Skrgopi 		/*
6271184Skrgopi 		 * Set ILL_SOFT_RING_ASSIGN flag. We don't want
6281184Skrgopi 		 * the next interrupt to schedule a task for calling
6291184Skrgopi 		 * ip_squeue_soft_ring_affinity();
6301184Skrgopi 		 */
6311184Skrgopi 		ill->ill_state_flags |= ILL_SOFT_RING_ASSIGN;
6321184Skrgopi 	} else {
6331184Skrgopi 		mutex_exit(&ill->ill_lock);
6341184Skrgopi 		goto out;
6351184Skrgopi 	}
6361184Skrgopi 	mutex_exit(&ill->ill_lock);
6371184Skrgopi 	refheld = ill_waiter_inc(ill);
6381184Skrgopi 	if (refheld) {
6391184Skrgopi 		if (taskq_dispatch(system_taskq,
6401184Skrgopi 		    ip_squeue_soft_ring_affinity, taskq_arg, TQ_NOSLEEP))
6411184Skrgopi 			goto out;
6421184Skrgopi 
6431184Skrgopi 		/* release ref on ill if taskq dispatch fails */
6441184Skrgopi 		ill_waiter_dcr(ill);
6451184Skrgopi 	}
6461184Skrgopi 	/*
6471184Skrgopi 	 * Turn on CAPAB_SOFT_RING so that affinity assignment
6481184Skrgopi 	 * can be tried again later.
6491184Skrgopi 	 */
6501184Skrgopi 	mutex_enter(&ill->ill_lock);
6511184Skrgopi 	ill->ill_state_flags &= ~ILL_SOFT_RING_ASSIGN;
6521184Skrgopi 	mutex_exit(&ill->ill_lock);
6531184Skrgopi 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
6541184Skrgopi 
6551184Skrgopi out:
6561184Skrgopi 	ip_input(ill, ip_ring, mp_chain, hdrlen);
6571184Skrgopi }
6581184Skrgopi 
6591184Skrgopi static squeue_t *
6601184Skrgopi ip_find_unused_squeue(squeue_set_t *sqs, cpu_t *bind_cpu, boolean_t fanout)
6611184Skrgopi {
6621184Skrgopi 	int 		i;
6631184Skrgopi 	squeue_set_t	*best_sqs = NULL;
6641184Skrgopi 	squeue_set_t	*curr_sqs = NULL;
6651184Skrgopi 	int		min_sq = 0;
6661184Skrgopi 	squeue_t 	*sqp = NULL;
6671184Skrgopi 	char		sqname[64];
6681184Skrgopi 
6691184Skrgopi 	/*
6701184Skrgopi 	 * If fanout is set and the passed squeue_set already has some
6711184Skrgopi 	 * squeues which are managing the NICs, try to find squeues on
6721184Skrgopi 	 * unused CPU.
6731184Skrgopi 	 */
6741184Skrgopi 	if (sqs->sqs_size > 1 && fanout) {
6751184Skrgopi 		/*
6761184Skrgopi 		 * First check to see if any squeue on the CPU passed
6771184Skrgopi 		 * is managing a NIC.
6781184Skrgopi 		 */
6791184Skrgopi 		for (i = 0; i < sqs->sqs_size; i++) {
6801184Skrgopi 			mutex_enter(&sqs->sqs_list[i]->sq_lock);
6811184Skrgopi 			if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) &&
6821184Skrgopi 			    !(sqs->sqs_list[i]->sq_state & SQS_DEFAULT)) {
6831184Skrgopi 				mutex_exit(&sqs->sqs_list[i]->sq_lock);
6841184Skrgopi 				break;
6851184Skrgopi 			}
6861184Skrgopi 			mutex_exit(&sqs->sqs_list[i]->sq_lock);
6871184Skrgopi 		}
6881184Skrgopi 		if (i != sqs->sqs_size) {
6891184Skrgopi 			best_sqs = sqset_global_list[sqset_global_size - 1];
6901184Skrgopi 			min_sq = best_sqs->sqs_size;
6911184Skrgopi 
6921184Skrgopi 			for (i = sqset_global_size - 2; i >= 0; i--) {
6931184Skrgopi 				curr_sqs = sqset_global_list[i];
6941184Skrgopi 				if (curr_sqs->sqs_size < min_sq) {
6951184Skrgopi 					best_sqs = curr_sqs;
6961184Skrgopi 					min_sq = curr_sqs->sqs_size;
6971184Skrgopi 				}
6981184Skrgopi 			}
6991184Skrgopi 
7001184Skrgopi 			ASSERT(best_sqs != NULL);
7011184Skrgopi 			sqs = best_sqs;
7021184Skrgopi 			bind_cpu = cpu[sqs->sqs_bind];
7031184Skrgopi 		}
7041184Skrgopi 	}
7051184Skrgopi 
7060Sstevel@tonic-gate 	mutex_enter(&sqs->sqs_lock);
7071184Skrgopi 
7080Sstevel@tonic-gate 	for (i = 0; i < sqs->sqs_size; i++) {
7090Sstevel@tonic-gate 		mutex_enter(&sqs->sqs_list[i]->sq_lock);
7101184Skrgopi 		if ((sqs->sqs_list[i]->sq_state &
7111184Skrgopi 		    (SQS_DEFAULT|SQS_ILL_BOUND)) == 0) {
7120Sstevel@tonic-gate 			sqp = sqs->sqs_list[i];
7130Sstevel@tonic-gate 			break;
7140Sstevel@tonic-gate 		}
7150Sstevel@tonic-gate 		mutex_exit(&sqs->sqs_list[i]->sq_lock);
7160Sstevel@tonic-gate 	}
7170Sstevel@tonic-gate 
7180Sstevel@tonic-gate 	if (sqp == NULL) {
7190Sstevel@tonic-gate 		/* Need to create a new squeue */
7200Sstevel@tonic-gate 		if (sqs->sqs_size == sqs->sqs_max_size) {
7210Sstevel@tonic-gate 			/*
7220Sstevel@tonic-gate 			 * Reached the max limit for squeue
7231184Skrgopi 			 * we can allocate on this CPU.
7240Sstevel@tonic-gate 			 */
7250Sstevel@tonic-gate 			mutex_exit(&sqs->sqs_lock);
7261184Skrgopi 			return (NULL);
7270Sstevel@tonic-gate 		}
7280Sstevel@tonic-gate 
7290Sstevel@tonic-gate 		bzero(sqname, sizeof (sqname));
7300Sstevel@tonic-gate 		(void) snprintf(sqname, sizeof (sqname),
7311184Skrgopi 		    "ip_squeue_cpu_%d/%d/%d", bind_cpu->cpu_seqid,
7321184Skrgopi 		    bind_cpu->cpu_id, sqs->sqs_size);
7330Sstevel@tonic-gate 
7341184Skrgopi 		sqp = squeue_create(sqname, bind_cpu->cpu_id,
7351184Skrgopi 		    ip_squeue_worker_wait, minclsyspri);
7360Sstevel@tonic-gate 
7370Sstevel@tonic-gate 		ASSERT(sqp != NULL);
7380Sstevel@tonic-gate 
7390Sstevel@tonic-gate 		squeue_profile_enable(sqp);
7400Sstevel@tonic-gate 		sqs->sqs_list[sqs->sqs_size++] = sqp;
7410Sstevel@tonic-gate 
7420Sstevel@tonic-gate 		if (ip_squeue_create_callback != NULL)
7430Sstevel@tonic-gate 			ip_squeue_create_callback(sqp);
7440Sstevel@tonic-gate 
7451184Skrgopi 		mutex_enter(&cpu_lock);
7461184Skrgopi 		if (ip_squeue_bind && cpu_is_online(bind_cpu)) {
7470Sstevel@tonic-gate 			squeue_bind(sqp, -1);
7480Sstevel@tonic-gate 		}
7491184Skrgopi 		mutex_exit(&cpu_lock);
7501184Skrgopi 
7510Sstevel@tonic-gate 		mutex_enter(&sqp->sq_lock);
7520Sstevel@tonic-gate 	}
7530Sstevel@tonic-gate 
7541184Skrgopi 	mutex_exit(&sqs->sqs_lock);
7550Sstevel@tonic-gate 	ASSERT(sqp != NULL);
7561184Skrgopi 	return (sqp);
7570Sstevel@tonic-gate }
7580Sstevel@tonic-gate 
7590Sstevel@tonic-gate /*
7600Sstevel@tonic-gate  * Find the squeue assigned to manage this Rx ring. If the Rx ring is not
7610Sstevel@tonic-gate  * owned by a squeue yet, do the assignment. When the NIC registers it
7620Sstevel@tonic-gate  * Rx rings with IP, we don't know where the interrupts will land and
7630Sstevel@tonic-gate  * hence we need to wait till this point to do the assignment.
7640Sstevel@tonic-gate  */
7650Sstevel@tonic-gate squeue_t *
7660Sstevel@tonic-gate ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
7670Sstevel@tonic-gate {
7680Sstevel@tonic-gate 	squeue_t 	*sqp;
7690Sstevel@tonic-gate 	ill_t 		*ill;
7700Sstevel@tonic-gate 	int		interrupt;
7710Sstevel@tonic-gate 	ip_taskq_arg_t	*taskq_arg;
7720Sstevel@tonic-gate 	boolean_t	refheld;
7730Sstevel@tonic-gate 
7740Sstevel@tonic-gate 	if (ill_rx_ring == NULL)
7750Sstevel@tonic-gate 		return (IP_SQUEUE_GET(lbolt));
7760Sstevel@tonic-gate 
7770Sstevel@tonic-gate 	sqp = ill_rx_ring->rr_sqp;
7780Sstevel@tonic-gate 	/*
7790Sstevel@tonic-gate 	 * Do a quick check. If it's not NULL, we are done.
7800Sstevel@tonic-gate 	 * Squeues are never destroyed so worse we will bind
7810Sstevel@tonic-gate 	 * this connection to a suboptimal squeue.
7820Sstevel@tonic-gate 	 *
7830Sstevel@tonic-gate 	 * This is the fast path case.
7840Sstevel@tonic-gate 	 */
7850Sstevel@tonic-gate 	if (sqp != NULL)
7860Sstevel@tonic-gate 		return (sqp);
7870Sstevel@tonic-gate 
7880Sstevel@tonic-gate 	ill = ill_rx_ring->rr_ill;
7890Sstevel@tonic-gate 	ASSERT(ill != NULL);
7900Sstevel@tonic-gate 
7910Sstevel@tonic-gate 	interrupt = servicing_interrupt();
7920Sstevel@tonic-gate 	taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t),
7930Sstevel@tonic-gate 	    KM_NOSLEEP);
7940Sstevel@tonic-gate 
7950Sstevel@tonic-gate 	mutex_enter(&ill->ill_lock);
7960Sstevel@tonic-gate 	if (!interrupt || ill_rx_ring->rr_ring_state != ILL_RING_INUSE ||
7970Sstevel@tonic-gate 		taskq_arg == NULL) {
7980Sstevel@tonic-gate 		/*
7990Sstevel@tonic-gate 		 * Do the ring to squeue binding only if we are in interrupt
8000Sstevel@tonic-gate 		 * context and there is no one else trying the bind already.
8010Sstevel@tonic-gate 		 */
8020Sstevel@tonic-gate 		mutex_exit(&ill->ill_lock);
8030Sstevel@tonic-gate 		if (taskq_arg != NULL)
8040Sstevel@tonic-gate 			kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
8050Sstevel@tonic-gate 		return (IP_SQUEUE_GET(lbolt));
8060Sstevel@tonic-gate 	}
8070Sstevel@tonic-gate 
8080Sstevel@tonic-gate 	/*
8090Sstevel@tonic-gate 	 * No sqp assigned yet. Can't really do that in interrupt
8100Sstevel@tonic-gate 	 * context. Assign the default sqp to this connection and
8110Sstevel@tonic-gate 	 * trigger creation of new sqp and binding it to this ring
8120Sstevel@tonic-gate 	 * via taskq. Need to make sure ill stays around.
8130Sstevel@tonic-gate 	 */
8140Sstevel@tonic-gate 	taskq_arg->ip_taskq_ill = ill;
8150Sstevel@tonic-gate 	taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring;
8160Sstevel@tonic-gate 	taskq_arg->ip_taskq_cpu = CPU;
8170Sstevel@tonic-gate 	ill_rx_ring->rr_ring_state = ILL_RING_INPROC;
8180Sstevel@tonic-gate 	mutex_exit(&ill->ill_lock);
8190Sstevel@tonic-gate 	refheld = ill_waiter_inc(ill);
8200Sstevel@tonic-gate 	if (refheld) {
8210Sstevel@tonic-gate 		if (taskq_dispatch(system_taskq, ip_squeue_extend,
8220Sstevel@tonic-gate 		    taskq_arg, TQ_NOSLEEP) != NULL) {
8230Sstevel@tonic-gate 			return (IP_SQUEUE_GET(lbolt));
8240Sstevel@tonic-gate 		}
8250Sstevel@tonic-gate 	}
8260Sstevel@tonic-gate 	/*
8270Sstevel@tonic-gate 	 * The ill is closing and we could not get a reference on the ill OR
8280Sstevel@tonic-gate 	 * taskq_dispatch failed probably due to memory allocation failure.
8290Sstevel@tonic-gate 	 * We will try again next time.
8300Sstevel@tonic-gate 	 */
8310Sstevel@tonic-gate 	mutex_enter(&ill->ill_lock);
8320Sstevel@tonic-gate 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
8330Sstevel@tonic-gate 	mutex_exit(&ill->ill_lock);
8340Sstevel@tonic-gate 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
8350Sstevel@tonic-gate 	if (refheld)
8360Sstevel@tonic-gate 		ill_waiter_dcr(ill);
8370Sstevel@tonic-gate 
8380Sstevel@tonic-gate 	return (IP_SQUEUE_GET(lbolt));
8390Sstevel@tonic-gate }
8400Sstevel@tonic-gate 
8410Sstevel@tonic-gate /*
8420Sstevel@tonic-gate  * NDD hooks for setting ip_squeue_xxx tuneables.
8430Sstevel@tonic-gate  */
8440Sstevel@tonic-gate 
8450Sstevel@tonic-gate /* ARGSUSED */
8460Sstevel@tonic-gate int
8470Sstevel@tonic-gate ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value,
8480Sstevel@tonic-gate     caddr_t addr, cred_t *cr)
8490Sstevel@tonic-gate {
8500Sstevel@tonic-gate 	int *bind_enabled = (int *)addr;
8510Sstevel@tonic-gate 	long new_value;
8520Sstevel@tonic-gate 	int i;
8530Sstevel@tonic-gate 
8540Sstevel@tonic-gate 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
8550Sstevel@tonic-gate 		return (EINVAL);
8560Sstevel@tonic-gate 
8570Sstevel@tonic-gate 	if (ip_squeue_bind == new_value)
8580Sstevel@tonic-gate 		return (0);
8590Sstevel@tonic-gate 
8600Sstevel@tonic-gate 	*bind_enabled = new_value;
8610Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
8620Sstevel@tonic-gate 	if (new_value == 0) {
8630Sstevel@tonic-gate 		for (i = 0; i < sqset_global_size; i++)
8640Sstevel@tonic-gate 			ip_squeue_set_unbind(sqset_global_list[i]);
8650Sstevel@tonic-gate 	} else {
8660Sstevel@tonic-gate 		for (i = 0; i < sqset_global_size; i++)
8670Sstevel@tonic-gate 			ip_squeue_set_bind(sqset_global_list[i]);
8680Sstevel@tonic-gate 	}
8690Sstevel@tonic-gate 
8700Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
8710Sstevel@tonic-gate 	return (0);
8720Sstevel@tonic-gate }
8730Sstevel@tonic-gate 
8740Sstevel@tonic-gate /*
8750Sstevel@tonic-gate  * Set squeue profiling.
8760Sstevel@tonic-gate  * 0 means "disable"
8770Sstevel@tonic-gate  * 1 means "enable"
8780Sstevel@tonic-gate  * 2 means "enable and reset"
8790Sstevel@tonic-gate  */
8800Sstevel@tonic-gate /* ARGSUSED */
8810Sstevel@tonic-gate int
8820Sstevel@tonic-gate ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
8830Sstevel@tonic-gate     cred_t *cr)
8840Sstevel@tonic-gate {
8850Sstevel@tonic-gate 	int *profile_enabled = (int *)cp;
8860Sstevel@tonic-gate 	long new_value;
8870Sstevel@tonic-gate 	squeue_set_t *sqs;
8880Sstevel@tonic-gate 
8890Sstevel@tonic-gate 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
8900Sstevel@tonic-gate 		return (EINVAL);
8910Sstevel@tonic-gate 
8920Sstevel@tonic-gate 	if (new_value == 0)
8930Sstevel@tonic-gate 		squeue_profile_stop();
8940Sstevel@tonic-gate 	else if (new_value == 1)
8950Sstevel@tonic-gate 		squeue_profile_start();
8960Sstevel@tonic-gate 	else if (new_value == 2) {
8970Sstevel@tonic-gate 		int i, j;
8980Sstevel@tonic-gate 
8990Sstevel@tonic-gate 		squeue_profile_stop();
9000Sstevel@tonic-gate 		mutex_enter(&cpu_lock);
9010Sstevel@tonic-gate 		for (i = 0; i < sqset_global_size; i++) {
9020Sstevel@tonic-gate 			sqs = sqset_global_list[i];
9030Sstevel@tonic-gate 			for (j = 0; j < sqs->sqs_size; j++) {
9040Sstevel@tonic-gate 				squeue_profile_reset(sqs->sqs_list[j]);
9050Sstevel@tonic-gate 			}
9060Sstevel@tonic-gate 		}
9070Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
9080Sstevel@tonic-gate 
9090Sstevel@tonic-gate 		new_value = 1;
9100Sstevel@tonic-gate 		squeue_profile_start();
9110Sstevel@tonic-gate 	}
9120Sstevel@tonic-gate 	*profile_enabled = new_value;
9130Sstevel@tonic-gate 
9140Sstevel@tonic-gate 	return (0);
9150Sstevel@tonic-gate }
9160Sstevel@tonic-gate 
9170Sstevel@tonic-gate /*
9180Sstevel@tonic-gate  * Reconfiguration callback
9190Sstevel@tonic-gate  */
9200Sstevel@tonic-gate 
9210Sstevel@tonic-gate /* ARGSUSED */
9220Sstevel@tonic-gate static int
9230Sstevel@tonic-gate ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
9240Sstevel@tonic-gate {
9250Sstevel@tonic-gate 	cpu_t *cp = cpu[id];
9260Sstevel@tonic-gate 
9270Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
9280Sstevel@tonic-gate 	switch (what) {
929405Sakolb 	case CPU_CONFIG:
930405Sakolb 		/*
931405Sakolb 		 * A new CPU is added. Create an squeue for it but do not bind
932405Sakolb 		 * it yet.
933405Sakolb 		 */
934405Sakolb 		if (cp->cpu_squeue_set == NULL)
935405Sakolb 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
936405Sakolb 		break;
9370Sstevel@tonic-gate 	case CPU_ON:
9380Sstevel@tonic-gate 	case CPU_INIT:
9390Sstevel@tonic-gate 	case CPU_CPUPART_IN:
9400Sstevel@tonic-gate 		if (cp->cpu_squeue_set == NULL) {
9410Sstevel@tonic-gate 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
9420Sstevel@tonic-gate 		}
9430Sstevel@tonic-gate 		if (ip_squeue_bind)
9440Sstevel@tonic-gate 			ip_squeue_set_bind(cp->cpu_squeue_set);
9450Sstevel@tonic-gate 		break;
9460Sstevel@tonic-gate 	case CPU_UNCONFIG:
9470Sstevel@tonic-gate 	case CPU_OFF:
9480Sstevel@tonic-gate 	case CPU_CPUPART_OUT:
9490Sstevel@tonic-gate 		ASSERT((cp->cpu_squeue_set != NULL) ||
9500Sstevel@tonic-gate 		    (cp->cpu_flags & CPU_OFFLINE));
9510Sstevel@tonic-gate 
9520Sstevel@tonic-gate 		if (cp->cpu_squeue_set != NULL) {
9530Sstevel@tonic-gate 			ip_squeue_set_unbind(cp->cpu_squeue_set);
9540Sstevel@tonic-gate 		}
9550Sstevel@tonic-gate 		break;
9560Sstevel@tonic-gate 	default:
9570Sstevel@tonic-gate 		break;
9580Sstevel@tonic-gate 	}
9590Sstevel@tonic-gate 	return (0);
9600Sstevel@tonic-gate }
9610Sstevel@tonic-gate 
9620Sstevel@tonic-gate /* ARGSUSED */
9630Sstevel@tonic-gate static void
9640Sstevel@tonic-gate ip_squeue_set_bind(squeue_set_t *sqs)
9650Sstevel@tonic-gate {
9660Sstevel@tonic-gate 	int i;
9670Sstevel@tonic-gate 	squeue_t *sqp;
9680Sstevel@tonic-gate 
9690Sstevel@tonic-gate 	if (!ip_squeue_bind)
9700Sstevel@tonic-gate 		return;
9710Sstevel@tonic-gate 
9720Sstevel@tonic-gate 	mutex_enter(&sqs->sqs_lock);
9730Sstevel@tonic-gate 	for (i = 0; i < sqs->sqs_size; i++) {
9740Sstevel@tonic-gate 		sqp = sqs->sqs_list[i];
9750Sstevel@tonic-gate 		if (sqp->sq_state & SQS_BOUND)
9760Sstevel@tonic-gate 			continue;
9770Sstevel@tonic-gate 		squeue_bind(sqp, -1);
9780Sstevel@tonic-gate 	}
9790Sstevel@tonic-gate 	mutex_exit(&sqs->sqs_lock);
9800Sstevel@tonic-gate }
9810Sstevel@tonic-gate 
9820Sstevel@tonic-gate static void
9830Sstevel@tonic-gate ip_squeue_set_unbind(squeue_set_t *sqs)
9840Sstevel@tonic-gate {
9850Sstevel@tonic-gate 	int i;
9860Sstevel@tonic-gate 	squeue_t *sqp;
9870Sstevel@tonic-gate 
9880Sstevel@tonic-gate 	mutex_enter(&sqs->sqs_lock);
9890Sstevel@tonic-gate 	for (i = 0; i < sqs->sqs_size; i++) {
9900Sstevel@tonic-gate 		sqp = sqs->sqs_list[i];
9911184Skrgopi 
9921184Skrgopi 		/*
9931184Skrgopi 		 * CPU is going offline. Remove the thread affinity
9941184Skrgopi 		 * for any soft ring threads the squeue is managing.
9951184Skrgopi 		 */
9961184Skrgopi 		if (sqp->sq_state & SQS_ILL_BOUND) {
9971184Skrgopi 			ill_rx_ring_t	*ring = sqp->sq_rx_ring;
9981184Skrgopi 			ill_t		*ill = ring->rr_ill;
9991184Skrgopi 
10001184Skrgopi 			if (ill->ill_capabilities & ILL_CAPAB_SOFT_RING) {
10011184Skrgopi 				ASSERT(ring->rr_handle != NULL);
10021184Skrgopi 				ill->ill_dls_capab->ill_dls_unbind(
10031184Skrgopi 					ring->rr_handle);
10041184Skrgopi 			}
10051184Skrgopi 		}
10060Sstevel@tonic-gate 		if (!(sqp->sq_state & SQS_BOUND))
10070Sstevel@tonic-gate 			continue;
10080Sstevel@tonic-gate 		squeue_unbind(sqp);
10090Sstevel@tonic-gate 	}
10100Sstevel@tonic-gate 	mutex_exit(&sqs->sqs_lock);
10110Sstevel@tonic-gate }
1012