xref: /onnv-gate/usr/src/uts/common/inet/ip/ip_squeue.c (revision 405:7d2868bd61bb)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
50Sstevel@tonic-gate  * Common Development and Distribution License, Version 1.0 only
60Sstevel@tonic-gate  * (the "License").  You may not use this file except in compliance
70Sstevel@tonic-gate  * with the License.
80Sstevel@tonic-gate  *
90Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
100Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
110Sstevel@tonic-gate  * See the License for the specific language governing permissions
120Sstevel@tonic-gate  * and limitations under the License.
130Sstevel@tonic-gate  *
140Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
150Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
160Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
170Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
180Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
190Sstevel@tonic-gate  *
200Sstevel@tonic-gate  * CDDL HEADER END
210Sstevel@tonic-gate  */
220Sstevel@tonic-gate /*
230Sstevel@tonic-gate  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
240Sstevel@tonic-gate  * Use is subject to license terms.
250Sstevel@tonic-gate  */
260Sstevel@tonic-gate 
270Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
280Sstevel@tonic-gate 
290Sstevel@tonic-gate /*
300Sstevel@tonic-gate  * IP interface to squeues.
310Sstevel@tonic-gate  *
320Sstevel@tonic-gate  * IP creates an squeue instance for each CPU. The squeue pointer is saved in
330Sstevel@tonic-gate  * cpu_squeue field of the cpu structure. Each squeue is associated with a
340Sstevel@tonic-gate  * connection instance (conn_t).
350Sstevel@tonic-gate  *
360Sstevel@tonic-gate  * For CPUs available at system startup time the squeue creation and association
370Sstevel@tonic-gate  * with CPU happens at MP initialization time. For CPUs added during dynamic
380Sstevel@tonic-gate  * reconfiguration, the initialization happens when the new CPU is configured in
390Sstevel@tonic-gate  * the system. The squeue is chosen using IP_SQUEUE_GET macro which will either
400Sstevel@tonic-gate  * return per-CPU squeue or random squeue based on the ip_squeue_fanout
410Sstevel@tonic-gate  * variable.
420Sstevel@tonic-gate  *
430Sstevel@tonic-gate  * There are two modes of associating connection with squeues. The first mode
440Sstevel@tonic-gate  * associates each connection with the CPU that creates the connection (either
450Sstevel@tonic-gate  * during open time or during accept time). The second mode associates each
460Sstevel@tonic-gate  * connection with a random CPU, effectively distributing load over all CPUs
470Sstevel@tonic-gate  * and all squeues in the system. The mode is controlled by the
480Sstevel@tonic-gate  * ip_squeue_fanout variable.
490Sstevel@tonic-gate  *
500Sstevel@tonic-gate  * NOTE: The fact that there is an association between each connection and
510Sstevel@tonic-gate  * squeue and squeue and CPU does not mean that each connection is always
520Sstevel@tonic-gate  * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
530Sstevel@tonic-gate  * may process the connection on whatever CPU it is scheduled. The squeue to CPU
540Sstevel@tonic-gate  * binding is only relevant for the worker thread.
550Sstevel@tonic-gate  *
560Sstevel@tonic-gate  * The list of all created squeues is kept in squeue_set structure. This list is
570Sstevel@tonic-gate  * used when ip_squeue_fanout is set and the load is distributed across all
580Sstevel@tonic-gate  * squeues.
590Sstevel@tonic-gate  *
600Sstevel@tonic-gate  * INTERFACE:
610Sstevel@tonic-gate  *
620Sstevel@tonic-gate  * squeue_t *ip_squeue_get(hint)
630Sstevel@tonic-gate  *
640Sstevel@tonic-gate  * 	Find an squeue based on the 'hint' value. The hint is used as an index
650Sstevel@tonic-gate  * 	in the array of IP squeues available. The way hint is computed may
660Sstevel@tonic-gate  * 	affect the effectiveness of the squeue distribution. Currently squeues
670Sstevel@tonic-gate  * 	are assigned in round-robin fashion using lbolt as a hint.
680Sstevel@tonic-gate  *
690Sstevel@tonic-gate  *
700Sstevel@tonic-gate  * DR Notes
710Sstevel@tonic-gate  * ========
720Sstevel@tonic-gate  *
730Sstevel@tonic-gate  * The ip_squeue_init() registers a call-back function with the CPU DR
740Sstevel@tonic-gate  * subsystem using register_cpu_setup_func(). The call-back function does two
750Sstevel@tonic-gate  * things:
760Sstevel@tonic-gate  *
770Sstevel@tonic-gate  * o When the CPU is going off-line or unconfigured, the worker thread is
780Sstevel@tonic-gate  *	unbound from the CPU. This allows the CPU unconfig code to move it to
790Sstevel@tonic-gate  *	another CPU.
800Sstevel@tonic-gate  *
810Sstevel@tonic-gate  * o When the CPU is going online, it creates a new squeue for this CPU if
820Sstevel@tonic-gate  *	necessary and binds the squeue worker thread to this CPU.
830Sstevel@tonic-gate  *
840Sstevel@tonic-gate  * TUNEBALES:
850Sstevel@tonic-gate  *
860Sstevel@tonic-gate  * ip_squeue_bind: if set to 1 each squeue worker thread is bound to the CPU
870Sstevel@tonic-gate  * 	associated with an squeue instance.
880Sstevel@tonic-gate  *
890Sstevel@tonic-gate  * ip_squeue_profile: if set to 1 squeue profiling is enabled. NOTE: squeue.c
900Sstevel@tonic-gate  *	should be compiled with SQUEUE_PROFILE enabled for this variable to have
910Sstevel@tonic-gate  *	an impact.
920Sstevel@tonic-gate  *
930Sstevel@tonic-gate  * ip_squeue_fanout: if set to 1 use ip_squeue_get() to find an squeue,
940Sstevel@tonic-gate  *	otherwise get it from CPU->cpu_squeue.
950Sstevel@tonic-gate  *
960Sstevel@tonic-gate  * ip_squeue_bind, ip_squeue_profile and ip_squeue_fanout can be accessed and
970Sstevel@tonic-gate  * changed using ndd on /dev/tcp or /dev/ip.
980Sstevel@tonic-gate  *
990Sstevel@tonic-gate  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
1000Sstevel@tonic-gate  *	created. This is the time squeue code waits before waking up the worker
1010Sstevel@tonic-gate  *	thread after queuing a request.
1020Sstevel@tonic-gate  */
1030Sstevel@tonic-gate 
1040Sstevel@tonic-gate #include <sys/types.h>
1050Sstevel@tonic-gate #include <sys/debug.h>
1060Sstevel@tonic-gate #include <sys/kmem.h>
1070Sstevel@tonic-gate #include <sys/cpuvar.h>
1080Sstevel@tonic-gate 
1090Sstevel@tonic-gate #include <sys/cmn_err.h>
1100Sstevel@tonic-gate 
1110Sstevel@tonic-gate #include <inet/common.h>
1120Sstevel@tonic-gate #include <inet/ip.h>
1130Sstevel@tonic-gate #include <inet/ip_if.h>
1140Sstevel@tonic-gate #include <inet/mi.h>
1150Sstevel@tonic-gate #include <inet/nd.h>
1160Sstevel@tonic-gate #include <inet/ipclassifier.h>
1170Sstevel@tonic-gate #include <sys/types.h>
1180Sstevel@tonic-gate #include <sys/conf.h>
1190Sstevel@tonic-gate #include <sys/sunddi.h>
1200Sstevel@tonic-gate #include <sys/ddi.h>
1210Sstevel@tonic-gate #include <sys/squeue_impl.h>
1220Sstevel@tonic-gate 
1230Sstevel@tonic-gate 
1240Sstevel@tonic-gate /*
1250Sstevel@tonic-gate  * We allow multiple NICs to bind to the same CPU but want to preserve 1 <-> 1
1260Sstevel@tonic-gate  * mapping between squeue and NIC (or Rx ring) for performance reasons so
1270Sstevel@tonic-gate  * each squeue can uniquely own a NIC or a Rx ring and do polling
1280Sstevel@tonic-gate  * (PSARC 2004/630). So we allow up to  MAX_THREAD_PER_CPU squeues per CPU.
1290Sstevel@tonic-gate  * We start by creating MIN_THREAD_PER_CPU squeues per CPU but more squeues
1300Sstevel@tonic-gate  * can be created dynamically as needed.
1310Sstevel@tonic-gate  */
1320Sstevel@tonic-gate #define	MAX_THREAD_PER_CPU	32
1330Sstevel@tonic-gate #define	MIN_THREAD_PER_CPU	1
1340Sstevel@tonic-gate uint_t	ip_threads_per_cpu = MIN_THREAD_PER_CPU;
1350Sstevel@tonic-gate 
1360Sstevel@tonic-gate /*
1370Sstevel@tonic-gate  * List of all created squeue sets. The size is protected by cpu_lock
1380Sstevel@tonic-gate  */
1390Sstevel@tonic-gate squeue_set_t	**sqset_global_list;
1400Sstevel@tonic-gate uint_t		sqset_global_size;
1410Sstevel@tonic-gate 
1420Sstevel@tonic-gate int ip_squeue_bind = B_TRUE;
1430Sstevel@tonic-gate int ip_squeue_profile = B_TRUE;
1440Sstevel@tonic-gate static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
1450Sstevel@tonic-gate 
1460Sstevel@tonic-gate /*
1470Sstevel@tonic-gate  * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
1480Sstevel@tonic-gate  *	created. This is the time squeue code waits before waking up the worker
1490Sstevel@tonic-gate  *	thread after queuing a request.
1500Sstevel@tonic-gate  */
1510Sstevel@tonic-gate uint_t ip_squeue_worker_wait = 10;
1520Sstevel@tonic-gate 
1530Sstevel@tonic-gate static squeue_set_t *ip_squeue_set_create(cpu_t *, boolean_t);
1540Sstevel@tonic-gate static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
1550Sstevel@tonic-gate 
1560Sstevel@tonic-gate static void ip_squeue_set_bind(squeue_set_t *);
1570Sstevel@tonic-gate static void ip_squeue_set_unbind(squeue_set_t *);
1580Sstevel@tonic-gate 
1590Sstevel@tonic-gate #define	CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
1600Sstevel@tonic-gate 
1610Sstevel@tonic-gate /*
1620Sstevel@tonic-gate  * Create squeue set containing ip_threads_per_cpu number of squeues
1630Sstevel@tonic-gate  * for this CPU and bind them all to the CPU.
1640Sstevel@tonic-gate  */
1650Sstevel@tonic-gate static squeue_set_t *
1660Sstevel@tonic-gate ip_squeue_set_create(cpu_t *cp, boolean_t reuse)
1670Sstevel@tonic-gate {
1680Sstevel@tonic-gate 	int i;
1690Sstevel@tonic-gate 	squeue_set_t	*sqs;
1700Sstevel@tonic-gate 	squeue_t 	*sqp;
1710Sstevel@tonic-gate 	char 		sqname[64];
1720Sstevel@tonic-gate 	processorid_t 	id = cp->cpu_id;
1730Sstevel@tonic-gate 
1740Sstevel@tonic-gate 	if (reuse) {
1750Sstevel@tonic-gate 		int i;
1760Sstevel@tonic-gate 
1770Sstevel@tonic-gate 		/*
1780Sstevel@tonic-gate 		 * We may already have an squeue created for this CPU. Try to
1790Sstevel@tonic-gate 		 * find one and reuse it if possible.
1800Sstevel@tonic-gate 		 */
1810Sstevel@tonic-gate 		for (i = 0; i < sqset_global_size; i++) {
1820Sstevel@tonic-gate 			sqs = sqset_global_list[i];
1830Sstevel@tonic-gate 			if (id == sqs->sqs_bind)
1840Sstevel@tonic-gate 				return (sqs);
1850Sstevel@tonic-gate 		}
1860Sstevel@tonic-gate 	}
1870Sstevel@tonic-gate 
1880Sstevel@tonic-gate 	sqs = kmem_zalloc(sizeof (squeue_set_t) +
1890Sstevel@tonic-gate 	    (sizeof (squeue_t *) * MAX_THREAD_PER_CPU), KM_SLEEP);
1900Sstevel@tonic-gate 	mutex_init(&sqs->sqs_lock, NULL, MUTEX_DEFAULT, NULL);
1910Sstevel@tonic-gate 	sqs->sqs_list = (squeue_t **)&sqs[1];
1920Sstevel@tonic-gate 	sqs->sqs_max_size = MAX_THREAD_PER_CPU;
1930Sstevel@tonic-gate 	sqs->sqs_bind = id;
1940Sstevel@tonic-gate 
1950Sstevel@tonic-gate 	for (i = 0; i < ip_threads_per_cpu; i++) {
1960Sstevel@tonic-gate 		bzero(sqname, sizeof (sqname));
1970Sstevel@tonic-gate 
1980Sstevel@tonic-gate 		(void) snprintf(sqname, sizeof (sqname),
1990Sstevel@tonic-gate 		    "ip_squeue_cpu_%d/%d/%d", cp->cpu_seqid,
2000Sstevel@tonic-gate 		    cp->cpu_id, i);
2010Sstevel@tonic-gate 
2020Sstevel@tonic-gate 		sqp = squeue_create(sqname, id, ip_squeue_worker_wait,
2030Sstevel@tonic-gate 		    minclsyspri);
2040Sstevel@tonic-gate 
2050Sstevel@tonic-gate 		ASSERT(sqp != NULL);
2060Sstevel@tonic-gate 
2070Sstevel@tonic-gate 		squeue_profile_enable(sqp);
2080Sstevel@tonic-gate 		sqs->sqs_list[sqs->sqs_size++] = sqp;
2090Sstevel@tonic-gate 
2100Sstevel@tonic-gate 		if (ip_squeue_create_callback != NULL)
2110Sstevel@tonic-gate 			ip_squeue_create_callback(sqp);
2120Sstevel@tonic-gate 	}
2130Sstevel@tonic-gate 
214*405Sakolb 	if (ip_squeue_bind && cpu_is_online(cp))
2150Sstevel@tonic-gate 		ip_squeue_set_bind(sqs);
2160Sstevel@tonic-gate 
2170Sstevel@tonic-gate 	sqset_global_list[sqset_global_size++] = sqs;
2180Sstevel@tonic-gate 	ASSERT(sqset_global_size <= NCPU);
2190Sstevel@tonic-gate 	return (sqs);
2200Sstevel@tonic-gate }
2210Sstevel@tonic-gate 
2220Sstevel@tonic-gate /*
2230Sstevel@tonic-gate  * Initialize IP squeues.
2240Sstevel@tonic-gate  */
2250Sstevel@tonic-gate void
2260Sstevel@tonic-gate ip_squeue_init(void (*callback)(squeue_t *))
2270Sstevel@tonic-gate {
2280Sstevel@tonic-gate 	int i;
2290Sstevel@tonic-gate 
2300Sstevel@tonic-gate 	ASSERT(sqset_global_list == NULL);
2310Sstevel@tonic-gate 
2320Sstevel@tonic-gate 	if (ip_threads_per_cpu < MIN_THREAD_PER_CPU)
2330Sstevel@tonic-gate 		ip_threads_per_cpu = MIN_THREAD_PER_CPU;
2340Sstevel@tonic-gate 	else if (ip_threads_per_cpu > MAX_THREAD_PER_CPU)
2350Sstevel@tonic-gate 		ip_threads_per_cpu = MAX_THREAD_PER_CPU;
2360Sstevel@tonic-gate 
2370Sstevel@tonic-gate 	ip_squeue_create_callback = callback;
2380Sstevel@tonic-gate 	squeue_init();
2390Sstevel@tonic-gate 	sqset_global_list =
2400Sstevel@tonic-gate 	    kmem_zalloc(sizeof (squeue_set_t *) * NCPU, KM_SLEEP);
2410Sstevel@tonic-gate 	sqset_global_size = 0;
2420Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
2430Sstevel@tonic-gate 
2440Sstevel@tonic-gate 	/* Create squeue for each active CPU available */
2450Sstevel@tonic-gate 	for (i = 0; i < NCPU; i++) {
2460Sstevel@tonic-gate 		cpu_t *cp = cpu[i];
2470Sstevel@tonic-gate 		if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
2480Sstevel@tonic-gate 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_FALSE);
2490Sstevel@tonic-gate 		}
2500Sstevel@tonic-gate 	}
2510Sstevel@tonic-gate 
2520Sstevel@tonic-gate 	register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
2530Sstevel@tonic-gate 
2540Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
2550Sstevel@tonic-gate 
2560Sstevel@tonic-gate 	if (ip_squeue_profile)
2570Sstevel@tonic-gate 		squeue_profile_start();
2580Sstevel@tonic-gate }
2590Sstevel@tonic-gate 
2600Sstevel@tonic-gate /*
2610Sstevel@tonic-gate  * Get squeue_t structure based on index.
2620Sstevel@tonic-gate  * Since the squeue list can only grow, no need to grab any lock.
2630Sstevel@tonic-gate  */
2640Sstevel@tonic-gate squeue_t *
2650Sstevel@tonic-gate ip_squeue_random(uint_t index)
2660Sstevel@tonic-gate {
2670Sstevel@tonic-gate 	squeue_set_t *sqs;
2680Sstevel@tonic-gate 
2690Sstevel@tonic-gate 	sqs = sqset_global_list[index % sqset_global_size];
2700Sstevel@tonic-gate 	return (sqs->sqs_list[index % sqs->sqs_size]);
2710Sstevel@tonic-gate }
2720Sstevel@tonic-gate 
2730Sstevel@tonic-gate /* ARGSUSED */
2740Sstevel@tonic-gate void
2750Sstevel@tonic-gate ip_squeue_clean(void *arg1, mblk_t *mp, void *arg2)
2760Sstevel@tonic-gate {
2770Sstevel@tonic-gate 	squeue_t	*sqp = arg2;
2780Sstevel@tonic-gate 	ill_rx_ring_t	*ring = sqp->sq_rx_ring;
2790Sstevel@tonic-gate 	ill_t		*ill;
2800Sstevel@tonic-gate 
2810Sstevel@tonic-gate 	ASSERT(sqp != NULL);
2820Sstevel@tonic-gate 
2830Sstevel@tonic-gate 	if (ring == NULL) {
2840Sstevel@tonic-gate 		return;
2850Sstevel@tonic-gate 	}
2860Sstevel@tonic-gate 
2870Sstevel@tonic-gate 	/*
2880Sstevel@tonic-gate 	 * Clean up squeue
2890Sstevel@tonic-gate 	 */
2900Sstevel@tonic-gate 	mutex_enter(&sqp->sq_lock);
2910Sstevel@tonic-gate 	sqp->sq_state &= ~(SQS_ILL_BOUND|SQS_POLL_CAPAB);
2920Sstevel@tonic-gate 	sqp->sq_rx_ring = NULL;
2930Sstevel@tonic-gate 	mutex_exit(&sqp->sq_lock);
2940Sstevel@tonic-gate 
2950Sstevel@tonic-gate 	ill = ring->rr_ill;
2960Sstevel@tonic-gate 
2970Sstevel@tonic-gate 	/*
2980Sstevel@tonic-gate 	 * Cleanup the ring
2990Sstevel@tonic-gate 	 */
3000Sstevel@tonic-gate 
3010Sstevel@tonic-gate 	ring->rr_blank = NULL;
3020Sstevel@tonic-gate 	ring->rr_handle = NULL;
3030Sstevel@tonic-gate 	ring->rr_sqp = NULL;
3040Sstevel@tonic-gate 
3050Sstevel@tonic-gate 	/*
3060Sstevel@tonic-gate 	 * Signal ill that cleanup is done
3070Sstevel@tonic-gate 	 */
3080Sstevel@tonic-gate 	mutex_enter(&ill->ill_lock);
3090Sstevel@tonic-gate 	ring->rr_ring_state = ILL_RING_FREE;
3100Sstevel@tonic-gate 	cv_signal(&ill->ill_cv);
3110Sstevel@tonic-gate 	mutex_exit(&ill->ill_lock);
3120Sstevel@tonic-gate }
3130Sstevel@tonic-gate 
3140Sstevel@tonic-gate typedef struct ip_taskq_arg {
3150Sstevel@tonic-gate 	ill_t		*ip_taskq_ill;
3160Sstevel@tonic-gate 	ill_rx_ring_t	*ip_taskq_ill_rx_ring;
3170Sstevel@tonic-gate 	cpu_t		*ip_taskq_cpu;
3180Sstevel@tonic-gate } ip_taskq_arg_t;
3190Sstevel@tonic-gate 
3200Sstevel@tonic-gate /*
3210Sstevel@tonic-gate  * Do a Rx ring to squeue binding. Find a unique squeue that is not
3220Sstevel@tonic-gate  * managing a receive ring. If no such squeue exists, dynamically
3230Sstevel@tonic-gate  * create a new one in the squeue set.
3240Sstevel@tonic-gate  *
3250Sstevel@tonic-gate  * The function runs via the system taskq. The ill passed as an
3260Sstevel@tonic-gate  * argument can't go away since we hold a ref. The lock order is
3270Sstevel@tonic-gate  * ill_lock -> sqs_lock -> sq_lock.
3280Sstevel@tonic-gate  *
3290Sstevel@tonic-gate  * If we are binding a Rx ring to a squeue attached to the offline CPU,
3300Sstevel@tonic-gate  * no need to check that because squeues are never destroyed once
3310Sstevel@tonic-gate  * created.
3320Sstevel@tonic-gate  */
3330Sstevel@tonic-gate /* ARGSUSED */
3340Sstevel@tonic-gate static void
3350Sstevel@tonic-gate ip_squeue_extend(void *arg)
3360Sstevel@tonic-gate {
3370Sstevel@tonic-gate 	ip_taskq_arg_t	*sq_arg = (ip_taskq_arg_t *)arg;
3380Sstevel@tonic-gate 	ill_t		*ill = sq_arg->ip_taskq_ill;
3390Sstevel@tonic-gate 	ill_rx_ring_t	*ill_rx_ring = sq_arg->ip_taskq_ill_rx_ring;
3400Sstevel@tonic-gate 	cpu_t		*intr_cpu = sq_arg->ip_taskq_cpu;
3410Sstevel@tonic-gate 	squeue_set_t *sqs;
3420Sstevel@tonic-gate 	squeue_t 	*sqp = NULL;
3430Sstevel@tonic-gate 	char		sqname[64];
3440Sstevel@tonic-gate 	int		i;
3450Sstevel@tonic-gate 
3460Sstevel@tonic-gate 	ASSERT(ill != NULL);
3470Sstevel@tonic-gate 	ASSERT(ill_rx_ring != NULL);
3480Sstevel@tonic-gate 	kmem_free(arg, sizeof (ip_taskq_arg_t));
3490Sstevel@tonic-gate 
3500Sstevel@tonic-gate 	sqs = intr_cpu->cpu_squeue_set;
3510Sstevel@tonic-gate 
3520Sstevel@tonic-gate 	/*
3530Sstevel@tonic-gate 	 * If this ill represents link aggregation, then there might be
3540Sstevel@tonic-gate 	 * multiple NICs trying to register them selves at the same time
3550Sstevel@tonic-gate 	 * and in order to ensure that test and assignment of free rings
3560Sstevel@tonic-gate 	 * is sequential, we need to hold the ill_lock.
3570Sstevel@tonic-gate 	 */
3580Sstevel@tonic-gate 	mutex_enter(&ill->ill_lock);
3590Sstevel@tonic-gate 	mutex_enter(&sqs->sqs_lock);
3600Sstevel@tonic-gate 	for (i = 0; i < sqs->sqs_size; i++) {
3610Sstevel@tonic-gate 		mutex_enter(&sqs->sqs_list[i]->sq_lock);
3620Sstevel@tonic-gate 		if ((sqs->sqs_list[i]->sq_state & SQS_ILL_BOUND) == 0) {
3630Sstevel@tonic-gate 			sqp = sqs->sqs_list[i];
3640Sstevel@tonic-gate 			break;
3650Sstevel@tonic-gate 		}
3660Sstevel@tonic-gate 		mutex_exit(&sqs->sqs_list[i]->sq_lock);
3670Sstevel@tonic-gate 	}
3680Sstevel@tonic-gate 
3690Sstevel@tonic-gate 	if (sqp == NULL) {
3700Sstevel@tonic-gate 		/* Need to create a new squeue */
3710Sstevel@tonic-gate 		if (sqs->sqs_size == sqs->sqs_max_size) {
3720Sstevel@tonic-gate 			/*
3730Sstevel@tonic-gate 			 * Reached the max limit for squeue
3740Sstevel@tonic-gate 			 * we can allocate on this CPU. Leave
3750Sstevel@tonic-gate 			 * ill_ring_state set to ILL_RING_INPROC
3760Sstevel@tonic-gate 			 * so that ip_squeue_direct will just
3770Sstevel@tonic-gate 			 * assign the default squeue for this
3780Sstevel@tonic-gate 			 * ring for future connections.
3790Sstevel@tonic-gate 			 */
3800Sstevel@tonic-gate #ifdef DEBUG
3810Sstevel@tonic-gate 			cmn_err(CE_NOTE, "ip_squeue_add: Reached max "
3820Sstevel@tonic-gate 			    " threads per CPU for sqp = %p\n", (void *)sqp);
3830Sstevel@tonic-gate #endif
3840Sstevel@tonic-gate 			mutex_exit(&sqs->sqs_lock);
3850Sstevel@tonic-gate 			mutex_exit(&ill->ill_lock);
3860Sstevel@tonic-gate 			ill_waiter_dcr(ill);
3870Sstevel@tonic-gate 			return;
3880Sstevel@tonic-gate 		}
3890Sstevel@tonic-gate 
3900Sstevel@tonic-gate 		bzero(sqname, sizeof (sqname));
3910Sstevel@tonic-gate 		(void) snprintf(sqname, sizeof (sqname),
3920Sstevel@tonic-gate 		    "ip_squeue_cpu_%d/%d/%d", CPU->cpu_seqid,
3930Sstevel@tonic-gate 		    CPU->cpu_id, sqs->sqs_size);
3940Sstevel@tonic-gate 
3950Sstevel@tonic-gate 		sqp = squeue_create(sqname, CPU->cpu_id, ip_squeue_worker_wait,
3960Sstevel@tonic-gate 		    minclsyspri);
3970Sstevel@tonic-gate 
3980Sstevel@tonic-gate 		ASSERT(sqp != NULL);
3990Sstevel@tonic-gate 
4000Sstevel@tonic-gate 		squeue_profile_enable(sqp);
4010Sstevel@tonic-gate 		sqs->sqs_list[sqs->sqs_size++] = sqp;
4020Sstevel@tonic-gate 
4030Sstevel@tonic-gate 		if (ip_squeue_create_callback != NULL)
4040Sstevel@tonic-gate 			ip_squeue_create_callback(sqp);
4050Sstevel@tonic-gate 
4060Sstevel@tonic-gate 		if (ip_squeue_bind) {
4070Sstevel@tonic-gate 			squeue_bind(sqp, -1);
4080Sstevel@tonic-gate 		}
4090Sstevel@tonic-gate 		mutex_enter(&sqp->sq_lock);
4100Sstevel@tonic-gate 	}
4110Sstevel@tonic-gate 
4120Sstevel@tonic-gate 	ASSERT(sqp != NULL);
4130Sstevel@tonic-gate 
4140Sstevel@tonic-gate 	sqp->sq_rx_ring = ill_rx_ring;
4150Sstevel@tonic-gate 	ill_rx_ring->rr_sqp = sqp;
4160Sstevel@tonic-gate 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
4170Sstevel@tonic-gate 
4180Sstevel@tonic-gate 	sqp->sq_state |= (SQS_ILL_BOUND|SQS_POLL_CAPAB);
4190Sstevel@tonic-gate 	mutex_exit(&sqp->sq_lock);
4200Sstevel@tonic-gate 	mutex_exit(&sqs->sqs_lock);
4210Sstevel@tonic-gate 
4220Sstevel@tonic-gate 	mutex_exit(&ill->ill_lock);
4230Sstevel@tonic-gate 
4240Sstevel@tonic-gate 	/* ill_waiter_dcr will also signal any waiters on ill_ring_state */
4250Sstevel@tonic-gate 	ill_waiter_dcr(ill);
4260Sstevel@tonic-gate }
4270Sstevel@tonic-gate 
4280Sstevel@tonic-gate /*
4290Sstevel@tonic-gate  * Find the squeue assigned to manage this Rx ring. If the Rx ring is not
4300Sstevel@tonic-gate  * owned by a squeue yet, do the assignment. When the NIC registers it
4310Sstevel@tonic-gate  * Rx rings with IP, we don't know where the interrupts will land and
4320Sstevel@tonic-gate  * hence we need to wait till this point to do the assignment.
4330Sstevel@tonic-gate  */
4340Sstevel@tonic-gate squeue_t *
4350Sstevel@tonic-gate ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
4360Sstevel@tonic-gate {
4370Sstevel@tonic-gate 	squeue_t 	*sqp;
4380Sstevel@tonic-gate 	ill_t 		*ill;
4390Sstevel@tonic-gate 	int		interrupt;
4400Sstevel@tonic-gate 	ip_taskq_arg_t	*taskq_arg;
4410Sstevel@tonic-gate 	boolean_t	refheld;
4420Sstevel@tonic-gate 
4430Sstevel@tonic-gate 	if (ill_rx_ring == NULL)
4440Sstevel@tonic-gate 		return (IP_SQUEUE_GET(lbolt));
4450Sstevel@tonic-gate 
4460Sstevel@tonic-gate 	sqp = ill_rx_ring->rr_sqp;
4470Sstevel@tonic-gate 	/*
4480Sstevel@tonic-gate 	 * Do a quick check. If it's not NULL, we are done.
4490Sstevel@tonic-gate 	 * Squeues are never destroyed so worse we will bind
4500Sstevel@tonic-gate 	 * this connection to a suboptimal squeue.
4510Sstevel@tonic-gate 	 *
4520Sstevel@tonic-gate 	 * This is the fast path case.
4530Sstevel@tonic-gate 	 */
4540Sstevel@tonic-gate 	if (sqp != NULL)
4550Sstevel@tonic-gate 		return (sqp);
4560Sstevel@tonic-gate 
4570Sstevel@tonic-gate 	ill = ill_rx_ring->rr_ill;
4580Sstevel@tonic-gate 	ASSERT(ill != NULL);
4590Sstevel@tonic-gate 
4600Sstevel@tonic-gate 	interrupt = servicing_interrupt();
4610Sstevel@tonic-gate 	taskq_arg = (ip_taskq_arg_t *)kmem_zalloc(sizeof (ip_taskq_arg_t),
4620Sstevel@tonic-gate 	    KM_NOSLEEP);
4630Sstevel@tonic-gate 
4640Sstevel@tonic-gate 	mutex_enter(&ill->ill_lock);
4650Sstevel@tonic-gate 	if (!interrupt || ill_rx_ring->rr_ring_state != ILL_RING_INUSE ||
4660Sstevel@tonic-gate 		taskq_arg == NULL) {
4670Sstevel@tonic-gate 		/*
4680Sstevel@tonic-gate 		 * Do the ring to squeue binding only if we are in interrupt
4690Sstevel@tonic-gate 		 * context and there is no one else trying the bind already.
4700Sstevel@tonic-gate 		 */
4710Sstevel@tonic-gate 		mutex_exit(&ill->ill_lock);
4720Sstevel@tonic-gate 		if (taskq_arg != NULL)
4730Sstevel@tonic-gate 			kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
4740Sstevel@tonic-gate 		return (IP_SQUEUE_GET(lbolt));
4750Sstevel@tonic-gate 	}
4760Sstevel@tonic-gate 
4770Sstevel@tonic-gate 	/*
4780Sstevel@tonic-gate 	 * No sqp assigned yet. Can't really do that in interrupt
4790Sstevel@tonic-gate 	 * context. Assign the default sqp to this connection and
4800Sstevel@tonic-gate 	 * trigger creation of new sqp and binding it to this ring
4810Sstevel@tonic-gate 	 * via taskq. Need to make sure ill stays around.
4820Sstevel@tonic-gate 	 */
4830Sstevel@tonic-gate 	taskq_arg->ip_taskq_ill = ill;
4840Sstevel@tonic-gate 	taskq_arg->ip_taskq_ill_rx_ring = ill_rx_ring;
4850Sstevel@tonic-gate 	taskq_arg->ip_taskq_cpu = CPU;
4860Sstevel@tonic-gate 	ill_rx_ring->rr_ring_state = ILL_RING_INPROC;
4870Sstevel@tonic-gate 	mutex_exit(&ill->ill_lock);
4880Sstevel@tonic-gate 	refheld = ill_waiter_inc(ill);
4890Sstevel@tonic-gate 	if (refheld) {
4900Sstevel@tonic-gate 		if (taskq_dispatch(system_taskq, ip_squeue_extend,
4910Sstevel@tonic-gate 		    taskq_arg, TQ_NOSLEEP) != NULL) {
4920Sstevel@tonic-gate 			return (IP_SQUEUE_GET(lbolt));
4930Sstevel@tonic-gate 		}
4940Sstevel@tonic-gate 	}
4950Sstevel@tonic-gate 	/*
4960Sstevel@tonic-gate 	 * The ill is closing and we could not get a reference on the ill OR
4970Sstevel@tonic-gate 	 * taskq_dispatch failed probably due to memory allocation failure.
4980Sstevel@tonic-gate 	 * We will try again next time.
4990Sstevel@tonic-gate 	 */
5000Sstevel@tonic-gate 	mutex_enter(&ill->ill_lock);
5010Sstevel@tonic-gate 	ill_rx_ring->rr_ring_state = ILL_RING_INUSE;
5020Sstevel@tonic-gate 	mutex_exit(&ill->ill_lock);
5030Sstevel@tonic-gate 	kmem_free(taskq_arg, sizeof (ip_taskq_arg_t));
5040Sstevel@tonic-gate 	if (refheld)
5050Sstevel@tonic-gate 		ill_waiter_dcr(ill);
5060Sstevel@tonic-gate 
5070Sstevel@tonic-gate 	return (IP_SQUEUE_GET(lbolt));
5080Sstevel@tonic-gate }
5090Sstevel@tonic-gate 
5100Sstevel@tonic-gate /*
5110Sstevel@tonic-gate  * NDD hooks for setting ip_squeue_xxx tuneables.
5120Sstevel@tonic-gate  */
5130Sstevel@tonic-gate 
5140Sstevel@tonic-gate /* ARGSUSED */
5150Sstevel@tonic-gate int
5160Sstevel@tonic-gate ip_squeue_bind_set(queue_t *q, mblk_t *mp, char *value,
5170Sstevel@tonic-gate     caddr_t addr, cred_t *cr)
5180Sstevel@tonic-gate {
5190Sstevel@tonic-gate 	int *bind_enabled = (int *)addr;
5200Sstevel@tonic-gate 	long new_value;
5210Sstevel@tonic-gate 	int i;
5220Sstevel@tonic-gate 
5230Sstevel@tonic-gate 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
5240Sstevel@tonic-gate 		return (EINVAL);
5250Sstevel@tonic-gate 
5260Sstevel@tonic-gate 	if (ip_squeue_bind == new_value)
5270Sstevel@tonic-gate 		return (0);
5280Sstevel@tonic-gate 
5290Sstevel@tonic-gate 	*bind_enabled = new_value;
5300Sstevel@tonic-gate 	mutex_enter(&cpu_lock);
5310Sstevel@tonic-gate 	if (new_value == 0) {
5320Sstevel@tonic-gate 		for (i = 0; i < sqset_global_size; i++)
5330Sstevel@tonic-gate 			ip_squeue_set_unbind(sqset_global_list[i]);
5340Sstevel@tonic-gate 	} else {
5350Sstevel@tonic-gate 		for (i = 0; i < sqset_global_size; i++)
5360Sstevel@tonic-gate 			ip_squeue_set_bind(sqset_global_list[i]);
5370Sstevel@tonic-gate 	}
5380Sstevel@tonic-gate 
5390Sstevel@tonic-gate 	mutex_exit(&cpu_lock);
5400Sstevel@tonic-gate 	return (0);
5410Sstevel@tonic-gate }
5420Sstevel@tonic-gate 
5430Sstevel@tonic-gate /*
5440Sstevel@tonic-gate  * Set squeue profiling.
5450Sstevel@tonic-gate  * 0 means "disable"
5460Sstevel@tonic-gate  * 1 means "enable"
5470Sstevel@tonic-gate  * 2 means "enable and reset"
5480Sstevel@tonic-gate  */
5490Sstevel@tonic-gate /* ARGSUSED */
5500Sstevel@tonic-gate int
5510Sstevel@tonic-gate ip_squeue_profile_set(queue_t *q, mblk_t *mp, char *value, caddr_t cp,
5520Sstevel@tonic-gate     cred_t *cr)
5530Sstevel@tonic-gate {
5540Sstevel@tonic-gate 	int *profile_enabled = (int *)cp;
5550Sstevel@tonic-gate 	long new_value;
5560Sstevel@tonic-gate 	squeue_set_t *sqs;
5570Sstevel@tonic-gate 
5580Sstevel@tonic-gate 	if (ddi_strtol(value, NULL, 10, &new_value) != 0)
5590Sstevel@tonic-gate 		return (EINVAL);
5600Sstevel@tonic-gate 
5610Sstevel@tonic-gate 	if (new_value == 0)
5620Sstevel@tonic-gate 		squeue_profile_stop();
5630Sstevel@tonic-gate 	else if (new_value == 1)
5640Sstevel@tonic-gate 		squeue_profile_start();
5650Sstevel@tonic-gate 	else if (new_value == 2) {
5660Sstevel@tonic-gate 		int i, j;
5670Sstevel@tonic-gate 
5680Sstevel@tonic-gate 		squeue_profile_stop();
5690Sstevel@tonic-gate 		mutex_enter(&cpu_lock);
5700Sstevel@tonic-gate 		for (i = 0; i < sqset_global_size; i++) {
5710Sstevel@tonic-gate 			sqs = sqset_global_list[i];
5720Sstevel@tonic-gate 			for (j = 0; j < sqs->sqs_size; j++) {
5730Sstevel@tonic-gate 				squeue_profile_reset(sqs->sqs_list[j]);
5740Sstevel@tonic-gate 			}
5750Sstevel@tonic-gate 		}
5760Sstevel@tonic-gate 		mutex_exit(&cpu_lock);
5770Sstevel@tonic-gate 
5780Sstevel@tonic-gate 		new_value = 1;
5790Sstevel@tonic-gate 		squeue_profile_start();
5800Sstevel@tonic-gate 	}
5810Sstevel@tonic-gate 	*profile_enabled = new_value;
5820Sstevel@tonic-gate 
5830Sstevel@tonic-gate 	return (0);
5840Sstevel@tonic-gate }
5850Sstevel@tonic-gate 
5860Sstevel@tonic-gate /*
5870Sstevel@tonic-gate  * Reconfiguration callback
5880Sstevel@tonic-gate  */
5890Sstevel@tonic-gate 
5900Sstevel@tonic-gate /* ARGSUSED */
5910Sstevel@tonic-gate static int
5920Sstevel@tonic-gate ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
5930Sstevel@tonic-gate {
5940Sstevel@tonic-gate 	cpu_t *cp = cpu[id];
5950Sstevel@tonic-gate 
5960Sstevel@tonic-gate 	ASSERT(MUTEX_HELD(&cpu_lock));
5970Sstevel@tonic-gate 	switch (what) {
598*405Sakolb 	case CPU_CONFIG:
599*405Sakolb 		/*
600*405Sakolb 		 * A new CPU is added. Create an squeue for it but do not bind
601*405Sakolb 		 * it yet.
602*405Sakolb 		 */
603*405Sakolb 		if (cp->cpu_squeue_set == NULL)
604*405Sakolb 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
605*405Sakolb 		break;
6060Sstevel@tonic-gate 	case CPU_ON:
6070Sstevel@tonic-gate 	case CPU_INIT:
6080Sstevel@tonic-gate 	case CPU_CPUPART_IN:
6090Sstevel@tonic-gate 		if (cp->cpu_squeue_set == NULL) {
6100Sstevel@tonic-gate 			cp->cpu_squeue_set = ip_squeue_set_create(cp, B_TRUE);
6110Sstevel@tonic-gate 		}
6120Sstevel@tonic-gate 		if (ip_squeue_bind)
6130Sstevel@tonic-gate 			ip_squeue_set_bind(cp->cpu_squeue_set);
6140Sstevel@tonic-gate 		break;
6150Sstevel@tonic-gate 	case CPU_UNCONFIG:
6160Sstevel@tonic-gate 	case CPU_OFF:
6170Sstevel@tonic-gate 	case CPU_CPUPART_OUT:
6180Sstevel@tonic-gate 		ASSERT((cp->cpu_squeue_set != NULL) ||
6190Sstevel@tonic-gate 		    (cp->cpu_flags & CPU_OFFLINE));
6200Sstevel@tonic-gate 
6210Sstevel@tonic-gate 		if (cp->cpu_squeue_set != NULL) {
6220Sstevel@tonic-gate 			ip_squeue_set_unbind(cp->cpu_squeue_set);
6230Sstevel@tonic-gate 		}
6240Sstevel@tonic-gate 		break;
6250Sstevel@tonic-gate 	default:
6260Sstevel@tonic-gate 		break;
6270Sstevel@tonic-gate 	}
6280Sstevel@tonic-gate 	return (0);
6290Sstevel@tonic-gate }
6300Sstevel@tonic-gate 
6310Sstevel@tonic-gate /* ARGSUSED */
6320Sstevel@tonic-gate static void
6330Sstevel@tonic-gate ip_squeue_set_bind(squeue_set_t *sqs)
6340Sstevel@tonic-gate {
6350Sstevel@tonic-gate 	int i;
6360Sstevel@tonic-gate 	squeue_t *sqp;
6370Sstevel@tonic-gate 
6380Sstevel@tonic-gate 	if (!ip_squeue_bind)
6390Sstevel@tonic-gate 		return;
6400Sstevel@tonic-gate 
6410Sstevel@tonic-gate 	mutex_enter(&sqs->sqs_lock);
6420Sstevel@tonic-gate 	for (i = 0; i < sqs->sqs_size; i++) {
6430Sstevel@tonic-gate 		sqp = sqs->sqs_list[i];
6440Sstevel@tonic-gate 		if (sqp->sq_state & SQS_BOUND)
6450Sstevel@tonic-gate 			continue;
6460Sstevel@tonic-gate 		squeue_bind(sqp, -1);
6470Sstevel@tonic-gate 	}
6480Sstevel@tonic-gate 	mutex_exit(&sqs->sqs_lock);
6490Sstevel@tonic-gate }
6500Sstevel@tonic-gate 
6510Sstevel@tonic-gate static void
6520Sstevel@tonic-gate ip_squeue_set_unbind(squeue_set_t *sqs)
6530Sstevel@tonic-gate {
6540Sstevel@tonic-gate 	int i;
6550Sstevel@tonic-gate 	squeue_t *sqp;
6560Sstevel@tonic-gate 
6570Sstevel@tonic-gate 	mutex_enter(&sqs->sqs_lock);
6580Sstevel@tonic-gate 	for (i = 0; i < sqs->sqs_size; i++) {
6590Sstevel@tonic-gate 		sqp = sqs->sqs_list[i];
6600Sstevel@tonic-gate 		if (!(sqp->sq_state & SQS_BOUND))
6610Sstevel@tonic-gate 			continue;
6620Sstevel@tonic-gate 		squeue_unbind(sqp);
6630Sstevel@tonic-gate 	}
6640Sstevel@tonic-gate 	mutex_exit(&sqs->sqs_lock);
6650Sstevel@tonic-gate }
666