10Sstevel@tonic-gate /*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
51503Sericheng * Common Development and Distribution License (the "License").
61503Sericheng * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate *
80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate * See the License for the specific language governing permissions
110Sstevel@tonic-gate * and limitations under the License.
120Sstevel@tonic-gate *
130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate *
190Sstevel@tonic-gate * CDDL HEADER END
200Sstevel@tonic-gate */
210Sstevel@tonic-gate /*
229210SThirumalai.Srinivasan@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
230Sstevel@tonic-gate * Use is subject to license terms.
240Sstevel@tonic-gate */
250Sstevel@tonic-gate
260Sstevel@tonic-gate /*
270Sstevel@tonic-gate * IP interface to squeues.
280Sstevel@tonic-gate *
298275SEric Cheng * IP uses squeues to force serialization of packets, both incoming and
308275SEric Cheng * outgoing. Each squeue is associated with a connection instance (conn_t)
318275SEric Cheng * above, and a soft ring (if enabled) below. Each CPU will have a default
328275SEric Cheng * squeue for outbound connections, and each soft ring of an interface will
338275SEric Cheng * have an squeue to which it sends incoming packets. squeues are never
348275SEric Cheng * destroyed, and if they become unused they are kept around against future
358275SEric Cheng * needs.
360Sstevel@tonic-gate *
378275SEric Cheng * IP organizes its squeues using squeue sets (squeue_set_t). For each CPU
388275SEric Cheng * in the system there will be one squeue set, all of whose squeues will be
398275SEric Cheng * bound to that CPU, plus one additional set known as the unbound set. Sets
408275SEric Cheng * associated with CPUs will have one default squeue, for outbound
418275SEric Cheng * connections, and a linked list of squeues used by various NICs for inbound
428275SEric Cheng * packets. The unbound set also has a linked list of squeues, but no default
438275SEric Cheng * squeue.
448275SEric Cheng *
458275SEric Cheng * When a CPU goes offline its squeue set is destroyed, and all its squeues
468275SEric Cheng * are moved to the unbound set. When a CPU comes online, a new squeue set is
478275SEric Cheng * created and the default set is searched for a default squeue formerly bound
488275SEric Cheng * to this CPU. If no default squeue is found, a new one is created.
498275SEric Cheng *
508275SEric Cheng * Two fields of the squeue_t, namely sq_next and sq_set, are owned by IP
518275SEric Cheng * and not the squeue code. squeue.c will not touch them, and we can modify
528275SEric Cheng * them without holding the squeue lock because of the guarantee that squeues
538275SEric Cheng * are never destroyed. ip_squeue locks must be held, however.
548275SEric Cheng *
558275SEric Cheng * All the squeue sets are protected by a single lock, the sqset_lock. This
568275SEric Cheng * is also used to protect the sq_next and sq_set fields of an squeue_t.
578275SEric Cheng *
588275SEric Cheng * The lock order is: cpu_lock --> ill_lock --> sqset_lock --> sq_lock
590Sstevel@tonic-gate *
600Sstevel@tonic-gate * There are two modes of associating connection with squeues. The first mode
610Sstevel@tonic-gate * associates each connection with the CPU that creates the connection (either
620Sstevel@tonic-gate * during open time or during accept time). The second mode associates each
630Sstevel@tonic-gate * connection with a random CPU, effectively distributing load over all CPUs
640Sstevel@tonic-gate * and all squeues in the system. The mode is controlled by the
650Sstevel@tonic-gate * ip_squeue_fanout variable.
660Sstevel@tonic-gate *
670Sstevel@tonic-gate * NOTE: The fact that there is an association between each connection and
680Sstevel@tonic-gate * squeue and squeue and CPU does not mean that each connection is always
690Sstevel@tonic-gate * processed on this CPU and on this CPU only. Any thread calling squeue_enter()
700Sstevel@tonic-gate * may process the connection on whatever CPU it is scheduled. The squeue to CPU
710Sstevel@tonic-gate * binding is only relevant for the worker thread.
720Sstevel@tonic-gate *
730Sstevel@tonic-gate * INTERFACE:
740Sstevel@tonic-gate *
758275SEric Cheng * squeue_t *ip_squeue_get(ill_rx_ring_t)
760Sstevel@tonic-gate *
778275SEric Cheng * Returns the squeue associated with an ill receive ring. If the ring is
788275SEric Cheng * not bound to a CPU, and we're currently servicing the interrupt which
798275SEric Cheng * generated the packet, then bind the squeue to CPU.
800Sstevel@tonic-gate *
810Sstevel@tonic-gate *
820Sstevel@tonic-gate * DR Notes
830Sstevel@tonic-gate * ========
840Sstevel@tonic-gate *
850Sstevel@tonic-gate * The ip_squeue_init() registers a call-back function with the CPU DR
860Sstevel@tonic-gate * subsystem using register_cpu_setup_func(). The call-back function does two
870Sstevel@tonic-gate * things:
880Sstevel@tonic-gate *
890Sstevel@tonic-gate * o When the CPU is going off-line or unconfigured, the worker thread is
900Sstevel@tonic-gate * unbound from the CPU. This allows the CPU unconfig code to move it to
910Sstevel@tonic-gate * another CPU.
920Sstevel@tonic-gate *
930Sstevel@tonic-gate * o When the CPU is going online, it creates a new squeue for this CPU if
940Sstevel@tonic-gate * necessary and binds the squeue worker thread to this CPU.
950Sstevel@tonic-gate *
968275SEric Cheng * TUNABLES:
970Sstevel@tonic-gate *
988275SEric Cheng * ip_squeue_fanout: used when TCP calls IP_SQUEUE_GET(). If 1, then
998275SEric Cheng * pick the default squeue from a random CPU, otherwise use our CPU's default
1008275SEric Cheng * squeue.
1010Sstevel@tonic-gate *
1028275SEric Cheng * ip_squeue_fanout can be accessed and changed using ndd on /dev/tcp or
1038275SEric Cheng * /dev/ip.
1040Sstevel@tonic-gate *
1058275SEric Cheng * ip_squeue_worker_wait: global value for the sq_wait field for all squeues *
1068275SEric Cheng * created. This is the time squeue code waits before waking up the worker
1078275SEric Cheng * thread after queuing a request.
1080Sstevel@tonic-gate */
1090Sstevel@tonic-gate
1100Sstevel@tonic-gate #include <sys/types.h>
1110Sstevel@tonic-gate #include <sys/debug.h>
1120Sstevel@tonic-gate #include <sys/kmem.h>
1130Sstevel@tonic-gate #include <sys/cpuvar.h>
1140Sstevel@tonic-gate #include <sys/cmn_err.h>
1150Sstevel@tonic-gate
1160Sstevel@tonic-gate #include <inet/common.h>
1170Sstevel@tonic-gate #include <inet/ip.h>
1188275SEric Cheng #include <netinet/ip6.h>
1190Sstevel@tonic-gate #include <inet/ip_if.h>
1208275SEric Cheng #include <inet/ip_ire.h>
1210Sstevel@tonic-gate #include <inet/nd.h>
1220Sstevel@tonic-gate #include <inet/ipclassifier.h>
1230Sstevel@tonic-gate #include <sys/types.h>
1240Sstevel@tonic-gate #include <sys/conf.h>
1250Sstevel@tonic-gate #include <sys/sunddi.h>
1262546Scarlsonj #include <sys/dlpi.h>
1270Sstevel@tonic-gate #include <sys/squeue_impl.h>
1288275SEric Cheng #include <sys/tihdr.h>
1298275SEric Cheng #include <inet/udp_impl.h>
1308275SEric Cheng #include <sys/strsubr.h>
1318275SEric Cheng #include <sys/zone.h>
1328275SEric Cheng #include <sys/dld.h>
1338130SGeorge.Shepherd@Sun.COM #include <sys/atomic.h>
1340Sstevel@tonic-gate
1350Sstevel@tonic-gate /*
1368275SEric Cheng * List of all created squeue sets. The list and its size are protected by
1378275SEric Cheng * sqset_lock.
1380Sstevel@tonic-gate */
1398275SEric Cheng static squeue_set_t **sqset_global_list; /* list 0 is the unbound list */
1408275SEric Cheng static uint_t sqset_global_size;
1418275SEric Cheng kmutex_t sqset_lock;
1421184Skrgopi
1430Sstevel@tonic-gate static void (*ip_squeue_create_callback)(squeue_t *) = NULL;
1440Sstevel@tonic-gate
1450Sstevel@tonic-gate /*
1460Sstevel@tonic-gate * ip_squeue_worker_wait: global value for the sq_wait field for all squeues
1470Sstevel@tonic-gate * created. This is the time squeue code waits before waking up the worker
1480Sstevel@tonic-gate * thread after queuing a request.
1490Sstevel@tonic-gate */
1500Sstevel@tonic-gate uint_t ip_squeue_worker_wait = 10;
1510Sstevel@tonic-gate
1528275SEric Cheng static squeue_t *ip_squeue_create(pri_t);
1538275SEric Cheng static squeue_set_t *ip_squeue_set_create(processorid_t);
1540Sstevel@tonic-gate static int ip_squeue_cpu_setup(cpu_setup_t, int, void *);
1558275SEric Cheng static void ip_squeue_set_move(squeue_t *, squeue_set_t *);
1568275SEric Cheng static void ip_squeue_set_destroy(cpu_t *);
1574360Smeem static void ip_squeue_clean(void *, mblk_t *, void *);
1580Sstevel@tonic-gate
1590Sstevel@tonic-gate #define CPU_ISON(c) (c != NULL && CPU_ACTIVE(c) && (c->cpu_flags & CPU_EXISTS))
1600Sstevel@tonic-gate
1618275SEric Cheng static squeue_t *
ip_squeue_create(pri_t pri)1628275SEric Cheng ip_squeue_create(pri_t pri)
1638275SEric Cheng {
1648275SEric Cheng squeue_t *sqp;
1658275SEric Cheng
1668275SEric Cheng sqp = squeue_create(ip_squeue_worker_wait, pri);
1678275SEric Cheng ASSERT(sqp != NULL);
1688275SEric Cheng if (ip_squeue_create_callback != NULL)
1698275SEric Cheng ip_squeue_create_callback(sqp);
1708275SEric Cheng return (sqp);
1718275SEric Cheng }
1728275SEric Cheng
1730Sstevel@tonic-gate /*
1748275SEric Cheng * Create a new squeue_set. If id == -1, then we're creating the unbound set,
1758275SEric Cheng * which should only happen once when we are first initialized. Otherwise id
1768275SEric Cheng * is the id of the CPU that needs a set, either because we are initializing
1778275SEric Cheng * or because the CPU has come online.
1788275SEric Cheng *
1798275SEric Cheng * If id != -1, then we need at a minimum to provide a default squeue for the
1808275SEric Cheng * new set. We search the unbound set for candidates, and if none are found we
1818275SEric Cheng * create a new one.
1820Sstevel@tonic-gate */
1830Sstevel@tonic-gate static squeue_set_t *
ip_squeue_set_create(processorid_t id)1848275SEric Cheng ip_squeue_set_create(processorid_t id)
1850Sstevel@tonic-gate {
1860Sstevel@tonic-gate squeue_set_t *sqs;
1878275SEric Cheng squeue_set_t *src = sqset_global_list[0];
1888275SEric Cheng squeue_t **lastsqp, *sq;
1898275SEric Cheng squeue_t **defaultq_lastp = NULL;
1900Sstevel@tonic-gate
1918275SEric Cheng sqs = kmem_zalloc(sizeof (squeue_set_t), KM_SLEEP);
1928275SEric Cheng sqs->sqs_cpuid = id;
1930Sstevel@tonic-gate
1948275SEric Cheng if (id == -1) {
1958275SEric Cheng ASSERT(sqset_global_size == 0);
1968275SEric Cheng sqset_global_list[0] = sqs;
1978275SEric Cheng sqset_global_size = 1;
1988275SEric Cheng return (sqs);
1990Sstevel@tonic-gate }
2000Sstevel@tonic-gate
2018275SEric Cheng /*
2028275SEric Cheng * When we create an squeue set id != -1, we need to give it a
2038275SEric Cheng * default squeue, in order to support fanout of conns across
2048275SEric Cheng * CPUs. Try to find a former default squeue that matches this
2058275SEric Cheng * cpu id on the unbound squeue set. If no such squeue is found,
2069979SThirumalai.Srinivasan@Sun.COM * find some non-default TCP squeue that is free. If still no such
2078275SEric Cheng * candidate is found, create a new squeue.
2088275SEric Cheng */
2090Sstevel@tonic-gate
2108275SEric Cheng ASSERT(MUTEX_HELD(&cpu_lock));
2118275SEric Cheng mutex_enter(&sqset_lock);
2128275SEric Cheng lastsqp = &src->sqs_head;
2130Sstevel@tonic-gate
2148275SEric Cheng while (*lastsqp) {
2158275SEric Cheng if ((*lastsqp)->sq_bind == id &&
2168275SEric Cheng (*lastsqp)->sq_state & SQS_DEFAULT) {
2179979SThirumalai.Srinivasan@Sun.COM /*
2189979SThirumalai.Srinivasan@Sun.COM * Exact match. Former default squeue of cpu 'id'
2199979SThirumalai.Srinivasan@Sun.COM */
2209979SThirumalai.Srinivasan@Sun.COM ASSERT(!((*lastsqp)->sq_state & SQS_ILL_BOUND));
2218275SEric Cheng defaultq_lastp = lastsqp;
2228275SEric Cheng break;
2238275SEric Cheng }
2248275SEric Cheng if (defaultq_lastp == NULL &&
2259979SThirumalai.Srinivasan@Sun.COM !((*lastsqp)->sq_state & (SQS_ILL_BOUND | SQS_DEFAULT))) {
2269979SThirumalai.Srinivasan@Sun.COM /*
2279979SThirumalai.Srinivasan@Sun.COM * A free non-default TCP squeue
2289979SThirumalai.Srinivasan@Sun.COM */
2298275SEric Cheng defaultq_lastp = lastsqp;
2308275SEric Cheng }
2318275SEric Cheng lastsqp = &(*lastsqp)->sq_next;
2329979SThirumalai.Srinivasan@Sun.COM }
2330Sstevel@tonic-gate
2349979SThirumalai.Srinivasan@Sun.COM if (defaultq_lastp != NULL) {
2358275SEric Cheng /* Remove from src set and set SQS_DEFAULT */
2368275SEric Cheng sq = *defaultq_lastp;
2378275SEric Cheng *defaultq_lastp = sq->sq_next;
2388275SEric Cheng sq->sq_next = NULL;
2398275SEric Cheng if (!(sq->sq_state & SQS_DEFAULT)) {
2408275SEric Cheng mutex_enter(&sq->sq_lock);
2418275SEric Cheng sq->sq_state |= SQS_DEFAULT;
2428275SEric Cheng mutex_exit(&sq->sq_lock);
2438275SEric Cheng }
2448275SEric Cheng } else {
2458275SEric Cheng sq = ip_squeue_create(SQUEUE_DEFAULT_PRIORITY);
2468275SEric Cheng sq->sq_state |= SQS_DEFAULT;
2470Sstevel@tonic-gate }
2480Sstevel@tonic-gate
2498275SEric Cheng sq->sq_set = sqs;
2508275SEric Cheng sqs->sqs_default = sq;
2518275SEric Cheng squeue_bind(sq, id); /* this locks squeue mutex */
2528275SEric Cheng
2538275SEric Cheng ASSERT(sqset_global_size <= NCPU);
2548275SEric Cheng sqset_global_list[sqset_global_size++] = sqs;
2558275SEric Cheng mutex_exit(&sqset_lock);
2568275SEric Cheng return (sqs);
2578275SEric Cheng }
2588275SEric Cheng
2598275SEric Cheng /*
2608275SEric Cheng * Called by ill_ring_add() to find an squeue to associate with a new ring.
2618275SEric Cheng */
2628275SEric Cheng
2638275SEric Cheng squeue_t *
ip_squeue_getfree(pri_t pri)2648275SEric Cheng ip_squeue_getfree(pri_t pri)
2658275SEric Cheng {
2668275SEric Cheng squeue_set_t *sqs = sqset_global_list[0];
2678275SEric Cheng squeue_t *sq;
2688275SEric Cheng
2698275SEric Cheng mutex_enter(&sqset_lock);
2708275SEric Cheng for (sq = sqs->sqs_head; sq != NULL; sq = sq->sq_next) {
2718275SEric Cheng /*
2729979SThirumalai.Srinivasan@Sun.COM * Select a non-default TCP squeue that is free i.e. not
2739979SThirumalai.Srinivasan@Sun.COM * bound to any ill.
2748275SEric Cheng */
2758275SEric Cheng if (!(sq->sq_state & (SQS_DEFAULT | SQS_ILL_BOUND)))
2768275SEric Cheng break;
2778275SEric Cheng }
2780Sstevel@tonic-gate
2798275SEric Cheng if (sq == NULL) {
2808275SEric Cheng sq = ip_squeue_create(pri);
2818275SEric Cheng sq->sq_set = sqs;
2828275SEric Cheng sq->sq_next = sqs->sqs_head;
2838275SEric Cheng sqs->sqs_head = sq;
2848275SEric Cheng }
2858275SEric Cheng
2868275SEric Cheng ASSERT(!(sq->sq_state & (SQS_POLL_THR_CONTROL | SQS_WORKER_THR_CONTROL |
2878275SEric Cheng SQS_POLL_CLEANUP_DONE | SQS_POLL_QUIESCE_DONE |
2888275SEric Cheng SQS_POLL_THR_QUIESCED)));
2898275SEric Cheng
2908275SEric Cheng mutex_enter(&sq->sq_lock);
2918275SEric Cheng sq->sq_state |= SQS_ILL_BOUND;
2928275SEric Cheng mutex_exit(&sq->sq_lock);
2938275SEric Cheng mutex_exit(&sqset_lock);
2948275SEric Cheng
2958275SEric Cheng if (sq->sq_priority != pri) {
2968275SEric Cheng thread_lock(sq->sq_worker);
2978275SEric Cheng (void) thread_change_pri(sq->sq_worker, pri, 0);
2988275SEric Cheng thread_unlock(sq->sq_worker);
2998275SEric Cheng
3008275SEric Cheng thread_lock(sq->sq_poll_thr);
3018275SEric Cheng (void) thread_change_pri(sq->sq_poll_thr, pri, 0);
3028275SEric Cheng thread_unlock(sq->sq_poll_thr);
3038275SEric Cheng
3048275SEric Cheng sq->sq_priority = pri;
3058275SEric Cheng }
3068275SEric Cheng return (sq);
3070Sstevel@tonic-gate }
3080Sstevel@tonic-gate
3090Sstevel@tonic-gate /*
3100Sstevel@tonic-gate * Initialize IP squeues.
3110Sstevel@tonic-gate */
3120Sstevel@tonic-gate void
ip_squeue_init(void (* callback)(squeue_t *))3130Sstevel@tonic-gate ip_squeue_init(void (*callback)(squeue_t *))
3140Sstevel@tonic-gate {
3150Sstevel@tonic-gate int i;
3168275SEric Cheng squeue_set_t *sqs;
3170Sstevel@tonic-gate
3180Sstevel@tonic-gate ASSERT(sqset_global_list == NULL);
3190Sstevel@tonic-gate
3200Sstevel@tonic-gate ip_squeue_create_callback = callback;
3210Sstevel@tonic-gate squeue_init();
3228275SEric Cheng mutex_init(&sqset_lock, NULL, MUTEX_DEFAULT, NULL);
3230Sstevel@tonic-gate sqset_global_list =
3248275SEric Cheng kmem_zalloc(sizeof (squeue_set_t *) * (NCPU+1), KM_SLEEP);
3250Sstevel@tonic-gate sqset_global_size = 0;
3268275SEric Cheng /*
3278275SEric Cheng * We are called at system boot time and we don't
3288275SEric Cheng * expect memory allocation failure.
3298275SEric Cheng */
3308275SEric Cheng sqs = ip_squeue_set_create(-1);
3318275SEric Cheng ASSERT(sqs != NULL);
3328275SEric Cheng
3330Sstevel@tonic-gate mutex_enter(&cpu_lock);
3340Sstevel@tonic-gate /* Create squeue for each active CPU available */
3350Sstevel@tonic-gate for (i = 0; i < NCPU; i++) {
3368275SEric Cheng cpu_t *cp = cpu_get(i);
3370Sstevel@tonic-gate if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL) {
3388275SEric Cheng /*
3398275SEric Cheng * We are called at system boot time and we don't
3408275SEric Cheng * expect memory allocation failure then
3418275SEric Cheng */
3428275SEric Cheng cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
3438275SEric Cheng ASSERT(cp->cpu_squeue_set != NULL);
3440Sstevel@tonic-gate }
3450Sstevel@tonic-gate }
3460Sstevel@tonic-gate
3470Sstevel@tonic-gate register_cpu_setup_func(ip_squeue_cpu_setup, NULL);
3480Sstevel@tonic-gate mutex_exit(&cpu_lock);
3490Sstevel@tonic-gate }
3500Sstevel@tonic-gate
3510Sstevel@tonic-gate /*
3528275SEric Cheng * Get a default squeue, either from the current CPU or a CPU derived by hash
3538275SEric Cheng * from the index argument, depending upon the setting of ip_squeue_fanout.
3540Sstevel@tonic-gate */
3550Sstevel@tonic-gate squeue_t *
ip_squeue_random(uint_t index)3560Sstevel@tonic-gate ip_squeue_random(uint_t index)
3570Sstevel@tonic-gate {
3588275SEric Cheng squeue_set_t *sqs = NULL;
3598275SEric Cheng squeue_t *sq;
3608275SEric Cheng
3618275SEric Cheng /*
3628275SEric Cheng * The minimum value of sqset_global_size is 2, one for the unbound
3638275SEric Cheng * squeue set and another for the squeue set of the zeroth CPU.
3648275SEric Cheng * Even though the value could be changing, it can never go below 2,
3658275SEric Cheng * so the assert does not need the lock protection.
3668275SEric Cheng */
3678275SEric Cheng ASSERT(sqset_global_size > 1);
3688275SEric Cheng
3698275SEric Cheng /* Protect against changes to sqset_global_list */
3708275SEric Cheng mutex_enter(&sqset_lock);
3710Sstevel@tonic-gate
3728275SEric Cheng if (!ip_squeue_fanout)
3738275SEric Cheng sqs = CPU->cpu_squeue_set;
3748275SEric Cheng
3758275SEric Cheng /*
3768275SEric Cheng * sqset_global_list[0] corresponds to the unbound squeue set.
3778275SEric Cheng * The computation below picks a set other than the unbound set.
3788275SEric Cheng */
3798275SEric Cheng if (sqs == NULL)
3808275SEric Cheng sqs = sqset_global_list[(index % (sqset_global_size - 1)) + 1];
3818275SEric Cheng sq = sqs->sqs_default;
3828275SEric Cheng
3838275SEric Cheng mutex_exit(&sqset_lock);
3848275SEric Cheng ASSERT(sq);
3858275SEric Cheng return (sq);
3860Sstevel@tonic-gate }
3870Sstevel@tonic-gate
3888275SEric Cheng /*
3898275SEric Cheng * Move squeue from its current set to newset. Not used for default squeues.
3908275SEric Cheng * Bind or unbind the worker thread as appropriate.
3918275SEric Cheng */
3928275SEric Cheng
3934360Smeem static void
ip_squeue_set_move(squeue_t * sq,squeue_set_t * newset)3948275SEric Cheng ip_squeue_set_move(squeue_t *sq, squeue_set_t *newset)
3950Sstevel@tonic-gate {
3968275SEric Cheng squeue_set_t *set;
3978275SEric Cheng squeue_t **lastsqp;
3988275SEric Cheng processorid_t cpuid = newset->sqs_cpuid;
3998275SEric Cheng
4008275SEric Cheng ASSERT(!(sq->sq_state & SQS_DEFAULT));
4018275SEric Cheng ASSERT(!MUTEX_HELD(&sq->sq_lock));
4028275SEric Cheng ASSERT(MUTEX_HELD(&sqset_lock));
4038275SEric Cheng
4048275SEric Cheng set = sq->sq_set;
4058275SEric Cheng if (set == newset)
4068275SEric Cheng return;
4078275SEric Cheng
4088275SEric Cheng lastsqp = &set->sqs_head;
4098275SEric Cheng while (*lastsqp != sq)
4108275SEric Cheng lastsqp = &(*lastsqp)->sq_next;
4118275SEric Cheng
4128275SEric Cheng *lastsqp = sq->sq_next;
4138275SEric Cheng sq->sq_next = newset->sqs_head;
4148275SEric Cheng newset->sqs_head = sq;
4158275SEric Cheng sq->sq_set = newset;
4168275SEric Cheng if (cpuid == -1)
4178275SEric Cheng squeue_unbind(sq);
4188275SEric Cheng else
4198275SEric Cheng squeue_bind(sq, cpuid);
4208275SEric Cheng }
4218275SEric Cheng
4228275SEric Cheng /*
4238275SEric Cheng * Move squeue from its current set to cpuid's set and bind to cpuid.
4248275SEric Cheng */
4250Sstevel@tonic-gate
4268275SEric Cheng int
ip_squeue_cpu_move(squeue_t * sq,processorid_t cpuid)4278275SEric Cheng ip_squeue_cpu_move(squeue_t *sq, processorid_t cpuid)
4288275SEric Cheng {
4298275SEric Cheng cpu_t *cpu;
4308275SEric Cheng squeue_set_t *set;
4318275SEric Cheng
4328275SEric Cheng if (sq->sq_state & SQS_DEFAULT)
4338275SEric Cheng return (-1);
4348275SEric Cheng
4358275SEric Cheng ASSERT(MUTEX_HELD(&cpu_lock));
4368275SEric Cheng
4378275SEric Cheng cpu = cpu_get(cpuid);
4388275SEric Cheng if (!CPU_ISON(cpu))
4398275SEric Cheng return (-1);
4400Sstevel@tonic-gate
4418275SEric Cheng mutex_enter(&sqset_lock);
4428275SEric Cheng set = cpu->cpu_squeue_set;
4438275SEric Cheng if (set != NULL)
4448275SEric Cheng ip_squeue_set_move(sq, set);
4458275SEric Cheng mutex_exit(&sqset_lock);
4468275SEric Cheng return ((set == NULL) ? -1 : 0);
4478275SEric Cheng }
4488275SEric Cheng
4498275SEric Cheng /*
4508275SEric Cheng * The mac layer is calling, asking us to move an squeue to a
4518275SEric Cheng * new CPU. This routine is called with cpu_lock held.
4528275SEric Cheng */
4538275SEric Cheng void
ip_squeue_bind_ring(ill_t * ill,ill_rx_ring_t * rx_ring,processorid_t cpuid)4548275SEric Cheng ip_squeue_bind_ring(ill_t *ill, ill_rx_ring_t *rx_ring, processorid_t cpuid)
4558275SEric Cheng {
4568275SEric Cheng ASSERT(ILL_MAC_PERIM_HELD(ill));
4578275SEric Cheng ASSERT(rx_ring->rr_ill == ill);
4588275SEric Cheng
4598275SEric Cheng mutex_enter(&ill->ill_lock);
4608275SEric Cheng if (rx_ring->rr_ring_state == RR_FREE ||
4618275SEric Cheng rx_ring->rr_ring_state == RR_FREE_INPROG) {
4628275SEric Cheng mutex_exit(&ill->ill_lock);
4630Sstevel@tonic-gate return;
4640Sstevel@tonic-gate }
4650Sstevel@tonic-gate
4668275SEric Cheng if (ip_squeue_cpu_move(rx_ring->rr_sqp, cpuid) != -1)
4678275SEric Cheng rx_ring->rr_ring_state = RR_SQUEUE_BOUND;
4688275SEric Cheng
4698275SEric Cheng mutex_exit(&ill->ill_lock);
4708275SEric Cheng }
4718275SEric Cheng
4728275SEric Cheng void *
ip_squeue_add_ring(ill_t * ill,void * mrp)4738275SEric Cheng ip_squeue_add_ring(ill_t *ill, void *mrp)
4748275SEric Cheng {
4758275SEric Cheng mac_rx_fifo_t *mrfp = (mac_rx_fifo_t *)mrp;
4768275SEric Cheng ill_rx_ring_t *rx_ring, *ring_tbl;
4778275SEric Cheng int ip_rx_index;
4788275SEric Cheng squeue_t *sq = NULL;
4798275SEric Cheng pri_t pri;
4808275SEric Cheng
4818275SEric Cheng ASSERT(ILL_MAC_PERIM_HELD(ill));
4828275SEric Cheng ASSERT(mrfp->mrf_type == MAC_RX_FIFO);
4838275SEric Cheng ASSERT(ill->ill_dld_capab != NULL);
4848275SEric Cheng
4858275SEric Cheng ring_tbl = ill->ill_dld_capab->idc_poll.idp_ring_tbl;
4860Sstevel@tonic-gate
4878275SEric Cheng mutex_enter(&ill->ill_lock);
4888275SEric Cheng for (ip_rx_index = 0; ip_rx_index < ILL_MAX_RINGS; ip_rx_index++) {
4898275SEric Cheng rx_ring = &ring_tbl[ip_rx_index];
4908275SEric Cheng if (rx_ring->rr_ring_state == RR_FREE)
4918275SEric Cheng break;
4928275SEric Cheng }
4938275SEric Cheng
4948275SEric Cheng if (ip_rx_index == ILL_MAX_RINGS) {
4958275SEric Cheng /*
4968275SEric Cheng * We ran out of ILL_MAX_RINGS worth rx_ring structures. If
4978275SEric Cheng * we have devices which can overwhelm this limit,
4988275SEric Cheng * ILL_MAX_RING should be made configurable. Meanwhile it
4998275SEric Cheng * cause no panic because driver will pass ip_input a NULL
5008275SEric Cheng * handle which will make IP allocate the default squeue and
5018275SEric Cheng * Polling mode will not be used for this ring.
5028275SEric Cheng */
5038275SEric Cheng cmn_err(CE_NOTE,
5048275SEric Cheng "Reached maximum number of receiving rings (%d) for %s\n",
5058275SEric Cheng ILL_MAX_RINGS, ill->ill_name);
5068275SEric Cheng mutex_exit(&ill->ill_lock);
5078275SEric Cheng return (NULL);
5081184Skrgopi }
5090Sstevel@tonic-gate
5108275SEric Cheng bzero(rx_ring, sizeof (ill_rx_ring_t));
5118275SEric Cheng rx_ring->rr_rx = (ip_mac_rx_t)mrfp->mrf_receive;
5128275SEric Cheng /* XXX: Hard code it to tcp accept for now */
5138275SEric Cheng rx_ring->rr_ip_accept = (ip_accept_t)ip_accept_tcp;
5148275SEric Cheng
5158275SEric Cheng rx_ring->rr_intr_handle = mrfp->mrf_intr_handle;
5168275SEric Cheng rx_ring->rr_intr_enable = (ip_mac_intr_enable_t)mrfp->mrf_intr_enable;
5178275SEric Cheng rx_ring->rr_intr_disable =
5188275SEric Cheng (ip_mac_intr_disable_t)mrfp->mrf_intr_disable;
5198275SEric Cheng rx_ring->rr_rx_handle = mrfp->mrf_rx_arg;
5208275SEric Cheng rx_ring->rr_ill = ill;
5218275SEric Cheng
5228275SEric Cheng pri = mrfp->mrf_flow_priority;
5238275SEric Cheng
5248275SEric Cheng sq = ip_squeue_getfree(pri);
5258275SEric Cheng
5268275SEric Cheng mutex_enter(&sq->sq_lock);
5278275SEric Cheng sq->sq_rx_ring = rx_ring;
5288275SEric Cheng rx_ring->rr_sqp = sq;
5298275SEric Cheng
5308275SEric Cheng sq->sq_state |= SQS_POLL_CAPAB;
5318275SEric Cheng
5328275SEric Cheng rx_ring->rr_ring_state = RR_SQUEUE_UNBOUND;
5338275SEric Cheng sq->sq_ill = ill;
5348275SEric Cheng mutex_exit(&sq->sq_lock);
5358275SEric Cheng mutex_exit(&ill->ill_lock);
5368275SEric Cheng
5378275SEric Cheng DTRACE_PROBE4(ill__ring__add, char *, ill->ill_name, ill_t *, ill, int,
5388275SEric Cheng ip_rx_index, void *, mrfp->mrf_rx_arg);
5398275SEric Cheng
5408275SEric Cheng /* Assign the squeue to the specified CPU as well */
5418275SEric Cheng mutex_enter(&cpu_lock);
5428275SEric Cheng (void) ip_squeue_bind_ring(ill, rx_ring, mrfp->mrf_cpu_id);
5438275SEric Cheng mutex_exit(&cpu_lock);
5440Sstevel@tonic-gate
5458275SEric Cheng return (rx_ring);
5468275SEric Cheng }
5478275SEric Cheng
5488275SEric Cheng /*
5498275SEric Cheng * sanitize the squeue etc. Some of the processing
5508275SEric Cheng * needs to be done from inside the perimeter.
5518275SEric Cheng */
5528275SEric Cheng void
ip_squeue_clean_ring(ill_t * ill,ill_rx_ring_t * rx_ring)5538275SEric Cheng ip_squeue_clean_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
5548275SEric Cheng {
5558275SEric Cheng squeue_t *sqp;
5568275SEric Cheng
5578275SEric Cheng ASSERT(ILL_MAC_PERIM_HELD(ill));
5588275SEric Cheng ASSERT(rx_ring != NULL);
5598275SEric Cheng
5608275SEric Cheng /* Just clean one squeue */
5618275SEric Cheng mutex_enter(&ill->ill_lock);
5628275SEric Cheng if (rx_ring->rr_ring_state == RR_FREE) {
5638275SEric Cheng mutex_exit(&ill->ill_lock);
5648275SEric Cheng return;
5658275SEric Cheng }
5668275SEric Cheng rx_ring->rr_ring_state = RR_FREE_INPROG;
5678275SEric Cheng sqp = rx_ring->rr_sqp;
5688275SEric Cheng
5698275SEric Cheng mutex_enter(&sqp->sq_lock);
5708275SEric Cheng sqp->sq_state |= SQS_POLL_CLEANUP;
5718275SEric Cheng cv_signal(&sqp->sq_worker_cv);
5728275SEric Cheng mutex_exit(&ill->ill_lock);
5738275SEric Cheng while (!(sqp->sq_state & SQS_POLL_CLEANUP_DONE))
5748275SEric Cheng cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
5759979SThirumalai.Srinivasan@Sun.COM sqp->sq_state &= ~SQS_POLL_CLEANUP_DONE;
5768275SEric Cheng
5778275SEric Cheng ASSERT(!(sqp->sq_state & (SQS_POLL_THR_CONTROL |
5788275SEric Cheng SQS_WORKER_THR_CONTROL | SQS_POLL_QUIESCE_DONE |
5798275SEric Cheng SQS_POLL_THR_QUIESCED)));
5808275SEric Cheng
5818275SEric Cheng cv_signal(&sqp->sq_worker_cv);
5828275SEric Cheng mutex_exit(&sqp->sq_lock);
5830Sstevel@tonic-gate
5840Sstevel@tonic-gate /*
5859979SThirumalai.Srinivasan@Sun.COM * Move the squeue to sqset_global_list[0] which holds the set of
5869979SThirumalai.Srinivasan@Sun.COM * squeues not bound to any cpu. Note that the squeue is still
5879979SThirumalai.Srinivasan@Sun.COM * considered bound to an ill as long as SQS_ILL_BOUND is set.
5880Sstevel@tonic-gate */
5898275SEric Cheng mutex_enter(&sqset_lock);
5908275SEric Cheng ip_squeue_set_move(sqp, sqset_global_list[0]);
5918275SEric Cheng mutex_exit(&sqset_lock);
5928275SEric Cheng
5939979SThirumalai.Srinivasan@Sun.COM /*
5949979SThirumalai.Srinivasan@Sun.COM * CPU going offline can also trigger a move of the squeue to the
5959979SThirumalai.Srinivasan@Sun.COM * unbound set sqset_global_list[0]. However the squeue won't be
5969979SThirumalai.Srinivasan@Sun.COM * recycled for the next use as long as the SQS_ILL_BOUND flag
5979979SThirumalai.Srinivasan@Sun.COM * is set. Hence we clear the SQS_ILL_BOUND flag only towards the
5989979SThirumalai.Srinivasan@Sun.COM * end after the move.
5999979SThirumalai.Srinivasan@Sun.COM */
6009979SThirumalai.Srinivasan@Sun.COM mutex_enter(&sqp->sq_lock);
6019979SThirumalai.Srinivasan@Sun.COM sqp->sq_state &= ~SQS_ILL_BOUND;
6029979SThirumalai.Srinivasan@Sun.COM mutex_exit(&sqp->sq_lock);
6039979SThirumalai.Srinivasan@Sun.COM
6040Sstevel@tonic-gate mutex_enter(&ill->ill_lock);
6058275SEric Cheng rx_ring->rr_ring_state = RR_FREE;
6060Sstevel@tonic-gate mutex_exit(&ill->ill_lock);
6070Sstevel@tonic-gate }
6080Sstevel@tonic-gate
6094360Smeem /*
6108275SEric Cheng * Stop the squeue from polling. This needs to be done
6118275SEric Cheng * from inside the perimeter.
6124360Smeem */
6138275SEric Cheng void
ip_squeue_quiesce_ring(ill_t * ill,ill_rx_ring_t * rx_ring)6148275SEric Cheng ip_squeue_quiesce_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
6154360Smeem {
6164360Smeem squeue_t *sqp;
6174360Smeem
6188275SEric Cheng ASSERT(ILL_MAC_PERIM_HELD(ill));
6194360Smeem ASSERT(rx_ring != NULL);
6204360Smeem
6218275SEric Cheng sqp = rx_ring->rr_sqp;
6228275SEric Cheng mutex_enter(&sqp->sq_lock);
6238275SEric Cheng sqp->sq_state |= SQS_POLL_QUIESCE;
6248275SEric Cheng cv_signal(&sqp->sq_worker_cv);
6258275SEric Cheng while (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE))
6268275SEric Cheng cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
6278275SEric Cheng
6288275SEric Cheng mutex_exit(&sqp->sq_lock);
6298275SEric Cheng }
6308275SEric Cheng
6318275SEric Cheng /*
6328275SEric Cheng * Restart polling etc. Needs to be inside the perimeter to
6338275SEric Cheng * prevent races.
6348275SEric Cheng */
6358275SEric Cheng void
ip_squeue_restart_ring(ill_t * ill,ill_rx_ring_t * rx_ring)6368275SEric Cheng ip_squeue_restart_ring(ill_t *ill, ill_rx_ring_t *rx_ring)
6378275SEric Cheng {
6388275SEric Cheng squeue_t *sqp;
6398275SEric Cheng
6408275SEric Cheng ASSERT(ILL_MAC_PERIM_HELD(ill));
6418275SEric Cheng ASSERT(rx_ring != NULL);
6428275SEric Cheng
6438275SEric Cheng sqp = rx_ring->rr_sqp;
6448275SEric Cheng mutex_enter(&sqp->sq_lock);
6454360Smeem /*
6468275SEric Cheng * Handle change in number of rings between the quiesce and
6478275SEric Cheng * restart operations by checking for a previous quiesce before
6488275SEric Cheng * attempting a restart.
6494360Smeem */
6508275SEric Cheng if (!(sqp->sq_state & SQS_POLL_QUIESCE_DONE)) {
6518275SEric Cheng mutex_exit(&sqp->sq_lock);
6524360Smeem return;
6534360Smeem }
6548275SEric Cheng sqp->sq_state |= SQS_POLL_RESTART;
6558275SEric Cheng cv_signal(&sqp->sq_worker_cv);
6568275SEric Cheng while (!(sqp->sq_state & SQS_POLL_RESTART_DONE))
6578275SEric Cheng cv_wait(&sqp->sq_ctrlop_done_cv, &sqp->sq_lock);
6588275SEric Cheng sqp->sq_state &= ~SQS_POLL_RESTART_DONE;
6598275SEric Cheng mutex_exit(&sqp->sq_lock);
6604360Smeem }
6614360Smeem
6628275SEric Cheng /*
6638275SEric Cheng * sanitize all squeues associated with the ill.
6648275SEric Cheng */
6654360Smeem void
ip_squeue_clean_all(ill_t * ill)6664360Smeem ip_squeue_clean_all(ill_t *ill)
6674360Smeem {
6684360Smeem int idx;
6698275SEric Cheng ill_rx_ring_t *rx_ring;
6704360Smeem
6714360Smeem for (idx = 0; idx < ILL_MAX_RINGS; idx++) {
6728275SEric Cheng rx_ring = &ill->ill_dld_capab->idc_poll.idp_ring_tbl[idx];
6738275SEric Cheng ip_squeue_clean_ring(ill, rx_ring);
6744360Smeem }
6751184Skrgopi }
6761184Skrgopi
6771184Skrgopi /*
6788275SEric Cheng * Used by IP to get the squeue associated with a ring. If the squeue isn't
6798275SEric Cheng * yet bound to a CPU, and we're being called directly from the NIC's
6808275SEric Cheng * interrupt, then we know what CPU we want to assign the squeue to, so
6818275SEric Cheng * dispatch that task to a taskq.
6820Sstevel@tonic-gate */
6830Sstevel@tonic-gate squeue_t *
ip_squeue_get(ill_rx_ring_t * ill_rx_ring)6840Sstevel@tonic-gate ip_squeue_get(ill_rx_ring_t *ill_rx_ring)
6850Sstevel@tonic-gate {
6860Sstevel@tonic-gate squeue_t *sqp;
6870Sstevel@tonic-gate
6888275SEric Cheng if ((ill_rx_ring == NULL) || ((sqp = ill_rx_ring->rr_sqp) == NULL))
689*11066Srafael.vanoni@sun.com return (IP_SQUEUE_GET(CPU_PSEUDO_RANDOM()));
6900Sstevel@tonic-gate
6918275SEric Cheng return (sqp);
6920Sstevel@tonic-gate }
6930Sstevel@tonic-gate
6940Sstevel@tonic-gate /*
6958275SEric Cheng * Called when a CPU goes offline. It's squeue_set_t is destroyed, and all
6968275SEric Cheng * squeues are unboudn and moved to the unbound set.
6970Sstevel@tonic-gate */
6988275SEric Cheng static void
ip_squeue_set_destroy(cpu_t * cpu)6998275SEric Cheng ip_squeue_set_destroy(cpu_t *cpu)
7000Sstevel@tonic-gate {
7010Sstevel@tonic-gate int i;
7028275SEric Cheng squeue_t *sqp, *lastsqp = NULL;
7038275SEric Cheng squeue_set_t *sqs, *unbound = sqset_global_list[0];
7040Sstevel@tonic-gate
7058275SEric Cheng mutex_enter(&sqset_lock);
7068275SEric Cheng if ((sqs = cpu->cpu_squeue_set) == NULL) {
7078275SEric Cheng mutex_exit(&sqset_lock);
7088275SEric Cheng return;
7090Sstevel@tonic-gate }
7100Sstevel@tonic-gate
7118275SEric Cheng /* Move all squeues to unbound set */
7120Sstevel@tonic-gate
7138275SEric Cheng for (sqp = sqs->sqs_head; sqp; lastsqp = sqp, sqp = sqp->sq_next) {
7148275SEric Cheng squeue_unbind(sqp);
7158275SEric Cheng sqp->sq_set = unbound;
7168275SEric Cheng }
7178275SEric Cheng if (sqs->sqs_head) {
7188275SEric Cheng lastsqp->sq_next = unbound->sqs_head;
7198275SEric Cheng unbound->sqs_head = sqs->sqs_head;
7208275SEric Cheng }
7210Sstevel@tonic-gate
7228275SEric Cheng /* Also move default squeue to unbound set */
7238275SEric Cheng
7248275SEric Cheng sqp = sqs->sqs_default;
7259979SThirumalai.Srinivasan@Sun.COM ASSERT(sqp != NULL);
7268275SEric Cheng ASSERT((sqp->sq_state & (SQS_DEFAULT|SQS_ILL_BOUND)) == SQS_DEFAULT);
7270Sstevel@tonic-gate
7288275SEric Cheng sqp->sq_next = unbound->sqs_head;
7298275SEric Cheng unbound->sqs_head = sqp;
7308275SEric Cheng squeue_unbind(sqp);
7318275SEric Cheng sqp->sq_set = unbound;
7320Sstevel@tonic-gate
7338275SEric Cheng for (i = 1; i < sqset_global_size; i++)
7348275SEric Cheng if (sqset_global_list[i] == sqs)
7358275SEric Cheng break;
7360Sstevel@tonic-gate
7378275SEric Cheng ASSERT(i < sqset_global_size);
7388275SEric Cheng sqset_global_list[i] = sqset_global_list[sqset_global_size - 1];
7398275SEric Cheng sqset_global_list[sqset_global_size - 1] = NULL;
7408275SEric Cheng sqset_global_size--;
7410Sstevel@tonic-gate
7428275SEric Cheng mutex_exit(&sqset_lock);
7438275SEric Cheng kmem_free(sqs, sizeof (*sqs));
7440Sstevel@tonic-gate }
7450Sstevel@tonic-gate
7460Sstevel@tonic-gate /*
7470Sstevel@tonic-gate * Reconfiguration callback
7480Sstevel@tonic-gate */
7490Sstevel@tonic-gate /* ARGSUSED */
7500Sstevel@tonic-gate static int
ip_squeue_cpu_setup(cpu_setup_t what,int id,void * arg)7510Sstevel@tonic-gate ip_squeue_cpu_setup(cpu_setup_t what, int id, void *arg)
7520Sstevel@tonic-gate {
7538275SEric Cheng cpu_t *cp = cpu_get(id);
7540Sstevel@tonic-gate
7550Sstevel@tonic-gate ASSERT(MUTEX_HELD(&cpu_lock));
7560Sstevel@tonic-gate switch (what) {
757405Sakolb case CPU_CONFIG:
7580Sstevel@tonic-gate case CPU_ON:
7590Sstevel@tonic-gate case CPU_INIT:
7600Sstevel@tonic-gate case CPU_CPUPART_IN:
7619210SThirumalai.Srinivasan@Sun.COM if (CPU_ISON(cp) && cp->cpu_squeue_set == NULL)
7628275SEric Cheng cp->cpu_squeue_set = ip_squeue_set_create(cp->cpu_id);
7630Sstevel@tonic-gate break;
7640Sstevel@tonic-gate case CPU_UNCONFIG:
7650Sstevel@tonic-gate case CPU_OFF:
7660Sstevel@tonic-gate case CPU_CPUPART_OUT:
7670Sstevel@tonic-gate if (cp->cpu_squeue_set != NULL) {
7688275SEric Cheng ip_squeue_set_destroy(cp);
7698275SEric Cheng cp->cpu_squeue_set = NULL;
7700Sstevel@tonic-gate }
7710Sstevel@tonic-gate break;
7720Sstevel@tonic-gate default:
7730Sstevel@tonic-gate break;
7740Sstevel@tonic-gate }
7750Sstevel@tonic-gate return (0);
7760Sstevel@tonic-gate }
777