xref: /onnv-gate/usr/src/uts/common/io/mac/mac_soft_ring.c (revision 11878:ac93462db6d7)
18275SEric Cheng /*
28275SEric Cheng  * CDDL HEADER START
38275SEric Cheng  *
48275SEric Cheng  * The contents of this file are subject to the terms of the
58275SEric Cheng  * Common Development and Distribution License (the "License").
68275SEric Cheng  * You may not use this file except in compliance with the License.
78275SEric Cheng  *
88275SEric Cheng  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
98275SEric Cheng  * or http://www.opensolaris.org/os/licensing.
108275SEric Cheng  * See the License for the specific language governing permissions
118275SEric Cheng  * and limitations under the License.
128275SEric Cheng  *
138275SEric Cheng  * When distributing Covered Code, include this CDDL HEADER in each
148275SEric Cheng  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
158275SEric Cheng  * If applicable, add the following below this CDDL HEADER, with the
168275SEric Cheng  * fields enclosed by brackets "[]" replaced with your own identifying
178275SEric Cheng  * information: Portions Copyright [yyyy] [name of copyright owner]
188275SEric Cheng  *
198275SEric Cheng  * CDDL HEADER END
208275SEric Cheng  */
218275SEric Cheng /*
22*11878SVenu.Iyer@Sun.COM  * Copyright 2010 Sun Microsystems, Inc.  All rights reserved.
238275SEric Cheng  * Use is subject to license terms.
248275SEric Cheng  */
258275SEric Cheng 
268275SEric Cheng /*
278275SEric Cheng  * General Soft rings - Simulating Rx rings in S/W.
288275SEric Cheng  *
298275SEric Cheng  * Soft ring is a data abstraction containing a queue and a worker
308275SEric Cheng  * thread and represents a hardware Rx ring in software. Each soft
318275SEric Cheng  * ring set can have a collection of soft rings for separating
328275SEric Cheng  * L3/L4 specific traffic (IPv4 from IPv6 or TCP from UDP) or for
338275SEric Cheng  * allowing a higher degree of parallelism by sending traffic to
348275SEric Cheng  * one of the soft rings for a SRS (using a hash on src IP or port).
358275SEric Cheng  * Each soft ring worker thread can be bound to a different CPU
368275SEric Cheng  * allowing the processing for each soft ring to happen in parallel
378275SEric Cheng  * and independent from each other.
388275SEric Cheng  *
398275SEric Cheng  * Protocol soft rings:
408275SEric Cheng  *
418275SEric Cheng  * Each SRS has at an minimum 3 softrings. One each for IPv4 TCP,
428275SEric Cheng  * IPv4 UDP and rest (OTH - for IPv6 and everything else). The
438275SEric Cheng  * SRS does dynamic polling and enforces link level bandwidth but
448275SEric Cheng  * it does so for all traffic (IPv4 and IPv6 and all protocols) on
458275SEric Cheng  * that link. However, each protocol layer wants a different
468275SEric Cheng  * behaviour. For instance IPv4 TCP has per CPU squeues which
478275SEric Cheng  * enforce their own polling and flow control so IPv4 TCP traffic
488275SEric Cheng  * needs to go to a separate soft ring which can be polled by the
498275SEric Cheng  * TCP squeue. It also allows TCP squeue to push back flow control
508275SEric Cheng  * all the way to NIC hardware (if it puts its corresponding soft
518275SEric Cheng  * ring in the poll mode and soft ring queue builds up, the
528275SEric Cheng  * shared srs_poll_pkt_cnt goes up and SRS automatically stops
538275SEric Cheng  * more packets from entering the system).
548275SEric Cheng  *
558275SEric Cheng  * Similarly, the UDP benefits from a DLS bypass and packet chaining
568275SEric Cheng  * so sending it to a separate soft ring is desired. All the rest of
578275SEric Cheng  * the traffic (including IPv6 is sent to OTH softring). The IPv6
588275SEric Cheng  * traffic current goes through OTH softring and via DLS because
598275SEric Cheng  * it need more processing to be done. Irrespective of the sap
608275SEric Cheng  * (IPv4 or IPv6) or the transport, the dynamic polling, B/W enforcement,
618275SEric Cheng  * cpu assignment, fanout, etc apply to all traffic since they
628275SEric Cheng  * are implement by the SRS which is agnostic to sap or transport.
638275SEric Cheng  *
648275SEric Cheng  * Fanout soft rings:
658275SEric Cheng  *
668275SEric Cheng  * On a multithreaded system, we can assign more CPU and multi thread
678275SEric Cheng  * the stack by creating a soft ring per CPU and spreading traffic
688275SEric Cheng  * based on a hash computed on src IP etc. Since we still need to
698275SEric Cheng  * keep the protocol separation, we create a set of 3 soft ring per
708275SEric Cheng  * CPU (specified by cpu list or degree of fanout).
718275SEric Cheng  *
728275SEric Cheng  * NOTE: See the block level comment on top of mac_sched.c
738275SEric Cheng  */
748275SEric Cheng 
758275SEric Cheng #include <sys/types.h>
768275SEric Cheng #include <sys/callb.h>
778275SEric Cheng #include <sys/sdt.h>
788275SEric Cheng #include <sys/strsubr.h>
798275SEric Cheng #include <sys/strsun.h>
808275SEric Cheng #include <sys/vlan.h>
818275SEric Cheng #include <inet/ipsec_impl.h>
828275SEric Cheng #include <inet/ip_impl.h>
838275SEric Cheng #include <inet/sadb.h>
848275SEric Cheng #include <inet/ipsecesp.h>
858275SEric Cheng #include <inet/ipsecah.h>
868275SEric Cheng 
878275SEric Cheng #include <sys/mac_impl.h>
888275SEric Cheng #include <sys/mac_client_impl.h>
898275SEric Cheng #include <sys/mac_soft_ring.h>
908275SEric Cheng #include <sys/mac_flow_impl.h>
91*11878SVenu.Iyer@Sun.COM #include <sys/mac_stat.h>
928275SEric Cheng 
938275SEric Cheng static void mac_rx_soft_ring_drain(mac_soft_ring_t *);
948275SEric Cheng static void mac_soft_ring_fire(void *);
958275SEric Cheng static void mac_soft_ring_worker(mac_soft_ring_t *);
968275SEric Cheng static void mac_tx_soft_ring_drain(mac_soft_ring_t *);
978275SEric Cheng 
988275SEric Cheng uint32_t mac_tx_soft_ring_max_q_cnt = 100000;
998275SEric Cheng uint32_t mac_tx_soft_ring_hiwat = 1000;
1008275SEric Cheng 
1018275SEric Cheng extern kmem_cache_t *mac_soft_ring_cache;
1028275SEric Cheng 
1038275SEric Cheng #define	ADD_SOFTRING_TO_SET(mac_srs, softring) {			\
1048275SEric Cheng 	if (mac_srs->srs_soft_ring_head == NULL) {			\
1058275SEric Cheng 		mac_srs->srs_soft_ring_head = softring;			\
1068275SEric Cheng 		mac_srs->srs_soft_ring_tail = softring;			\
1078275SEric Cheng 	} else {							\
1088275SEric Cheng 		/* ADD to the list */					\
1098275SEric Cheng 		softring->s_ring_prev =					\
1108275SEric Cheng 			mac_srs->srs_soft_ring_tail;			\
1118275SEric Cheng 		mac_srs->srs_soft_ring_tail->s_ring_next = softring;	\
1128275SEric Cheng 		mac_srs->srs_soft_ring_tail = softring;			\
1138275SEric Cheng 	}								\
1148275SEric Cheng 	mac_srs->srs_soft_ring_count++;					\
1158275SEric Cheng }
1168275SEric Cheng 
1178275SEric Cheng /*
1188275SEric Cheng  * mac_soft_ring_worker_wakeup
1198275SEric Cheng  *
1208275SEric Cheng  * Wake up the soft ring worker thread to process the queue as long
1218275SEric Cheng  * as no one else is processing it and upper layer (client) is still
1228275SEric Cheng  * ready to receive packets.
1238275SEric Cheng  */
1248275SEric Cheng void
mac_soft_ring_worker_wakeup(mac_soft_ring_t * ringp)1258275SEric Cheng mac_soft_ring_worker_wakeup(mac_soft_ring_t *ringp)
1268275SEric Cheng {
1278275SEric Cheng 	ASSERT(MUTEX_HELD(&ringp->s_ring_lock));
1288275SEric Cheng 	if (!(ringp->s_ring_state & S_RING_PROC) &&
1298275SEric Cheng 	    !(ringp->s_ring_state & S_RING_BLANK) &&
1308275SEric Cheng 	    (ringp->s_ring_tid == NULL)) {
1318275SEric Cheng 		if (ringp->s_ring_wait != 0) {
1328275SEric Cheng 			ringp->s_ring_tid =
1338275SEric Cheng 			    timeout(mac_soft_ring_fire, ringp,
1348275SEric Cheng 			    ringp->s_ring_wait);
1358275SEric Cheng 		} else {
1368275SEric Cheng 			/* Schedule the worker thread. */
1378275SEric Cheng 			cv_signal(&ringp->s_ring_async);
1388275SEric Cheng 		}
1398275SEric Cheng 	}
1408275SEric Cheng }
1418275SEric Cheng 
1428275SEric Cheng /*
1438275SEric Cheng  * mac_soft_ring_create
1448275SEric Cheng  *
1458275SEric Cheng  * Create a soft ring, do the necessary setup and bind the worker
1468275SEric Cheng  * thread to the assigned CPU.
1478275SEric Cheng  */
1488275SEric Cheng mac_soft_ring_t *
mac_soft_ring_create(int id,clock_t wait,uint16_t type,pri_t pri,mac_client_impl_t * mcip,mac_soft_ring_set_t * mac_srs,processorid_t cpuid,mac_direct_rx_t rx_func,void * x_arg1,mac_resource_handle_t x_arg2)149*11878SVenu.Iyer@Sun.COM mac_soft_ring_create(int id, clock_t wait, uint16_t type,
1508275SEric Cheng     pri_t pri, mac_client_impl_t *mcip, mac_soft_ring_set_t *mac_srs,
1518275SEric Cheng     processorid_t cpuid, mac_direct_rx_t rx_func, void *x_arg1,
1528275SEric Cheng     mac_resource_handle_t x_arg2)
1538275SEric Cheng {
1548275SEric Cheng 	mac_soft_ring_t 	*ringp;
1558400SNicolas.Droux@Sun.COM 	char 			name[S_RING_NAMELEN];
1568275SEric Cheng 
1578275SEric Cheng 	bzero(name, 64);
1588275SEric Cheng 	ringp = kmem_cache_alloc(mac_soft_ring_cache, KM_SLEEP);
1598275SEric Cheng 
1608275SEric Cheng 	if (type & ST_RING_TCP) {
1618275SEric Cheng 		(void) snprintf(name, sizeof (name),
1628400SNicolas.Droux@Sun.COM 		    "mac_tcp_soft_ring_%d_%p", id, (void *)mac_srs);
1638275SEric Cheng 	} else if (type & ST_RING_UDP) {
1648275SEric Cheng 		(void) snprintf(name, sizeof (name),
1658400SNicolas.Droux@Sun.COM 		    "mac_udp_soft_ring_%d_%p", id, (void *)mac_srs);
166*11878SVenu.Iyer@Sun.COM 	} else if (type & ST_RING_OTH) {
1678275SEric Cheng 		(void) snprintf(name, sizeof (name),
1688400SNicolas.Droux@Sun.COM 		    "mac_oth_soft_ring_%d_%p", id, (void *)mac_srs);
169*11878SVenu.Iyer@Sun.COM 	} else {
170*11878SVenu.Iyer@Sun.COM 		ASSERT(type & ST_RING_TX);
171*11878SVenu.Iyer@Sun.COM 		(void) snprintf(name, sizeof (name),
172*11878SVenu.Iyer@Sun.COM 		    "mac_tx_soft_ring_%d_%p", id, (void *)mac_srs);
1738275SEric Cheng 	}
1748275SEric Cheng 
1758275SEric Cheng 	bzero(ringp, sizeof (mac_soft_ring_t));
1768275SEric Cheng 	(void) strncpy(ringp->s_ring_name, name, S_RING_NAMELEN + 1);
1778275SEric Cheng 	ringp->s_ring_name[S_RING_NAMELEN] = '\0';
1788275SEric Cheng 	mutex_init(&ringp->s_ring_lock, NULL, MUTEX_DEFAULT, NULL);
1798275SEric Cheng 	ringp->s_ring_notify_cb_info.mcbi_lockp = &ringp->s_ring_lock;
1808275SEric Cheng 
1818275SEric Cheng 	ringp->s_ring_type = type;
1828275SEric Cheng 	ringp->s_ring_wait = MSEC_TO_TICK(wait);
1838275SEric Cheng 	ringp->s_ring_mcip = mcip;
1848275SEric Cheng 	ringp->s_ring_set = mac_srs;
1858275SEric Cheng 
1868275SEric Cheng 	/*
1878275SEric Cheng 	 * Protect against access from DR callbacks (mac_walk_srs_bind/unbind)
1888275SEric Cheng 	 * which can't grab the mac perimeter
1898275SEric Cheng 	 */
1908275SEric Cheng 	mutex_enter(&mac_srs->srs_lock);
1918275SEric Cheng 	ADD_SOFTRING_TO_SET(mac_srs, ringp);
1928275SEric Cheng 	mutex_exit(&mac_srs->srs_lock);
1938275SEric Cheng 
1948275SEric Cheng 	/*
1958275SEric Cheng 	 * set the bind CPU to -1 to indicate
1968275SEric Cheng 	 * no thread affinity set
1978275SEric Cheng 	 */
1988275SEric Cheng 	ringp->s_ring_cpuid = ringp->s_ring_cpuid_save = -1;
1998275SEric Cheng 	ringp->s_ring_worker = thread_create(NULL, 0,
2008275SEric Cheng 	    mac_soft_ring_worker, ringp, 0, &p0, TS_RUN, pri);
2018275SEric Cheng 	if (type & ST_RING_TX) {
2028275SEric Cheng 		ringp->s_ring_drain_func = mac_tx_soft_ring_drain;
2038275SEric Cheng 		ringp->s_ring_tx_arg1 = x_arg1;
2048275SEric Cheng 		ringp->s_ring_tx_arg2 = x_arg2;
2058275SEric Cheng 		ringp->s_ring_tx_max_q_cnt = mac_tx_soft_ring_max_q_cnt;
2068275SEric Cheng 		ringp->s_ring_tx_hiwat =
2078275SEric Cheng 		    (mac_tx_soft_ring_hiwat > mac_tx_soft_ring_max_q_cnt) ?
2088275SEric Cheng 		    mac_tx_soft_ring_max_q_cnt : mac_tx_soft_ring_hiwat;
209*11878SVenu.Iyer@Sun.COM 		if (mcip->mci_state_flags & MCIS_IS_AGGR) {
210*11878SVenu.Iyer@Sun.COM 			mac_srs_tx_t *tx = &mac_srs->srs_tx;
211*11878SVenu.Iyer@Sun.COM 
212*11878SVenu.Iyer@Sun.COM 			ASSERT(tx->st_soft_rings[
213*11878SVenu.Iyer@Sun.COM 			    ((mac_ring_t *)x_arg2)->mr_index] == NULL);
214*11878SVenu.Iyer@Sun.COM 			tx->st_soft_rings[((mac_ring_t *)x_arg2)->mr_index] =
215*11878SVenu.Iyer@Sun.COM 			    ringp;
216*11878SVenu.Iyer@Sun.COM 		}
2178275SEric Cheng 	} else {
2188275SEric Cheng 		ringp->s_ring_drain_func = mac_rx_soft_ring_drain;
2198275SEric Cheng 		ringp->s_ring_rx_func = rx_func;
2208275SEric Cheng 		ringp->s_ring_rx_arg1 = x_arg1;
2218275SEric Cheng 		ringp->s_ring_rx_arg2 = x_arg2;
2228833SVenu.Iyer@Sun.COM 		if (mac_srs->srs_state & SRS_SOFTRING_QUEUE)
2238833SVenu.Iyer@Sun.COM 			ringp->s_ring_type |= ST_RING_WORKER_ONLY;
2248275SEric Cheng 	}
2258275SEric Cheng 	if (cpuid != -1)
2268275SEric Cheng 		(void) mac_soft_ring_bind(ringp, cpuid);
2278275SEric Cheng 
228*11878SVenu.Iyer@Sun.COM 	mac_soft_ring_stat_create(ringp);
229*11878SVenu.Iyer@Sun.COM 
2308275SEric Cheng 	return (ringp);
2318275SEric Cheng }
2328275SEric Cheng 
2338275SEric Cheng /*
2348275SEric Cheng  * mac_soft_ring_free
2358275SEric Cheng  *
2368275SEric Cheng  * Free the soft ring once we are done with it.
2378275SEric Cheng  */
2388275SEric Cheng void
mac_soft_ring_free(mac_soft_ring_t * softring)239*11878SVenu.Iyer@Sun.COM mac_soft_ring_free(mac_soft_ring_t *softring)
2408275SEric Cheng {
2418275SEric Cheng 	ASSERT((softring->s_ring_state &
2428275SEric Cheng 	    (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE | S_RING_PROC)) ==
2438275SEric Cheng 	    (S_RING_CONDEMNED | S_RING_CONDEMNED_DONE));
2448275SEric Cheng 	mac_pkt_drop(NULL, NULL, softring->s_ring_first, B_FALSE);
245*11878SVenu.Iyer@Sun.COM 	softring->s_ring_tx_arg2 = NULL;
246*11878SVenu.Iyer@Sun.COM 	mac_soft_ring_stat_delete(softring);
2478275SEric Cheng 	mac_callback_free(softring->s_ring_notify_cb_list);
2488275SEric Cheng 	kmem_cache_free(mac_soft_ring_cache, softring);
2498275SEric Cheng }
2508275SEric Cheng 
2518275SEric Cheng int mac_soft_ring_thread_bind = 1;
2528275SEric Cheng 
2538275SEric Cheng /*
2548275SEric Cheng  * mac_soft_ring_bind
2558275SEric Cheng  *
2568275SEric Cheng  * Bind a soft ring worker thread to supplied CPU.
2578275SEric Cheng  */
2588275SEric Cheng cpu_t *
mac_soft_ring_bind(mac_soft_ring_t * ringp,processorid_t cpuid)2598275SEric Cheng mac_soft_ring_bind(mac_soft_ring_t *ringp, processorid_t cpuid)
2608275SEric Cheng {
2618275SEric Cheng 	cpu_t *cp;
2628275SEric Cheng 	boolean_t clear = B_FALSE;
2638275SEric Cheng 
2648275SEric Cheng 	ASSERT(MUTEX_HELD(&cpu_lock));
2658275SEric Cheng 
2668275SEric Cheng 	if (mac_soft_ring_thread_bind == 0) {
2678275SEric Cheng 		DTRACE_PROBE1(mac__soft__ring__no__cpu__bound,
2688275SEric Cheng 		    mac_soft_ring_t *, ringp);
2698275SEric Cheng 		return (NULL);
2708275SEric Cheng 	}
2718275SEric Cheng 
2728275SEric Cheng 	cp = cpu_get(cpuid);
2738275SEric Cheng 	if (cp == NULL || !cpu_is_online(cp))
2748275SEric Cheng 		return (NULL);
2758275SEric Cheng 
2768275SEric Cheng 	mutex_enter(&ringp->s_ring_lock);
2778275SEric Cheng 	ringp->s_ring_state |= S_RING_BOUND;
2788275SEric Cheng 	if (ringp->s_ring_cpuid != -1)
2798275SEric Cheng 		clear = B_TRUE;
2808275SEric Cheng 	ringp->s_ring_cpuid = cpuid;
2818275SEric Cheng 	mutex_exit(&ringp->s_ring_lock);
2828275SEric Cheng 
2838275SEric Cheng 	if (clear)
2848275SEric Cheng 		thread_affinity_clear(ringp->s_ring_worker);
2858275SEric Cheng 
2868275SEric Cheng 	DTRACE_PROBE2(mac__soft__ring__cpu__bound, mac_soft_ring_t *,
2878275SEric Cheng 	    ringp, processorid_t, cpuid);
2888275SEric Cheng 
2898275SEric Cheng 	thread_affinity_set(ringp->s_ring_worker, cpuid);
2908275SEric Cheng 
2918275SEric Cheng 	return (cp);
2928275SEric Cheng }
2938275SEric Cheng 
2948275SEric Cheng /*
2958275SEric Cheng  * mac_soft_ring_unbind
2968275SEric Cheng  *
2978275SEric Cheng  * Un Bind a soft ring worker thread.
2988275SEric Cheng  */
2998275SEric Cheng void
mac_soft_ring_unbind(mac_soft_ring_t * ringp)3008275SEric Cheng mac_soft_ring_unbind(mac_soft_ring_t *ringp)
3018275SEric Cheng {
3028275SEric Cheng 	ASSERT(MUTEX_HELD(&cpu_lock));
3038275SEric Cheng 
3048275SEric Cheng 	mutex_enter(&ringp->s_ring_lock);
3058275SEric Cheng 	if (!(ringp->s_ring_state & S_RING_BOUND)) {
3068275SEric Cheng 		ASSERT(ringp->s_ring_cpuid == -1);
3078275SEric Cheng 		mutex_exit(&ringp->s_ring_lock);
3088275SEric Cheng 		return;
3098275SEric Cheng 	}
3108275SEric Cheng 
3118275SEric Cheng 	ringp->s_ring_cpuid = -1;
3128275SEric Cheng 	ringp->s_ring_state &= ~S_RING_BOUND;
3138275SEric Cheng 	thread_affinity_clear(ringp->s_ring_worker);
3148275SEric Cheng 	mutex_exit(&ringp->s_ring_lock);
3158275SEric Cheng }
3168275SEric Cheng 
3178275SEric Cheng /*
3188275SEric Cheng  * PRIVATE FUNCTIONS
3198275SEric Cheng  */
3208275SEric Cheng 
3218275SEric Cheng static void
mac_soft_ring_fire(void * arg)3228275SEric Cheng mac_soft_ring_fire(void *arg)
3238275SEric Cheng {
3248275SEric Cheng 	mac_soft_ring_t	*ringp = arg;
3258275SEric Cheng 
3268275SEric Cheng 	mutex_enter(&ringp->s_ring_lock);
3278275SEric Cheng 	if (ringp->s_ring_tid == 0) {
3288275SEric Cheng 		mutex_exit(&ringp->s_ring_lock);
3298275SEric Cheng 		return;
3308275SEric Cheng 	}
3318275SEric Cheng 
3328275SEric Cheng 	ringp->s_ring_tid = 0;
3338275SEric Cheng 
3348275SEric Cheng 	if (!(ringp->s_ring_state & S_RING_PROC)) {
3358275SEric Cheng 		cv_signal(&ringp->s_ring_async);
3368275SEric Cheng 	}
3378275SEric Cheng 	mutex_exit(&ringp->s_ring_lock);
3388275SEric Cheng }
3398275SEric Cheng 
3408275SEric Cheng /*
3418275SEric Cheng  * mac_rx_soft_ring_drain
3428275SEric Cheng  *
3438275SEric Cheng  * Called when worker thread model (ST_RING_WORKER_ONLY) of processing
3448275SEric Cheng  * incoming packets is used. s_ring_first contain the queued packets.
3458275SEric Cheng  * s_ring_rx_func contains the upper level (client) routine where the
3468275SEric Cheng  * packets are destined and s_ring_rx_arg1/s_ring_rx_arg2 are the
3478275SEric Cheng  * cookie meant for the client.
3488275SEric Cheng  */
3498275SEric Cheng /* ARGSUSED */
3508275SEric Cheng static void
mac_rx_soft_ring_drain(mac_soft_ring_t * ringp)3518275SEric Cheng mac_rx_soft_ring_drain(mac_soft_ring_t *ringp)
3528275SEric Cheng {
3538275SEric Cheng 	mblk_t		*mp;
3548275SEric Cheng 	void		*arg1;
3558275SEric Cheng 	mac_resource_handle_t arg2;
3568275SEric Cheng 	timeout_id_t 	tid;
3578275SEric Cheng 	mac_direct_rx_t	proc;
3588275SEric Cheng 	size_t		sz;
3598275SEric Cheng 	int		cnt;
3608275SEric Cheng 	mac_soft_ring_set_t	*mac_srs = ringp->s_ring_set;
3618275SEric Cheng 
3628275SEric Cheng 	ringp->s_ring_run = curthread;
3638275SEric Cheng 	ASSERT(mutex_owned(&ringp->s_ring_lock));
3648275SEric Cheng 	ASSERT(!(ringp->s_ring_state & S_RING_PROC));
3658275SEric Cheng 
3668275SEric Cheng 	if ((tid = ringp->s_ring_tid) != 0)
3678275SEric Cheng 		ringp->s_ring_tid = 0;
3688275SEric Cheng 
3698275SEric Cheng 	ringp->s_ring_state |= S_RING_PROC;
3708275SEric Cheng 
3718275SEric Cheng 	proc = ringp->s_ring_rx_func;
3728275SEric Cheng 	arg1 = ringp->s_ring_rx_arg1;
3738275SEric Cheng 	arg2 = ringp->s_ring_rx_arg2;
3748275SEric Cheng 
3758275SEric Cheng 	while ((ringp->s_ring_first != NULL) &&
3768275SEric Cheng 	    !(ringp->s_ring_state & S_RING_PAUSE)) {
3778275SEric Cheng 		mp = ringp->s_ring_first;
3788275SEric Cheng 		ringp->s_ring_first = NULL;
3798275SEric Cheng 		ringp->s_ring_last = NULL;
3808275SEric Cheng 		cnt = ringp->s_ring_count;
3818275SEric Cheng 		ringp->s_ring_count = 0;
3828275SEric Cheng 		sz = ringp->s_ring_size;
3838275SEric Cheng 		ringp->s_ring_size = 0;
3848275SEric Cheng 		mutex_exit(&ringp->s_ring_lock);
3858275SEric Cheng 
3868275SEric Cheng 		if (tid != 0) {
3878275SEric Cheng 			(void) untimeout(tid);
3888275SEric Cheng 			tid = 0;
3898275SEric Cheng 		}
3908275SEric Cheng 
3918275SEric Cheng 		(*proc)(arg1, arg2, mp, NULL);
3928275SEric Cheng 
3938275SEric Cheng 		/*
3948275SEric Cheng 		 * If we have a soft ring set which is doing
3958275SEric Cheng 		 * bandwidth control, we need to decrement its
3968275SEric Cheng 		 * srs_size so it can have a accurate idea of
3978275SEric Cheng 		 * what is the real data queued between SRS and
3988275SEric Cheng 		 * its soft rings. We decrement the size for a
3998275SEric Cheng 		 * packet only when it gets processed by both
4008275SEric Cheng 		 * SRS and the soft ring.
4018275SEric Cheng 		 */
4028275SEric Cheng 		mutex_enter(&mac_srs->srs_lock);
4038275SEric Cheng 		MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
4048275SEric Cheng 		MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
4058275SEric Cheng 		mutex_exit(&mac_srs->srs_lock);
4068275SEric Cheng 
4078275SEric Cheng 		mutex_enter(&ringp->s_ring_lock);
4088275SEric Cheng 	}
4098275SEric Cheng 	ringp->s_ring_state &= ~S_RING_PROC;
4108275SEric Cheng 	if (ringp->s_ring_state & S_RING_CLIENT_WAIT)
4118275SEric Cheng 		cv_signal(&ringp->s_ring_client_cv);
4128275SEric Cheng 	ringp->s_ring_run = NULL;
4138275SEric Cheng }
4148275SEric Cheng 
4158275SEric Cheng /*
4168275SEric Cheng  * mac_soft_ring_worker
4178275SEric Cheng  *
4188275SEric Cheng  * The soft ring worker routine to process any queued packets. In
4198275SEric Cheng  * normal case, the worker thread is bound to a CPU. It the soft
4208275SEric Cheng  * ring is dealing with TCP packets, then the worker thread will
4218275SEric Cheng  * be bound to the same CPU as the TCP squeue.
4228275SEric Cheng  */
4238275SEric Cheng static void
mac_soft_ring_worker(mac_soft_ring_t * ringp)4248275SEric Cheng mac_soft_ring_worker(mac_soft_ring_t *ringp)
4258275SEric Cheng {
4268275SEric Cheng 	kmutex_t *lock = &ringp->s_ring_lock;
4278275SEric Cheng 	kcondvar_t *async = &ringp->s_ring_async;
4288275SEric Cheng 	mac_soft_ring_set_t *srs = ringp->s_ring_set;
4298275SEric Cheng 	callb_cpr_t cprinfo;
4308275SEric Cheng 
4318275SEric Cheng 	CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_soft_ring");
4328275SEric Cheng 	mutex_enter(lock);
4338275SEric Cheng start:
4348275SEric Cheng 	for (;;) {
4358275SEric Cheng 		while (((ringp->s_ring_first == NULL ||
4369883SRajagopal.Kunhappan@Sun.COM 		    (ringp->s_ring_state & (S_RING_BLOCK|S_RING_BLANK))) &&
4378275SEric Cheng 		    !(ringp->s_ring_state & S_RING_PAUSE)) ||
4388275SEric Cheng 		    (ringp->s_ring_state & S_RING_PROC)) {
4398275SEric Cheng 
4408275SEric Cheng 			CALLB_CPR_SAFE_BEGIN(&cprinfo);
4418275SEric Cheng 			cv_wait(async, lock);
4428275SEric Cheng 			CALLB_CPR_SAFE_END(&cprinfo, lock);
4438275SEric Cheng 		}
4448275SEric Cheng 
4458275SEric Cheng 		/*
4468275SEric Cheng 		 * Either we have work to do, or we have been asked to
4478275SEric Cheng 		 * shutdown temporarily or permanently
4488275SEric Cheng 		 */
4498275SEric Cheng 		if (ringp->s_ring_state & S_RING_PAUSE)
4508275SEric Cheng 			goto done;
4518275SEric Cheng 
4528275SEric Cheng 		ringp->s_ring_drain_func(ringp);
4538275SEric Cheng 	}
4548275SEric Cheng done:
4558275SEric Cheng 	mutex_exit(lock);
4568275SEric Cheng 	mutex_enter(&srs->srs_lock);
4578275SEric Cheng 	mutex_enter(lock);
4588275SEric Cheng 
4598275SEric Cheng 	ringp->s_ring_state |= S_RING_QUIESCE_DONE;
4608275SEric Cheng 	if (!(ringp->s_ring_state & S_RING_CONDEMNED)) {
4618275SEric Cheng 		srs->srs_soft_ring_quiesced_count++;
4628275SEric Cheng 		cv_broadcast(&srs->srs_async);
4638275SEric Cheng 		mutex_exit(&srs->srs_lock);
4648275SEric Cheng 		while (!(ringp->s_ring_state &
4658275SEric Cheng 		    (S_RING_RESTART | S_RING_CONDEMNED)))
4668275SEric Cheng 			cv_wait(&ringp->s_ring_async, &ringp->s_ring_lock);
4678275SEric Cheng 		mutex_exit(lock);
4688275SEric Cheng 		mutex_enter(&srs->srs_lock);
4698275SEric Cheng 		mutex_enter(lock);
4708275SEric Cheng 		srs->srs_soft_ring_quiesced_count--;
4718275SEric Cheng 		if (ringp->s_ring_state & S_RING_RESTART) {
4728275SEric Cheng 			ASSERT(!(ringp->s_ring_state & S_RING_CONDEMNED));
4738275SEric Cheng 			ringp->s_ring_state &= ~(S_RING_RESTART |
4748275SEric Cheng 			    S_RING_QUIESCE | S_RING_QUIESCE_DONE);
4758275SEric Cheng 			cv_broadcast(&srs->srs_async);
4768275SEric Cheng 			mutex_exit(&srs->srs_lock);
4778275SEric Cheng 			goto start;
4788275SEric Cheng 		}
4798275SEric Cheng 	}
4808275SEric Cheng 	ASSERT(ringp->s_ring_state & S_RING_CONDEMNED);
4818275SEric Cheng 	ringp->s_ring_state |= S_RING_CONDEMNED_DONE;
4828275SEric Cheng 	CALLB_CPR_EXIT(&cprinfo);
4838275SEric Cheng 	srs->srs_soft_ring_condemned_count++;
4848275SEric Cheng 	cv_broadcast(&srs->srs_async);
4858275SEric Cheng 	mutex_exit(&srs->srs_lock);
4868275SEric Cheng 	thread_exit();
4878275SEric Cheng }
4888275SEric Cheng 
4898275SEric Cheng /*
4908275SEric Cheng  * mac_soft_ring_intr_enable and mac_soft_ring_intr_disable
4918275SEric Cheng  *
4928275SEric Cheng  * these functions are called to toggle the sending of packets to the
4938275SEric Cheng  * client. They are called by the client. the client gets the name
4948275SEric Cheng  * of these routine and corresponding cookie (pointing to softring)
4958275SEric Cheng  * during capability negotiation at setup time.
4968275SEric Cheng  *
4978275SEric Cheng  * Enabling is allow the processing thread to send packets to the
4988275SEric Cheng  * client while disabling does the opposite.
4998275SEric Cheng  */
5008275SEric Cheng void
mac_soft_ring_intr_enable(void * arg)5018275SEric Cheng mac_soft_ring_intr_enable(void *arg)
5028275SEric Cheng {
5038275SEric Cheng 	mac_soft_ring_t *ringp = (mac_soft_ring_t *)arg;
5048275SEric Cheng 	mutex_enter(&ringp->s_ring_lock);
5058275SEric Cheng 	ringp->s_ring_state &= ~S_RING_BLANK;
5068275SEric Cheng 	if (ringp->s_ring_first != NULL)
5078275SEric Cheng 		mac_soft_ring_worker_wakeup(ringp);
5088275SEric Cheng 	mutex_exit(&ringp->s_ring_lock);
5098275SEric Cheng }
5108275SEric Cheng 
5119883SRajagopal.Kunhappan@Sun.COM boolean_t
mac_soft_ring_intr_disable(void * arg)5128275SEric Cheng mac_soft_ring_intr_disable(void *arg)
5138275SEric Cheng {
5148275SEric Cheng 	mac_soft_ring_t *ringp = (mac_soft_ring_t *)arg;
5159883SRajagopal.Kunhappan@Sun.COM 	boolean_t sring_blanked = B_FALSE;
5168275SEric Cheng 	/*
5178275SEric Cheng 	 * Stop worker thread from sending packets above.
5188275SEric Cheng 	 * Squeue will poll soft ring when it needs packets.
5198275SEric Cheng 	 */
5208275SEric Cheng 	mutex_enter(&ringp->s_ring_lock);
5219883SRajagopal.Kunhappan@Sun.COM 	if (!(ringp->s_ring_state & S_RING_PROC)) {
5229883SRajagopal.Kunhappan@Sun.COM 		ringp->s_ring_state |= S_RING_BLANK;
5239883SRajagopal.Kunhappan@Sun.COM 		sring_blanked = B_TRUE;
5249883SRajagopal.Kunhappan@Sun.COM 	}
5258275SEric Cheng 	mutex_exit(&ringp->s_ring_lock);
5269883SRajagopal.Kunhappan@Sun.COM 	return (sring_blanked);
5278275SEric Cheng }
5288275SEric Cheng 
5298275SEric Cheng /*
5308275SEric Cheng  * mac_soft_ring_poll
5318275SEric Cheng  *
5328275SEric Cheng  * This routine is called by the client to poll for packets from
5338275SEric Cheng  * the soft ring. The function name and cookie corresponding to
5348275SEric Cheng  * the soft ring is exchanged during capability negotiation during
5358275SEric Cheng  * setup.
5368275SEric Cheng  */
5378275SEric Cheng mblk_t *
mac_soft_ring_poll(mac_soft_ring_t * ringp,int bytes_to_pickup)5388275SEric Cheng mac_soft_ring_poll(mac_soft_ring_t *ringp, int bytes_to_pickup)
5398275SEric Cheng {
5408275SEric Cheng 	mblk_t	*head, *tail;
5418275SEric Cheng 	mblk_t	*mp;
5428275SEric Cheng 	size_t	sz = 0;
5438275SEric Cheng 	int	cnt = 0;
5448275SEric Cheng 	mac_soft_ring_set_t	*mac_srs = ringp->s_ring_set;
5458275SEric Cheng 
5468275SEric Cheng 	ASSERT(mac_srs != NULL);
5478275SEric Cheng 
5488275SEric Cheng 	mutex_enter(&ringp->s_ring_lock);
5498275SEric Cheng 	head = tail = mp = ringp->s_ring_first;
5508275SEric Cheng 	if (head == NULL) {
5518275SEric Cheng 		mutex_exit(&ringp->s_ring_lock);
5528275SEric Cheng 		return (NULL);
5538275SEric Cheng 	}
5548275SEric Cheng 
5558275SEric Cheng 	if (ringp->s_ring_size <= bytes_to_pickup) {
5568275SEric Cheng 		head = ringp->s_ring_first;
5578275SEric Cheng 		ringp->s_ring_first = NULL;
5588275SEric Cheng 		ringp->s_ring_last = NULL;
5598275SEric Cheng 		cnt = ringp->s_ring_count;
5608275SEric Cheng 		ringp->s_ring_count = 0;
5618275SEric Cheng 		sz = ringp->s_ring_size;
5628275SEric Cheng 		ringp->s_ring_size = 0;
5638275SEric Cheng 	} else {
5648275SEric Cheng 		while (mp && sz <= bytes_to_pickup) {
5658275SEric Cheng 			sz += msgdsize(mp);
5668275SEric Cheng 			cnt++;
5678275SEric Cheng 			tail = mp;
5688275SEric Cheng 			mp = mp->b_next;
5698275SEric Cheng 		}
5708275SEric Cheng 		ringp->s_ring_count -= cnt;
5718275SEric Cheng 		ringp->s_ring_size -= sz;
5728275SEric Cheng 		tail->b_next = NULL;
5738275SEric Cheng 		if (mp == NULL) {
5748275SEric Cheng 			ringp->s_ring_first = NULL;
5758275SEric Cheng 			ringp->s_ring_last = NULL;
5768275SEric Cheng 			ASSERT(ringp->s_ring_count == 0);
5778275SEric Cheng 		} else {
5788275SEric Cheng 			ringp->s_ring_first = mp;
5798275SEric Cheng 		}
5808275SEric Cheng 	}
5818275SEric Cheng 
5828275SEric Cheng 	mutex_exit(&ringp->s_ring_lock);
5838275SEric Cheng 	/*
5848275SEric Cheng 	 * Update the shared count and size counters so
5858275SEric Cheng 	 * that SRS has a accurate idea of queued packets.
5868275SEric Cheng 	 */
5878275SEric Cheng 	mutex_enter(&mac_srs->srs_lock);
5888275SEric Cheng 	MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt);
5898275SEric Cheng 	MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz);
5908275SEric Cheng 	mutex_exit(&mac_srs->srs_lock);
5918275SEric Cheng 	return (head);
5928275SEric Cheng }
5938275SEric Cheng 
5948275SEric Cheng /*
5958275SEric Cheng  * mac_soft_ring_dls_bypass
5968275SEric Cheng  *
5978275SEric Cheng  * Enable direct client (IP) callback function from the softrings.
5988275SEric Cheng  * Callers need to make sure they don't need any DLS layer processing
5998275SEric Cheng  */
6008275SEric Cheng void
mac_soft_ring_dls_bypass(void * arg,mac_direct_rx_t rx_func,void * rx_arg1)6018275SEric Cheng mac_soft_ring_dls_bypass(void *arg, mac_direct_rx_t rx_func, void *rx_arg1)
6028275SEric Cheng {
6038275SEric Cheng 	mac_soft_ring_t		*softring = arg;
6048275SEric Cheng 	mac_soft_ring_set_t	*srs;
6058275SEric Cheng 
6068275SEric Cheng 	ASSERT(rx_func != NULL);
6078275SEric Cheng 
6088275SEric Cheng 	mutex_enter(&softring->s_ring_lock);
6098275SEric Cheng 	softring->s_ring_rx_func = rx_func;
6108275SEric Cheng 	softring->s_ring_rx_arg1 = rx_arg1;
6118275SEric Cheng 	mutex_exit(&softring->s_ring_lock);
6128275SEric Cheng 
6138275SEric Cheng 	srs = softring->s_ring_set;
6148275SEric Cheng 	mutex_enter(&srs->srs_lock);
6158275SEric Cheng 	srs->srs_type |= SRST_DLS_BYPASS;
6168275SEric Cheng 	mutex_exit(&srs->srs_lock);
6178275SEric Cheng }
6188275SEric Cheng 
6198275SEric Cheng /*
6208275SEric Cheng  * mac_soft_ring_signal
6218275SEric Cheng  *
6228275SEric Cheng  * Typically used to set the soft ring state to QUIESCE, CONDEMNED, or
6238275SEric Cheng  * RESTART.
6248275SEric Cheng  *
6258275SEric Cheng  * In the Rx side, the quiescing is done bottom up. After the Rx upcalls
6268275SEric Cheng  * from the driver are done, then the Rx SRS is quiesced and only then can
6278275SEric Cheng  * we signal the soft rings. Thus this function can't be called arbitrarily
6288275SEric Cheng  * without satisfying the prerequisites. On the Tx side, the threads from
6298275SEric Cheng  * top need to quiesced, then the Tx SRS and only then can we signal the
6308275SEric Cheng  * Tx soft rings.
6318275SEric Cheng  */
6328275SEric Cheng void
mac_soft_ring_signal(mac_soft_ring_t * softring,uint_t sr_flag)6338275SEric Cheng mac_soft_ring_signal(mac_soft_ring_t *softring, uint_t sr_flag)
6348275SEric Cheng {
6358275SEric Cheng 	mutex_enter(&softring->s_ring_lock);
6368275SEric Cheng 	softring->s_ring_state |= sr_flag;
6378275SEric Cheng 	cv_signal(&softring->s_ring_async);
6388275SEric Cheng 	mutex_exit(&softring->s_ring_lock);
6398275SEric Cheng }
6408275SEric Cheng 
6418275SEric Cheng /*
6428275SEric Cheng  * mac_tx_soft_ring_drain
6438275SEric Cheng  *
6448275SEric Cheng  * The transmit side drain routine in case the soft ring was being
6458275SEric Cheng  * used to transmit packets.
6468275SEric Cheng  */
6478275SEric Cheng static void
mac_tx_soft_ring_drain(mac_soft_ring_t * ringp)6488275SEric Cheng mac_tx_soft_ring_drain(mac_soft_ring_t *ringp)
6498275SEric Cheng {
6508275SEric Cheng 	mblk_t 			*mp;
6518275SEric Cheng 	void 			*arg1;
6528275SEric Cheng 	void 			*arg2;
6538275SEric Cheng 	mblk_t 			*tail;
6548275SEric Cheng 	uint_t			saved_pkt_count, saved_size;
6558275SEric Cheng 	mac_tx_stats_t		stats;
6568275SEric Cheng 	mac_soft_ring_set_t	*mac_srs = ringp->s_ring_set;
6578275SEric Cheng 
6588275SEric Cheng 	saved_pkt_count = saved_size = 0;
6598275SEric Cheng 	ringp->s_ring_run = curthread;
6608275SEric Cheng 	ASSERT(mutex_owned(&ringp->s_ring_lock));
6618275SEric Cheng 	ASSERT(!(ringp->s_ring_state & S_RING_PROC));
6628275SEric Cheng 
6638275SEric Cheng 	ringp->s_ring_state |= S_RING_PROC;
6648275SEric Cheng 	arg1 = ringp->s_ring_tx_arg1;
6658275SEric Cheng 	arg2 = ringp->s_ring_tx_arg2;
6668275SEric Cheng 
6678275SEric Cheng 	while (ringp->s_ring_first != NULL) {
6688275SEric Cheng 		mp = ringp->s_ring_first;
6698275SEric Cheng 		tail = ringp->s_ring_last;
6708275SEric Cheng 		saved_pkt_count = ringp->s_ring_count;
6718275SEric Cheng 		saved_size = ringp->s_ring_size;
6728275SEric Cheng 		ringp->s_ring_first = NULL;
6738275SEric Cheng 		ringp->s_ring_last = NULL;
6748275SEric Cheng 		ringp->s_ring_count = 0;
6758275SEric Cheng 		ringp->s_ring_size = 0;
6768275SEric Cheng 		mutex_exit(&ringp->s_ring_lock);
6778275SEric Cheng 
6788275SEric Cheng 		mp = mac_tx_send(arg1, arg2, mp, &stats);
6798275SEric Cheng 
6808275SEric Cheng 		mutex_enter(&ringp->s_ring_lock);
6818275SEric Cheng 		if (mp != NULL) {
6828275SEric Cheng 			/* Device out of tx desc, set block */
6838275SEric Cheng 			tail->b_next = ringp->s_ring_first;
6848275SEric Cheng 			ringp->s_ring_first = mp;
6858275SEric Cheng 			ringp->s_ring_count +=
686*11878SVenu.Iyer@Sun.COM 			    (saved_pkt_count - stats.mts_opackets);
687*11878SVenu.Iyer@Sun.COM 			ringp->s_ring_size += (saved_size - stats.mts_obytes);
6888275SEric Cheng 			if (ringp->s_ring_last == NULL)
6898275SEric Cheng 				ringp->s_ring_last = tail;
6908275SEric Cheng 
6918275SEric Cheng 			if (ringp->s_ring_tx_woken_up) {
6928275SEric Cheng 				ringp->s_ring_tx_woken_up = B_FALSE;
6938275SEric Cheng 			} else {
6948275SEric Cheng 				ringp->s_ring_state |= S_RING_BLOCK;
695*11878SVenu.Iyer@Sun.COM 				ringp->s_st_stat.mts_blockcnt++;
6968275SEric Cheng 			}
6978275SEric Cheng 
6988275SEric Cheng 			ringp->s_ring_state &= ~S_RING_PROC;
6998275SEric Cheng 			ringp->s_ring_run = NULL;
7008275SEric Cheng 			return;
7018275SEric Cheng 		} else {
7028275SEric Cheng 			ringp->s_ring_tx_woken_up = B_FALSE;
703*11878SVenu.Iyer@Sun.COM 			SRS_TX_STATS_UPDATE(mac_srs, &stats);
704*11878SVenu.Iyer@Sun.COM 			SOFTRING_TX_STATS_UPDATE(ringp, &stats);
7058275SEric Cheng 		}
7068275SEric Cheng 	}
7078275SEric Cheng 
7088275SEric Cheng 	if (ringp->s_ring_count == 0 && ringp->s_ring_state &
7098275SEric Cheng 	    (S_RING_TX_HIWAT | S_RING_WAKEUP_CLIENT | S_RING_ENQUEUED)) {
7108275SEric Cheng 		mac_client_impl_t *mcip =  ringp->s_ring_mcip;
7118275SEric Cheng 		boolean_t wakeup_required = B_FALSE;
7128275SEric Cheng 
7138275SEric Cheng 		if (ringp->s_ring_state &
7148275SEric Cheng 		    (S_RING_TX_HIWAT|S_RING_WAKEUP_CLIENT)) {
7158275SEric Cheng 			wakeup_required = B_TRUE;
7168275SEric Cheng 		}
7178275SEric Cheng 		ringp->s_ring_state &=
7188275SEric Cheng 		    ~(S_RING_TX_HIWAT | S_RING_WAKEUP_CLIENT | S_RING_ENQUEUED);
7198275SEric Cheng 		mutex_exit(&ringp->s_ring_lock);
7208275SEric Cheng 		if (wakeup_required) {
721*11878SVenu.Iyer@Sun.COM 			mac_tx_invoke_callbacks(mcip, (mac_tx_cookie_t)ringp);
7228275SEric Cheng 			/*
7238275SEric Cheng 			 * If the client is not the primary MAC client, then we
7248275SEric Cheng 			 * need to send the notification to the clients upper
7258275SEric Cheng 			 * MAC, i.e. mci_upper_mip.
7268275SEric Cheng 			 */
7278275SEric Cheng 			mac_tx_notify(mcip->mci_upper_mip != NULL ?
7288275SEric Cheng 			    mcip->mci_upper_mip : mcip->mci_mip);
7298275SEric Cheng 		}
7308275SEric Cheng 		mutex_enter(&ringp->s_ring_lock);
7318275SEric Cheng 	}
7328275SEric Cheng 	ringp->s_ring_state &= ~S_RING_PROC;
7338275SEric Cheng 	ringp->s_ring_run = NULL;
7348275SEric Cheng }
735