18275SEric Cheng /* 28275SEric Cheng * CDDL HEADER START 38275SEric Cheng * 48275SEric Cheng * The contents of this file are subject to the terms of the 58275SEric Cheng * Common Development and Distribution License (the "License"). 68275SEric Cheng * You may not use this file except in compliance with the License. 78275SEric Cheng * 88275SEric Cheng * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 98275SEric Cheng * or http://www.opensolaris.org/os/licensing. 108275SEric Cheng * See the License for the specific language governing permissions 118275SEric Cheng * and limitations under the License. 128275SEric Cheng * 138275SEric Cheng * When distributing Covered Code, include this CDDL HEADER in each 148275SEric Cheng * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 158275SEric Cheng * If applicable, add the following below this CDDL HEADER, with the 168275SEric Cheng * fields enclosed by brackets "[]" replaced with your own identifying 178275SEric Cheng * information: Portions Copyright [yyyy] [name of copyright owner] 188275SEric Cheng * 198275SEric Cheng * CDDL HEADER END 208275SEric Cheng */ 218275SEric Cheng /* 228833SVenu.Iyer@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 238275SEric Cheng * Use is subject to license terms. 248275SEric Cheng */ 258275SEric Cheng 268275SEric Cheng #include <sys/types.h> 278275SEric Cheng #include <sys/callb.h> 288275SEric Cheng #include <sys/sdt.h> 298275SEric Cheng #include <sys/strsubr.h> 308275SEric Cheng #include <sys/strsun.h> 318275SEric Cheng #include <sys/vlan.h> 328275SEric Cheng #include <inet/ipsec_impl.h> 338275SEric Cheng #include <inet/ip_impl.h> 348275SEric Cheng #include <inet/sadb.h> 358275SEric Cheng #include <inet/ipsecesp.h> 368275SEric Cheng #include <inet/ipsecah.h> 378275SEric Cheng #include <inet/ip6.h> 388275SEric Cheng 398275SEric Cheng #include <sys/mac_impl.h> 408275SEric Cheng #include <sys/mac_client_impl.h> 418275SEric Cheng #include <sys/mac_client_priv.h> 428275SEric Cheng #include <sys/mac_soft_ring.h> 438275SEric Cheng #include <sys/mac_flow_impl.h> 448275SEric Cheng 458275SEric Cheng static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *, 468275SEric Cheng uintptr_t, uint16_t, mblk_t **); 478275SEric Cheng static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *, 488275SEric Cheng uintptr_t, uint16_t, mblk_t **); 498275SEric Cheng static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *, 508275SEric Cheng uintptr_t, uint16_t, mblk_t **); 518275SEric Cheng static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *, 528275SEric Cheng uintptr_t, uint16_t, mblk_t **); 538275SEric Cheng 548275SEric Cheng typedef struct mac_tx_mode_s { 558275SEric Cheng mac_tx_srs_mode_t mac_tx_mode; 568275SEric Cheng mac_tx_func_t mac_tx_func; 578275SEric Cheng } mac_tx_mode_t; 588275SEric Cheng 598275SEric Cheng /* 608275SEric Cheng * There are five modes of operation on the Tx side. These modes get set 618275SEric Cheng * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode, 628275SEric Cheng * none of the other modes are user configurable. They get selected by 638275SEric Cheng * the system depending upon whether the link (or flow) has multiple Tx 648275SEric Cheng * rings or a bandwidth configured, etc. 658275SEric Cheng */ 668275SEric Cheng mac_tx_mode_t mac_tx_mode_list[] = { 678275SEric Cheng {SRS_TX_DEFAULT, mac_tx_single_ring_mode}, 688275SEric Cheng {SRS_TX_SERIALIZE, mac_tx_serializer_mode}, 698275SEric Cheng {SRS_TX_FANOUT, mac_tx_fanout_mode}, 708275SEric Cheng {SRS_TX_BW, mac_tx_bw_mode}, 718275SEric Cheng {SRS_TX_BW_FANOUT, mac_tx_bw_mode} 728275SEric Cheng }; 738275SEric Cheng 748275SEric Cheng /* 758275SEric Cheng * Soft Ring Set (SRS) - The Run time code that deals with 768275SEric Cheng * dynamic polling from the hardware, bandwidth enforcement, 778275SEric Cheng * fanout etc. 788275SEric Cheng * 798275SEric Cheng * We try to use H/W classification on NIC and assign traffic for 808275SEric Cheng * a MAC address to a particular Rx ring or ring group. There is a 818275SEric Cheng * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically 828275SEric Cheng * switches the underlying Rx ring between interrupt and 838275SEric Cheng * polling mode and enforces any specified B/W control. 848275SEric Cheng * 858275SEric Cheng * There is always a SRS created and tied to each H/W and S/W rule. 868275SEric Cheng * Whenever we create a H/W rule, we always add the the same rule to 878275SEric Cheng * S/W classifier and tie a SRS to it. 888275SEric Cheng * 898275SEric Cheng * In case a B/W control is specified, it is broken into bytes 908275SEric Cheng * per ticks and as soon as the quota for a tick is exhausted, 918275SEric Cheng * the underlying Rx ring is forced into poll mode for remainder of 928275SEric Cheng * the tick. The SRS poll thread only polls for bytes that are 938275SEric Cheng * allowed to come in the SRS. We typically let 4x the configured 948275SEric Cheng * B/W worth of packets to come in the SRS (to prevent unnecessary 958275SEric Cheng * drops due to bursts) but only process the specified amount. 968275SEric Cheng * 978275SEric Cheng * A MAC client (e.g. a VNIC or aggr) can have 1 or more 988275SEric Cheng * Rx rings (and corresponding SRSs) assigned to it. The SRS 998275SEric Cheng * in turn can have softrings to do protocol level fanout or 1008275SEric Cheng * softrings to do S/W based fanout or both. In case the NIC 1018275SEric Cheng * has no Rx rings, we do S/W classification to respective SRS. 1028275SEric Cheng * The S/W classification rule is always setup and ready. This 1038275SEric Cheng * allows the MAC layer to reassign Rx rings whenever needed 1048275SEric Cheng * but packets still continue to flow via the default path and 1058275SEric Cheng * getting S/W classified to correct SRS. 1068275SEric Cheng * 1078275SEric Cheng * The SRS's are used on both Tx and Rx side. They use the same 1088275SEric Cheng * data structure but the processing routines have slightly different 1098275SEric Cheng * semantics due to the fact that Rx side needs to do dynamic 1108275SEric Cheng * polling etc. 1118275SEric Cheng * 1128275SEric Cheng * Dynamic Polling Notes 1138275SEric Cheng * ===================== 1148275SEric Cheng * 1158275SEric Cheng * Each Soft ring set is capable of switching its Rx ring between 1168275SEric Cheng * interrupt and poll mode and actively 'polls' for packets in 1178275SEric Cheng * poll mode. If the SRS is implementing a B/W limit, it makes 1188275SEric Cheng * sure that only Max allowed packets are pulled in poll mode 1198275SEric Cheng * and goes to poll mode as soon as B/W limit is exceeded. As 1208275SEric Cheng * such, there are no overheads to implement B/W limits. 1218275SEric Cheng * 1228275SEric Cheng * In poll mode, its better to keep the pipeline going where the 1238275SEric Cheng * SRS worker thread keeps processing packets and poll thread 1248275SEric Cheng * keeps bringing more packets (specially if they get to run 1258275SEric Cheng * on different CPUs). This also prevents the overheads associated 1268275SEric Cheng * by excessive signalling (on NUMA machines, this can be 1278275SEric Cheng * pretty devastating). The exception is latency optimized case 1288275SEric Cheng * where worker thread does no work and interrupt and poll thread 1298275SEric Cheng * are allowed to do their own drain. 1308275SEric Cheng * 1318275SEric Cheng * We use the following policy to control Dynamic Polling: 1328275SEric Cheng * 1) We switch to poll mode anytime the processing 1338275SEric Cheng * thread causes a backlog to build up in SRS and 1348275SEric Cheng * its associated Soft Rings (sr_poll_pkt_cnt > 0). 1358275SEric Cheng * 2) As long as the backlog stays under the low water 1368275SEric Cheng * mark (sr_lowat), we poll the H/W for more packets. 1378275SEric Cheng * 3) If the backlog (sr_poll_pkt_cnt) exceeds low 1388275SEric Cheng * water mark, we stay in poll mode but don't poll 1398275SEric Cheng * the H/W for more packets. 1408275SEric Cheng * 4) Anytime in polling mode, if we poll the H/W for 1418275SEric Cheng * packets and find nothing plus we have an existing 1428275SEric Cheng * backlog (sr_poll_pkt_cnt > 0), we stay in polling 1438275SEric Cheng * mode but don't poll the H/W for packets anymore 1448275SEric Cheng * (let the polling thread go to sleep). 1458275SEric Cheng * 5) Once the backlog is relived (packets are processed) 1468275SEric Cheng * we reenable polling (by signalling the poll thread) 1478275SEric Cheng * only when the backlog dips below sr_poll_thres. 1488275SEric Cheng * 6) sr_hiwat is used exclusively when we are not 1498275SEric Cheng * polling capable and is used to decide when to 1508275SEric Cheng * drop packets so the SRS queue length doesn't grow 1518275SEric Cheng * infinitely. 1528275SEric Cheng * 1538275SEric Cheng * NOTE: Also see the block level comment on top of mac_soft_ring.c 1548275SEric Cheng */ 1558275SEric Cheng 1568275SEric Cheng /* 1578275SEric Cheng * mac_latency_optimize 1588275SEric Cheng * 1598275SEric Cheng * Controls whether the poll thread can process the packets inline 1608275SEric Cheng * or let the SRS worker thread do the processing. This applies if 1618275SEric Cheng * the SRS was not being processed. For latency sensitive traffic, 1628275SEric Cheng * this needs to be true to allow inline processing. For throughput 1638275SEric Cheng * under load, this should be false. 1648275SEric Cheng * 1658275SEric Cheng * This (and other similar) tunable should be rolled into a link 1668275SEric Cheng * or flow specific workload hint that can be set using dladm 1678275SEric Cheng * linkprop (instead of multiple such tunables). 1688275SEric Cheng */ 1698275SEric Cheng boolean_t mac_latency_optimize = B_TRUE; 1708275SEric Cheng 1718275SEric Cheng /* 1728275SEric Cheng * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN 1738275SEric Cheng * 1748275SEric Cheng * queue a mp or chain in soft ring set and increment the 1758275SEric Cheng * local count (srs_count) for the SRS and the shared counter 1768275SEric Cheng * (srs_poll_pkt_cnt - shared between SRS and its soft rings 1778275SEric Cheng * to track the total unprocessed packets for polling to work 1788275SEric Cheng * correctly). 1798275SEric Cheng * 1808275SEric Cheng * The size (total bytes queued) counters are incremented only 1818275SEric Cheng * if we are doing B/W control. 1828275SEric Cheng */ 1838275SEric Cheng #define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 1848275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 1858275SEric Cheng if ((mac_srs)->srs_last != NULL) \ 1868275SEric Cheng (mac_srs)->srs_last->b_next = (head); \ 1878275SEric Cheng else \ 1888275SEric Cheng (mac_srs)->srs_first = (head); \ 1898275SEric Cheng (mac_srs)->srs_last = (tail); \ 1908275SEric Cheng (mac_srs)->srs_count += count; \ 1918275SEric Cheng } 1928275SEric Cheng 1938275SEric Cheng #define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 1948275SEric Cheng mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 1958275SEric Cheng \ 1968275SEric Cheng MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 1978275SEric Cheng srs_rx->sr_poll_pkt_cnt += count; \ 1988275SEric Cheng ASSERT(srs_rx->sr_poll_pkt_cnt > 0); \ 1998275SEric Cheng if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 2008275SEric Cheng (mac_srs)->srs_size += (sz); \ 2018275SEric Cheng mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock); \ 2028275SEric Cheng (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 2038275SEric Cheng mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock); \ 2048275SEric Cheng } \ 2058275SEric Cheng } 2068275SEric Cheng 2078275SEric Cheng #define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 2088275SEric Cheng mac_srs->srs_state |= SRS_ENQUEUED; \ 2098275SEric Cheng MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 2108275SEric Cheng if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 2118275SEric Cheng (mac_srs)->srs_size += (sz); \ 2128275SEric Cheng (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 2138275SEric Cheng } \ 2148275SEric Cheng } 2158275SEric Cheng 2168275SEric Cheng /* 2178275SEric Cheng * Turn polling on routines 2188275SEric Cheng */ 2198275SEric Cheng #define MAC_SRS_POLLING_ON(mac_srs) { \ 2208275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2218275SEric Cheng if (((mac_srs)->srs_state & \ 2228275SEric Cheng (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) { \ 2238275SEric Cheng (mac_srs)->srs_state |= SRS_POLLING; \ 2248275SEric Cheng (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 2258275SEric Cheng (mac_srs)->srs_ring); \ 2268275SEric Cheng (mac_srs)->srs_rx.sr_poll_on++; \ 2278275SEric Cheng } \ 2288275SEric Cheng } 2298275SEric Cheng 2308275SEric Cheng #define MAC_SRS_WORKER_POLLING_ON(mac_srs) { \ 2318275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2328275SEric Cheng if (((mac_srs)->srs_state & \ 2338275SEric Cheng (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == \ 2348275SEric Cheng (SRS_POLLING_CAPAB|SRS_WORKER)) { \ 2358275SEric Cheng (mac_srs)->srs_state |= SRS_POLLING; \ 2368275SEric Cheng (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 2378275SEric Cheng (mac_srs)->srs_ring); \ 2388275SEric Cheng (mac_srs)->srs_rx.sr_worker_poll_on++; \ 2398275SEric Cheng } \ 2408275SEric Cheng } 2418275SEric Cheng 2428275SEric Cheng /* 2438275SEric Cheng * MAC_SRS_POLL_RING 2448275SEric Cheng * 2458275SEric Cheng * Signal the SRS poll thread to poll the underlying H/W ring 2468275SEric Cheng * provided it wasn't already polling (SRS_GET_PKTS was set). 2478275SEric Cheng * 2488275SEric Cheng * Poll thread gets to run only from mac_rx_srs_drain() and only 2498275SEric Cheng * if the drain was being done by the worker thread. 2508275SEric Cheng */ 2518275SEric Cheng #define MAC_SRS_POLL_RING(mac_srs) { \ 2528275SEric Cheng mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 2538275SEric Cheng \ 2548275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2558275SEric Cheng srs_rx->sr_poll_thr_sig++; \ 2568275SEric Cheng if (((mac_srs)->srs_state & \ 2578275SEric Cheng (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) == \ 2588275SEric Cheng (SRS_WORKER|SRS_POLLING_CAPAB)) { \ 2598275SEric Cheng (mac_srs)->srs_state |= SRS_GET_PKTS; \ 2608275SEric Cheng cv_signal(&(mac_srs)->srs_cv); \ 2618275SEric Cheng } else { \ 2628275SEric Cheng srs_rx->sr_poll_thr_busy++; \ 2638275SEric Cheng } \ 2648275SEric Cheng } 2658275SEric Cheng 2668275SEric Cheng /* 2678275SEric Cheng * MAC_SRS_CHECK_BW_CONTROL 2688275SEric Cheng * 2698275SEric Cheng * Check to see if next tick has started so we can reset the 2708275SEric Cheng * SRS_BW_ENFORCED flag and allow more packets to come in the 2718275SEric Cheng * system. 2728275SEric Cheng */ 2738275SEric Cheng #define MAC_SRS_CHECK_BW_CONTROL(mac_srs) { \ 2748275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2758275SEric Cheng ASSERT(((mac_srs)->srs_type & SRST_TX) || \ 2768275SEric Cheng MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock)); \ 2778275SEric Cheng if ((mac_srs)->srs_bw->mac_bw_curr_time != lbolt) { \ 2788275SEric Cheng (mac_srs)->srs_bw->mac_bw_curr_time = lbolt; \ 2798275SEric Cheng (mac_srs)->srs_bw->mac_bw_used = 0; \ 2808275SEric Cheng if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED) \ 2818275SEric Cheng (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \ 2828275SEric Cheng } \ 2838275SEric Cheng } 2848275SEric Cheng 2858275SEric Cheng /* 2868275SEric Cheng * MAC_SRS_WORKER_WAKEUP 2878275SEric Cheng * 2888275SEric Cheng * Wake up the SRS worker thread to process the queue as long as 2898275SEric Cheng * no one else is processing the queue. If we are optimizing for 2908275SEric Cheng * latency, we wake up the worker thread immediately or else we 2918275SEric Cheng * wait mac_srs_worker_wakeup_ticks before worker thread gets 2928275SEric Cheng * woken up. 2938275SEric Cheng */ 2948275SEric Cheng int mac_srs_worker_wakeup_ticks = 0; 2958275SEric Cheng #define MAC_SRS_WORKER_WAKEUP(mac_srs) { \ 2968275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2978275SEric Cheng if (!((mac_srs)->srs_state & SRS_PROC) && \ 2988275SEric Cheng (mac_srs)->srs_tid == NULL) { \ 2999618SRajagopal.Kunhappan@Sun.COM if (((mac_srs)->srs_state & SRS_LATENCY_OPT) || \ 3008275SEric Cheng (mac_srs_worker_wakeup_ticks == 0)) \ 3018275SEric Cheng cv_signal(&(mac_srs)->srs_async); \ 3028275SEric Cheng else \ 3038275SEric Cheng (mac_srs)->srs_tid = \ 3048275SEric Cheng timeout(mac_srs_fire, (mac_srs), \ 3058275SEric Cheng mac_srs_worker_wakeup_ticks); \ 3068275SEric Cheng } \ 3078275SEric Cheng } 3088275SEric Cheng 3098275SEric Cheng #define TX_SINGLE_RING_MODE(mac_srs) \ 3108275SEric Cheng ((mac_srs)->srs_tx.st_mode == SRS_TX_DEFAULT || \ 3118275SEric Cheng (mac_srs)->srs_tx.st_mode == SRS_TX_SERIALIZE || \ 3128275SEric Cheng (mac_srs)->srs_tx.st_mode == SRS_TX_BW) 3138275SEric Cheng 3148275SEric Cheng #define TX_BANDWIDTH_MODE(mac_srs) \ 3158275SEric Cheng ((mac_srs)->srs_tx.st_mode == SRS_TX_BW || \ 3168275SEric Cheng (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT) 3178275SEric Cheng 3188275SEric Cheng #define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) { \ 3198275SEric Cheng uint_t hash, indx; \ 3208275SEric Cheng hash = HASH_HINT(hint); \ 3218275SEric Cheng indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); \ 3228275SEric Cheng softring = mac_srs->srs_oth_soft_rings[indx]; \ 3238275SEric Cheng (void) (mac_tx_soft_ring_process(softring, head, 0, NULL)); \ 3248275SEric Cheng } 3258275SEric Cheng 3268275SEric Cheng /* 3278275SEric Cheng * MAC_TX_SRS_BLOCK 3288275SEric Cheng * 3298275SEric Cheng * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED 3308275SEric Cheng * will be set only if srs_tx_woken_up is FALSE. If 3318275SEric Cheng * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived 3328275SEric Cheng * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to 3338275SEric Cheng * attempt to transmit again and not setting SRS_TX_BLOCKED does 3348275SEric Cheng * that. 3358275SEric Cheng */ 3368275SEric Cheng #define MAC_TX_SRS_BLOCK(srs, mp) { \ 3378275SEric Cheng ASSERT(MUTEX_HELD(&(srs)->srs_lock)); \ 3388275SEric Cheng if ((srs)->srs_tx.st_woken_up) { \ 3398275SEric Cheng (srs)->srs_tx.st_woken_up = B_FALSE; \ 3408275SEric Cheng } else { \ 3418275SEric Cheng ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED)); \ 3428275SEric Cheng (srs)->srs_state |= SRS_TX_BLOCKED; \ 3438275SEric Cheng (srs)->srs_tx.st_blocked_cnt++; \ 3448275SEric Cheng } \ 3458275SEric Cheng } 3468275SEric Cheng 3478275SEric Cheng /* 3488275SEric Cheng * MAC_TX_SRS_TEST_HIWAT 3498275SEric Cheng * 3508275SEric Cheng * Called before queueing a packet onto Tx SRS to test and set 3518275SEric Cheng * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat. 3528275SEric Cheng */ 3538275SEric Cheng #define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) { \ 3548275SEric Cheng boolean_t enqueue = 1; \ 3558275SEric Cheng \ 3568275SEric Cheng if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) { \ 3578275SEric Cheng /* \ 3588275SEric Cheng * flow-controlled. Store srs in cookie so that it \ 3598275SEric Cheng * can be returned as mac_tx_cookie_t to client \ 3608275SEric Cheng */ \ 3618275SEric Cheng (srs)->srs_state |= SRS_TX_HIWAT; \ 3628275SEric Cheng cookie = (mac_tx_cookie_t)srs; \ 3638275SEric Cheng (srs)->srs_tx.st_hiwat_cnt++; \ 3648275SEric Cheng if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) { \ 3658275SEric Cheng /* increment freed stats */ \ 3668275SEric Cheng (srs)->srs_tx.st_drop_count += cnt; \ 3678275SEric Cheng /* \ 3688275SEric Cheng * b_prev may be set to the fanout hint \ 3698275SEric Cheng * hence can't use freemsg directly \ 3708275SEric Cheng */ \ 3718275SEric Cheng mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \ 3728275SEric Cheng DTRACE_PROBE1(tx_queued_hiwat, \ 3738275SEric Cheng mac_soft_ring_set_t *, srs); \ 3748275SEric Cheng enqueue = 0; \ 3758275SEric Cheng } \ 3768275SEric Cheng } \ 3778275SEric Cheng if (enqueue) \ 3788275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz); \ 3798275SEric Cheng } 3808275SEric Cheng 3818275SEric Cheng /* Some utility macros */ 3828275SEric Cheng #define MAC_SRS_BW_LOCK(srs) \ 3838275SEric Cheng if (!(srs->srs_type & SRST_TX)) \ 3848275SEric Cheng mutex_enter(&srs->srs_bw->mac_bw_lock); 3858275SEric Cheng 3868275SEric Cheng #define MAC_SRS_BW_UNLOCK(srs) \ 3878275SEric Cheng if (!(srs->srs_type & SRST_TX)) \ 3888275SEric Cheng mutex_exit(&srs->srs_bw->mac_bw_lock); 3898275SEric Cheng 3908275SEric Cheng #define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \ 3918275SEric Cheng mac_pkt_drop(NULL, NULL, mp, B_FALSE); \ 3928275SEric Cheng /* increment freed stats */ \ 3938275SEric Cheng mac_srs->srs_tx.st_drop_count++; \ 3948275SEric Cheng cookie = (mac_tx_cookie_t)srs; \ 3958275SEric Cheng } 3968275SEric Cheng 3978275SEric Cheng #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \ 3988275SEric Cheng mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT; \ 3998275SEric Cheng cookie = (mac_tx_cookie_t)srs; \ 4008275SEric Cheng *ret_mp = mp_chain; \ 4018275SEric Cheng } 4028275SEric Cheng 4038275SEric Cheng /* 4048275SEric Cheng * Drop the rx packet and advance to the next one in the chain. 4058275SEric Cheng */ 4068275SEric Cheng static void 4078275SEric Cheng mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp) 4088275SEric Cheng { 4098275SEric Cheng mac_srs_rx_t *srs_rx = &srs->srs_rx; 4108275SEric Cheng 4118275SEric Cheng ASSERT(mp->b_next == NULL); 4128275SEric Cheng mutex_enter(&srs->srs_lock); 4138275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1); 4148275SEric Cheng MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp)); 4158275SEric Cheng mutex_exit(&srs->srs_lock); 4168275SEric Cheng 4178275SEric Cheng srs_rx->sr_drop_count++; 4188275SEric Cheng freemsg(mp); 4198275SEric Cheng } 4208275SEric Cheng 4218275SEric Cheng /* DATAPATH RUNTIME ROUTINES */ 4228275SEric Cheng 4238275SEric Cheng /* 4248275SEric Cheng * mac_srs_fire 4258275SEric Cheng * 4268275SEric Cheng * Timer callback routine for waking up the SRS worker thread. 4278275SEric Cheng */ 4288275SEric Cheng static void 4298275SEric Cheng mac_srs_fire(void *arg) 4308275SEric Cheng { 4318275SEric Cheng mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg; 4328275SEric Cheng 4338275SEric Cheng mutex_enter(&mac_srs->srs_lock); 4348275SEric Cheng if (mac_srs->srs_tid == 0) { 4358275SEric Cheng mutex_exit(&mac_srs->srs_lock); 4368275SEric Cheng return; 4378275SEric Cheng } 4388275SEric Cheng 4398275SEric Cheng mac_srs->srs_tid = 0; 4408275SEric Cheng if (!(mac_srs->srs_state & SRS_PROC)) 4418275SEric Cheng cv_signal(&mac_srs->srs_async); 4428275SEric Cheng 4438275SEric Cheng mutex_exit(&mac_srs->srs_lock); 4448275SEric Cheng } 4458275SEric Cheng 4468275SEric Cheng /* 4478275SEric Cheng * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack, 4488275SEric Cheng * and it is used on the TX path. 4498275SEric Cheng */ 4508275SEric Cheng #define HASH_HINT(hint) (((hint) << 17) | ((hint) >> 16)) 4518275SEric Cheng 4528275SEric Cheng /* 4538275SEric Cheng * hash based on the src address and the port information. 4548275SEric Cheng */ 4558275SEric Cheng #define HASH_ADDR(src, ports) \ 4568275SEric Cheng (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \ 4578275SEric Cheng ((ports) >> 8) ^ (ports)) 4588275SEric Cheng 4598275SEric Cheng #define COMPUTE_INDEX(key, sz) (key % sz) 4608275SEric Cheng 4618275SEric Cheng #define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) { \ 4628275SEric Cheng if ((tail) != NULL) { \ 4638275SEric Cheng ASSERT((tail)->b_next == NULL); \ 4648275SEric Cheng (tail)->b_next = (mp); \ 4658275SEric Cheng } else { \ 4668275SEric Cheng ASSERT((head) == NULL); \ 4678275SEric Cheng (head) = (mp); \ 4688275SEric Cheng } \ 4698275SEric Cheng (tail) = (mp); \ 4708275SEric Cheng (cnt)++; \ 4718275SEric Cheng if ((bw_ctl)) \ 4728275SEric Cheng (sz) += (sz0); \ 4738275SEric Cheng } 4748275SEric Cheng 4758275SEric Cheng #define MAC_FANOUT_DEFAULT 0 4768275SEric Cheng #define MAC_FANOUT_RND_ROBIN 1 4778275SEric Cheng int mac_fanout_type = MAC_FANOUT_DEFAULT; 4788275SEric Cheng 4798275SEric Cheng #define MAX_SR_TYPES 3 4808275SEric Cheng /* fanout types for port based hashing */ 4818275SEric Cheng enum pkt_type { 4828275SEric Cheng V4_TCP = 0, 4838275SEric Cheng V4_UDP, 4848275SEric Cheng OTH, 4858275SEric Cheng UNDEF 4868275SEric Cheng }; 4878275SEric Cheng 4888275SEric Cheng /* 4898275SEric Cheng * In general we do port based hashing to spread traffic over different 4908275SEric Cheng * softrings. The below tunable allows to override that behavior. Setting it 4918275SEric Cheng * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior 4928275SEric Cheng * is also the applicable to ipv6 packets carrying multiple optional headers 4938275SEric Cheng * and other uncommon packet types. 4948275SEric Cheng */ 4958275SEric Cheng boolean_t mac_src_ipv6_fanout = B_FALSE; 4968275SEric Cheng 4978275SEric Cheng /* 4988275SEric Cheng * Pair of local and remote ports in the transport header 4998275SEric Cheng */ 5008275SEric Cheng #define PORTS_SIZE 4 5018275SEric Cheng 5028275SEric Cheng /* 5038275SEric Cheng * mac_rx_srs_proto_fanout 5048275SEric Cheng * 5058275SEric Cheng * This routine delivers packets destined to an SRS into one of the 5068275SEric Cheng * protocol soft rings. 5078275SEric Cheng * 5088275SEric Cheng * Given a chain of packets we need to split it up into multiple sub chains 5098275SEric Cheng * destined into TCP, UDP or OTH soft ring. Instead of entering 5108275SEric Cheng * the soft ring one packet at a time, we want to enter it in the form of a 5118275SEric Cheng * chain otherwise we get this start/stop behaviour where the worker thread 5128275SEric Cheng * goes to sleep and then next packets comes in forcing it to wake up etc. 5138275SEric Cheng */ 5148275SEric Cheng static void 5158275SEric Cheng mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 5168275SEric Cheng { 5178275SEric Cheng struct ether_header *ehp; 5188833SVenu.Iyer@Sun.COM struct ether_vlan_header *evhp; 5198833SVenu.Iyer@Sun.COM uint32_t sap; 5208275SEric Cheng ipha_t *ipha; 5218833SVenu.Iyer@Sun.COM uint8_t *dstaddr; 5228833SVenu.Iyer@Sun.COM size_t hdrsize; 5238275SEric Cheng mblk_t *mp; 5248275SEric Cheng mblk_t *headmp[MAX_SR_TYPES]; 5258275SEric Cheng mblk_t *tailmp[MAX_SR_TYPES]; 5268275SEric Cheng int cnt[MAX_SR_TYPES]; 5278275SEric Cheng size_t sz[MAX_SR_TYPES]; 5288275SEric Cheng size_t sz1; 5298833SVenu.Iyer@Sun.COM boolean_t bw_ctl; 5308275SEric Cheng boolean_t hw_classified; 5318833SVenu.Iyer@Sun.COM boolean_t dls_bypass; 5328833SVenu.Iyer@Sun.COM boolean_t is_ether; 5338833SVenu.Iyer@Sun.COM boolean_t is_unicast; 5348833SVenu.Iyer@Sun.COM enum pkt_type type; 5358275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 5368833SVenu.Iyer@Sun.COM 5378833SVenu.Iyer@Sun.COM is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 5388833SVenu.Iyer@Sun.COM bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 5398275SEric Cheng 5408275SEric Cheng /* 5418275SEric Cheng * If we don't have a Rx ring, S/W classification would have done 5428275SEric Cheng * its job and its a packet meant for us. If we were polling on 5438275SEric Cheng * the default ring (i.e. there was a ring assigned to this SRS), 5448275SEric Cheng * then we need to make sure that the mac address really belongs 5458275SEric Cheng * to us. 5468275SEric Cheng */ 5478275SEric Cheng hw_classified = mac_srs->srs_ring != NULL && 5488275SEric Cheng mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 5498275SEric Cheng 5508275SEric Cheng /* 5518275SEric Cheng * Special clients (eg. VLAN, non ether, etc) need DLS 5528275SEric Cheng * processing in the Rx path. SRST_DLS_BYPASS will be clear for 553*11021SEric.Cheng@Sun.COM * such SRSs. Another way of disabling bypass is to set the 554*11021SEric.Cheng@Sun.COM * MCIS_RX_BYPASS_DISABLE flag. 5558275SEric Cheng */ 556*11021SEric.Cheng@Sun.COM dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && 557*11021SEric.Cheng@Sun.COM ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); 5588275SEric Cheng 5598275SEric Cheng bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *)); 5608275SEric Cheng bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *)); 5618275SEric Cheng bzero(cnt, MAX_SR_TYPES * sizeof (int)); 5628275SEric Cheng bzero(sz, MAX_SR_TYPES * sizeof (size_t)); 5638275SEric Cheng 5648275SEric Cheng /* 5658275SEric Cheng * We got a chain from SRS that we need to send to the soft rings. 5668275SEric Cheng * Since squeues for TCP & IPv4 sap poll their soft rings (for 5678275SEric Cheng * performance reasons), we need to separate out v4_tcp, v4_udp 5688275SEric Cheng * and the rest goes in other. 5698275SEric Cheng */ 5708275SEric Cheng while (head != NULL) { 5718275SEric Cheng mp = head; 5728275SEric Cheng head = head->b_next; 5738275SEric Cheng mp->b_next = NULL; 5748275SEric Cheng 5758275SEric Cheng type = OTH; 5768833SVenu.Iyer@Sun.COM sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 5778833SVenu.Iyer@Sun.COM 5788833SVenu.Iyer@Sun.COM if (is_ether) { 5798833SVenu.Iyer@Sun.COM /* 5808833SVenu.Iyer@Sun.COM * At this point we can be sure the packet at least 5818833SVenu.Iyer@Sun.COM * has an ether header. 5828833SVenu.Iyer@Sun.COM */ 5838833SVenu.Iyer@Sun.COM if (sz1 < sizeof (struct ether_header)) { 5848833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 5858833SVenu.Iyer@Sun.COM continue; 5868833SVenu.Iyer@Sun.COM } 5878275SEric Cheng ehp = (struct ether_header *)mp->b_rptr; 5888275SEric Cheng 5898275SEric Cheng /* 5908833SVenu.Iyer@Sun.COM * Determine if this is a VLAN or non-VLAN packet. 5918275SEric Cheng */ 5928833SVenu.Iyer@Sun.COM if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 5938833SVenu.Iyer@Sun.COM evhp = (struct ether_vlan_header *)mp->b_rptr; 5948833SVenu.Iyer@Sun.COM sap = ntohs(evhp->ether_type); 5958833SVenu.Iyer@Sun.COM hdrsize = sizeof (struct ether_vlan_header); 5968275SEric Cheng /* 5978833SVenu.Iyer@Sun.COM * Check if the VID of the packet, if any, 5988833SVenu.Iyer@Sun.COM * belongs to this client. 5998275SEric Cheng */ 6008275SEric Cheng if (!mac_client_check_flow_vid(mcip, 6018275SEric Cheng VLAN_ID(ntohs(evhp->ether_tci)))) { 6028275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 6038275SEric Cheng continue; 6048275SEric Cheng } 6058833SVenu.Iyer@Sun.COM } else { 6068833SVenu.Iyer@Sun.COM hdrsize = sizeof (struct ether_header); 6078275SEric Cheng } 6088833SVenu.Iyer@Sun.COM is_unicast = 6098833SVenu.Iyer@Sun.COM ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 6108833SVenu.Iyer@Sun.COM dstaddr = (uint8_t *)&ehp->ether_dhost; 6118833SVenu.Iyer@Sun.COM } else { 6128833SVenu.Iyer@Sun.COM mac_header_info_t mhi; 6138833SVenu.Iyer@Sun.COM 6148833SVenu.Iyer@Sun.COM if (mac_header_info((mac_handle_t)mcip->mci_mip, 6158833SVenu.Iyer@Sun.COM mp, &mhi) != 0) { 6168833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 6178833SVenu.Iyer@Sun.COM continue; 6188833SVenu.Iyer@Sun.COM } 6198833SVenu.Iyer@Sun.COM hdrsize = mhi.mhi_hdrsize; 6208833SVenu.Iyer@Sun.COM sap = mhi.mhi_bindsap; 6218833SVenu.Iyer@Sun.COM is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 6228833SVenu.Iyer@Sun.COM dstaddr = (uint8_t *)mhi.mhi_daddr; 6238833SVenu.Iyer@Sun.COM } 6248833SVenu.Iyer@Sun.COM 6258833SVenu.Iyer@Sun.COM if (!dls_bypass) { 6268275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 6278275SEric Cheng cnt[type], bw_ctl, sz[type], sz1, mp); 6288275SEric Cheng continue; 6298275SEric Cheng } 6308275SEric Cheng 6318833SVenu.Iyer@Sun.COM if (sap == ETHERTYPE_IP) { 6328275SEric Cheng /* 6338275SEric Cheng * If we are H/W classified, but we have promisc 6348275SEric Cheng * on, then we need to check for the unicast address. 6358275SEric Cheng */ 6368275SEric Cheng if (hw_classified && mcip->mci_promisc_list != NULL) { 6378275SEric Cheng mac_address_t *map; 6388275SEric Cheng 6398275SEric Cheng rw_enter(&mcip->mci_rw_lock, RW_READER); 6408275SEric Cheng map = mcip->mci_unicast; 6418833SVenu.Iyer@Sun.COM if (bcmp(dstaddr, map->ma_addr, 6428275SEric Cheng map->ma_len) == 0) 6438275SEric Cheng type = UNDEF; 6448275SEric Cheng rw_exit(&mcip->mci_rw_lock); 6458833SVenu.Iyer@Sun.COM } else if (is_unicast) { 6468275SEric Cheng type = UNDEF; 6478275SEric Cheng } 6488275SEric Cheng } 6498275SEric Cheng 6508275SEric Cheng /* 6518275SEric Cheng * This needs to become a contract with the driver for 6528275SEric Cheng * the fast path. 6538275SEric Cheng * 6548275SEric Cheng * In the normal case the packet will have at least the L2 6558275SEric Cheng * header and the IP + Transport header in the same mblk. 6568275SEric Cheng * This is usually the case when the NIC driver sends up 6578275SEric Cheng * the packet. This is also true when the stack generates 6588275SEric Cheng * a packet that is looped back and when the stack uses the 6598275SEric Cheng * fastpath mechanism. The normal case is optimized for 6608275SEric Cheng * performance and may bypass DLS. All other cases go through 6618275SEric Cheng * the 'OTH' type path without DLS bypass. 6628275SEric Cheng */ 6638275SEric Cheng 6648833SVenu.Iyer@Sun.COM ipha = (ipha_t *)(mp->b_rptr + hdrsize); 6658275SEric Cheng if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) 6668275SEric Cheng type = OTH; 6678275SEric Cheng 6688275SEric Cheng if (type == OTH) { 6698275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 6708275SEric Cheng cnt[type], bw_ctl, sz[type], sz1, mp); 6718275SEric Cheng continue; 6728275SEric Cheng } 6738275SEric Cheng 6748275SEric Cheng ASSERT(type == UNDEF); 6758275SEric Cheng /* 6768275SEric Cheng * We look for at least 4 bytes past the IP header to get 6778275SEric Cheng * the port information. If we get an IP fragment, we don't 6788275SEric Cheng * have the port information, and we use just the protocol 6798275SEric Cheng * information. 6808275SEric Cheng */ 6818275SEric Cheng switch (ipha->ipha_protocol) { 6828275SEric Cheng case IPPROTO_TCP: 6838275SEric Cheng type = V4_TCP; 6848833SVenu.Iyer@Sun.COM mp->b_rptr += hdrsize; 6858275SEric Cheng break; 6868275SEric Cheng case IPPROTO_UDP: 6878275SEric Cheng type = V4_UDP; 6888833SVenu.Iyer@Sun.COM mp->b_rptr += hdrsize; 6898275SEric Cheng break; 6908275SEric Cheng default: 6918275SEric Cheng type = OTH; 6928275SEric Cheng break; 6938275SEric Cheng } 6948275SEric Cheng 6958275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type], 6968275SEric Cheng bw_ctl, sz[type], sz1, mp); 6978275SEric Cheng } 6988275SEric Cheng 6998275SEric Cheng for (type = V4_TCP; type < UNDEF; type++) { 7008275SEric Cheng if (headmp[type] != NULL) { 7018833SVenu.Iyer@Sun.COM mac_soft_ring_t *softring; 7028833SVenu.Iyer@Sun.COM 7038275SEric Cheng ASSERT(tailmp[type]->b_next == NULL); 7048275SEric Cheng switch (type) { 7058275SEric Cheng case V4_TCP: 7068275SEric Cheng softring = mac_srs->srs_tcp_soft_rings[0]; 7078275SEric Cheng break; 7088275SEric Cheng case V4_UDP: 7098275SEric Cheng softring = mac_srs->srs_udp_soft_rings[0]; 7108275SEric Cheng break; 7118275SEric Cheng case OTH: 7128275SEric Cheng softring = mac_srs->srs_oth_soft_rings[0]; 7138275SEric Cheng } 7148833SVenu.Iyer@Sun.COM mac_rx_soft_ring_process(mcip, softring, 7158275SEric Cheng headmp[type], tailmp[type], cnt[type], sz[type]); 7168275SEric Cheng } 7178275SEric Cheng } 7188275SEric Cheng } 7198275SEric Cheng 7208275SEric Cheng int fanout_unalligned = 0; 7218275SEric Cheng 7228275SEric Cheng /* 7238275SEric Cheng * mac_rx_srs_long_fanout 7248275SEric Cheng * 7258275SEric Cheng * The fanout routine for IPv6 7268275SEric Cheng */ 7278275SEric Cheng static int 7288275SEric Cheng mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, 7298833SVenu.Iyer@Sun.COM uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx) 7308275SEric Cheng { 7318275SEric Cheng ip6_t *ip6h; 7328275SEric Cheng uint8_t *whereptr; 7338275SEric Cheng uint_t hash; 7348275SEric Cheng uint16_t remlen; 7358275SEric Cheng uint8_t nexthdr; 7368275SEric Cheng uint16_t hdr_len; 7378275SEric Cheng 7388833SVenu.Iyer@Sun.COM if (sap == ETHERTYPE_IPV6) { 7398275SEric Cheng boolean_t modifiable = B_TRUE; 7408275SEric Cheng 7418833SVenu.Iyer@Sun.COM ASSERT(MBLKL(mp) >= hdrsize); 7428833SVenu.Iyer@Sun.COM 7438833SVenu.Iyer@Sun.COM ip6h = (ip6_t *)(mp->b_rptr + hdrsize); 7448275SEric Cheng if ((unsigned char *)ip6h == mp->b_wptr) { 7458275SEric Cheng /* 7468833SVenu.Iyer@Sun.COM * The first mblk_t only includes the mac header. 7478275SEric Cheng * Note that it is safe to change the mp pointer here, 7488275SEric Cheng * as the subsequent operation does not assume mp 7498833SVenu.Iyer@Sun.COM * points to the start of the mac header. 7508275SEric Cheng */ 7518275SEric Cheng mp = mp->b_cont; 7528275SEric Cheng 7538275SEric Cheng /* 7548275SEric Cheng * Make sure ip6h holds the full ip6_t structure. 7558275SEric Cheng */ 7568275SEric Cheng if (mp == NULL) 7578275SEric Cheng return (-1); 7588275SEric Cheng 7598275SEric Cheng if (MBLKL(mp) < IPV6_HDR_LEN) { 7608275SEric Cheng modifiable = (DB_REF(mp) == 1); 7618275SEric Cheng 7628275SEric Cheng if (modifiable && 7638275SEric Cheng !pullupmsg(mp, IPV6_HDR_LEN)) { 7648275SEric Cheng return (-1); 7658275SEric Cheng } 7668275SEric Cheng } 7678275SEric Cheng 7688275SEric Cheng ip6h = (ip6_t *)mp->b_rptr; 7698275SEric Cheng } 7708275SEric Cheng 7718275SEric Cheng if (!modifiable || !(OK_32PTR((char *)ip6h)) || 7728275SEric Cheng ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) { 7738275SEric Cheng /* 7748275SEric Cheng * If either ip6h is not alligned, or ip6h does not 7758275SEric Cheng * hold the complete ip6_t structure (a pullupmsg() 7768275SEric Cheng * is not an option since it would result in an 7778275SEric Cheng * unalligned ip6h), fanout to the default ring. Note 7788275SEric Cheng * that this may cause packets reordering. 7798275SEric Cheng */ 7808275SEric Cheng *indx = 0; 7818275SEric Cheng *type = OTH; 7828275SEric Cheng fanout_unalligned++; 7838275SEric Cheng return (0); 7848275SEric Cheng } 7858275SEric Cheng 7868275SEric Cheng remlen = ntohs(ip6h->ip6_plen); 7878275SEric Cheng nexthdr = ip6h->ip6_nxt; 7888275SEric Cheng 7898275SEric Cheng if (remlen < MIN_EHDR_LEN) 7908275SEric Cheng return (-1); 7918275SEric Cheng /* 7928275SEric Cheng * Do src based fanout if below tunable is set to B_TRUE or 7938275SEric Cheng * when mac_ip_hdr_length_v6() fails because of malformed 7948275SEric Cheng * packets or because mblk's need to be concatenated using 7958275SEric Cheng * pullupmsg(). 7968275SEric Cheng */ 7978275SEric Cheng if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(mp, ip6h, 7988275SEric Cheng &hdr_len, &nexthdr)) { 7998275SEric Cheng goto src_based_fanout; 8008275SEric Cheng } 8018275SEric Cheng whereptr = (uint8_t *)ip6h + hdr_len; 8028275SEric Cheng 8038275SEric Cheng /* If the transport is one of below, we do port based fanout */ 8048275SEric Cheng switch (nexthdr) { 8058275SEric Cheng case IPPROTO_TCP: 8068275SEric Cheng case IPPROTO_UDP: 8078275SEric Cheng case IPPROTO_SCTP: 8088275SEric Cheng case IPPROTO_ESP: 8098275SEric Cheng /* 8108275SEric Cheng * If the ports in the transport header is not part of 8118275SEric Cheng * the mblk, do src_based_fanout, instead of calling 8128275SEric Cheng * pullupmsg(). 8138275SEric Cheng */ 8148275SEric Cheng if (mp->b_cont != NULL && 8158275SEric Cheng whereptr + PORTS_SIZE > mp->b_wptr) { 8168275SEric Cheng goto src_based_fanout; 8178275SEric Cheng } 8188275SEric Cheng break; 8198275SEric Cheng default: 8208275SEric Cheng break; 8218275SEric Cheng } 8228275SEric Cheng 8238275SEric Cheng switch (nexthdr) { 8248275SEric Cheng case IPPROTO_TCP: 8258275SEric Cheng hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 8268275SEric Cheng *(uint32_t *)whereptr); 8278275SEric Cheng *indx = COMPUTE_INDEX(hash, 8288275SEric Cheng mac_srs->srs_tcp_ring_count); 8298275SEric Cheng *type = OTH; 8308275SEric Cheng break; 8318275SEric Cheng 8328275SEric Cheng case IPPROTO_UDP: 8338275SEric Cheng case IPPROTO_SCTP: 8348275SEric Cheng case IPPROTO_ESP: 8358275SEric Cheng if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 8368275SEric Cheng hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 8378275SEric Cheng *(uint32_t *)whereptr); 8388275SEric Cheng *indx = COMPUTE_INDEX(hash, 8398275SEric Cheng mac_srs->srs_udp_ring_count); 8408275SEric Cheng } else { 8418275SEric Cheng *indx = mac_srs->srs_ind % 8428275SEric Cheng mac_srs->srs_udp_ring_count; 8438275SEric Cheng mac_srs->srs_ind++; 8448275SEric Cheng } 8458275SEric Cheng *type = OTH; 8468275SEric Cheng break; 8478275SEric Cheng 8488275SEric Cheng /* For all other protocol, do source based fanout */ 8498275SEric Cheng default: 8508275SEric Cheng goto src_based_fanout; 8518275SEric Cheng } 8528275SEric Cheng } else { 8538275SEric Cheng *indx = 0; 8548275SEric Cheng *type = OTH; 8558275SEric Cheng } 8568275SEric Cheng return (0); 8578275SEric Cheng 8588275SEric Cheng src_based_fanout: 8598275SEric Cheng hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0); 8608275SEric Cheng *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); 8618275SEric Cheng *type = OTH; 8628275SEric Cheng return (0); 8638275SEric Cheng } 8648275SEric Cheng 8658275SEric Cheng /* 8668275SEric Cheng * mac_rx_srs_fanout 8678275SEric Cheng * 8688275SEric Cheng * This routine delivers packets destined to an SRS into a soft ring member 8698275SEric Cheng * of the set. 8708275SEric Cheng * 8718275SEric Cheng * Given a chain of packets we need to split it up into multiple sub chains 8728275SEric Cheng * destined for one of the TCP, UDP or OTH soft rings. Instead of entering 8738275SEric Cheng * the soft ring one packet at a time, we want to enter it in the form of a 8748275SEric Cheng * chain otherwise we get this start/stop behaviour where the worker thread 8758275SEric Cheng * goes to sleep and then next packets comes in forcing it to wake up etc. 8768275SEric Cheng * 8778275SEric Cheng * Note: 8788275SEric Cheng * Since we know what is the maximum fanout possible, we create a 2D array 8798275SEric Cheng * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz 8808275SEric Cheng * variables so that we can enter the softrings with chain. We need the 8818275SEric Cheng * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc 8828275SEric Cheng * for each packet would be expensive). If we ever want to have the 8838275SEric Cheng * ability to have unlimited fanout, we should probably declare a head, 8848275SEric Cheng * tail, cnt, sz with each soft ring (a data struct which contains a softring 8858275SEric Cheng * along with these members) and create an array of this uber struct so we 8868275SEric Cheng * don't have to do kmem_alloc. 8878275SEric Cheng */ 8888275SEric Cheng int fanout_oth1 = 0; 8898275SEric Cheng int fanout_oth2 = 0; 8908275SEric Cheng int fanout_oth3 = 0; 8918275SEric Cheng int fanout_oth4 = 0; 8928275SEric Cheng int fanout_oth5 = 0; 8938275SEric Cheng 8948275SEric Cheng static void 8958275SEric Cheng mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 8968275SEric Cheng { 8978275SEric Cheng struct ether_header *ehp; 8988833SVenu.Iyer@Sun.COM struct ether_vlan_header *evhp; 8998833SVenu.Iyer@Sun.COM uint32_t sap; 9008275SEric Cheng ipha_t *ipha; 9018833SVenu.Iyer@Sun.COM uint8_t *dstaddr; 9028275SEric Cheng uint_t indx; 9038833SVenu.Iyer@Sun.COM size_t ports_offset; 9048833SVenu.Iyer@Sun.COM size_t ipha_len; 9058833SVenu.Iyer@Sun.COM size_t hdrsize; 9068275SEric Cheng uint_t hash; 9078275SEric Cheng mblk_t *mp; 9088275SEric Cheng mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 9098275SEric Cheng mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 9108275SEric Cheng int cnt[MAX_SR_TYPES][MAX_SR_FANOUT]; 9118275SEric Cheng size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT]; 9128275SEric Cheng size_t sz1; 9138833SVenu.Iyer@Sun.COM boolean_t bw_ctl; 9148275SEric Cheng boolean_t hw_classified; 9158833SVenu.Iyer@Sun.COM boolean_t dls_bypass; 9168833SVenu.Iyer@Sun.COM boolean_t is_ether; 9178833SVenu.Iyer@Sun.COM boolean_t is_unicast; 9188275SEric Cheng int fanout_cnt; 9198833SVenu.Iyer@Sun.COM enum pkt_type type; 9208275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 9218833SVenu.Iyer@Sun.COM 9228833SVenu.Iyer@Sun.COM is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 9238833SVenu.Iyer@Sun.COM bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 9248275SEric Cheng 9258275SEric Cheng /* 9268275SEric Cheng * If we don't have a Rx ring, S/W classification would have done 9278275SEric Cheng * its job and its a packet meant for us. If we were polling on 9288275SEric Cheng * the default ring (i.e. there was a ring assigned to this SRS), 9298275SEric Cheng * then we need to make sure that the mac address really belongs 9308275SEric Cheng * to us. 9318275SEric Cheng */ 9328275SEric Cheng hw_classified = mac_srs->srs_ring != NULL && 9338275SEric Cheng mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 9348275SEric Cheng 9358275SEric Cheng /* 9368275SEric Cheng * Special clients (eg. VLAN, non ether, etc) need DLS 9378275SEric Cheng * processing in the Rx path. SRST_DLS_BYPASS will be clear for 938*11021SEric.Cheng@Sun.COM * such SRSs. Another way of disabling bypass is to set the 939*11021SEric.Cheng@Sun.COM * MCIS_RX_BYPASS_DISABLE flag. 9408275SEric Cheng */ 941*11021SEric.Cheng@Sun.COM dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && 942*11021SEric.Cheng@Sun.COM ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); 9438275SEric Cheng 9448275SEric Cheng /* 9458275SEric Cheng * Since the softrings are never destroyed and we always 9468275SEric Cheng * create equal number of softrings for TCP, UDP and rest, 9478275SEric Cheng * its OK to check one of them for count and use it without 9488275SEric Cheng * any lock. In future, if soft rings get destroyed because 9498275SEric Cheng * of reduction in fanout, we will need to ensure that happens 9508275SEric Cheng * behind the SRS_PROC. 9518275SEric Cheng */ 9528275SEric Cheng fanout_cnt = mac_srs->srs_tcp_ring_count; 9538275SEric Cheng 9548275SEric Cheng bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 9558275SEric Cheng bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 9568275SEric Cheng bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int)); 9578275SEric Cheng bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t)); 9588275SEric Cheng 9598275SEric Cheng /* 9608275SEric Cheng * We got a chain from SRS that we need to send to the soft rings. 9618275SEric Cheng * Since squeues for TCP & IPv4 sap poll their soft rings (for 9628275SEric Cheng * performance reasons), we need to separate out v4_tcp, v4_udp 9638275SEric Cheng * and the rest goes in other. 9648275SEric Cheng */ 9658275SEric Cheng while (head != NULL) { 9668275SEric Cheng mp = head; 9678275SEric Cheng head = head->b_next; 9688275SEric Cheng mp->b_next = NULL; 9698275SEric Cheng 9708275SEric Cheng type = OTH; 9718833SVenu.Iyer@Sun.COM sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 9728833SVenu.Iyer@Sun.COM 9738833SVenu.Iyer@Sun.COM if (is_ether) { 9748833SVenu.Iyer@Sun.COM /* 9758833SVenu.Iyer@Sun.COM * At this point we can be sure the packet at least 9768833SVenu.Iyer@Sun.COM * has an ether header. 9778833SVenu.Iyer@Sun.COM */ 9788833SVenu.Iyer@Sun.COM if (sz1 < sizeof (struct ether_header)) { 9798833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 9808833SVenu.Iyer@Sun.COM continue; 9818833SVenu.Iyer@Sun.COM } 9828833SVenu.Iyer@Sun.COM ehp = (struct ether_header *)mp->b_rptr; 9838833SVenu.Iyer@Sun.COM 9848833SVenu.Iyer@Sun.COM /* 9858833SVenu.Iyer@Sun.COM * Determine if this is a VLAN or non-VLAN packet. 9868833SVenu.Iyer@Sun.COM */ 9878833SVenu.Iyer@Sun.COM if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 9888833SVenu.Iyer@Sun.COM evhp = (struct ether_vlan_header *)mp->b_rptr; 9898833SVenu.Iyer@Sun.COM sap = ntohs(evhp->ether_type); 9908833SVenu.Iyer@Sun.COM hdrsize = sizeof (struct ether_vlan_header); 9918275SEric Cheng /* 9928833SVenu.Iyer@Sun.COM * Check if the VID of the packet, if any, 9938833SVenu.Iyer@Sun.COM * belongs to this client. 9948275SEric Cheng */ 9958833SVenu.Iyer@Sun.COM if (!mac_client_check_flow_vid(mcip, 9968833SVenu.Iyer@Sun.COM VLAN_ID(ntohs(evhp->ether_tci)))) { 9978275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 9988275SEric Cheng continue; 9998275SEric Cheng } 10008833SVenu.Iyer@Sun.COM } else { 10018833SVenu.Iyer@Sun.COM hdrsize = sizeof (struct ether_header); 10028833SVenu.Iyer@Sun.COM } 10038833SVenu.Iyer@Sun.COM is_unicast = 10048833SVenu.Iyer@Sun.COM ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 10058833SVenu.Iyer@Sun.COM dstaddr = (uint8_t *)&ehp->ether_dhost; 10068833SVenu.Iyer@Sun.COM } else { 10078833SVenu.Iyer@Sun.COM mac_header_info_t mhi; 10088833SVenu.Iyer@Sun.COM 10098833SVenu.Iyer@Sun.COM if (mac_header_info((mac_handle_t)mcip->mci_mip, 10108833SVenu.Iyer@Sun.COM mp, &mhi) != 0) { 10118833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 10128833SVenu.Iyer@Sun.COM continue; 10138833SVenu.Iyer@Sun.COM } 10148833SVenu.Iyer@Sun.COM hdrsize = mhi.mhi_hdrsize; 10158833SVenu.Iyer@Sun.COM sap = mhi.mhi_bindsap; 10168833SVenu.Iyer@Sun.COM is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 10178833SVenu.Iyer@Sun.COM dstaddr = (uint8_t *)mhi.mhi_daddr; 10188833SVenu.Iyer@Sun.COM } 10198833SVenu.Iyer@Sun.COM 10208833SVenu.Iyer@Sun.COM if (!dls_bypass) { 10218833SVenu.Iyer@Sun.COM if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 10228833SVenu.Iyer@Sun.COM hdrsize, &type, &indx) == -1) { 10238833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 10248833SVenu.Iyer@Sun.COM continue; 10258275SEric Cheng } 10268275SEric Cheng 10278275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type][indx], 10288275SEric Cheng tailmp[type][indx], cnt[type][indx], bw_ctl, 10298275SEric Cheng sz[type][indx], sz1, mp); 10308275SEric Cheng continue; 10318275SEric Cheng } 10328275SEric Cheng 10338275SEric Cheng 10348275SEric Cheng /* 10358275SEric Cheng * If we are using the default Rx ring where H/W or S/W 10368275SEric Cheng * classification has not happened, we need to verify if 10378275SEric Cheng * this unicast packet really belongs to us. 10388275SEric Cheng */ 10398833SVenu.Iyer@Sun.COM if (sap == ETHERTYPE_IP) { 10408275SEric Cheng /* 10418275SEric Cheng * If we are H/W classified, but we have promisc 10428275SEric Cheng * on, then we need to check for the unicast address. 10438275SEric Cheng */ 10448275SEric Cheng if (hw_classified && mcip->mci_promisc_list != NULL) { 10458275SEric Cheng mac_address_t *map; 10468275SEric Cheng 10478275SEric Cheng rw_enter(&mcip->mci_rw_lock, RW_READER); 10488275SEric Cheng map = mcip->mci_unicast; 10498833SVenu.Iyer@Sun.COM if (bcmp(dstaddr, map->ma_addr, 10508275SEric Cheng map->ma_len) == 0) 10518275SEric Cheng type = UNDEF; 10528275SEric Cheng rw_exit(&mcip->mci_rw_lock); 10538833SVenu.Iyer@Sun.COM } else if (is_unicast) { 10548275SEric Cheng type = UNDEF; 10558275SEric Cheng } 10568275SEric Cheng } 10578275SEric Cheng 10588275SEric Cheng /* 10598275SEric Cheng * This needs to become a contract with the driver for 10608275SEric Cheng * the fast path. 10618275SEric Cheng */ 10628275SEric Cheng 10638833SVenu.Iyer@Sun.COM ipha = (ipha_t *)(mp->b_rptr + hdrsize); 10648275SEric Cheng if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) { 10658275SEric Cheng type = OTH; 10668275SEric Cheng fanout_oth1++; 10678275SEric Cheng } 10688275SEric Cheng 10698275SEric Cheng if (type != OTH) { 10708833SVenu.Iyer@Sun.COM uint16_t frag_offset_flags; 10718833SVenu.Iyer@Sun.COM 10728275SEric Cheng switch (ipha->ipha_protocol) { 10738275SEric Cheng case IPPROTO_TCP: 10748275SEric Cheng case IPPROTO_UDP: 10758275SEric Cheng case IPPROTO_SCTP: 10768275SEric Cheng case IPPROTO_ESP: 10778275SEric Cheng ipha_len = IPH_HDR_LENGTH(ipha); 10788275SEric Cheng if ((uchar_t *)ipha + ipha_len + PORTS_SIZE > 10798275SEric Cheng mp->b_wptr) { 10808275SEric Cheng type = OTH; 10818275SEric Cheng break; 10828275SEric Cheng } 10838275SEric Cheng frag_offset_flags = 10848275SEric Cheng ntohs(ipha->ipha_fragment_offset_and_flags); 10858275SEric Cheng if ((frag_offset_flags & 10868275SEric Cheng (IPH_MF | IPH_OFFSET)) != 0) { 10878275SEric Cheng type = OTH; 10888275SEric Cheng fanout_oth3++; 10898275SEric Cheng break; 10908275SEric Cheng } 10918833SVenu.Iyer@Sun.COM ports_offset = hdrsize + ipha_len; 10928275SEric Cheng break; 10938275SEric Cheng default: 10948275SEric Cheng type = OTH; 10958275SEric Cheng fanout_oth4++; 10968275SEric Cheng break; 10978275SEric Cheng } 10988275SEric Cheng } 10998275SEric Cheng 11008275SEric Cheng if (type == OTH) { 11018833SVenu.Iyer@Sun.COM if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 11028833SVenu.Iyer@Sun.COM hdrsize, &type, &indx) == -1) { 11038275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 11048275SEric Cheng continue; 11058275SEric Cheng } 11068275SEric Cheng 11078275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type][indx], 11088275SEric Cheng tailmp[type][indx], cnt[type][indx], bw_ctl, 11098275SEric Cheng sz[type][indx], sz1, mp); 11108275SEric Cheng continue; 11118275SEric Cheng } 11128275SEric Cheng 11138275SEric Cheng ASSERT(type == UNDEF); 11148275SEric Cheng 11158275SEric Cheng /* 11168275SEric Cheng * XXX-Sunay: We should hold srs_lock since ring_count 11178275SEric Cheng * below can change. But if we are always called from 11188275SEric Cheng * mac_rx_srs_drain and SRS_PROC is set, then we can 11198275SEric Cheng * enforce that ring_count can't be changed i.e. 11208275SEric Cheng * to change fanout type or ring count, the calling 11218275SEric Cheng * thread needs to be behind SRS_PROC. 11228275SEric Cheng */ 11238275SEric Cheng switch (ipha->ipha_protocol) { 11248275SEric Cheng case IPPROTO_TCP: 11258275SEric Cheng /* 11268275SEric Cheng * Note that for ESP, we fanout on SPI and it is at the 11278275SEric Cheng * same offset as the 2x16-bit ports. So it is clumped 11288275SEric Cheng * along with TCP, UDP and SCTP. 11298275SEric Cheng */ 11308275SEric Cheng hash = HASH_ADDR(ipha->ipha_src, 11318275SEric Cheng *(uint32_t *)(mp->b_rptr + ports_offset)); 11328275SEric Cheng indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count); 11338275SEric Cheng type = V4_TCP; 11348833SVenu.Iyer@Sun.COM mp->b_rptr += hdrsize; 11358275SEric Cheng break; 11368275SEric Cheng case IPPROTO_UDP: 11378275SEric Cheng case IPPROTO_SCTP: 11388275SEric Cheng case IPPROTO_ESP: 11398275SEric Cheng if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 11408275SEric Cheng hash = HASH_ADDR(ipha->ipha_src, 11418275SEric Cheng *(uint32_t *)(mp->b_rptr + ports_offset)); 11428275SEric Cheng indx = COMPUTE_INDEX(hash, 11438275SEric Cheng mac_srs->srs_udp_ring_count); 11448275SEric Cheng } else { 11458275SEric Cheng indx = mac_srs->srs_ind % 11468275SEric Cheng mac_srs->srs_udp_ring_count; 11478275SEric Cheng mac_srs->srs_ind++; 11488275SEric Cheng } 11498275SEric Cheng type = V4_UDP; 11508833SVenu.Iyer@Sun.COM mp->b_rptr += hdrsize; 11518275SEric Cheng break; 11528833SVenu.Iyer@Sun.COM default: 11538833SVenu.Iyer@Sun.COM indx = 0; 11548833SVenu.Iyer@Sun.COM type = OTH; 11558275SEric Cheng } 11568275SEric Cheng 11578275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx], 11588275SEric Cheng cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp); 11598275SEric Cheng } 11608275SEric Cheng 11618275SEric Cheng for (type = V4_TCP; type < UNDEF; type++) { 11628833SVenu.Iyer@Sun.COM int i; 11638833SVenu.Iyer@Sun.COM 11648275SEric Cheng for (i = 0; i < fanout_cnt; i++) { 11658275SEric Cheng if (headmp[type][i] != NULL) { 11668833SVenu.Iyer@Sun.COM mac_soft_ring_t *softring; 11678833SVenu.Iyer@Sun.COM 11688275SEric Cheng ASSERT(tailmp[type][i]->b_next == NULL); 11698275SEric Cheng switch (type) { 11708275SEric Cheng case V4_TCP: 11718275SEric Cheng softring = 11728275SEric Cheng mac_srs->srs_tcp_soft_rings[i]; 11738275SEric Cheng break; 11748275SEric Cheng case V4_UDP: 11758275SEric Cheng softring = 11768275SEric Cheng mac_srs->srs_udp_soft_rings[i]; 11778275SEric Cheng break; 11788275SEric Cheng case OTH: 11798275SEric Cheng softring = 11808275SEric Cheng mac_srs->srs_oth_soft_rings[i]; 11818275SEric Cheng break; 11828275SEric Cheng } 11838833SVenu.Iyer@Sun.COM mac_rx_soft_ring_process(mcip, 11848275SEric Cheng softring, headmp[type][i], tailmp[type][i], 11858275SEric Cheng cnt[type][i], sz[type][i]); 11868275SEric Cheng } 11878275SEric Cheng } 11888275SEric Cheng } 11898275SEric Cheng } 11908275SEric Cheng 11918275SEric Cheng #define SRS_BYTES_TO_PICKUP 150000 11928275SEric Cheng ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP; 11938275SEric Cheng 11948275SEric Cheng /* 11958275SEric Cheng * mac_rx_srs_poll_ring 11968275SEric Cheng * 11978275SEric Cheng * This SRS Poll thread uses this routine to poll the underlying hardware 11988275SEric Cheng * Rx ring to get a chain of packets. It can inline process that chain 11998275SEric Cheng * if mac_latency_optimize is set (default) or signal the SRS worker thread 12008275SEric Cheng * to do the remaining processing. 12018275SEric Cheng * 12028275SEric Cheng * Since packets come in the system via interrupt or poll path, we also 12038275SEric Cheng * update the stats and deal with promiscous clients here. 12048275SEric Cheng */ 12058275SEric Cheng void 12068275SEric Cheng mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs) 12078275SEric Cheng { 12088275SEric Cheng kmutex_t *lock = &mac_srs->srs_lock; 12098275SEric Cheng kcondvar_t *async = &mac_srs->srs_cv; 12108275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 12118275SEric Cheng mblk_t *head, *tail, *mp; 12128275SEric Cheng callb_cpr_t cprinfo; 12138275SEric Cheng ssize_t bytes_to_pickup; 12148275SEric Cheng size_t sz; 12158275SEric Cheng int count; 12168275SEric Cheng mac_client_impl_t *smcip; 12178275SEric Cheng 12188275SEric Cheng CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll"); 12198275SEric Cheng mutex_enter(lock); 12208275SEric Cheng 12218275SEric Cheng start: 12228275SEric Cheng for (;;) { 12238275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 12248275SEric Cheng goto done; 12258275SEric Cheng 12268275SEric Cheng CALLB_CPR_SAFE_BEGIN(&cprinfo); 12278275SEric Cheng cv_wait(async, lock); 12288275SEric Cheng CALLB_CPR_SAFE_END(&cprinfo, lock); 12298275SEric Cheng 12308275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 12318275SEric Cheng goto done; 12328275SEric Cheng 12338275SEric Cheng check_again: 12348275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 12358275SEric Cheng /* 12368275SEric Cheng * We pick as many bytes as we are allowed to queue. 12378275SEric Cheng * Its possible that we will exceed the total 12388275SEric Cheng * packets queued in case this SRS is part of the 12398275SEric Cheng * Rx ring group since > 1 poll thread can be pulling 12408275SEric Cheng * upto the max allowed packets at the same time 12418275SEric Cheng * but that should be OK. 12428275SEric Cheng */ 12438275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 12448275SEric Cheng bytes_to_pickup = 12458275SEric Cheng mac_srs->srs_bw->mac_bw_drop_threshold - 12468275SEric Cheng mac_srs->srs_bw->mac_bw_sz; 12478275SEric Cheng /* 12488275SEric Cheng * We shouldn't have been signalled if we 12498275SEric Cheng * have 0 or less bytes to pick but since 12508275SEric Cheng * some of the bytes accounting is driver 12518275SEric Cheng * dependant, we do the safety check. 12528275SEric Cheng */ 12538275SEric Cheng if (bytes_to_pickup < 0) 12548275SEric Cheng bytes_to_pickup = 0; 12558275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 12568275SEric Cheng } else { 12578275SEric Cheng /* 12588275SEric Cheng * ToDO: Need to change the polling API 12598275SEric Cheng * to add a packet count and a flag which 12608275SEric Cheng * tells the driver whether we want packets 12618275SEric Cheng * based on a count, or bytes, or all the 12628275SEric Cheng * packets queued in the driver/HW. This 12638275SEric Cheng * way, we never have to check the limits 12648275SEric Cheng * on poll path. We truly let only as many 12658275SEric Cheng * packets enter the system as we are willing 12668275SEric Cheng * to process or queue. 12678275SEric Cheng * 12688275SEric Cheng * Something along the lines of 12698275SEric Cheng * pkts_to_pickup = mac_soft_ring_max_q_cnt - 12708275SEric Cheng * mac_srs->srs_poll_pkt_cnt 12718275SEric Cheng */ 12728275SEric Cheng 12738275SEric Cheng /* 12748275SEric Cheng * Since we are not doing B/W control, pick 12758275SEric Cheng * as many packets as allowed. 12768275SEric Cheng */ 12778275SEric Cheng bytes_to_pickup = max_bytes_to_pickup; 12788275SEric Cheng } 12798275SEric Cheng 12808275SEric Cheng /* Poll the underlying Hardware */ 12818275SEric Cheng mutex_exit(lock); 12828275SEric Cheng head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup); 12838275SEric Cheng mutex_enter(lock); 12848275SEric Cheng 12858275SEric Cheng ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 12868275SEric Cheng SRS_POLL_THR_OWNER); 12878275SEric Cheng 12888275SEric Cheng mp = tail = head; 12898275SEric Cheng count = 0; 12908275SEric Cheng sz = 0; 12918275SEric Cheng while (mp != NULL) { 12928275SEric Cheng tail = mp; 12938275SEric Cheng sz += msgdsize(mp); 12948275SEric Cheng mp = mp->b_next; 12958275SEric Cheng count++; 12968275SEric Cheng } 12978275SEric Cheng 12988275SEric Cheng if (head != NULL) { 12998275SEric Cheng tail->b_next = NULL; 13008275SEric Cheng smcip = mac_srs->srs_mcip; 13018275SEric Cheng 13028275SEric Cheng if ((mac_srs->srs_type & SRST_FLOW) || 13038275SEric Cheng (smcip == NULL)) { 13048275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, 13058275SEric Cheng rbytes, sz); 13068275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, 13078275SEric Cheng ipackets, count); 13088275SEric Cheng } 13098275SEric Cheng 13108275SEric Cheng /* 13118275SEric Cheng * If there are any promiscuous mode callbacks 13128275SEric Cheng * defined for this MAC client, pass them a copy 13138275SEric Cheng * if appropriate and also update the counters. 13148275SEric Cheng */ 13158275SEric Cheng if (smcip != NULL) { 13168275SEric Cheng smcip->mci_stat_ibytes += sz; 13178275SEric Cheng smcip->mci_stat_ipackets += count; 13188275SEric Cheng 13198275SEric Cheng if (smcip->mci_mip->mi_promisc_list != NULL) { 13208275SEric Cheng mutex_exit(lock); 13218275SEric Cheng mac_promisc_dispatch(smcip->mci_mip, 13228275SEric Cheng head, NULL); 13238275SEric Cheng mutex_enter(lock); 13248275SEric Cheng } 13258275SEric Cheng } 13268275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 13278275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 13288275SEric Cheng mac_srs->srs_bw->mac_bw_polled += sz; 13298275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 13308275SEric Cheng } 13318275SEric Cheng srs_rx->sr_poll_count += count; 13328275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, 13338275SEric Cheng count, sz); 13348275SEric Cheng if (count <= 10) 13358275SEric Cheng srs_rx->sr_chain_cnt_undr10++; 13368275SEric Cheng else if (count > 10 && count <= 50) 13378275SEric Cheng srs_rx->sr_chain_cnt_10to50++; 13388275SEric Cheng else 13398275SEric Cheng srs_rx->sr_chain_cnt_over50++; 13408275SEric Cheng } 13418275SEric Cheng 13428275SEric Cheng /* 13438275SEric Cheng * We are guaranteed that SRS_PROC will be set if we 13448275SEric Cheng * are here. Also, poll thread gets to run only if 13458275SEric Cheng * the drain was being done by a worker thread although 13468275SEric Cheng * its possible that worker thread is still running 13478275SEric Cheng * and poll thread was sent down to keep the pipeline 13488275SEric Cheng * going instead of doing a complete drain and then 13498275SEric Cheng * trying to poll the NIC. 13508275SEric Cheng * 13518275SEric Cheng * So we need to check SRS_WORKER flag to make sure 13528275SEric Cheng * that the worker thread is not processing the queue 13538275SEric Cheng * in parallel to us. The flags and conditions are 13548275SEric Cheng * protected by the srs_lock to prevent any race. We 13558275SEric Cheng * ensure that we don't drop the srs_lock from now 13568275SEric Cheng * till the end and similarly we don't drop the srs_lock 13578275SEric Cheng * in mac_rx_srs_drain() till similar condition check 13588275SEric Cheng * are complete. The mac_rx_srs_drain() needs to ensure 13598275SEric Cheng * that SRS_WORKER flag remains set as long as its 13608275SEric Cheng * processing the queue. 13618275SEric Cheng */ 13628275SEric Cheng if (!(mac_srs->srs_state & SRS_WORKER) && 13638275SEric Cheng (mac_srs->srs_first != NULL)) { 13648275SEric Cheng /* 13658275SEric Cheng * We have packets to process and worker thread 13668833SVenu.Iyer@Sun.COM * is not running. Check to see if poll thread is 13678833SVenu.Iyer@Sun.COM * allowed to process. 13688275SEric Cheng */ 13698833SVenu.Iyer@Sun.COM if (mac_srs->srs_state & SRS_LATENCY_OPT) { 13708275SEric Cheng mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC); 13719209SEric Cheng if (!(mac_srs->srs_state & SRS_PAUSE) && 13729209SEric Cheng srs_rx->sr_poll_pkt_cnt <= 13738275SEric Cheng srs_rx->sr_lowat) { 13748275SEric Cheng srs_rx->sr_poll_again++; 13758275SEric Cheng goto check_again; 13768833SVenu.Iyer@Sun.COM } 13778833SVenu.Iyer@Sun.COM /* 13788833SVenu.Iyer@Sun.COM * We are already above low water mark 13798833SVenu.Iyer@Sun.COM * so stay in the polling mode but no 13808833SVenu.Iyer@Sun.COM * need to poll. Once we dip below 13818833SVenu.Iyer@Sun.COM * the polling threshold, the processing 13828833SVenu.Iyer@Sun.COM * thread (soft ring) will signal us 13838833SVenu.Iyer@Sun.COM * to poll again (MAC_UPDATE_SRS_COUNT) 13848833SVenu.Iyer@Sun.COM */ 13858833SVenu.Iyer@Sun.COM srs_rx->sr_poll_drain_no_poll++; 13868833SVenu.Iyer@Sun.COM mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 13878833SVenu.Iyer@Sun.COM /* 13888833SVenu.Iyer@Sun.COM * In B/W control case, its possible 13898833SVenu.Iyer@Sun.COM * that the backlog built up due to 13908833SVenu.Iyer@Sun.COM * B/W limit being reached and packets 13918833SVenu.Iyer@Sun.COM * are queued only in SRS. In this case, 13928833SVenu.Iyer@Sun.COM * we should schedule worker thread 13938833SVenu.Iyer@Sun.COM * since no one else will wake us up. 13948833SVenu.Iyer@Sun.COM */ 13958833SVenu.Iyer@Sun.COM if ((mac_srs->srs_type & SRST_BW_CONTROL) && 13968833SVenu.Iyer@Sun.COM (mac_srs->srs_tid == NULL)) { 13978833SVenu.Iyer@Sun.COM mac_srs->srs_tid = 13988833SVenu.Iyer@Sun.COM timeout(mac_srs_fire, mac_srs, 1); 13998833SVenu.Iyer@Sun.COM srs_rx->sr_poll_worker_wakeup++; 14008275SEric Cheng } 14018275SEric Cheng } else { 14028275SEric Cheng /* 14038275SEric Cheng * Wakeup the worker thread for more processing. 14048275SEric Cheng * We optimize for throughput in this case. 14058275SEric Cheng */ 14068275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 14078275SEric Cheng MAC_SRS_WORKER_WAKEUP(mac_srs); 14088275SEric Cheng srs_rx->sr_poll_sig_worker++; 14098275SEric Cheng } 14108275SEric Cheng } else if ((mac_srs->srs_first == NULL) && 14118275SEric Cheng !(mac_srs->srs_state & SRS_WORKER)) { 14128275SEric Cheng /* 14138275SEric Cheng * There is nothing queued in SRS and 14148275SEric Cheng * no worker thread running. Plus we 14158275SEric Cheng * didn't get anything from the H/W 14168275SEric Cheng * as well (head == NULL); 14178275SEric Cheng */ 14188275SEric Cheng ASSERT(head == NULL); 14198275SEric Cheng mac_srs->srs_state &= 14208275SEric Cheng ~(SRS_PROC|SRS_GET_PKTS); 14218275SEric Cheng 14228275SEric Cheng /* 14238275SEric Cheng * If we have a packets in soft ring, don't allow 14248275SEric Cheng * more packets to come into this SRS by keeping the 14258275SEric Cheng * interrupts off but not polling the H/W. The 14268275SEric Cheng * poll thread will get signaled as soon as 14278275SEric Cheng * srs_poll_pkt_cnt dips below poll threshold. 14288275SEric Cheng */ 14298275SEric Cheng if (srs_rx->sr_poll_pkt_cnt == 0) { 14308275SEric Cheng srs_rx->sr_poll_intr_enable++; 14318275SEric Cheng MAC_SRS_POLLING_OFF(mac_srs); 14328275SEric Cheng } else { 14338275SEric Cheng /* 14348275SEric Cheng * We know nothing is queued in SRS 14358275SEric Cheng * since we are here after checking 14368275SEric Cheng * srs_first is NULL. The backlog 14378275SEric Cheng * is entirely due to packets queued 14388275SEric Cheng * in Soft ring which will wake us up 14398275SEric Cheng * and get the interface out of polling 14408275SEric Cheng * mode once the backlog dips below 14418275SEric Cheng * sr_poll_thres. 14428275SEric Cheng */ 14438275SEric Cheng srs_rx->sr_poll_no_poll++; 14448275SEric Cheng } 14458275SEric Cheng } else { 14468275SEric Cheng /* 14478275SEric Cheng * Worker thread is already running. 14488275SEric Cheng * Nothing much to do. If the polling 14498275SEric Cheng * was enabled, worker thread will deal 14508275SEric Cheng * with that. 14518275SEric Cheng */ 14528275SEric Cheng mac_srs->srs_state &= ~SRS_GET_PKTS; 14538275SEric Cheng srs_rx->sr_poll_goto_sleep++; 14548275SEric Cheng } 14558275SEric Cheng } 14568275SEric Cheng done: 14578275SEric Cheng mac_srs->srs_state |= SRS_POLL_THR_QUIESCED; 14588275SEric Cheng cv_signal(&mac_srs->srs_async); 14598275SEric Cheng /* 14608275SEric Cheng * If this is a temporary quiesce then wait for the restart signal 14618275SEric Cheng * from the srs worker. Then clear the flags and signal the srs worker 14628275SEric Cheng * to ensure a positive handshake and go back to start. 14638275SEric Cheng */ 14648275SEric Cheng while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART))) 14658275SEric Cheng cv_wait(async, lock); 14668275SEric Cheng if (mac_srs->srs_state & SRS_POLL_THR_RESTART) { 14678275SEric Cheng ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 14688275SEric Cheng mac_srs->srs_state &= 14698275SEric Cheng ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART); 14708275SEric Cheng cv_signal(&mac_srs->srs_async); 14718275SEric Cheng goto start; 14728275SEric Cheng } else { 14738275SEric Cheng mac_srs->srs_state |= SRS_POLL_THR_EXITED; 14748275SEric Cheng cv_signal(&mac_srs->srs_async); 14758275SEric Cheng CALLB_CPR_EXIT(&cprinfo); 14768275SEric Cheng thread_exit(); 14778275SEric Cheng } 14788275SEric Cheng } 14798275SEric Cheng 14808275SEric Cheng /* 14818275SEric Cheng * mac_srs_pick_chain 14828275SEric Cheng * 14838275SEric Cheng * In Bandwidth control case, checks how many packets can be processed 14848275SEric Cheng * and return them in a sub chain. 14858275SEric Cheng */ 14868275SEric Cheng static mblk_t * 14878275SEric Cheng mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail, 14888275SEric Cheng size_t *chain_sz, int *chain_cnt) 14898275SEric Cheng { 14908275SEric Cheng mblk_t *head = NULL; 14918275SEric Cheng mblk_t *tail = NULL; 14928275SEric Cheng size_t sz; 14938275SEric Cheng size_t tsz = 0; 14948275SEric Cheng int cnt = 0; 14958275SEric Cheng mblk_t *mp; 14968275SEric Cheng 14978275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 14988275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 14998275SEric Cheng if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <= 15008275SEric Cheng mac_srs->srs_bw->mac_bw_limit) || 15018275SEric Cheng (mac_srs->srs_bw->mac_bw_limit == 0)) { 15028275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 15038275SEric Cheng head = mac_srs->srs_first; 15048275SEric Cheng mac_srs->srs_first = NULL; 15058275SEric Cheng *chain_tail = mac_srs->srs_last; 15068275SEric Cheng mac_srs->srs_last = NULL; 15078275SEric Cheng *chain_sz = mac_srs->srs_size; 15088275SEric Cheng *chain_cnt = mac_srs->srs_count; 15098275SEric Cheng mac_srs->srs_count = 0; 15108275SEric Cheng mac_srs->srs_size = 0; 15118275SEric Cheng return (head); 15128275SEric Cheng } 15138275SEric Cheng 15148275SEric Cheng /* 15158275SEric Cheng * Can't clear the entire backlog. 15168275SEric Cheng * Need to find how many packets to pick 15178275SEric Cheng */ 15188275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock)); 15198275SEric Cheng while ((mp = mac_srs->srs_first) != NULL) { 15208275SEric Cheng sz = msgdsize(mp); 15218275SEric Cheng if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) > 15228275SEric Cheng mac_srs->srs_bw->mac_bw_limit) { 15238275SEric Cheng if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) 15248275SEric Cheng mac_srs->srs_bw->mac_bw_state |= 15258275SEric Cheng SRS_BW_ENFORCED; 15268275SEric Cheng break; 15278275SEric Cheng } 15288275SEric Cheng 15298275SEric Cheng /* 15308275SEric Cheng * The _size & cnt is decremented from the softrings 15318275SEric Cheng * when they send up the packet for polling to work 15328275SEric Cheng * properly. 15338275SEric Cheng */ 15348275SEric Cheng tsz += sz; 15358275SEric Cheng cnt++; 15368275SEric Cheng mac_srs->srs_count--; 15378275SEric Cheng mac_srs->srs_size -= sz; 15388275SEric Cheng if (tail != NULL) 15398275SEric Cheng tail->b_next = mp; 15408275SEric Cheng else 15418275SEric Cheng head = mp; 15428275SEric Cheng tail = mp; 15438275SEric Cheng mac_srs->srs_first = mac_srs->srs_first->b_next; 15448275SEric Cheng } 15458275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 15468275SEric Cheng if (mac_srs->srs_first == NULL) 15478275SEric Cheng mac_srs->srs_last = NULL; 15488275SEric Cheng 15498275SEric Cheng if (tail != NULL) 15508275SEric Cheng tail->b_next = NULL; 15518275SEric Cheng *chain_tail = tail; 15528275SEric Cheng *chain_cnt = cnt; 15538275SEric Cheng *chain_sz = tsz; 15548275SEric Cheng 15558275SEric Cheng return (head); 15568275SEric Cheng } 15578275SEric Cheng 15588275SEric Cheng /* 15598275SEric Cheng * mac_rx_srs_drain 15608275SEric Cheng * 15618275SEric Cheng * The SRS drain routine. Gets to run to clear the queue. Any thread 15628275SEric Cheng * (worker, interrupt, poll) can call this based on processing model. 15638275SEric Cheng * The first thing we do is disable interrupts if possible and then 15648275SEric Cheng * drain the queue. we also try to poll the underlying hardware if 15658275SEric Cheng * there is a dedicated hardware Rx ring assigned to this SRS. 15668275SEric Cheng * 15678275SEric Cheng * There is a equivalent drain routine in bandwidth control mode 15688275SEric Cheng * mac_rx_srs_drain_bw. There is some code duplication between the two 15698275SEric Cheng * routines but they are highly performance sensitive and are easier 15708275SEric Cheng * to read/debug if they stay separate. Any code changes here might 15718275SEric Cheng * also apply to mac_rx_srs_drain_bw as well. 15728275SEric Cheng */ 15738275SEric Cheng void 15748275SEric Cheng mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 15758275SEric Cheng { 15768275SEric Cheng mblk_t *head; 15778275SEric Cheng mblk_t *tail; 15788275SEric Cheng timeout_id_t tid; 15798275SEric Cheng int cnt = 0; 15808275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 15818275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 15828275SEric Cheng 15838275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 15848275SEric Cheng ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL)); 15858833SVenu.Iyer@Sun.COM 15868275SEric Cheng /* If we are blanked i.e. can't do upcalls, then we are done */ 15878275SEric Cheng if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 15888275SEric Cheng ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 15898275SEric Cheng (mac_srs->srs_state & SRS_PAUSE)); 15908275SEric Cheng goto out; 15918275SEric Cheng } 15928275SEric Cheng 15938275SEric Cheng if (mac_srs->srs_first == NULL) 15948275SEric Cheng goto out; 15958275SEric Cheng 15968833SVenu.Iyer@Sun.COM if (!(mac_srs->srs_state & SRS_LATENCY_OPT) && 15978833SVenu.Iyer@Sun.COM (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) { 15988833SVenu.Iyer@Sun.COM /* 15998833SVenu.Iyer@Sun.COM * In the normal case, the SRS worker thread does no 16008833SVenu.Iyer@Sun.COM * work and we wait for a backlog to build up before 16018833SVenu.Iyer@Sun.COM * we switch into polling mode. In case we are 16028833SVenu.Iyer@Sun.COM * optimizing for throughput, we use the worker thread 16038833SVenu.Iyer@Sun.COM * as well. The goal is to let worker thread process 16048833SVenu.Iyer@Sun.COM * the queue and poll thread to feed packets into 16058833SVenu.Iyer@Sun.COM * the queue. As such, we should signal the poll 16068833SVenu.Iyer@Sun.COM * thread to try and get more packets. 16078833SVenu.Iyer@Sun.COM * 16088833SVenu.Iyer@Sun.COM * We could have pulled this check in the POLL_RING 16098833SVenu.Iyer@Sun.COM * macro itself but keeping it explicit here makes 16108833SVenu.Iyer@Sun.COM * the architecture more human understandable. 16118833SVenu.Iyer@Sun.COM */ 16128833SVenu.Iyer@Sun.COM MAC_SRS_POLL_RING(mac_srs); 16138833SVenu.Iyer@Sun.COM } 16148833SVenu.Iyer@Sun.COM 16158833SVenu.Iyer@Sun.COM again: 16168275SEric Cheng head = mac_srs->srs_first; 16178275SEric Cheng mac_srs->srs_first = NULL; 16188275SEric Cheng tail = mac_srs->srs_last; 16198275SEric Cheng mac_srs->srs_last = NULL; 16208275SEric Cheng cnt = mac_srs->srs_count; 16218275SEric Cheng mac_srs->srs_count = 0; 16228275SEric Cheng 16238275SEric Cheng ASSERT(head != NULL); 16248275SEric Cheng ASSERT(tail != NULL); 16258275SEric Cheng 16268275SEric Cheng if ((tid = mac_srs->srs_tid) != 0) 16278275SEric Cheng mac_srs->srs_tid = 0; 16288275SEric Cheng 16298275SEric Cheng mac_srs->srs_state |= (SRS_PROC|proc_type); 16308275SEric Cheng 16318833SVenu.Iyer@Sun.COM 16328275SEric Cheng /* 16338275SEric Cheng * mcip is NULL for broadcast and multicast flows. The promisc 16348275SEric Cheng * callbacks for broadcast and multicast packets are delivered from 16358275SEric Cheng * mac_rx() and we don't need to worry about that case in this path 16368275SEric Cheng */ 16378275SEric Cheng if (mcip != NULL && mcip->mci_promisc_list != NULL) { 16388275SEric Cheng mutex_exit(&mac_srs->srs_lock); 16398275SEric Cheng mac_promisc_client_dispatch(mcip, head); 16408275SEric Cheng mutex_enter(&mac_srs->srs_lock); 16418275SEric Cheng } 16428275SEric Cheng 16438275SEric Cheng /* 16448275SEric Cheng * Check if SRS itself is doing the processing 16458275SEric Cheng * This direct path does not apply when subflows are present. In this 16468275SEric Cheng * case, packets need to be dispatched to a soft ring according to the 16478275SEric Cheng * flow's bandwidth and other resources contraints. 16488275SEric Cheng */ 16498275SEric Cheng if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 16508275SEric Cheng mac_direct_rx_t proc; 16518275SEric Cheng void *arg1; 16528275SEric Cheng mac_resource_handle_t arg2; 16538275SEric Cheng 16548275SEric Cheng /* 16558275SEric Cheng * This is the case when a Rx is directly 16568275SEric Cheng * assigned and we have a fully classified 16578275SEric Cheng * protocol chain. We can deal with it in 16588275SEric Cheng * one shot. 16598275SEric Cheng */ 16608275SEric Cheng proc = srs_rx->sr_func; 16618275SEric Cheng arg1 = srs_rx->sr_arg1; 16628275SEric Cheng arg2 = srs_rx->sr_arg2; 16638275SEric Cheng 16648275SEric Cheng mac_srs->srs_state |= SRS_CLIENT_PROC; 16658275SEric Cheng mutex_exit(&mac_srs->srs_lock); 16668275SEric Cheng if (tid != 0) { 16678275SEric Cheng (void) untimeout(tid); 16688275SEric Cheng tid = 0; 16698275SEric Cheng } 16708275SEric Cheng 16718275SEric Cheng proc(arg1, arg2, head, NULL); 16728275SEric Cheng /* 16738275SEric Cheng * Decrement the size and count here itelf 16748275SEric Cheng * since the packet has been processed. 16758275SEric Cheng */ 16768275SEric Cheng mutex_enter(&mac_srs->srs_lock); 16778275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 16788275SEric Cheng if (mac_srs->srs_state & SRS_CLIENT_WAIT) 16798275SEric Cheng cv_signal(&mac_srs->srs_client_cv); 16808275SEric Cheng mac_srs->srs_state &= ~SRS_CLIENT_PROC; 16818275SEric Cheng } else { 16828275SEric Cheng /* Some kind of softrings based fanout is required */ 16838275SEric Cheng mutex_exit(&mac_srs->srs_lock); 16848275SEric Cheng if (tid != 0) { 16858275SEric Cheng (void) untimeout(tid); 16868275SEric Cheng tid = 0; 16878275SEric Cheng } 16888275SEric Cheng 16898275SEric Cheng /* 16908275SEric Cheng * Since the fanout routines can deal with chains, 16918275SEric Cheng * shoot the entire chain up. 16928275SEric Cheng */ 16938275SEric Cheng if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 16948275SEric Cheng mac_rx_srs_fanout(mac_srs, head); 16958275SEric Cheng else 16968275SEric Cheng mac_rx_srs_proto_fanout(mac_srs, head); 16978275SEric Cheng mutex_enter(&mac_srs->srs_lock); 16988275SEric Cheng } 16998275SEric Cheng 17009820SEric Cheng if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) && 17019820SEric Cheng (mac_srs->srs_first != NULL)) { 17028833SVenu.Iyer@Sun.COM /* 17039820SEric Cheng * More packets arrived while we were clearing the 17049820SEric Cheng * SRS. This can be possible because of one of 17059820SEric Cheng * three conditions below: 17069820SEric Cheng * 1) The driver is using multiple worker threads 17079820SEric Cheng * to send the packets to us. 17089820SEric Cheng * 2) The driver has a race in switching 17099820SEric Cheng * between interrupt and polling mode or 17109820SEric Cheng * 3) Packets are arriving in this SRS via the 17119820SEric Cheng * S/W classification as well. 17129820SEric Cheng * 17139820SEric Cheng * We should switch to polling mode and see if we 17149820SEric Cheng * need to send the poll thread down. Also, signal 17159820SEric Cheng * the worker thread to process whats just arrived. 17168833SVenu.Iyer@Sun.COM */ 17179820SEric Cheng MAC_SRS_POLLING_ON(mac_srs); 17188833SVenu.Iyer@Sun.COM if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) { 17198833SVenu.Iyer@Sun.COM srs_rx->sr_drain_poll_sig++; 17208833SVenu.Iyer@Sun.COM MAC_SRS_POLL_RING(mac_srs); 17218833SVenu.Iyer@Sun.COM } 17229820SEric Cheng 17239820SEric Cheng /* 17249820SEric Cheng * If we didn't signal the poll thread, we need 17259820SEric Cheng * to deal with the pending packets ourselves. 17269820SEric Cheng */ 17279820SEric Cheng if (proc_type == SRS_WORKER) { 17288275SEric Cheng srs_rx->sr_drain_again++; 17298275SEric Cheng goto again; 17309820SEric Cheng } else { 17319820SEric Cheng srs_rx->sr_drain_worker_sig++; 17329820SEric Cheng cv_signal(&mac_srs->srs_async); 17338275SEric Cheng } 17348275SEric Cheng } 17358275SEric Cheng 17368275SEric Cheng out: 17378275SEric Cheng if (mac_srs->srs_state & SRS_GET_PKTS) { 17388275SEric Cheng /* 17398275SEric Cheng * Poll thread is already running. Leave the 17408275SEric Cheng * SRS_RPOC set and hand over the control to 17418275SEric Cheng * poll thread. 17428275SEric Cheng */ 17438275SEric Cheng mac_srs->srs_state &= ~proc_type; 17448275SEric Cheng srs_rx->sr_drain_poll_running++; 17458275SEric Cheng return; 17468275SEric Cheng } 17478275SEric Cheng 17488275SEric Cheng /* 17498275SEric Cheng * Even if there are no packets queued in SRS, we 17508275SEric Cheng * need to make sure that the shared counter is 17518275SEric Cheng * clear and any associated softrings have cleared 17528275SEric Cheng * all the backlog. Otherwise, leave the interface 17538275SEric Cheng * in polling mode and the poll thread will get 17548275SEric Cheng * signalled once the count goes down to zero. 17558275SEric Cheng * 17568275SEric Cheng * If someone is already draining the queue (SRS_PROC is 17578275SEric Cheng * set) when the srs_poll_pkt_cnt goes down to zero, 17588275SEric Cheng * then it means that drain is already running and we 17598275SEric Cheng * will turn off polling at that time if there is 17608275SEric Cheng * no backlog. 17618275SEric Cheng * 17628275SEric Cheng * As long as there are packets queued either 17638275SEric Cheng * in soft ring set or its soft rings, we will leave 17648275SEric Cheng * the interface in polling mode (even if the drain 17658275SEric Cheng * was done being the interrupt thread). We signal 17668275SEric Cheng * the poll thread as well if we have dipped below 17678275SEric Cheng * low water mark. 17688275SEric Cheng * 17698275SEric Cheng * NOTE: We can't use the MAC_SRS_POLLING_ON macro 17708275SEric Cheng * since that turn polling on only for worker thread. 17718275SEric Cheng * Its not worth turning polling on for interrupt 17728275SEric Cheng * thread (since NIC will not issue another interrupt) 17738275SEric Cheng * unless a backlog builds up. 17748275SEric Cheng */ 17758275SEric Cheng if ((srs_rx->sr_poll_pkt_cnt > 0) && 17768275SEric Cheng (mac_srs->srs_state & SRS_POLLING_CAPAB)) { 17778275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 17788275SEric Cheng srs_rx->sr_drain_keep_polling++; 17798275SEric Cheng MAC_SRS_POLLING_ON(mac_srs); 17808275SEric Cheng if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) 17818275SEric Cheng MAC_SRS_POLL_RING(mac_srs); 17828275SEric Cheng return; 17838275SEric Cheng } 17848275SEric Cheng 17858275SEric Cheng /* Nothing else to do. Get out of poll mode */ 17868275SEric Cheng MAC_SRS_POLLING_OFF(mac_srs); 17878275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 17888275SEric Cheng srs_rx->sr_drain_finish_intr++; 17898275SEric Cheng } 17908275SEric Cheng 17918275SEric Cheng /* 17928275SEric Cheng * mac_rx_srs_drain_bw 17938275SEric Cheng * 17948275SEric Cheng * The SRS BW drain routine. Gets to run to clear the queue. Any thread 17958275SEric Cheng * (worker, interrupt, poll) can call this based on processing model. 17968275SEric Cheng * The first thing we do is disable interrupts if possible and then 17978275SEric Cheng * drain the queue. we also try to poll the underlying hardware if 17988275SEric Cheng * there is a dedicated hardware Rx ring assigned to this SRS. 17998275SEric Cheng * 18008275SEric Cheng * There is a equivalent drain routine in non bandwidth control mode 18018275SEric Cheng * mac_rx_srs_drain. There is some code duplication between the two 18028275SEric Cheng * routines but they are highly performance sensitive and are easier 18038275SEric Cheng * to read/debug if they stay separate. Any code changes here might 18048275SEric Cheng * also apply to mac_rx_srs_drain as well. 18058275SEric Cheng */ 18068275SEric Cheng void 18078275SEric Cheng mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 18088275SEric Cheng { 18098275SEric Cheng mblk_t *head; 18108275SEric Cheng mblk_t *tail; 18118275SEric Cheng timeout_id_t tid; 18128275SEric Cheng size_t sz = 0; 18138275SEric Cheng int cnt = 0; 18148275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 18158275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 18168275SEric Cheng 18178275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 18188275SEric Cheng ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 18198275SEric Cheng again: 18208275SEric Cheng /* Check if we are doing B/W control */ 18218275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 18228275SEric Cheng if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 18238275SEric Cheng mac_srs->srs_bw->mac_bw_curr_time = lbolt; 18248275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 18258275SEric Cheng if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 18268275SEric Cheng mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; 18278275SEric Cheng } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) { 18288275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18298275SEric Cheng goto done; 18308275SEric Cheng } else if (mac_srs->srs_bw->mac_bw_used > 18318275SEric Cheng mac_srs->srs_bw->mac_bw_limit) { 18328275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 18338275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18348275SEric Cheng goto done; 18358275SEric Cheng } 18368275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18378275SEric Cheng 18388275SEric Cheng /* If we are blanked i.e. can't do upcalls, then we are done */ 18398275SEric Cheng if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 18408275SEric Cheng ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 18418275SEric Cheng (mac_srs->srs_state & SRS_PAUSE)); 18428275SEric Cheng goto done; 18438275SEric Cheng } 18448275SEric Cheng 18458275SEric Cheng sz = 0; 18468275SEric Cheng cnt = 0; 18478275SEric Cheng if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) { 18488275SEric Cheng /* 18498275SEric Cheng * We couldn't pick up a single packet. 18508275SEric Cheng */ 18518275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 18528275SEric Cheng if ((mac_srs->srs_bw->mac_bw_used == 0) && 18538275SEric Cheng (mac_srs->srs_size != 0) && 18548275SEric Cheng !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 18558275SEric Cheng /* 18568275SEric Cheng * Seems like configured B/W doesn't 18578275SEric Cheng * even allow processing of 1 packet 18588275SEric Cheng * per tick. 18598275SEric Cheng * 18608275SEric Cheng * XXX: raise the limit to processing 18618275SEric Cheng * at least 1 packet per tick. 18628275SEric Cheng */ 18638275SEric Cheng mac_srs->srs_bw->mac_bw_limit += 18648275SEric Cheng mac_srs->srs_bw->mac_bw_limit; 18658275SEric Cheng mac_srs->srs_bw->mac_bw_drop_threshold += 18668275SEric Cheng mac_srs->srs_bw->mac_bw_drop_threshold; 18678275SEric Cheng cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) " 18688275SEric Cheng "raised B/W limit to %d since not even a " 18698275SEric Cheng "single packet can be processed per " 18708275SEric Cheng "tick %d\n", (void *)mac_srs, 18718275SEric Cheng (int)mac_srs->srs_bw->mac_bw_limit, 18728275SEric Cheng (int)msgdsize(mac_srs->srs_first)); 18738275SEric Cheng } 18748275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18758275SEric Cheng goto done; 18768275SEric Cheng } 18778275SEric Cheng 18788275SEric Cheng ASSERT(head != NULL); 18798275SEric Cheng ASSERT(tail != NULL); 18808275SEric Cheng 18818275SEric Cheng /* zero bandwidth: drop all and return to interrupt mode */ 18828275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 18838275SEric Cheng if (mac_srs->srs_bw->mac_bw_limit == 0) { 18848275SEric Cheng srs_rx->sr_drop_count += cnt; 18858275SEric Cheng ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz); 18868275SEric Cheng mac_srs->srs_bw->mac_bw_sz -= sz; 18878275SEric Cheng mac_srs->srs_bw->mac_bw_drop_bytes += sz; 18888275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18898275SEric Cheng mac_pkt_drop(NULL, NULL, head, B_FALSE); 18908275SEric Cheng goto leave_poll; 18918275SEric Cheng } else { 18928275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18938275SEric Cheng } 18948275SEric Cheng 18958275SEric Cheng if ((tid = mac_srs->srs_tid) != 0) 18968275SEric Cheng mac_srs->srs_tid = 0; 18978275SEric Cheng 18988275SEric Cheng mac_srs->srs_state |= (SRS_PROC|proc_type); 18998275SEric Cheng MAC_SRS_WORKER_POLLING_ON(mac_srs); 19008275SEric Cheng 19018275SEric Cheng /* 19028275SEric Cheng * mcip is NULL for broadcast and multicast flows. The promisc 19038275SEric Cheng * callbacks for broadcast and multicast packets are delivered from 19048275SEric Cheng * mac_rx() and we don't need to worry about that case in this path 19058275SEric Cheng */ 19068275SEric Cheng if (mcip != NULL && mcip->mci_promisc_list != NULL) { 19078275SEric Cheng mutex_exit(&mac_srs->srs_lock); 19088275SEric Cheng mac_promisc_client_dispatch(mcip, head); 19098275SEric Cheng mutex_enter(&mac_srs->srs_lock); 19108275SEric Cheng } 19118275SEric Cheng 19128275SEric Cheng /* 19138275SEric Cheng * Check if SRS itself is doing the processing 19148275SEric Cheng * This direct path does not apply when subflows are present. In this 19158275SEric Cheng * case, packets need to be dispatched to a soft ring according to the 19168275SEric Cheng * flow's bandwidth and other resources contraints. 19178275SEric Cheng */ 19188275SEric Cheng if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 19198275SEric Cheng mac_direct_rx_t proc; 19208275SEric Cheng void *arg1; 19218275SEric Cheng mac_resource_handle_t arg2; 19228275SEric Cheng 19238275SEric Cheng /* 19248275SEric Cheng * This is the case when a Rx is directly 19258275SEric Cheng * assigned and we have a fully classified 19268275SEric Cheng * protocol chain. We can deal with it in 19278275SEric Cheng * one shot. 19288275SEric Cheng */ 19298275SEric Cheng proc = srs_rx->sr_func; 19308275SEric Cheng arg1 = srs_rx->sr_arg1; 19318275SEric Cheng arg2 = srs_rx->sr_arg2; 19328275SEric Cheng 19338275SEric Cheng mac_srs->srs_state |= SRS_CLIENT_PROC; 19348275SEric Cheng mutex_exit(&mac_srs->srs_lock); 19358275SEric Cheng if (tid != 0) { 19368275SEric Cheng (void) untimeout(tid); 19378275SEric Cheng tid = 0; 19388275SEric Cheng } 19398275SEric Cheng 19408275SEric Cheng proc(arg1, arg2, head, NULL); 19418275SEric Cheng /* 19428275SEric Cheng * Decrement the size and count here itelf 19438275SEric Cheng * since the packet has been processed. 19448275SEric Cheng */ 19458275SEric Cheng mutex_enter(&mac_srs->srs_lock); 19468275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 19478275SEric Cheng MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 19488275SEric Cheng 19498275SEric Cheng if (mac_srs->srs_state & SRS_CLIENT_WAIT) 19508275SEric Cheng cv_signal(&mac_srs->srs_client_cv); 19518275SEric Cheng mac_srs->srs_state &= ~SRS_CLIENT_PROC; 19528275SEric Cheng } else { 19538275SEric Cheng /* Some kind of softrings based fanout is required */ 19548275SEric Cheng mutex_exit(&mac_srs->srs_lock); 19558275SEric Cheng if (tid != 0) { 19568275SEric Cheng (void) untimeout(tid); 19578275SEric Cheng tid = 0; 19588275SEric Cheng } 19598275SEric Cheng 19608275SEric Cheng /* 19618275SEric Cheng * Since the fanout routines can deal with chains, 19628275SEric Cheng * shoot the entire chain up. 19638275SEric Cheng */ 19648275SEric Cheng if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 19658275SEric Cheng mac_rx_srs_fanout(mac_srs, head); 19668275SEric Cheng else 19678275SEric Cheng mac_rx_srs_proto_fanout(mac_srs, head); 19688275SEric Cheng mutex_enter(&mac_srs->srs_lock); 19698275SEric Cheng } 19708275SEric Cheng 19718275SEric Cheng /* 19728275SEric Cheng * Send the poll thread to pick up any packets arrived 19738275SEric Cheng * so far. This also serves as the last check in case 19748275SEric Cheng * nothing else is queued in the SRS. The poll thread 19758275SEric Cheng * is signalled only in the case the drain was done 19768275SEric Cheng * by the worker thread and SRS_WORKER is set. The 19778275SEric Cheng * worker thread can run in parallel as long as the 19788275SEric Cheng * SRS_WORKER flag is set. We we have nothing else to 19798275SEric Cheng * process, we can exit while leaving SRS_PROC set 19808275SEric Cheng * which gives the poll thread control to process and 19818275SEric Cheng * cleanup once it returns from the NIC. 19828275SEric Cheng * 19838275SEric Cheng * If we have nothing else to process, we need to 19848275SEric Cheng * ensure that we keep holding the srs_lock till 19858275SEric Cheng * all the checks below are done and control is 19868275SEric Cheng * handed to the poll thread if it was running. 19878275SEric Cheng */ 19888275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 19898275SEric Cheng if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 19908275SEric Cheng if (mac_srs->srs_first != NULL) { 19918275SEric Cheng if (proc_type == SRS_WORKER) { 19928275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 19938275SEric Cheng if (srs_rx->sr_poll_pkt_cnt <= 19948275SEric Cheng srs_rx->sr_lowat) 19958275SEric Cheng MAC_SRS_POLL_RING(mac_srs); 19968275SEric Cheng goto again; 19978275SEric Cheng } else { 19988275SEric Cheng cv_signal(&mac_srs->srs_async); 19998275SEric Cheng } 20008275SEric Cheng } 20018275SEric Cheng } 20028275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 20038275SEric Cheng 20048275SEric Cheng done: 20058275SEric Cheng 20068275SEric Cheng if (mac_srs->srs_state & SRS_GET_PKTS) { 20078275SEric Cheng /* 20088275SEric Cheng * Poll thread is already running. Leave the 20098275SEric Cheng * SRS_RPOC set and hand over the control to 20108275SEric Cheng * poll thread. 20118275SEric Cheng */ 20128275SEric Cheng mac_srs->srs_state &= ~proc_type; 20138275SEric Cheng return; 20148275SEric Cheng } 20158275SEric Cheng 20168275SEric Cheng /* 20178275SEric Cheng * If we can't process packets because we have exceeded 20188275SEric Cheng * B/W limit for this tick, just set the timeout 20198275SEric Cheng * and leave. 20208275SEric Cheng * 20218275SEric Cheng * Even if there are no packets queued in SRS, we 20228275SEric Cheng * need to make sure that the shared counter is 20238275SEric Cheng * clear and any associated softrings have cleared 20248275SEric Cheng * all the backlog. Otherwise, leave the interface 20258275SEric Cheng * in polling mode and the poll thread will get 20268275SEric Cheng * signalled once the count goes down to zero. 20278275SEric Cheng * 20288275SEric Cheng * If someone is already draining the queue (SRS_PROC is 20298275SEric Cheng * set) when the srs_poll_pkt_cnt goes down to zero, 20308275SEric Cheng * then it means that drain is already running and we 20318275SEric Cheng * will turn off polling at that time if there is 20328275SEric Cheng * no backlog. As long as there are packets queued either 20338275SEric Cheng * is soft ring set or its soft rings, we will leave 20348275SEric Cheng * the interface in polling mode. 20358275SEric Cheng */ 20368275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 20378275SEric Cheng if ((mac_srs->srs_state & SRS_POLLING_CAPAB) && 20388275SEric Cheng ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) || 20398275SEric Cheng (srs_rx->sr_poll_pkt_cnt > 0))) { 20408275SEric Cheng MAC_SRS_POLLING_ON(mac_srs); 20418275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 20428275SEric Cheng if ((mac_srs->srs_first != NULL) && 20438275SEric Cheng (mac_srs->srs_tid == NULL)) 20448275SEric Cheng mac_srs->srs_tid = timeout(mac_srs_fire, 20458275SEric Cheng mac_srs, 1); 20468275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 20478275SEric Cheng return; 20488275SEric Cheng } 20498275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 20508275SEric Cheng 20518275SEric Cheng leave_poll: 20528275SEric Cheng 20538275SEric Cheng /* Nothing else to do. Get out of poll mode */ 20548275SEric Cheng MAC_SRS_POLLING_OFF(mac_srs); 20558275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 20568275SEric Cheng } 20578275SEric Cheng 20588275SEric Cheng /* 20598275SEric Cheng * mac_srs_worker 20608275SEric Cheng * 20618275SEric Cheng * The SRS worker routine. Drains the queue when no one else is 20628275SEric Cheng * processing it. 20638275SEric Cheng */ 20648275SEric Cheng void 20658275SEric Cheng mac_srs_worker(mac_soft_ring_set_t *mac_srs) 20668275SEric Cheng { 20678275SEric Cheng kmutex_t *lock = &mac_srs->srs_lock; 20688275SEric Cheng kcondvar_t *async = &mac_srs->srs_async; 20698275SEric Cheng callb_cpr_t cprinfo; 20708275SEric Cheng boolean_t bw_ctl_flag; 20718275SEric Cheng 20728275SEric Cheng CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker"); 20738275SEric Cheng mutex_enter(lock); 20748275SEric Cheng 20758275SEric Cheng start: 20768275SEric Cheng for (;;) { 20778275SEric Cheng bw_ctl_flag = B_FALSE; 20788275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 20798275SEric Cheng MAC_SRS_BW_LOCK(mac_srs); 20808275SEric Cheng MAC_SRS_CHECK_BW_CONTROL(mac_srs); 20818275SEric Cheng if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 20828275SEric Cheng bw_ctl_flag = B_TRUE; 20838275SEric Cheng MAC_SRS_BW_UNLOCK(mac_srs); 20848275SEric Cheng } 20858275SEric Cheng /* 20868275SEric Cheng * The SRS_BW_ENFORCED flag may change since we have dropped 20878275SEric Cheng * the mac_bw_lock. However the drain function can handle both 20888275SEric Cheng * a drainable SRS or a bandwidth controlled SRS, and the 20898275SEric Cheng * effect of scheduling a timeout is to wakeup the worker 20908275SEric Cheng * thread which in turn will call the drain function. Since 20918275SEric Cheng * we release the srs_lock atomically only in the cv_wait there 20928275SEric Cheng * isn't a fear of waiting for ever. 20938275SEric Cheng */ 20948275SEric Cheng while (((mac_srs->srs_state & SRS_PROC) || 20958275SEric Cheng (mac_srs->srs_first == NULL) || bw_ctl_flag || 20968275SEric Cheng (mac_srs->srs_state & SRS_TX_BLOCKED)) && 20978275SEric Cheng !(mac_srs->srs_state & SRS_PAUSE)) { 20988275SEric Cheng /* 20998275SEric Cheng * If we have packets queued and we are here 21008275SEric Cheng * because B/W control is in place, we better 21018275SEric Cheng * schedule the worker wakeup after 1 tick 21028275SEric Cheng * to see if bandwidth control can be relaxed. 21038275SEric Cheng */ 21048275SEric Cheng if (bw_ctl_flag && mac_srs->srs_tid == NULL) { 21058275SEric Cheng /* 21068275SEric Cheng * We need to ensure that a timer is already 21078275SEric Cheng * scheduled or we force schedule one for 21088275SEric Cheng * later so that we can continue processing 21098275SEric Cheng * after this quanta is over. 21108275SEric Cheng */ 21118275SEric Cheng mac_srs->srs_tid = timeout(mac_srs_fire, 21128275SEric Cheng mac_srs, 1); 21138275SEric Cheng } 21148275SEric Cheng wait: 21158275SEric Cheng CALLB_CPR_SAFE_BEGIN(&cprinfo); 21168275SEric Cheng cv_wait(async, lock); 21178275SEric Cheng CALLB_CPR_SAFE_END(&cprinfo, lock); 21188275SEric Cheng 21198275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 21208275SEric Cheng goto done; 21218275SEric Cheng if (mac_srs->srs_state & SRS_PROC) 21228275SEric Cheng goto wait; 21238275SEric Cheng 21248275SEric Cheng if (mac_srs->srs_first != NULL && 21258275SEric Cheng mac_srs->srs_type & SRST_BW_CONTROL) { 21268275SEric Cheng MAC_SRS_BW_LOCK(mac_srs); 21278275SEric Cheng if (mac_srs->srs_bw->mac_bw_state & 21288275SEric Cheng SRS_BW_ENFORCED) { 21298275SEric Cheng MAC_SRS_CHECK_BW_CONTROL(mac_srs); 21308275SEric Cheng } 21318275SEric Cheng bw_ctl_flag = mac_srs->srs_bw->mac_bw_state & 21328275SEric Cheng SRS_BW_ENFORCED; 21338275SEric Cheng MAC_SRS_BW_UNLOCK(mac_srs); 21348275SEric Cheng } 21358275SEric Cheng } 21368275SEric Cheng 21378275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 21388275SEric Cheng goto done; 21398275SEric Cheng mac_srs->srs_drain_func(mac_srs, SRS_WORKER); 21408275SEric Cheng } 21418275SEric Cheng done: 21428275SEric Cheng /* 21438275SEric Cheng * The Rx SRS quiesce logic first cuts off packet supply to the SRS 21448275SEric Cheng * from both hard and soft classifications and waits for such threads 21458275SEric Cheng * to finish before signaling the worker. So at this point the only 21468275SEric Cheng * thread left that could be competing with the worker is the poll 21478275SEric Cheng * thread. In the case of Tx, there shouldn't be any thread holding 21488275SEric Cheng * SRS_PROC at this point. 21498275SEric Cheng */ 21508275SEric Cheng if (!(mac_srs->srs_state & SRS_PROC)) { 21518275SEric Cheng mac_srs->srs_state |= SRS_PROC; 21528275SEric Cheng } else { 21538275SEric Cheng ASSERT((mac_srs->srs_type & SRST_TX) == 0); 21548275SEric Cheng /* 21558275SEric Cheng * Poll thread still owns the SRS and is still running 21568275SEric Cheng */ 21578275SEric Cheng ASSERT((mac_srs->srs_poll_thr == NULL) || 21588275SEric Cheng ((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 21598275SEric Cheng SRS_POLL_THR_OWNER)); 21608275SEric Cheng } 21618275SEric Cheng mac_srs_worker_quiesce(mac_srs); 21628275SEric Cheng /* 21638275SEric Cheng * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator 21648275SEric Cheng * of the quiesce operation 21658275SEric Cheng */ 21668275SEric Cheng while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART))) 21678275SEric Cheng cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 21688275SEric Cheng 21698275SEric Cheng if (mac_srs->srs_state & SRS_RESTART) { 21708275SEric Cheng ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 21718275SEric Cheng mac_srs_worker_restart(mac_srs); 21728275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 21738275SEric Cheng goto start; 21748275SEric Cheng } 21758275SEric Cheng 21768275SEric Cheng if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE)) 21778275SEric Cheng mac_srs_worker_quiesce(mac_srs); 21788275SEric Cheng 21798275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 21808275SEric Cheng /* The macro drops the srs_lock */ 21818275SEric Cheng CALLB_CPR_EXIT(&cprinfo); 21828275SEric Cheng thread_exit(); 21838275SEric Cheng } 21848275SEric Cheng 21858275SEric Cheng /* 21868275SEric Cheng * mac_rx_srs_subflow_process 21878275SEric Cheng * 21888275SEric Cheng * Receive side routine called from interrupt path when there are 21898275SEric Cheng * sub flows present on this SRS. 21908275SEric Cheng */ 21918275SEric Cheng /* ARGSUSED */ 21928275SEric Cheng void 21938275SEric Cheng mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs, 21948275SEric Cheng mblk_t *mp_chain, boolean_t loopback) 21958275SEric Cheng { 21968275SEric Cheng flow_entry_t *flent = NULL; 21978275SEric Cheng flow_entry_t *prev_flent = NULL; 21988275SEric Cheng mblk_t *mp = NULL; 21998275SEric Cheng mblk_t *tail = NULL; 22008275SEric Cheng mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 22018275SEric Cheng mac_client_impl_t *mcip; 22028275SEric Cheng 22038275SEric Cheng mcip = mac_srs->srs_mcip; 22048275SEric Cheng ASSERT(mcip != NULL); 22058275SEric Cheng 22068275SEric Cheng /* 22078275SEric Cheng * We need to determine the SRS for every packet 22088275SEric Cheng * by walking the flow table, if we don't get any, 22098275SEric Cheng * then we proceed using the SRS we came with. 22108275SEric Cheng */ 22118275SEric Cheng mp = tail = mp_chain; 22128275SEric Cheng while (mp != NULL) { 22138275SEric Cheng 22148275SEric Cheng /* 22158275SEric Cheng * We will increment the stats for the mactching subflow. 22168275SEric Cheng * when we get the bytes/pkt count for the classified packets 22178275SEric Cheng * later in mac_rx_srs_process. 22188275SEric Cheng */ 22198275SEric Cheng (void) mac_flow_lookup(mcip->mci_subflow_tab, mp, 22208275SEric Cheng FLOW_INBOUND, &flent); 22218275SEric Cheng 22228275SEric Cheng if (mp == mp_chain || flent == prev_flent) { 22238275SEric Cheng if (prev_flent != NULL) 22248275SEric Cheng FLOW_REFRELE(prev_flent); 22258275SEric Cheng prev_flent = flent; 22268275SEric Cheng flent = NULL; 22278275SEric Cheng tail = mp; 22288275SEric Cheng mp = mp->b_next; 22298275SEric Cheng continue; 22308275SEric Cheng } 22318275SEric Cheng tail->b_next = NULL; 22328275SEric Cheng /* 22338275SEric Cheng * A null indicates, this is for the mac_srs itself. 22348275SEric Cheng * XXX-venu : probably assert for fe_rx_srs_cnt == 0. 22358275SEric Cheng */ 22368275SEric Cheng if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 22378275SEric Cheng mac_rx_srs_process(arg, 22388275SEric Cheng (mac_resource_handle_t)mac_srs, mp_chain, 22398275SEric Cheng loopback); 22408275SEric Cheng } else { 22418275SEric Cheng (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 22428275SEric Cheng prev_flent->fe_cb_arg2, mp_chain, loopback); 22438275SEric Cheng FLOW_REFRELE(prev_flent); 22448275SEric Cheng } 22458275SEric Cheng prev_flent = flent; 22468275SEric Cheng flent = NULL; 22478275SEric Cheng mp_chain = mp; 22488275SEric Cheng tail = mp; 22498275SEric Cheng mp = mp->b_next; 22508275SEric Cheng } 22518275SEric Cheng /* Last chain */ 22528275SEric Cheng ASSERT(mp_chain != NULL); 22538275SEric Cheng if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 22548275SEric Cheng mac_rx_srs_process(arg, 22558275SEric Cheng (mac_resource_handle_t)mac_srs, mp_chain, loopback); 22568275SEric Cheng } else { 22578275SEric Cheng (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 22588275SEric Cheng prev_flent->fe_cb_arg2, mp_chain, loopback); 22598275SEric Cheng FLOW_REFRELE(prev_flent); 22608275SEric Cheng } 22618275SEric Cheng } 22628275SEric Cheng 22638275SEric Cheng /* 22648275SEric Cheng * mac_rx_srs_process 22658275SEric Cheng * 22668275SEric Cheng * Receive side routine called from the interrupt path. 22678275SEric Cheng * 22688275SEric Cheng * loopback is set to force a context switch on the loopback 22698275SEric Cheng * path between MAC clients. 22708275SEric Cheng */ 22718275SEric Cheng /* ARGSUSED */ 22728275SEric Cheng void 22738275SEric Cheng mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, 22748275SEric Cheng boolean_t loopback) 22758275SEric Cheng { 22768275SEric Cheng mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 22778275SEric Cheng mblk_t *mp, *tail, *head; 22788275SEric Cheng int count = 0; 22798275SEric Cheng int count1; 22808275SEric Cheng size_t sz = 0; 22818275SEric Cheng size_t chain_sz, sz1; 22828275SEric Cheng mac_bw_ctl_t *mac_bw; 22838275SEric Cheng mac_client_impl_t *smcip; 22848275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 22858275SEric Cheng 22868275SEric Cheng /* 22878275SEric Cheng * Set the tail, count and sz. We set the sz irrespective 22888275SEric Cheng * of whether we are doing B/W control or not for the 22898275SEric Cheng * purpose of updating the stats. 22908275SEric Cheng */ 22918275SEric Cheng mp = tail = mp_chain; 22928275SEric Cheng while (mp != NULL) { 22938275SEric Cheng tail = mp; 22948275SEric Cheng count++; 22958275SEric Cheng sz += msgdsize(mp); 22968275SEric Cheng mp = mp->b_next; 22978275SEric Cheng } 22988275SEric Cheng 22998275SEric Cheng mutex_enter(&mac_srs->srs_lock); 23008275SEric Cheng smcip = mac_srs->srs_mcip; 23018275SEric Cheng 23028275SEric Cheng if (mac_srs->srs_type & SRST_FLOW || smcip == NULL) { 23038275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, rbytes, sz); 23048275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, ipackets, count); 23058275SEric Cheng } 23068275SEric Cheng if (smcip != NULL) { 23078275SEric Cheng smcip->mci_stat_ibytes += sz; 23088275SEric Cheng smcip->mci_stat_ipackets += count; 23098275SEric Cheng } 23108275SEric Cheng 23118275SEric Cheng /* 23128275SEric Cheng * If the SRS in already being processed; has been blanked; 23138275SEric Cheng * can be processed by worker thread only; or the B/W limit 23148275SEric Cheng * has been reached, then queue the chain and check if 23158275SEric Cheng * worker thread needs to be awakend. 23168275SEric Cheng */ 23178275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 23188275SEric Cheng mac_bw = mac_srs->srs_bw; 23198275SEric Cheng ASSERT(mac_bw != NULL); 23208275SEric Cheng mutex_enter(&mac_bw->mac_bw_lock); 23218275SEric Cheng /* Count the packets and bytes via interrupt */ 23228275SEric Cheng srs_rx->sr_intr_count += count; 23238275SEric Cheng mac_bw->mac_bw_intr += sz; 23248275SEric Cheng if (mac_bw->mac_bw_limit == 0) { 23258275SEric Cheng /* zero bandwidth: drop all */ 23268275SEric Cheng srs_rx->sr_drop_count += count; 23278275SEric Cheng mac_bw->mac_bw_drop_bytes += sz; 23288275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23298275SEric Cheng mutex_exit(&mac_srs->srs_lock); 23308275SEric Cheng mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 23318275SEric Cheng return; 23328275SEric Cheng } else { 23338275SEric Cheng if ((mac_bw->mac_bw_sz + sz) <= 23348275SEric Cheng mac_bw->mac_bw_drop_threshold) { 23358275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23368275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, 23378275SEric Cheng tail, count, sz); 23388275SEric Cheng } else { 23398275SEric Cheng mp = mp_chain; 23408275SEric Cheng chain_sz = 0; 23418275SEric Cheng count1 = 0; 23428275SEric Cheng tail = NULL; 23438275SEric Cheng head = NULL; 23448275SEric Cheng while (mp != NULL) { 23458275SEric Cheng sz1 = msgdsize(mp); 23468275SEric Cheng if (mac_bw->mac_bw_sz + chain_sz + sz1 > 23478275SEric Cheng mac_bw->mac_bw_drop_threshold) 23488275SEric Cheng break; 23498275SEric Cheng chain_sz += sz1; 23508275SEric Cheng count1++; 23518275SEric Cheng tail = mp; 23528275SEric Cheng mp = mp->b_next; 23538275SEric Cheng } 23548275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23558275SEric Cheng if (tail != NULL) { 23568275SEric Cheng head = tail->b_next; 23578275SEric Cheng tail->b_next = NULL; 23588275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, 23598275SEric Cheng mp_chain, tail, count1, chain_sz); 23608275SEric Cheng sz -= chain_sz; 23618275SEric Cheng count -= count1; 23628275SEric Cheng } else { 23638275SEric Cheng /* Can't pick up any */ 23648275SEric Cheng head = mp_chain; 23658275SEric Cheng } 23668275SEric Cheng if (head != NULL) { 23678275SEric Cheng /* Drop any packet over the threshold */ 23688275SEric Cheng srs_rx->sr_drop_count += count; 23698275SEric Cheng mutex_enter(&mac_bw->mac_bw_lock); 23708275SEric Cheng mac_bw->mac_bw_drop_bytes += sz; 23718275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23728275SEric Cheng freemsgchain(head); 23738275SEric Cheng } 23748275SEric Cheng } 23758275SEric Cheng MAC_SRS_WORKER_WAKEUP(mac_srs); 23768275SEric Cheng mutex_exit(&mac_srs->srs_lock); 23778275SEric Cheng return; 23788275SEric Cheng } 23798275SEric Cheng } 23808275SEric Cheng 23818275SEric Cheng /* 23828275SEric Cheng * If the total number of packets queued in the SRS and 23838275SEric Cheng * its associated soft rings exceeds the max allowed, 23848275SEric Cheng * then drop the chain. If we are polling capable, this 23858275SEric Cheng * shouldn't be happening. 23868275SEric Cheng */ 23878275SEric Cheng if (!(mac_srs->srs_type & SRST_BW_CONTROL) && 23888275SEric Cheng (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) { 23898275SEric Cheng mac_bw = mac_srs->srs_bw; 23908275SEric Cheng srs_rx->sr_drop_count += count; 23918275SEric Cheng mutex_enter(&mac_bw->mac_bw_lock); 23928275SEric Cheng mac_bw->mac_bw_drop_bytes += sz; 23938275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23948275SEric Cheng freemsgchain(mp_chain); 23958275SEric Cheng mutex_exit(&mac_srs->srs_lock); 23968275SEric Cheng return; 23978275SEric Cheng } 23988275SEric Cheng 23998275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz); 24008275SEric Cheng /* Count the packets entering via interrupt path */ 24018275SEric Cheng srs_rx->sr_intr_count += count; 24028275SEric Cheng 24038275SEric Cheng if (!(mac_srs->srs_state & SRS_PROC)) { 24048275SEric Cheng /* 24058275SEric Cheng * If we are coming via loopback or if we are not 24068275SEric Cheng * optimizing for latency, we should signal the 24078275SEric Cheng * worker thread. 24088275SEric Cheng */ 24098833SVenu.Iyer@Sun.COM if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT)) { 24108275SEric Cheng /* 24118275SEric Cheng * For loopback, We need to let the worker take 24128275SEric Cheng * over as we don't want to continue in the same 24138275SEric Cheng * thread even if we can. This could lead to stack 24148275SEric Cheng * overflows and may also end up using 24158275SEric Cheng * resources (cpu) incorrectly. 24168275SEric Cheng */ 24178275SEric Cheng cv_signal(&mac_srs->srs_async); 24188275SEric Cheng } else { 24198275SEric Cheng /* 24208275SEric Cheng * Seems like no one is processing the SRS and 24218275SEric Cheng * there is no backlog. We also inline process 24228275SEric Cheng * our packet if its a single packet in non 24238275SEric Cheng * latency optimized case (in latency optimized 24248275SEric Cheng * case, we inline process chains of any size). 24258275SEric Cheng */ 24268275SEric Cheng mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST); 24278275SEric Cheng } 24288275SEric Cheng } 24298275SEric Cheng mutex_exit(&mac_srs->srs_lock); 24308275SEric Cheng } 24318275SEric Cheng 24328275SEric Cheng /* TX SIDE ROUTINES (RUNTIME) */ 24338275SEric Cheng 24348275SEric Cheng /* 24358275SEric Cheng * mac_tx_srs_no_desc 24368275SEric Cheng * 24378275SEric Cheng * This routine is called by Tx single ring default mode 24388275SEric Cheng * when Tx ring runs out of descs. 24398275SEric Cheng */ 24408275SEric Cheng mac_tx_cookie_t 24418275SEric Cheng mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 24428275SEric Cheng uint16_t flag, mblk_t **ret_mp) 24438275SEric Cheng { 24448275SEric Cheng mac_tx_cookie_t cookie = NULL; 24458275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 24468275SEric Cheng boolean_t wakeup_worker = B_TRUE; 24478275SEric Cheng uint32_t tx_mode = srs_tx->st_mode; 24488275SEric Cheng int cnt, sz; 24498275SEric Cheng mblk_t *tail; 24508275SEric Cheng 24518275SEric Cheng ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW); 24528275SEric Cheng if (flag & MAC_DROP_ON_NO_DESC) { 24538275SEric Cheng MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 24548275SEric Cheng } else { 24558275SEric Cheng if (mac_srs->srs_first != NULL) 24568275SEric Cheng wakeup_worker = B_FALSE; 24578275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 24588275SEric Cheng if (flag & MAC_TX_NO_ENQUEUE) { 24598275SEric Cheng /* 24608275SEric Cheng * If TX_QUEUED is not set, queue the 24618275SEric Cheng * packet and let mac_tx_srs_drain() 24628275SEric Cheng * set the TX_BLOCKED bit for the 24638275SEric Cheng * reasons explained above. Otherwise, 24648275SEric Cheng * return the mblks. 24658275SEric Cheng */ 24668275SEric Cheng if (wakeup_worker) { 24678275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 24688275SEric Cheng mp_chain, tail, cnt, sz); 24698275SEric Cheng } else { 24708275SEric Cheng MAC_TX_SET_NO_ENQUEUE(mac_srs, 24718275SEric Cheng mp_chain, ret_mp, cookie); 24728275SEric Cheng } 24738275SEric Cheng } else { 24748275SEric Cheng MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 24758275SEric Cheng tail, cnt, sz, cookie); 24768275SEric Cheng } 24778275SEric Cheng if (wakeup_worker) 24788275SEric Cheng cv_signal(&mac_srs->srs_async); 24798275SEric Cheng } 24808275SEric Cheng return (cookie); 24818275SEric Cheng } 24828275SEric Cheng 24838275SEric Cheng /* 24848275SEric Cheng * mac_tx_srs_enqueue 24858275SEric Cheng * 24868275SEric Cheng * This routine is called when Tx SRS is operating in either serializer 24878275SEric Cheng * or bandwidth mode. In serializer mode, a packet will get enqueued 24888275SEric Cheng * when a thread cannot enter SRS exclusively. In bandwidth mode, 24898275SEric Cheng * packets gets queued if allowed byte-count limit for a tick is 24908275SEric Cheng * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and 24918275SEric Cheng * MAC_TX_NO_ENQUEUE is set is different than when operaing in either 24928275SEric Cheng * the default mode or fanout mode. Here packets get dropped or 24938275SEric Cheng * returned back to the caller only after hi-watermark worth of data 24948275SEric Cheng * is queued. 24958275SEric Cheng */ 24968275SEric Cheng static mac_tx_cookie_t 24978275SEric Cheng mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 24988275SEric Cheng uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp) 24998275SEric Cheng { 25008275SEric Cheng mac_tx_cookie_t cookie = NULL; 25018275SEric Cheng int cnt, sz; 25028275SEric Cheng mblk_t *tail; 25038275SEric Cheng boolean_t wakeup_worker = B_TRUE; 25048275SEric Cheng 25058833SVenu.Iyer@Sun.COM /* 25068833SVenu.Iyer@Sun.COM * Ignore fanout hint if we don't have multiple tx rings. 25078833SVenu.Iyer@Sun.COM */ 25088833SVenu.Iyer@Sun.COM if (!TX_MULTI_RING_MODE(mac_srs)) 25098833SVenu.Iyer@Sun.COM fanout_hint = 0; 25108833SVenu.Iyer@Sun.COM 25118275SEric Cheng if (mac_srs->srs_first != NULL) 25128275SEric Cheng wakeup_worker = B_FALSE; 25138275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 25148275SEric Cheng if (flag & MAC_DROP_ON_NO_DESC) { 25158275SEric Cheng if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) { 25168275SEric Cheng MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 25178275SEric Cheng } else { 25188275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 25198275SEric Cheng mp_chain, tail, cnt, sz); 25208275SEric Cheng } 25218275SEric Cheng } else if (flag & MAC_TX_NO_ENQUEUE) { 25228275SEric Cheng if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) || 25238275SEric Cheng (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) { 25248275SEric Cheng MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain, 25258275SEric Cheng ret_mp, cookie); 25268275SEric Cheng } else { 25278275SEric Cheng mp_chain->b_prev = (mblk_t *)fanout_hint; 25288275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 25298275SEric Cheng mp_chain, tail, cnt, sz); 25308275SEric Cheng } 25318275SEric Cheng } else { 25328275SEric Cheng /* 25338275SEric Cheng * If you are BW_ENFORCED, just enqueue the 25348275SEric Cheng * packet. srs_worker will drain it at the 25358275SEric Cheng * prescribed rate. Before enqueueing, save 25368275SEric Cheng * the fanout hint. 25378275SEric Cheng */ 25388275SEric Cheng mp_chain->b_prev = (mblk_t *)fanout_hint; 25398275SEric Cheng MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 25408275SEric Cheng tail, cnt, sz, cookie); 25418275SEric Cheng } 25428275SEric Cheng if (wakeup_worker) 25438275SEric Cheng cv_signal(&mac_srs->srs_async); 25448275SEric Cheng return (cookie); 25458275SEric Cheng } 25468275SEric Cheng 25478275SEric Cheng /* 25488275SEric Cheng * There are five tx modes: 25498275SEric Cheng * 25508275SEric Cheng * 1) Default mode (SRS_TX_DEFAULT) 25518275SEric Cheng * 2) Serialization mode (SRS_TX_SERIALIZE) 25528275SEric Cheng * 3) Fanout mode (SRS_TX_FANOUT) 25538275SEric Cheng * 4) Bandwdith mode (SRS_TX_BW) 25548275SEric Cheng * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT) 25558275SEric Cheng * 25568275SEric Cheng * The tx mode in which an SRS operates is decided in mac_tx_srs_setup() 25578275SEric Cheng * based on the number of Tx rings requested for an SRS and whether 25588275SEric Cheng * bandwidth control is requested or not. 25598275SEric Cheng * 25608275SEric Cheng * In the default mode (i.e., no fanout/no bandwidth), the SRS acts as a 25618275SEric Cheng * pass-thru. Packets will go directly to mac_tx_send(). When the underlying 25628275SEric Cheng * Tx ring runs out of Tx descs, it starts queueing up packets in SRS. 25638275SEric Cheng * When flow-control is relieved, the srs_worker drains the queued 25648275SEric Cheng * packets and informs blocked clients to restart sending packets. 25658275SEric Cheng * 25668275SEric Cheng * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. 25678275SEric Cheng * 25688275SEric Cheng * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple 25698275SEric Cheng * Tx rings. Each Tx ring will have a soft ring associated with it. 25708275SEric Cheng * These soft rings will be hung off the Tx SRS. Queueing if it happens 25718275SEric Cheng * due to lack of Tx desc will be in individual soft ring (and not srs) 25728275SEric Cheng * associated with Tx ring. 25738275SEric Cheng * 25748275SEric Cheng * In the TX_BW mode, tx srs will allow packets to go down to Tx ring 25758275SEric Cheng * only if bw is available. Otherwise the packets will be queued in 25768275SEric Cheng * SRS. If fanout to multiple Tx rings is configured, the packets will 25778275SEric Cheng * be fanned out among the soft rings associated with the Tx rings. 25788275SEric Cheng * 25798275SEric Cheng * Four flags are used in srs_state for indicating flow control 25808275SEric Cheng * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT. 25818275SEric Cheng * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the 25828275SEric Cheng * driver below. 25838275SEric Cheng * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat 25848275SEric Cheng * and flow-control pressure is applied back to clients. The clients expect 25858275SEric Cheng * wakeup when flow-control is relieved. 25868275SEric Cheng * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk 25878275SEric Cheng * got returned back to client either due to lack of Tx descs or due to bw 25888275SEric Cheng * control reasons. The clients expect a wakeup when condition is relieved. 25898275SEric Cheng * 25908275SEric Cheng * The fourth argument to mac_tx() is the flag. Normally it will be 0 but 25918275SEric Cheng * some clients set the following values too: MAC_DROP_ON_NO_DESC, 25928275SEric Cheng * MAC_TX_NO_ENQUEUE 25938275SEric Cheng * Mac clients that do not want packets to be enqueued in the mac layer set 25948275SEric Cheng * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or 25958275SEric Cheng * Tx soft rings but instead get dropped when the NIC runs out of desc. The 25968275SEric Cheng * behaviour of this flag is different when the Tx is running in serializer 25978275SEric Cheng * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet 25988275SEric Cheng * get dropped when Tx high watermark is reached. 25998275SEric Cheng * There are some mac clients like vsw, aggr that want the mblks to be 26008275SEric Cheng * returned back to clients instead of being queued in Tx SRS (or Tx soft 26018275SEric Cheng * rings) under flow-control (i.e., out of desc or exceeding bw limits) 26028275SEric Cheng * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set. 26038275SEric Cheng * In the default and Tx fanout mode, the un-transmitted mblks will be 26048275SEric Cheng * returned back to the clients when the driver runs out of Tx descs. 26058275SEric Cheng * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or 26068275SEric Cheng * soft ring) so that the clients can be woken up when Tx desc become 26078275SEric Cheng * available. When running in serializer or bandwidth mode mode, 26088275SEric Cheng * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached. 26098275SEric Cheng */ 26108275SEric Cheng 26118275SEric Cheng mac_tx_func_t 26128275SEric Cheng mac_tx_get_func(uint32_t mode) 26138275SEric Cheng { 26148275SEric Cheng return (mac_tx_mode_list[mode].mac_tx_func); 26158275SEric Cheng } 26168275SEric Cheng 26178275SEric Cheng /* ARGSUSED */ 26188275SEric Cheng static mac_tx_cookie_t 26198275SEric Cheng mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 26208275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 26218275SEric Cheng { 26228275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 26238275SEric Cheng boolean_t is_subflow; 26248275SEric Cheng mac_tx_stats_t stats; 26258275SEric Cheng mac_tx_cookie_t cookie = NULL; 26268275SEric Cheng 26278275SEric Cheng ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT); 26288275SEric Cheng 26298275SEric Cheng /* Regular case with a single Tx ring */ 26308275SEric Cheng /* 26318275SEric Cheng * SRS_TX_BLOCKED is set when underlying NIC runs 26328275SEric Cheng * out of Tx descs and messages start getting 26338275SEric Cheng * queued. It won't get reset until 26348275SEric Cheng * tx_srs_drain() completely drains out the 26358275SEric Cheng * messages. 26368275SEric Cheng */ 26378275SEric Cheng if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 26388275SEric Cheng /* Tx descs/resources not available */ 26398275SEric Cheng mutex_enter(&mac_srs->srs_lock); 26408275SEric Cheng if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 26418275SEric Cheng cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, 26428275SEric Cheng flag, ret_mp); 26438275SEric Cheng mutex_exit(&mac_srs->srs_lock); 26448275SEric Cheng return (cookie); 26458275SEric Cheng } 26468275SEric Cheng /* 26478275SEric Cheng * While we were computing mblk count, the 26488275SEric Cheng * flow control condition got relieved. 26498275SEric Cheng * Continue with the transmission. 26508275SEric Cheng */ 26518275SEric Cheng mutex_exit(&mac_srs->srs_lock); 26528275SEric Cheng } 26538275SEric Cheng 26548275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 26558275SEric Cheng 26568275SEric Cheng mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 26578275SEric Cheng mp_chain, (is_subflow ? &stats : NULL)); 26588275SEric Cheng 26598275SEric Cheng /* 26608275SEric Cheng * Multiple threads could be here sending packets. 26618275SEric Cheng * Under such conditions, it is not possible to 26628275SEric Cheng * automically set SRS_TX_BLOCKED bit to indicate 26638275SEric Cheng * out of tx desc condition. To atomically set 26648275SEric Cheng * this, we queue the returned packet and do 26658275SEric Cheng * the setting of SRS_TX_BLOCKED in 26668275SEric Cheng * mac_tx_srs_drain(). 26678275SEric Cheng */ 26688275SEric Cheng if (mp_chain != NULL) { 26698275SEric Cheng mutex_enter(&mac_srs->srs_lock); 26708275SEric Cheng cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp); 26718275SEric Cheng mutex_exit(&mac_srs->srs_lock); 26728275SEric Cheng return (cookie); 26738275SEric Cheng } 26748275SEric Cheng 26758275SEric Cheng if (is_subflow) 26768275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 26778275SEric Cheng 26788275SEric Cheng return (NULL); 26798275SEric Cheng } 26808275SEric Cheng 26818275SEric Cheng /* 26828275SEric Cheng * mac_tx_serialize_mode 26838275SEric Cheng * 26848275SEric Cheng * This is an experimental mode implemented as per the request of PAE. 26858275SEric Cheng * In this mode, all callers attempting to send a packet to the NIC 26868275SEric Cheng * will get serialized. Only one thread at any time will access the 26878275SEric Cheng * NIC to send the packet out. 26888275SEric Cheng */ 26898275SEric Cheng /* ARGSUSED */ 26908275SEric Cheng static mac_tx_cookie_t 26918275SEric Cheng mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 26928275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 26938275SEric Cheng { 26948275SEric Cheng boolean_t is_subflow; 26958275SEric Cheng mac_tx_stats_t stats; 26968275SEric Cheng mac_tx_cookie_t cookie = NULL; 26978275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 26988275SEric Cheng 26998275SEric Cheng /* Single ring, serialize below */ 27008275SEric Cheng ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE); 27018275SEric Cheng mutex_enter(&mac_srs->srs_lock); 27028275SEric Cheng if ((mac_srs->srs_first != NULL) || 27038275SEric Cheng (mac_srs->srs_state & SRS_PROC)) { 27048275SEric Cheng /* 27058275SEric Cheng * In serialization mode, queue all packets until 27068275SEric Cheng * TX_HIWAT is set. 27078275SEric Cheng * If drop bit is set, drop if TX_HIWAT is set. 27088275SEric Cheng * If no_enqueue is set, still enqueue until hiwat 27098275SEric Cheng * is set and return mblks after TX_HIWAT is set. 27108275SEric Cheng */ 27118275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, 27128275SEric Cheng flag, NULL, ret_mp); 27138275SEric Cheng mutex_exit(&mac_srs->srs_lock); 27148275SEric Cheng return (cookie); 27158275SEric Cheng } 27168275SEric Cheng /* 27178275SEric Cheng * No packets queued, nothing on proc and no flow 27188275SEric Cheng * control condition. Fast-path, ok. Do inline 27198275SEric Cheng * processing. 27208275SEric Cheng */ 27218275SEric Cheng mac_srs->srs_state |= SRS_PROC; 27228275SEric Cheng mutex_exit(&mac_srs->srs_lock); 27238275SEric Cheng 27248275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 27258275SEric Cheng 27268275SEric Cheng mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 27278275SEric Cheng mp_chain, (is_subflow ? &stats : NULL)); 27288275SEric Cheng 27298275SEric Cheng mutex_enter(&mac_srs->srs_lock); 27308275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 27318275SEric Cheng if (mp_chain != NULL) { 27328275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, 27338275SEric Cheng mp_chain, flag, NULL, ret_mp); 27348275SEric Cheng } 27358275SEric Cheng if (mac_srs->srs_first != NULL) { 27368275SEric Cheng /* 27378275SEric Cheng * We processed inline our packet and a new 27388275SEric Cheng * packet/s got queued while we were 27398275SEric Cheng * processing. Wakeup srs worker 27408275SEric Cheng */ 27418275SEric Cheng cv_signal(&mac_srs->srs_async); 27428275SEric Cheng } 27438275SEric Cheng mutex_exit(&mac_srs->srs_lock); 27448275SEric Cheng 27458275SEric Cheng if (is_subflow && cookie == NULL) 27468275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 27478275SEric Cheng 27488275SEric Cheng return (cookie); 27498275SEric Cheng } 27508275SEric Cheng 27518275SEric Cheng /* 27528275SEric Cheng * mac_tx_fanout_mode 27538275SEric Cheng * 27548275SEric Cheng * In this mode, the SRS will have access to multiple Tx rings to send 27558275SEric Cheng * the packet out. The fanout hint that is passed as an argument is 27568275SEric Cheng * used to find an appropriate ring to fanout the traffic. Each Tx 27578275SEric Cheng * ring, in turn, will have a soft ring associated with it. If a Tx 27588275SEric Cheng * ring runs out of Tx desc's the returned packet will be queued in 27598275SEric Cheng * the soft ring associated with that Tx ring. The srs itself will not 27608275SEric Cheng * queue any packets. 27618275SEric Cheng */ 27628833SVenu.Iyer@Sun.COM 27638833SVenu.Iyer@Sun.COM #define MAC_TX_SOFT_RING_PROCESS(chain) { \ 27648833SVenu.Iyer@Sun.COM index = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count), \ 27658833SVenu.Iyer@Sun.COM softring = mac_srs->srs_oth_soft_rings[index]; \ 27668833SVenu.Iyer@Sun.COM cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \ 27678833SVenu.Iyer@Sun.COM DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index); \ 27688833SVenu.Iyer@Sun.COM } 27698833SVenu.Iyer@Sun.COM 27708275SEric Cheng static mac_tx_cookie_t 27718275SEric Cheng mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 27728275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 27738275SEric Cheng { 27748275SEric Cheng mac_soft_ring_t *softring; 27758833SVenu.Iyer@Sun.COM uint64_t hash; 27768833SVenu.Iyer@Sun.COM uint_t index; 27778833SVenu.Iyer@Sun.COM mac_tx_cookie_t cookie = NULL; 27788275SEric Cheng 27798275SEric Cheng ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT); 27808833SVenu.Iyer@Sun.COM if (fanout_hint != 0) { 27818833SVenu.Iyer@Sun.COM /* 27828833SVenu.Iyer@Sun.COM * The hint is specified by the caller, simply pass the 27838833SVenu.Iyer@Sun.COM * whole chain to the soft ring. 27848833SVenu.Iyer@Sun.COM */ 27858833SVenu.Iyer@Sun.COM hash = HASH_HINT(fanout_hint); 27868833SVenu.Iyer@Sun.COM MAC_TX_SOFT_RING_PROCESS(mp_chain); 27878833SVenu.Iyer@Sun.COM } else { 27888833SVenu.Iyer@Sun.COM mblk_t *last_mp, *cur_mp, *sub_chain; 27898833SVenu.Iyer@Sun.COM uint64_t last_hash = 0; 27908833SVenu.Iyer@Sun.COM uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media; 27918833SVenu.Iyer@Sun.COM 27928833SVenu.Iyer@Sun.COM /* 27938833SVenu.Iyer@Sun.COM * Compute the hash from the contents (headers) of the 27948833SVenu.Iyer@Sun.COM * packets of the mblk chain. Split the chains into 27958833SVenu.Iyer@Sun.COM * subchains of the same conversation. 27968833SVenu.Iyer@Sun.COM * 27978833SVenu.Iyer@Sun.COM * Since there may be more than one ring used for 27988833SVenu.Iyer@Sun.COM * sub-chains of the same call, and since the caller 27998833SVenu.Iyer@Sun.COM * does not maintain per conversation state since it 28008833SVenu.Iyer@Sun.COM * passed a zero hint, unsent subchains will be 28018833SVenu.Iyer@Sun.COM * dropped. 28028833SVenu.Iyer@Sun.COM */ 28038833SVenu.Iyer@Sun.COM 28048833SVenu.Iyer@Sun.COM flag |= MAC_DROP_ON_NO_DESC; 28058833SVenu.Iyer@Sun.COM ret_mp = NULL; 28068833SVenu.Iyer@Sun.COM 28078833SVenu.Iyer@Sun.COM ASSERT(ret_mp == NULL); 28088833SVenu.Iyer@Sun.COM 28098833SVenu.Iyer@Sun.COM sub_chain = NULL; 28108833SVenu.Iyer@Sun.COM last_mp = NULL; 28118833SVenu.Iyer@Sun.COM 28128833SVenu.Iyer@Sun.COM for (cur_mp = mp_chain; cur_mp != NULL; 28138833SVenu.Iyer@Sun.COM cur_mp = cur_mp->b_next) { 28148833SVenu.Iyer@Sun.COM hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4, 28158833SVenu.Iyer@Sun.COM B_TRUE); 28168833SVenu.Iyer@Sun.COM if (last_hash != 0 && hash != last_hash) { 28178833SVenu.Iyer@Sun.COM /* 28188833SVenu.Iyer@Sun.COM * Starting a different subchain, send current 28198833SVenu.Iyer@Sun.COM * chain out. 28208833SVenu.Iyer@Sun.COM */ 28218833SVenu.Iyer@Sun.COM ASSERT(last_mp != NULL); 28228833SVenu.Iyer@Sun.COM last_mp->b_next = NULL; 28238833SVenu.Iyer@Sun.COM MAC_TX_SOFT_RING_PROCESS(sub_chain); 28248833SVenu.Iyer@Sun.COM sub_chain = NULL; 28258833SVenu.Iyer@Sun.COM } 28268833SVenu.Iyer@Sun.COM 28278833SVenu.Iyer@Sun.COM /* add packet to subchain */ 28288833SVenu.Iyer@Sun.COM if (sub_chain == NULL) 28298833SVenu.Iyer@Sun.COM sub_chain = cur_mp; 28308833SVenu.Iyer@Sun.COM last_mp = cur_mp; 28318833SVenu.Iyer@Sun.COM last_hash = hash; 28328833SVenu.Iyer@Sun.COM } 28338833SVenu.Iyer@Sun.COM 28348833SVenu.Iyer@Sun.COM if (sub_chain != NULL) { 28358833SVenu.Iyer@Sun.COM /* send last subchain */ 28368833SVenu.Iyer@Sun.COM ASSERT(last_mp != NULL); 28378833SVenu.Iyer@Sun.COM last_mp->b_next = NULL; 28388833SVenu.Iyer@Sun.COM MAC_TX_SOFT_RING_PROCESS(sub_chain); 28398833SVenu.Iyer@Sun.COM } 28408833SVenu.Iyer@Sun.COM 28418833SVenu.Iyer@Sun.COM cookie = NULL; 28428833SVenu.Iyer@Sun.COM } 28438833SVenu.Iyer@Sun.COM 28448833SVenu.Iyer@Sun.COM return (cookie); 28458275SEric Cheng } 28468275SEric Cheng 28478275SEric Cheng /* 28488275SEric Cheng * mac_tx_bw_mode 28498275SEric Cheng * 28508275SEric Cheng * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring 28518275SEric Cheng * only if bw is available. Otherwise the packets will be queued in 28528275SEric Cheng * SRS. If the SRS has multiple Tx rings, then packets will get fanned 28538275SEric Cheng * out to a Tx rings. 28548275SEric Cheng */ 28558275SEric Cheng static mac_tx_cookie_t 28568275SEric Cheng mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 28578275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 28588275SEric Cheng { 28598275SEric Cheng int cnt, sz; 28608275SEric Cheng mblk_t *tail; 28618275SEric Cheng mac_tx_cookie_t cookie = NULL; 28628275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 28638275SEric Cheng 28648275SEric Cheng ASSERT(TX_BANDWIDTH_MODE(mac_srs)); 28658275SEric Cheng ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 28668275SEric Cheng mutex_enter(&mac_srs->srs_lock); 28678275SEric Cheng if (mac_srs->srs_bw->mac_bw_limit == 0) { 28688833SVenu.Iyer@Sun.COM /* 28698833SVenu.Iyer@Sun.COM * zero bandwidth, no traffic is sent: drop the packets, 28708833SVenu.Iyer@Sun.COM * or return the whole chain if the caller requests all 28718833SVenu.Iyer@Sun.COM * unsent packets back. 28728833SVenu.Iyer@Sun.COM */ 28738833SVenu.Iyer@Sun.COM if (flag & MAC_TX_NO_ENQUEUE) { 28748833SVenu.Iyer@Sun.COM cookie = (mac_tx_cookie_t)mac_srs; 28758833SVenu.Iyer@Sun.COM *ret_mp = mp_chain; 28768833SVenu.Iyer@Sun.COM } else { 28778833SVenu.Iyer@Sun.COM MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 28788833SVenu.Iyer@Sun.COM } 28798275SEric Cheng mutex_exit(&mac_srs->srs_lock); 28808275SEric Cheng return (cookie); 28818275SEric Cheng } else if ((mac_srs->srs_first != NULL) || 28828275SEric Cheng (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 28838275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 28848275SEric Cheng fanout_hint, ret_mp); 28858275SEric Cheng mutex_exit(&mac_srs->srs_lock); 28868275SEric Cheng return (cookie); 28878275SEric Cheng } 28888275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 28898275SEric Cheng if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 28908275SEric Cheng mac_srs->srs_bw->mac_bw_curr_time = lbolt; 28918275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 28928275SEric Cheng } else if (mac_srs->srs_bw->mac_bw_used > 28938275SEric Cheng mac_srs->srs_bw->mac_bw_limit) { 28948275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 28958275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 28968275SEric Cheng mp_chain, tail, cnt, sz); 28978275SEric Cheng /* 28988275SEric Cheng * Wakeup worker thread. Note that worker 28998275SEric Cheng * thread has to be woken up so that it 29008275SEric Cheng * can fire up the timer to be woken up 29018275SEric Cheng * on the next tick. Also once 29028275SEric Cheng * BW_ENFORCED is set, it can only be 29038275SEric Cheng * reset by srs_worker thread. Until then 29048275SEric Cheng * all packets will get queued up in SRS 29058275SEric Cheng * and hence this this code path won't be 29068275SEric Cheng * entered until BW_ENFORCED is reset. 29078275SEric Cheng */ 29088275SEric Cheng cv_signal(&mac_srs->srs_async); 29098275SEric Cheng mutex_exit(&mac_srs->srs_lock); 29108275SEric Cheng return (cookie); 29118275SEric Cheng } 29128275SEric Cheng 29138275SEric Cheng mac_srs->srs_bw->mac_bw_used += sz; 29148275SEric Cheng mutex_exit(&mac_srs->srs_lock); 29158275SEric Cheng 29168275SEric Cheng if (srs_tx->st_mode == SRS_TX_BW_FANOUT) { 29178275SEric Cheng mac_soft_ring_t *softring; 29188275SEric Cheng uint_t indx, hash; 29198275SEric Cheng 29208275SEric Cheng hash = HASH_HINT(fanout_hint); 29218275SEric Cheng indx = COMPUTE_INDEX(hash, 29228275SEric Cheng mac_srs->srs_oth_ring_count); 29238275SEric Cheng softring = mac_srs->srs_oth_soft_rings[indx]; 29248275SEric Cheng return (mac_tx_soft_ring_process(softring, mp_chain, flag, 29258275SEric Cheng ret_mp)); 29268275SEric Cheng } else { 29278275SEric Cheng boolean_t is_subflow; 29288275SEric Cheng mac_tx_stats_t stats; 29298275SEric Cheng 29308275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 29318275SEric Cheng 29328275SEric Cheng mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 29338275SEric Cheng mp_chain, (is_subflow ? &stats : NULL)); 29348275SEric Cheng 29358275SEric Cheng if (mp_chain != NULL) { 29368275SEric Cheng mutex_enter(&mac_srs->srs_lock); 29378275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 29388275SEric Cheng if (mac_srs->srs_bw->mac_bw_used > sz) 29398275SEric Cheng mac_srs->srs_bw->mac_bw_used -= sz; 29408275SEric Cheng else 29418275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 29428275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 29438275SEric Cheng fanout_hint, ret_mp); 29448275SEric Cheng mutex_exit(&mac_srs->srs_lock); 29458275SEric Cheng return (cookie); 29468275SEric Cheng } 29478275SEric Cheng if (is_subflow) 29488275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 29498275SEric Cheng 29508275SEric Cheng return (NULL); 29518275SEric Cheng } 29528275SEric Cheng } 29538275SEric Cheng 29548275SEric Cheng /* ARGSUSED */ 29558275SEric Cheng void 29568275SEric Cheng mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 29578275SEric Cheng { 29588275SEric Cheng mblk_t *head, *tail; 29598275SEric Cheng size_t sz; 29608275SEric Cheng uint32_t tx_mode; 29618275SEric Cheng uint_t saved_pkt_count; 29628275SEric Cheng boolean_t is_subflow; 29638275SEric Cheng mac_tx_stats_t stats; 29648275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 29658275SEric Cheng 29668275SEric Cheng saved_pkt_count = 0; 29678275SEric Cheng ASSERT(mutex_owned(&mac_srs->srs_lock)); 29688275SEric Cheng ASSERT(!(mac_srs->srs_state & SRS_PROC)); 29698275SEric Cheng 29708275SEric Cheng mac_srs->srs_state |= SRS_PROC; 29718275SEric Cheng 29728275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 29738275SEric Cheng tx_mode = srs_tx->st_mode; 29748275SEric Cheng if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) { 29758275SEric Cheng if (mac_srs->srs_first != NULL) { 29768275SEric Cheng head = mac_srs->srs_first; 29778275SEric Cheng tail = mac_srs->srs_last; 29788275SEric Cheng saved_pkt_count = mac_srs->srs_count; 29798275SEric Cheng mac_srs->srs_first = NULL; 29808275SEric Cheng mac_srs->srs_last = NULL; 29818275SEric Cheng mac_srs->srs_count = 0; 29828275SEric Cheng mutex_exit(&mac_srs->srs_lock); 29838275SEric Cheng 29848275SEric Cheng head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 29858275SEric Cheng head, &stats); 29868275SEric Cheng 29878275SEric Cheng mutex_enter(&mac_srs->srs_lock); 29888275SEric Cheng if (head != NULL) { 29898275SEric Cheng /* Device out of tx desc, set block */ 29908275SEric Cheng if (head->b_next == NULL) 29918275SEric Cheng VERIFY(head == tail); 29928275SEric Cheng tail->b_next = mac_srs->srs_first; 29938275SEric Cheng mac_srs->srs_first = head; 29948275SEric Cheng mac_srs->srs_count += 29958275SEric Cheng (saved_pkt_count - stats.ts_opackets); 29968275SEric Cheng if (mac_srs->srs_last == NULL) 29978275SEric Cheng mac_srs->srs_last = tail; 29988275SEric Cheng MAC_TX_SRS_BLOCK(mac_srs, head); 29998275SEric Cheng } else { 30008275SEric Cheng srs_tx->st_woken_up = B_FALSE; 30018275SEric Cheng if (is_subflow) { 30028275SEric Cheng FLOW_TX_STATS_UPDATE( 30038275SEric Cheng mac_srs->srs_flent, &stats); 30048275SEric Cheng } 30058275SEric Cheng } 30068275SEric Cheng } 30078275SEric Cheng } else if (tx_mode == SRS_TX_BW) { 30088275SEric Cheng /* 30098275SEric Cheng * We are here because the timer fired and we have some data 30108275SEric Cheng * to tranmit. Also mac_tx_srs_worker should have reset 30118275SEric Cheng * SRS_BW_ENFORCED flag 30128275SEric Cheng */ 30138275SEric Cheng ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)); 30148275SEric Cheng head = tail = mac_srs->srs_first; 30158275SEric Cheng while (mac_srs->srs_first != NULL) { 30168275SEric Cheng tail = mac_srs->srs_first; 30178275SEric Cheng tail->b_prev = NULL; 30188275SEric Cheng mac_srs->srs_first = tail->b_next; 30198275SEric Cheng if (mac_srs->srs_first == NULL) 30208275SEric Cheng mac_srs->srs_last = NULL; 30218275SEric Cheng mac_srs->srs_count--; 30228275SEric Cheng sz = msgdsize(tail); 30238275SEric Cheng mac_srs->srs_size -= sz; 30248275SEric Cheng saved_pkt_count++; 30258275SEric Cheng MAC_TX_UPDATE_BW_INFO(mac_srs, sz); 30268275SEric Cheng 30278275SEric Cheng if (mac_srs->srs_bw->mac_bw_used < 30288275SEric Cheng mac_srs->srs_bw->mac_bw_limit) 30298275SEric Cheng continue; 30308275SEric Cheng 30318275SEric Cheng if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 30328275SEric Cheng mac_srs->srs_bw->mac_bw_curr_time = lbolt; 30338275SEric Cheng mac_srs->srs_bw->mac_bw_used = sz; 30348275SEric Cheng continue; 30358275SEric Cheng } 30368275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 30378275SEric Cheng break; 30388275SEric Cheng } 30398275SEric Cheng 30408275SEric Cheng ASSERT((head == NULL && tail == NULL) || 30418275SEric Cheng (head != NULL && tail != NULL)); 30428275SEric Cheng if (tail != NULL) { 30438275SEric Cheng tail->b_next = NULL; 30448275SEric Cheng mutex_exit(&mac_srs->srs_lock); 30458275SEric Cheng 30468275SEric Cheng head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 30478275SEric Cheng head, &stats); 30488275SEric Cheng 30498275SEric Cheng mutex_enter(&mac_srs->srs_lock); 30508275SEric Cheng if (head != NULL) { 30518275SEric Cheng uint_t size_sent; 30528275SEric Cheng 30538275SEric Cheng /* Device out of tx desc, set block */ 30548275SEric Cheng if (head->b_next == NULL) 30558275SEric Cheng VERIFY(head == tail); 30568275SEric Cheng tail->b_next = mac_srs->srs_first; 30578275SEric Cheng mac_srs->srs_first = head; 30588275SEric Cheng mac_srs->srs_count += 30598275SEric Cheng (saved_pkt_count - stats.ts_opackets); 30608275SEric Cheng if (mac_srs->srs_last == NULL) 30618275SEric Cheng mac_srs->srs_last = tail; 30628275SEric Cheng size_sent = sz - stats.ts_obytes; 30638275SEric Cheng mac_srs->srs_size += size_sent; 30648275SEric Cheng mac_srs->srs_bw->mac_bw_sz += size_sent; 30658275SEric Cheng if (mac_srs->srs_bw->mac_bw_used > size_sent) { 30668275SEric Cheng mac_srs->srs_bw->mac_bw_used -= 30678275SEric Cheng size_sent; 30688275SEric Cheng } else { 30698275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 30708275SEric Cheng } 30718275SEric Cheng MAC_TX_SRS_BLOCK(mac_srs, head); 30728275SEric Cheng } else { 30738275SEric Cheng srs_tx->st_woken_up = B_FALSE; 30748275SEric Cheng if (is_subflow) { 30758275SEric Cheng FLOW_TX_STATS_UPDATE( 30768275SEric Cheng mac_srs->srs_flent, &stats); 30778275SEric Cheng } 30788275SEric Cheng } 30798275SEric Cheng } 30808275SEric Cheng } else if (tx_mode == SRS_TX_BW_FANOUT) { 30818275SEric Cheng mblk_t *prev; 30828275SEric Cheng mac_soft_ring_t *softring; 30838275SEric Cheng uint64_t hint; 30848275SEric Cheng 30858275SEric Cheng /* 30868275SEric Cheng * We are here because the timer fired and we 30878275SEric Cheng * have some quota to tranmit. 30888275SEric Cheng */ 30898275SEric Cheng prev = NULL; 30908275SEric Cheng head = tail = mac_srs->srs_first; 30918275SEric Cheng while (mac_srs->srs_first != NULL) { 30928275SEric Cheng tail = mac_srs->srs_first; 30938275SEric Cheng mac_srs->srs_first = tail->b_next; 30948275SEric Cheng if (mac_srs->srs_first == NULL) 30958275SEric Cheng mac_srs->srs_last = NULL; 30968275SEric Cheng mac_srs->srs_count--; 30978275SEric Cheng sz = msgdsize(tail); 30988275SEric Cheng mac_srs->srs_size -= sz; 30998275SEric Cheng mac_srs->srs_bw->mac_bw_used += sz; 31008275SEric Cheng if (prev == NULL) 31018275SEric Cheng hint = (ulong_t)tail->b_prev; 31028275SEric Cheng if (hint != (ulong_t)tail->b_prev) { 31038275SEric Cheng prev->b_next = NULL; 31048275SEric Cheng mutex_exit(&mac_srs->srs_lock); 31058275SEric Cheng TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 31068275SEric Cheng head = tail; 31078275SEric Cheng hint = (ulong_t)tail->b_prev; 31088275SEric Cheng mutex_enter(&mac_srs->srs_lock); 31098275SEric Cheng } 31108275SEric Cheng 31118275SEric Cheng prev = tail; 31128275SEric Cheng tail->b_prev = NULL; 31138275SEric Cheng if (mac_srs->srs_bw->mac_bw_used < 31148275SEric Cheng mac_srs->srs_bw->mac_bw_limit) 31158275SEric Cheng continue; 31168275SEric Cheng 31178275SEric Cheng if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 31188275SEric Cheng mac_srs->srs_bw->mac_bw_curr_time = lbolt; 31198275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 31208275SEric Cheng continue; 31218275SEric Cheng } 31228275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 31238275SEric Cheng break; 31248275SEric Cheng } 31258275SEric Cheng ASSERT((head == NULL && tail == NULL) || 31268275SEric Cheng (head != NULL && tail != NULL)); 31278275SEric Cheng if (tail != NULL) { 31288275SEric Cheng tail->b_next = NULL; 31298275SEric Cheng mutex_exit(&mac_srs->srs_lock); 31308275SEric Cheng TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 31318275SEric Cheng mutex_enter(&mac_srs->srs_lock); 31328275SEric Cheng } 31338275SEric Cheng } 31348275SEric Cheng /* 31358275SEric Cheng * SRS_TX_FANOUT case not considered here because packets 31368275SEric Cheng * won't be queued in the SRS for this case. Packets will 31378275SEric Cheng * be sent directly to soft rings underneath and if there 31388275SEric Cheng * is any queueing at all, it would be in Tx side soft 31398275SEric Cheng * rings. 31408275SEric Cheng */ 31418275SEric Cheng 31428275SEric Cheng /* 31438275SEric Cheng * When srs_count becomes 0, reset SRS_TX_HIWAT and 31448275SEric Cheng * SRS_TX_WAKEUP_CLIENT and wakeup registered clients. 31458275SEric Cheng */ 31468275SEric Cheng if (mac_srs->srs_count == 0 && (mac_srs->srs_state & 31478275SEric Cheng (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) { 31488275SEric Cheng mac_tx_notify_cb_t *mtnfp; 31498275SEric Cheng mac_cb_t *mcb; 31508275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 31518275SEric Cheng boolean_t wakeup_required = B_FALSE; 31528275SEric Cheng 31538275SEric Cheng if (mac_srs->srs_state & 31548275SEric Cheng (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) { 31558275SEric Cheng wakeup_required = B_TRUE; 31568275SEric Cheng } 31578275SEric Cheng mac_srs->srs_state &= ~(SRS_TX_HIWAT | 31588275SEric Cheng SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED); 31598275SEric Cheng mutex_exit(&mac_srs->srs_lock); 31608275SEric Cheng if (wakeup_required) { 31618275SEric Cheng /* Wakeup callback registered clients */ 31628275SEric Cheng MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info); 31638275SEric Cheng for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL; 31648275SEric Cheng mcb = mcb->mcb_nextp) { 31658275SEric Cheng mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp; 31668275SEric Cheng mtnfp->mtnf_fn(mtnfp->mtnf_arg, 31678275SEric Cheng (mac_tx_cookie_t)mac_srs); 31688275SEric Cheng } 31698275SEric Cheng MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info, 31708275SEric Cheng &mcip->mci_tx_notify_cb_list); 31718275SEric Cheng /* 31728275SEric Cheng * If the client is not the primary MAC client, then we 31738275SEric Cheng * need to send the notification to the clients upper 31748275SEric Cheng * MAC, i.e. mci_upper_mip. 31758275SEric Cheng */ 31768275SEric Cheng mac_tx_notify(mcip->mci_upper_mip != NULL ? 31778275SEric Cheng mcip->mci_upper_mip : mcip->mci_mip); 31788275SEric Cheng } 31798275SEric Cheng mutex_enter(&mac_srs->srs_lock); 31808275SEric Cheng } 31818275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 31828275SEric Cheng } 31838275SEric Cheng 31848275SEric Cheng /* 31858275SEric Cheng * Given a packet, get the flow_entry that identifies the flow 31868275SEric Cheng * to which that packet belongs. The flow_entry will contain 31878275SEric Cheng * the transmit function to be used to send the packet. If the 31888275SEric Cheng * function returns NULL, the packet should be sent using the 31898275SEric Cheng * underlying NIC. 31908275SEric Cheng */ 31918275SEric Cheng static flow_entry_t * 31928275SEric Cheng mac_tx_classify(mac_impl_t *mip, mblk_t *mp) 31938275SEric Cheng { 31948275SEric Cheng flow_entry_t *flent = NULL; 31958275SEric Cheng mac_client_impl_t *mcip; 31968275SEric Cheng int err; 31978275SEric Cheng 31988275SEric Cheng /* 31998275SEric Cheng * Do classification on the packet. 32008275SEric Cheng */ 32018275SEric Cheng err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent); 32028275SEric Cheng if (err != 0) 32038275SEric Cheng return (NULL); 32048275SEric Cheng 32058275SEric Cheng /* 32068275SEric Cheng * This flent might just be an additional one on the MAC client, 32078275SEric Cheng * i.e. for classification purposes (different fdesc), however 32088275SEric Cheng * the resources, SRS et. al., are in the mci_flent, so if 32098275SEric Cheng * this isn't the mci_flent, we need to get it. 32108275SEric Cheng */ 32118275SEric Cheng if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) { 32128275SEric Cheng FLOW_REFRELE(flent); 32138275SEric Cheng flent = mcip->mci_flent; 32148275SEric Cheng FLOW_TRY_REFHOLD(flent, err); 32158275SEric Cheng if (err != 0) 32168275SEric Cheng return (NULL); 32178275SEric Cheng } 32188275SEric Cheng 32198275SEric Cheng return (flent); 32208275SEric Cheng } 32218275SEric Cheng 32228275SEric Cheng /* 32238275SEric Cheng * This macro is only meant to be used by mac_tx_send(). 32248275SEric Cheng */ 32258275SEric Cheng #define CHECK_VID_AND_ADD_TAG(mp) { \ 32268275SEric Cheng if (vid_check) { \ 32278275SEric Cheng int err = 0; \ 32288275SEric Cheng \ 32298275SEric Cheng MAC_VID_CHECK(src_mcip, (mp), err); \ 32308275SEric Cheng if (err != 0) { \ 32318275SEric Cheng freemsg((mp)); \ 32328275SEric Cheng (mp) = next; \ 32338275SEric Cheng oerrors++; \ 32348275SEric Cheng continue; \ 32358275SEric Cheng } \ 32368275SEric Cheng } \ 32378275SEric Cheng if (add_tag) { \ 32388275SEric Cheng (mp) = mac_add_vlan_tag((mp), 0, vid); \ 32398275SEric Cheng if ((mp) == NULL) { \ 32408275SEric Cheng (mp) = next; \ 32418275SEric Cheng oerrors++; \ 32428275SEric Cheng continue; \ 32438275SEric Cheng } \ 32448275SEric Cheng } \ 32458275SEric Cheng } 32468275SEric Cheng 32478275SEric Cheng mblk_t * 32488275SEric Cheng mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, 32498275SEric Cheng mac_tx_stats_t *stats) 32508275SEric Cheng { 32518275SEric Cheng mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch; 32528275SEric Cheng mac_impl_t *mip = src_mcip->mci_mip; 32538275SEric Cheng uint_t obytes = 0, opackets = 0, oerrors = 0; 32548275SEric Cheng mblk_t *mp = NULL, *next; 32558275SEric Cheng boolean_t vid_check, add_tag; 32568275SEric Cheng uint16_t vid = 0; 32578275SEric Cheng 32588275SEric Cheng if (mip->mi_nclients > 1) { 32598275SEric Cheng vid_check = MAC_VID_CHECK_NEEDED(src_mcip); 32608275SEric Cheng add_tag = MAC_TAG_NEEDED(src_mcip); 32618275SEric Cheng if (add_tag) 32628275SEric Cheng vid = mac_client_vid(mch); 32638275SEric Cheng } else { 32648275SEric Cheng ASSERT(mip->mi_nclients == 1); 32658275SEric Cheng vid_check = add_tag = B_FALSE; 32668275SEric Cheng } 32678275SEric Cheng 32688275SEric Cheng /* 32698275SEric Cheng * Fastpath: if there's only one client, and there's no 32708275SEric Cheng * multicast listeners, we simply send the packet down to the 32718275SEric Cheng * underlying NIC. 32728275SEric Cheng */ 32738275SEric Cheng if (mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL) { 32748275SEric Cheng DTRACE_PROBE2(fastpath, 32758275SEric Cheng mac_client_impl_t *, src_mcip, mblk_t *, mp_chain); 32768275SEric Cheng 32778275SEric Cheng mp = mp_chain; 32788275SEric Cheng while (mp != NULL) { 32798275SEric Cheng next = mp->b_next; 32808275SEric Cheng mp->b_next = NULL; 32818275SEric Cheng opackets++; 32828275SEric Cheng obytes += (mp->b_cont == NULL ? MBLKL(mp) : 32838275SEric Cheng msgdsize(mp)); 32848275SEric Cheng 32858275SEric Cheng CHECK_VID_AND_ADD_TAG(mp); 328610491SRishi.Srivatsavai@Sun.COM MAC_TX(mip, ring, mp, 328710491SRishi.Srivatsavai@Sun.COM ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) != 328810491SRishi.Srivatsavai@Sun.COM 0)); 32898275SEric Cheng 32908275SEric Cheng /* 32918275SEric Cheng * If the driver is out of descriptors and does a 32928275SEric Cheng * partial send it will return a chain of unsent 32938275SEric Cheng * mblks. Adjust the accounting stats. 32948275SEric Cheng */ 32958275SEric Cheng if (mp != NULL) { 32968275SEric Cheng opackets--; 32978275SEric Cheng obytes -= msgdsize(mp); 32988275SEric Cheng mp->b_next = next; 32998275SEric Cheng break; 33008275SEric Cheng } 33018275SEric Cheng mp = next; 33028275SEric Cheng } 33038275SEric Cheng goto done; 33048275SEric Cheng } 33058275SEric Cheng 33068275SEric Cheng /* 33078275SEric Cheng * No fastpath, we either have more than one MAC client 33088275SEric Cheng * defined on top of the same MAC, or one or more MAC 33098275SEric Cheng * client promiscuous callbacks. 33108275SEric Cheng */ 33118275SEric Cheng DTRACE_PROBE3(slowpath, mac_client_impl_t *, 33128275SEric Cheng src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain); 33138275SEric Cheng 33148275SEric Cheng mp = mp_chain; 33158275SEric Cheng while (mp != NULL) { 33168275SEric Cheng flow_entry_t *dst_flow_ent; 33178275SEric Cheng void *flow_cookie; 33188275SEric Cheng size_t pkt_size; 33198275SEric Cheng mblk_t *mp1; 33208275SEric Cheng 33218275SEric Cheng next = mp->b_next; 33228275SEric Cheng mp->b_next = NULL; 33238275SEric Cheng opackets++; 33248275SEric Cheng pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp)); 33258275SEric Cheng obytes += pkt_size; 33268275SEric Cheng CHECK_VID_AND_ADD_TAG(mp); 33278275SEric Cheng 33288275SEric Cheng /* 33298833SVenu.Iyer@Sun.COM * Check if there are promiscuous mode callbacks defined. 33308833SVenu.Iyer@Sun.COM */ 33318833SVenu.Iyer@Sun.COM if (mip->mi_promisc_list != NULL) 33328833SVenu.Iyer@Sun.COM mac_promisc_dispatch(mip, mp, src_mcip); 33338833SVenu.Iyer@Sun.COM 33348833SVenu.Iyer@Sun.COM /* 33358275SEric Cheng * Find the destination. 33368275SEric Cheng */ 33378275SEric Cheng dst_flow_ent = mac_tx_classify(mip, mp); 33388275SEric Cheng 33398275SEric Cheng if (dst_flow_ent != NULL) { 33408275SEric Cheng size_t hdrsize; 33418275SEric Cheng int err = 0; 33428275SEric Cheng 33438275SEric Cheng if (mip->mi_info.mi_nativemedia == DL_ETHER) { 33448275SEric Cheng struct ether_vlan_header *evhp = 33458275SEric Cheng (struct ether_vlan_header *)mp->b_rptr; 33468275SEric Cheng 33478275SEric Cheng if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) 33488275SEric Cheng hdrsize = sizeof (*evhp); 33498275SEric Cheng else 33508275SEric Cheng hdrsize = sizeof (struct ether_header); 33518275SEric Cheng } else { 33528275SEric Cheng mac_header_info_t mhi; 33538275SEric Cheng 33548275SEric Cheng err = mac_header_info((mac_handle_t)mip, 33558275SEric Cheng mp, &mhi); 33568275SEric Cheng if (err == 0) 33578275SEric Cheng hdrsize = mhi.mhi_hdrsize; 33588275SEric Cheng } 33598275SEric Cheng 33608275SEric Cheng /* 33618275SEric Cheng * Got a matching flow. It's either another 33628275SEric Cheng * MAC client, or a broadcast/multicast flow. 33638275SEric Cheng * Make sure the packet size is within the 33648275SEric Cheng * allowed size. If not drop the packet and 33658275SEric Cheng * move to next packet. 33668275SEric Cheng */ 33678275SEric Cheng if (err != 0 || 33688275SEric Cheng (pkt_size - hdrsize) > mip->mi_sdu_max) { 33698275SEric Cheng oerrors++; 33708275SEric Cheng DTRACE_PROBE2(loopback__drop, size_t, pkt_size, 33718275SEric Cheng mblk_t *, mp); 33728275SEric Cheng freemsg(mp); 33738275SEric Cheng mp = next; 33748275SEric Cheng FLOW_REFRELE(dst_flow_ent); 33758275SEric Cheng continue; 33768275SEric Cheng } 33778275SEric Cheng flow_cookie = mac_flow_get_client_cookie(dst_flow_ent); 33788275SEric Cheng if (flow_cookie != NULL) { 33798275SEric Cheng /* 33808275SEric Cheng * The vnic_bcast_send function expects 33818275SEric Cheng * to receive the sender MAC client 33828275SEric Cheng * as value for arg2. 33838275SEric Cheng */ 33848275SEric Cheng mac_bcast_send(flow_cookie, src_mcip, mp, 33858275SEric Cheng B_TRUE); 33868275SEric Cheng } else { 33878275SEric Cheng /* 33888275SEric Cheng * loopback the packet to a 33898275SEric Cheng * local MAC client. We force a context 33908275SEric Cheng * switch if both source and destination 33918275SEric Cheng * MAC clients are used by IP, i.e. bypass 33928275SEric Cheng * is set. 33938275SEric Cheng */ 33948275SEric Cheng boolean_t do_switch; 33958275SEric Cheng mac_client_impl_t *dst_mcip = 33968275SEric Cheng dst_flow_ent->fe_mcip; 33978275SEric Cheng 33988275SEric Cheng do_switch = ((src_mcip->mci_state_flags & 33998275SEric Cheng dst_mcip->mci_state_flags & 34008275SEric Cheng MCIS_CLIENT_POLL_CAPABLE) != 0); 34018275SEric Cheng 34028275SEric Cheng if ((mp1 = mac_fix_cksum(mp)) != NULL) { 34038275SEric Cheng (dst_flow_ent->fe_cb_fn)( 34048275SEric Cheng dst_flow_ent->fe_cb_arg1, 34058275SEric Cheng dst_flow_ent->fe_cb_arg2, 34068275SEric Cheng mp1, do_switch); 34078275SEric Cheng } 34088275SEric Cheng } 34098275SEric Cheng FLOW_REFRELE(dst_flow_ent); 34108275SEric Cheng } else { 34118275SEric Cheng /* 34128275SEric Cheng * Unknown destination, send via the underlying 34138275SEric Cheng * NIC. 34148275SEric Cheng */ 341510491SRishi.Srivatsavai@Sun.COM MAC_TX(mip, ring, mp, 341610491SRishi.Srivatsavai@Sun.COM ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) != 341710491SRishi.Srivatsavai@Sun.COM 0)); 34188275SEric Cheng if (mp != NULL) { 34198275SEric Cheng /* 34208275SEric Cheng * Adjust for the last packet that 34218275SEric Cheng * could not be transmitted 34228275SEric Cheng */ 34238275SEric Cheng opackets--; 34248275SEric Cheng obytes -= pkt_size; 34258275SEric Cheng mp->b_next = next; 34268275SEric Cheng break; 34278275SEric Cheng } 34288275SEric Cheng } 34298275SEric Cheng mp = next; 34308275SEric Cheng } 34318275SEric Cheng 34328275SEric Cheng done: 34338275SEric Cheng src_mcip->mci_stat_obytes += obytes; 34348275SEric Cheng src_mcip->mci_stat_opackets += opackets; 34358275SEric Cheng src_mcip->mci_stat_oerrors += oerrors; 34368275SEric Cheng 34378275SEric Cheng if (stats != NULL) { 34388275SEric Cheng stats->ts_opackets = opackets; 34398275SEric Cheng stats->ts_obytes = obytes; 34408275SEric Cheng stats->ts_oerrors = oerrors; 34418275SEric Cheng } 34428275SEric Cheng return (mp); 34438275SEric Cheng } 34448275SEric Cheng 34458275SEric Cheng /* 34468275SEric Cheng * mac_tx_srs_ring_present 34478275SEric Cheng * 34488275SEric Cheng * Returns whether the specified ring is part of the specified SRS. 34498275SEric Cheng */ 34508275SEric Cheng boolean_t 34518275SEric Cheng mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring) 34528275SEric Cheng { 34538275SEric Cheng int i; 34548275SEric Cheng mac_soft_ring_t *soft_ring; 34558275SEric Cheng 34568275SEric Cheng if (srs->srs_tx.st_arg2 == tx_ring) 34578275SEric Cheng return (B_TRUE); 34588275SEric Cheng 34598275SEric Cheng for (i = 0; i < srs->srs_oth_ring_count; i++) { 34608275SEric Cheng soft_ring = srs->srs_oth_soft_rings[i]; 34618275SEric Cheng if (soft_ring->s_ring_tx_arg2 == tx_ring) 34628275SEric Cheng return (B_TRUE); 34638275SEric Cheng } 34648275SEric Cheng 34658275SEric Cheng return (B_FALSE); 34668275SEric Cheng } 34678275SEric Cheng 34688275SEric Cheng /* 34698275SEric Cheng * mac_tx_srs_wakeup 34708275SEric Cheng * 34718275SEric Cheng * Called when Tx desc become available. Wakeup the appropriate worker 34728275SEric Cheng * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the 34738275SEric Cheng * state field. 34748275SEric Cheng */ 34758275SEric Cheng void 34768275SEric Cheng mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring) 34778275SEric Cheng { 34788275SEric Cheng int i; 34798275SEric Cheng mac_soft_ring_t *sringp; 34808275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 34818275SEric Cheng 34828275SEric Cheng mutex_enter(&mac_srs->srs_lock); 34838275SEric Cheng if (TX_SINGLE_RING_MODE(mac_srs)) { 34848275SEric Cheng if (srs_tx->st_arg2 == ring && 34858275SEric Cheng mac_srs->srs_state & SRS_TX_BLOCKED) { 34868275SEric Cheng mac_srs->srs_state &= ~SRS_TX_BLOCKED; 34878275SEric Cheng srs_tx->st_unblocked_cnt++; 34888275SEric Cheng cv_signal(&mac_srs->srs_async); 34898275SEric Cheng } 34908275SEric Cheng /* 34918275SEric Cheng * A wakeup can come before tx_srs_drain() could 34928275SEric Cheng * grab srs lock and set SRS_TX_BLOCKED. So 34938275SEric Cheng * always set woken_up flag when we come here. 34948275SEric Cheng */ 34958275SEric Cheng srs_tx->st_woken_up = B_TRUE; 34968275SEric Cheng mutex_exit(&mac_srs->srs_lock); 34978275SEric Cheng return; 34988275SEric Cheng } 34998275SEric Cheng 35008275SEric Cheng /* If you are here, it is for FANOUT or BW_FANOUT case */ 35018275SEric Cheng ASSERT(TX_MULTI_RING_MODE(mac_srs)); 35028275SEric Cheng for (i = 0; i < mac_srs->srs_oth_ring_count; i++) { 35038275SEric Cheng sringp = mac_srs->srs_oth_soft_rings[i]; 35048275SEric Cheng mutex_enter(&sringp->s_ring_lock); 35058275SEric Cheng if (sringp->s_ring_tx_arg2 == ring) { 35068275SEric Cheng if (sringp->s_ring_state & S_RING_BLOCK) { 35078275SEric Cheng sringp->s_ring_state &= ~S_RING_BLOCK; 35088275SEric Cheng sringp->s_ring_unblocked_cnt++; 35098275SEric Cheng cv_signal(&sringp->s_ring_async); 35108275SEric Cheng } 35118275SEric Cheng sringp->s_ring_tx_woken_up = B_TRUE; 35128275SEric Cheng } 35138275SEric Cheng mutex_exit(&sringp->s_ring_lock); 35148275SEric Cheng } 35158275SEric Cheng mutex_exit(&mac_srs->srs_lock); 35168275SEric Cheng } 35178275SEric Cheng 35188275SEric Cheng /* 35198275SEric Cheng * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash 35208275SEric Cheng * the blocked clients again. 35218275SEric Cheng */ 35228275SEric Cheng void 35238275SEric Cheng mac_tx_notify(mac_impl_t *mip) 35248275SEric Cheng { 35258275SEric Cheng i_mac_notify(mip, MAC_NOTE_TX); 35268275SEric Cheng } 35278275SEric Cheng 35288275SEric Cheng /* 35298275SEric Cheng * RX SOFTRING RELATED FUNCTIONS 35308275SEric Cheng * 35318275SEric Cheng * These functions really belong in mac_soft_ring.c and here for 35328275SEric Cheng * a short period. 35338275SEric Cheng */ 35348275SEric Cheng 35358275SEric Cheng #define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 35368275SEric Cheng /* \ 35378275SEric Cheng * Enqueue our mblk chain. \ 35388275SEric Cheng */ \ 35398275SEric Cheng ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \ 35408275SEric Cheng \ 35418275SEric Cheng if ((ringp)->s_ring_last != NULL) \ 35428275SEric Cheng (ringp)->s_ring_last->b_next = (mp); \ 35438275SEric Cheng else \ 35448275SEric Cheng (ringp)->s_ring_first = (mp); \ 35458275SEric Cheng (ringp)->s_ring_last = (tail); \ 35468275SEric Cheng (ringp)->s_ring_count += (cnt); \ 35478275SEric Cheng ASSERT((ringp)->s_ring_count > 0); \ 35488275SEric Cheng if ((ringp)->s_ring_type & ST_RING_BW_CTL) { \ 35498275SEric Cheng (ringp)->s_ring_size += sz; \ 35508275SEric Cheng } \ 35518275SEric Cheng } 35528275SEric Cheng 35538275SEric Cheng /* 35548275SEric Cheng * Default entry point to deliver a packet chain to a MAC client. 35558275SEric Cheng * If the MAC client has flows, do the classification with these 35568275SEric Cheng * flows as well. 35578275SEric Cheng */ 35588275SEric Cheng /* ARGSUSED */ 35598275SEric Cheng void 35608275SEric Cheng mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain, 35618275SEric Cheng mac_header_info_t *arg3) 35628275SEric Cheng { 35638275SEric Cheng mac_client_impl_t *mcip = arg1; 35648275SEric Cheng 35658275SEric Cheng if (mcip->mci_nvids == 1 && 35669109SVenu.Iyer@Sun.COM !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) { 35678275SEric Cheng /* 35688275SEric Cheng * If the client has exactly one VID associated with it 35698275SEric Cheng * and striping of VLAN header is not disabled, 35708275SEric Cheng * remove the VLAN tag from the packet before 35718275SEric Cheng * passing it on to the client's receive callback. 35728275SEric Cheng * Note that this needs to be done after we dispatch 35738275SEric Cheng * the packet to the promiscuous listeners of the 35748275SEric Cheng * client, since they expect to see the whole 35758275SEric Cheng * frame including the VLAN headers. 35768275SEric Cheng */ 35778275SEric Cheng mp_chain = mac_strip_vlan_tag_chain(mp_chain); 35788275SEric Cheng } 35798275SEric Cheng 35808275SEric Cheng mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE); 35818275SEric Cheng } 35828275SEric Cheng 35838275SEric Cheng /* 35848275SEric Cheng * mac_rx_soft_ring_process 35858275SEric Cheng * 35868275SEric Cheng * process a chain for a given soft ring. The number of packets queued 35878275SEric Cheng * in the SRS and its associated soft rings (including this one) is 35888275SEric Cheng * very small (tracked by srs_poll_pkt_cnt), then allow the entering 35898275SEric Cheng * thread (interrupt or poll thread) to do inline processing. This 35908275SEric Cheng * helps keep the latency down under low load. 35918275SEric Cheng * 35928275SEric Cheng * The proc and arg for each mblk is already stored in the mblk in 35938275SEric Cheng * appropriate places. 35948275SEric Cheng */ 35958275SEric Cheng /* ARGSUSED */ 35968275SEric Cheng void 35978275SEric Cheng mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, 35988275SEric Cheng mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz) 35998275SEric Cheng { 36008275SEric Cheng mac_direct_rx_t proc; 36018275SEric Cheng void *arg1; 36028275SEric Cheng mac_resource_handle_t arg2; 36038275SEric Cheng mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 36048275SEric Cheng 36058275SEric Cheng ASSERT(ringp != NULL); 36068275SEric Cheng ASSERT(mp_chain != NULL); 36078275SEric Cheng ASSERT(tail != NULL); 36088275SEric Cheng ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 36098275SEric Cheng 36108275SEric Cheng mutex_enter(&ringp->s_ring_lock); 36118275SEric Cheng ringp->s_ring_total_inpkt += cnt; 36128833SVenu.Iyer@Sun.COM if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) && 36138833SVenu.Iyer@Sun.COM !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) { 36148275SEric Cheng /* If on processor or blanking on, then enqueue and return */ 36158275SEric Cheng if (ringp->s_ring_state & S_RING_BLANK || 36168275SEric Cheng ringp->s_ring_state & S_RING_PROC) { 36178275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 36188275SEric Cheng mutex_exit(&ringp->s_ring_lock); 36198275SEric Cheng return; 36208275SEric Cheng } 36218275SEric Cheng proc = ringp->s_ring_rx_func; 36228275SEric Cheng arg1 = ringp->s_ring_rx_arg1; 36238275SEric Cheng arg2 = ringp->s_ring_rx_arg2; 36248275SEric Cheng /* 36258275SEric Cheng * See if anything is already queued. If we are the 36268275SEric Cheng * first packet, do inline processing else queue the 36278275SEric Cheng * packet and do the drain. 36288275SEric Cheng */ 36298275SEric Cheng if (ringp->s_ring_first == NULL) { 36308275SEric Cheng /* 36318275SEric Cheng * Fast-path, ok to process and nothing queued. 36328275SEric Cheng */ 36338275SEric Cheng ringp->s_ring_run = curthread; 36348275SEric Cheng ringp->s_ring_state |= (S_RING_PROC); 36358275SEric Cheng 36368275SEric Cheng mutex_exit(&ringp->s_ring_lock); 36378275SEric Cheng 36388275SEric Cheng /* 36398275SEric Cheng * We are the chain of 1 packet so 36408275SEric Cheng * go through this fast path. 36418275SEric Cheng */ 36428275SEric Cheng ASSERT(mp_chain->b_next == NULL); 36438275SEric Cheng 36448275SEric Cheng (*proc)(arg1, arg2, mp_chain, NULL); 36458275SEric Cheng 36468275SEric Cheng ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 36478275SEric Cheng /* 36488275SEric Cheng * If we have a soft ring set which is doing 36498275SEric Cheng * bandwidth control, we need to decrement 36508275SEric Cheng * srs_size and count so it the SRS can have a 36518275SEric Cheng * accurate idea of what is the real data 36528275SEric Cheng * queued between SRS and its soft rings. We 36538275SEric Cheng * decrement the counters only when the packet 36548275SEric Cheng * gets processed by both SRS and the soft ring. 36558275SEric Cheng */ 36568275SEric Cheng mutex_enter(&mac_srs->srs_lock); 36578275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 36588275SEric Cheng MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 36598275SEric Cheng mutex_exit(&mac_srs->srs_lock); 36608275SEric Cheng 36618275SEric Cheng mutex_enter(&ringp->s_ring_lock); 36628275SEric Cheng ringp->s_ring_run = NULL; 36638275SEric Cheng ringp->s_ring_state &= ~S_RING_PROC; 36648275SEric Cheng if (ringp->s_ring_state & S_RING_CLIENT_WAIT) 36658275SEric Cheng cv_signal(&ringp->s_ring_client_cv); 36668275SEric Cheng 36678275SEric Cheng if ((ringp->s_ring_first == NULL) || 36688275SEric Cheng (ringp->s_ring_state & S_RING_BLANK)) { 36698275SEric Cheng /* 36708275SEric Cheng * We processed inline our packet and 36718275SEric Cheng * nothing new has arrived or our 36728275SEric Cheng * receiver doesn't want to receive 36738275SEric Cheng * any packets. We are done. 36748275SEric Cheng */ 36758275SEric Cheng mutex_exit(&ringp->s_ring_lock); 36768275SEric Cheng return; 36778275SEric Cheng } 36788275SEric Cheng } else { 36798275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, 36808275SEric Cheng mp_chain, tail, cnt, sz); 36818275SEric Cheng } 36828275SEric Cheng 36838275SEric Cheng /* 36848275SEric Cheng * We are here because either we couldn't do inline 36858275SEric Cheng * processing (because something was already 36868275SEric Cheng * queued), or we had a chain of more than one 36878275SEric Cheng * packet, or something else arrived after we were 36888275SEric Cheng * done with inline processing. 36898275SEric Cheng */ 36908275SEric Cheng ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 36918275SEric Cheng ASSERT(ringp->s_ring_first != NULL); 36928275SEric Cheng 36938275SEric Cheng ringp->s_ring_drain_func(ringp); 36948275SEric Cheng mutex_exit(&ringp->s_ring_lock); 36958275SEric Cheng return; 36968275SEric Cheng } else { 36978275SEric Cheng /* ST_RING_WORKER_ONLY case */ 36988275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 36998275SEric Cheng mac_soft_ring_worker_wakeup(ringp); 37008275SEric Cheng mutex_exit(&ringp->s_ring_lock); 37018275SEric Cheng } 37028275SEric Cheng } 37038275SEric Cheng 37048275SEric Cheng /* 37058275SEric Cheng * TX SOFTRING RELATED FUNCTIONS 37068275SEric Cheng * 37078275SEric Cheng * These functions really belong in mac_soft_ring.c and here for 37088275SEric Cheng * a short period. 37098275SEric Cheng */ 37108275SEric Cheng 37118275SEric Cheng #define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 37128275SEric Cheng ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); \ 37138275SEric Cheng ringp->s_ring_state |= S_RING_ENQUEUED; \ 37148275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); \ 37158275SEric Cheng } 37168275SEric Cheng 37178275SEric Cheng /* 37188275SEric Cheng * mac_tx_sring_queued 37198275SEric Cheng * 37208275SEric Cheng * When we are out of transmit descriptors and we already have a 37218275SEric Cheng * queue that exceeds hiwat (or the client called us with 37228275SEric Cheng * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the 37238275SEric Cheng * soft ring pointer as the opaque cookie for the client enable 37248275SEric Cheng * flow control. 37258275SEric Cheng */ 37268275SEric Cheng static mac_tx_cookie_t 37278275SEric Cheng mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, 37288275SEric Cheng mblk_t **ret_mp) 37298275SEric Cheng { 37308275SEric Cheng int cnt; 37318275SEric Cheng size_t sz; 37328275SEric Cheng mblk_t *tail; 37338275SEric Cheng mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 37348275SEric Cheng mac_tx_cookie_t cookie = NULL; 37358275SEric Cheng boolean_t wakeup_worker = B_TRUE; 37368275SEric Cheng 37378275SEric Cheng ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 37388275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 37398275SEric Cheng if (flag & MAC_DROP_ON_NO_DESC) { 37408275SEric Cheng mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 37418275SEric Cheng /* increment freed stats */ 37428275SEric Cheng ringp->s_ring_drops += cnt; 37438275SEric Cheng cookie = (mac_tx_cookie_t)ringp; 37448275SEric Cheng } else { 37458275SEric Cheng if (ringp->s_ring_first != NULL) 37468275SEric Cheng wakeup_worker = B_FALSE; 37478275SEric Cheng 37488275SEric Cheng if (flag & MAC_TX_NO_ENQUEUE) { 37498275SEric Cheng /* 37508275SEric Cheng * If QUEUED is not set, queue the packet 37518275SEric Cheng * and let mac_tx_soft_ring_drain() set 37528275SEric Cheng * the TX_BLOCKED bit for the reasons 37538275SEric Cheng * explained above. Otherwise, return the 37548275SEric Cheng * mblks. 37558275SEric Cheng */ 37568275SEric Cheng if (wakeup_worker) { 37578275SEric Cheng TX_SOFT_RING_ENQUEUE_CHAIN(ringp, 37588275SEric Cheng mp_chain, tail, cnt, sz); 37598275SEric Cheng } else { 37608275SEric Cheng ringp->s_ring_state |= S_RING_WAKEUP_CLIENT; 37618275SEric Cheng cookie = (mac_tx_cookie_t)ringp; 37628275SEric Cheng *ret_mp = mp_chain; 37638275SEric Cheng } 37648275SEric Cheng } else { 37658275SEric Cheng boolean_t enqueue = B_TRUE; 37668275SEric Cheng 37678275SEric Cheng if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 37688275SEric Cheng /* 37698275SEric Cheng * flow-controlled. Store ringp in cookie 37708275SEric Cheng * so that it can be returned as 37718275SEric Cheng * mac_tx_cookie_t to client 37728275SEric Cheng */ 37738275SEric Cheng ringp->s_ring_state |= S_RING_TX_HIWAT; 37748275SEric Cheng cookie = (mac_tx_cookie_t)ringp; 37758275SEric Cheng ringp->s_ring_hiwat_cnt++; 37768275SEric Cheng if (ringp->s_ring_count > 37778275SEric Cheng ringp->s_ring_tx_max_q_cnt) { 37788275SEric Cheng /* increment freed stats */ 37798275SEric Cheng ringp->s_ring_drops += cnt; 37808275SEric Cheng /* 37818275SEric Cheng * b_prev may be set to the fanout hint 37828275SEric Cheng * hence can't use freemsg directly 37838275SEric Cheng */ 37848275SEric Cheng mac_pkt_drop(NULL, NULL, 37858275SEric Cheng mp_chain, B_FALSE); 37868275SEric Cheng DTRACE_PROBE1(tx_queued_hiwat, 37878275SEric Cheng mac_soft_ring_t *, ringp); 37888275SEric Cheng enqueue = B_FALSE; 37898275SEric Cheng } 37908275SEric Cheng } 37918275SEric Cheng if (enqueue) { 37928275SEric Cheng TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, 37938275SEric Cheng tail, cnt, sz); 37948275SEric Cheng } 37958275SEric Cheng } 37968275SEric Cheng if (wakeup_worker) 37978275SEric Cheng cv_signal(&ringp->s_ring_async); 37988275SEric Cheng } 37998275SEric Cheng return (cookie); 38008275SEric Cheng } 38018275SEric Cheng 38028275SEric Cheng 38038275SEric Cheng /* 38048275SEric Cheng * mac_tx_soft_ring_process 38058275SEric Cheng * 38068275SEric Cheng * This routine is called when fanning out outgoing traffic among 38078275SEric Cheng * multipe Tx rings. 38088275SEric Cheng * Note that a soft ring is associated with a h/w Tx ring. 38098275SEric Cheng */ 38108275SEric Cheng mac_tx_cookie_t 38118275SEric Cheng mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain, 38128275SEric Cheng uint16_t flag, mblk_t **ret_mp) 38138275SEric Cheng { 38148275SEric Cheng mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 38158275SEric Cheng int cnt; 38168275SEric Cheng size_t sz; 38178275SEric Cheng mblk_t *tail; 38188275SEric Cheng mac_tx_cookie_t cookie = NULL; 38198275SEric Cheng 38208275SEric Cheng ASSERT(ringp != NULL); 38218275SEric Cheng ASSERT(mp_chain != NULL); 38228275SEric Cheng ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 38238275SEric Cheng /* 38248275SEric Cheng * Only two modes can come here; either it can be 38258275SEric Cheng * SRS_TX_BW_FANOUT or SRS_TX_FANOUT 38268275SEric Cheng */ 38278275SEric Cheng ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT || 38288275SEric Cheng mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT); 38298275SEric Cheng 38308275SEric Cheng if (ringp->s_ring_type & ST_RING_WORKER_ONLY) { 38318275SEric Cheng /* Serialization mode */ 38328275SEric Cheng 38338275SEric Cheng mutex_enter(&ringp->s_ring_lock); 38348275SEric Cheng if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 38358275SEric Cheng cookie = mac_tx_sring_enqueue(ringp, mp_chain, 38368275SEric Cheng flag, ret_mp); 38378275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38388275SEric Cheng return (cookie); 38398275SEric Cheng } 38408275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 38418275SEric Cheng TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 38428275SEric Cheng if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) { 38438275SEric Cheng /* 38448275SEric Cheng * If ring is blocked due to lack of Tx 38458275SEric Cheng * descs, just return. Worker thread 38468275SEric Cheng * will get scheduled when Tx desc's 38478275SEric Cheng * become available. 38488275SEric Cheng */ 38498275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38508275SEric Cheng return (cookie); 38518275SEric Cheng } 38528275SEric Cheng mac_soft_ring_worker_wakeup(ringp); 38538275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38548275SEric Cheng return (cookie); 38558275SEric Cheng } else { 38568275SEric Cheng /* Default fanout mode */ 38578275SEric Cheng /* 38588275SEric Cheng * S_RING_BLOCKED is set when underlying NIC runs 38598275SEric Cheng * out of Tx descs and messages start getting 38608275SEric Cheng * queued. It won't get reset until 38618275SEric Cheng * tx_srs_drain() completely drains out the 38628275SEric Cheng * messages. 38638275SEric Cheng */ 38648275SEric Cheng boolean_t is_subflow; 38658275SEric Cheng mac_tx_stats_t stats; 38668275SEric Cheng 38678275SEric Cheng if (ringp->s_ring_state & S_RING_ENQUEUED) { 38688275SEric Cheng /* Tx descs/resources not available */ 38698275SEric Cheng mutex_enter(&ringp->s_ring_lock); 38708275SEric Cheng if (ringp->s_ring_state & S_RING_ENQUEUED) { 38718275SEric Cheng cookie = mac_tx_sring_enqueue(ringp, mp_chain, 38728275SEric Cheng flag, ret_mp); 38738275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38748275SEric Cheng return (cookie); 38758275SEric Cheng } 38768275SEric Cheng /* 38778275SEric Cheng * While we were computing mblk count, the 38788275SEric Cheng * flow control condition got relieved. 38798275SEric Cheng * Continue with the transmission. 38808275SEric Cheng */ 38818275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38828275SEric Cheng } 38838275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 38848275SEric Cheng 38858275SEric Cheng mp_chain = mac_tx_send(ringp->s_ring_tx_arg1, 38868275SEric Cheng ringp->s_ring_tx_arg2, mp_chain, 38878275SEric Cheng (is_subflow ? &stats : NULL)); 38888275SEric Cheng 38898275SEric Cheng /* 38908275SEric Cheng * Multiple threads could be here sending packets. 38918275SEric Cheng * Under such conditions, it is not possible to 38928275SEric Cheng * automically set S_RING_BLOCKED bit to indicate 38938275SEric Cheng * out of tx desc condition. To atomically set 38948275SEric Cheng * this, we queue the returned packet and do 38958275SEric Cheng * the setting of S_RING_BLOCKED in 38968275SEric Cheng * mac_tx_soft_ring_drain(). 38978275SEric Cheng */ 38988275SEric Cheng if (mp_chain != NULL) { 38998275SEric Cheng mutex_enter(&ringp->s_ring_lock); 39008275SEric Cheng cookie = 39018275SEric Cheng mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp); 39028275SEric Cheng mutex_exit(&ringp->s_ring_lock); 39038275SEric Cheng return (cookie); 39048275SEric Cheng } 39058275SEric Cheng if (is_subflow) { 39068275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 39078275SEric Cheng } 39088275SEric Cheng return (NULL); 39098275SEric Cheng } 39108275SEric Cheng } 3911