18275SEric Cheng /* 28275SEric Cheng * CDDL HEADER START 38275SEric Cheng * 48275SEric Cheng * The contents of this file are subject to the terms of the 58275SEric Cheng * Common Development and Distribution License (the "License"). 68275SEric Cheng * You may not use this file except in compliance with the License. 78275SEric Cheng * 88275SEric Cheng * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 98275SEric Cheng * or http://www.opensolaris.org/os/licensing. 108275SEric Cheng * See the License for the specific language governing permissions 118275SEric Cheng * and limitations under the License. 128275SEric Cheng * 138275SEric Cheng * When distributing Covered Code, include this CDDL HEADER in each 148275SEric Cheng * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 158275SEric Cheng * If applicable, add the following below this CDDL HEADER, with the 168275SEric Cheng * fields enclosed by brackets "[]" replaced with your own identifying 178275SEric Cheng * information: Portions Copyright [yyyy] [name of copyright owner] 188275SEric Cheng * 198275SEric Cheng * CDDL HEADER END 208275SEric Cheng */ 218275SEric Cheng /* 228833SVenu.Iyer@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 238275SEric Cheng * Use is subject to license terms. 248275SEric Cheng */ 258275SEric Cheng 268275SEric Cheng #include <sys/types.h> 278275SEric Cheng #include <sys/callb.h> 288275SEric Cheng #include <sys/sdt.h> 298275SEric Cheng #include <sys/strsubr.h> 308275SEric Cheng #include <sys/strsun.h> 318275SEric Cheng #include <sys/vlan.h> 328275SEric Cheng #include <inet/ipsec_impl.h> 338275SEric Cheng #include <inet/ip_impl.h> 348275SEric Cheng #include <inet/sadb.h> 358275SEric Cheng #include <inet/ipsecesp.h> 368275SEric Cheng #include <inet/ipsecah.h> 378275SEric Cheng #include <inet/ip6.h> 388275SEric Cheng 398275SEric Cheng #include <sys/mac_impl.h> 408275SEric Cheng #include <sys/mac_client_impl.h> 418275SEric Cheng #include <sys/mac_client_priv.h> 428275SEric Cheng #include <sys/mac_soft_ring.h> 438275SEric Cheng #include <sys/mac_flow_impl.h> 448275SEric Cheng 458275SEric Cheng static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *, 468275SEric Cheng uintptr_t, uint16_t, mblk_t **); 478275SEric Cheng static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *, 488275SEric Cheng uintptr_t, uint16_t, mblk_t **); 498275SEric Cheng static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *, 508275SEric Cheng uintptr_t, uint16_t, mblk_t **); 518275SEric Cheng static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *, 528275SEric Cheng uintptr_t, uint16_t, mblk_t **); 538275SEric Cheng 548275SEric Cheng typedef struct mac_tx_mode_s { 558275SEric Cheng mac_tx_srs_mode_t mac_tx_mode; 568275SEric Cheng mac_tx_func_t mac_tx_func; 578275SEric Cheng } mac_tx_mode_t; 588275SEric Cheng 598275SEric Cheng /* 608275SEric Cheng * There are five modes of operation on the Tx side. These modes get set 618275SEric Cheng * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode, 628275SEric Cheng * none of the other modes are user configurable. They get selected by 638275SEric Cheng * the system depending upon whether the link (or flow) has multiple Tx 648275SEric Cheng * rings or a bandwidth configured, etc. 658275SEric Cheng */ 668275SEric Cheng mac_tx_mode_t mac_tx_mode_list[] = { 678275SEric Cheng {SRS_TX_DEFAULT, mac_tx_single_ring_mode}, 688275SEric Cheng {SRS_TX_SERIALIZE, mac_tx_serializer_mode}, 698275SEric Cheng {SRS_TX_FANOUT, mac_tx_fanout_mode}, 708275SEric Cheng {SRS_TX_BW, mac_tx_bw_mode}, 718275SEric Cheng {SRS_TX_BW_FANOUT, mac_tx_bw_mode} 728275SEric Cheng }; 738275SEric Cheng 748275SEric Cheng /* 758275SEric Cheng * Soft Ring Set (SRS) - The Run time code that deals with 768275SEric Cheng * dynamic polling from the hardware, bandwidth enforcement, 778275SEric Cheng * fanout etc. 788275SEric Cheng * 798275SEric Cheng * We try to use H/W classification on NIC and assign traffic for 808275SEric Cheng * a MAC address to a particular Rx ring or ring group. There is a 818275SEric Cheng * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically 828275SEric Cheng * switches the underlying Rx ring between interrupt and 838275SEric Cheng * polling mode and enforces any specified B/W control. 848275SEric Cheng * 858275SEric Cheng * There is always a SRS created and tied to each H/W and S/W rule. 868275SEric Cheng * Whenever we create a H/W rule, we always add the the same rule to 878275SEric Cheng * S/W classifier and tie a SRS to it. 888275SEric Cheng * 898275SEric Cheng * In case a B/W control is specified, it is broken into bytes 908275SEric Cheng * per ticks and as soon as the quota for a tick is exhausted, 918275SEric Cheng * the underlying Rx ring is forced into poll mode for remainder of 928275SEric Cheng * the tick. The SRS poll thread only polls for bytes that are 938275SEric Cheng * allowed to come in the SRS. We typically let 4x the configured 948275SEric Cheng * B/W worth of packets to come in the SRS (to prevent unnecessary 958275SEric Cheng * drops due to bursts) but only process the specified amount. 968275SEric Cheng * 978275SEric Cheng * A MAC client (e.g. a VNIC or aggr) can have 1 or more 988275SEric Cheng * Rx rings (and corresponding SRSs) assigned to it. The SRS 998275SEric Cheng * in turn can have softrings to do protocol level fanout or 1008275SEric Cheng * softrings to do S/W based fanout or both. In case the NIC 1018275SEric Cheng * has no Rx rings, we do S/W classification to respective SRS. 1028275SEric Cheng * The S/W classification rule is always setup and ready. This 1038275SEric Cheng * allows the MAC layer to reassign Rx rings whenever needed 1048275SEric Cheng * but packets still continue to flow via the default path and 1058275SEric Cheng * getting S/W classified to correct SRS. 1068275SEric Cheng * 1078275SEric Cheng * The SRS's are used on both Tx and Rx side. They use the same 1088275SEric Cheng * data structure but the processing routines have slightly different 1098275SEric Cheng * semantics due to the fact that Rx side needs to do dynamic 1108275SEric Cheng * polling etc. 1118275SEric Cheng * 1128275SEric Cheng * Dynamic Polling Notes 1138275SEric Cheng * ===================== 1148275SEric Cheng * 1158275SEric Cheng * Each Soft ring set is capable of switching its Rx ring between 1168275SEric Cheng * interrupt and poll mode and actively 'polls' for packets in 1178275SEric Cheng * poll mode. If the SRS is implementing a B/W limit, it makes 1188275SEric Cheng * sure that only Max allowed packets are pulled in poll mode 1198275SEric Cheng * and goes to poll mode as soon as B/W limit is exceeded. As 1208275SEric Cheng * such, there are no overheads to implement B/W limits. 1218275SEric Cheng * 1228275SEric Cheng * In poll mode, its better to keep the pipeline going where the 1238275SEric Cheng * SRS worker thread keeps processing packets and poll thread 1248275SEric Cheng * keeps bringing more packets (specially if they get to run 1258275SEric Cheng * on different CPUs). This also prevents the overheads associated 1268275SEric Cheng * by excessive signalling (on NUMA machines, this can be 1278275SEric Cheng * pretty devastating). The exception is latency optimized case 1288275SEric Cheng * where worker thread does no work and interrupt and poll thread 1298275SEric Cheng * are allowed to do their own drain. 1308275SEric Cheng * 1318275SEric Cheng * We use the following policy to control Dynamic Polling: 1328275SEric Cheng * 1) We switch to poll mode anytime the processing 1338275SEric Cheng * thread causes a backlog to build up in SRS and 1348275SEric Cheng * its associated Soft Rings (sr_poll_pkt_cnt > 0). 1358275SEric Cheng * 2) As long as the backlog stays under the low water 1368275SEric Cheng * mark (sr_lowat), we poll the H/W for more packets. 1378275SEric Cheng * 3) If the backlog (sr_poll_pkt_cnt) exceeds low 1388275SEric Cheng * water mark, we stay in poll mode but don't poll 1398275SEric Cheng * the H/W for more packets. 1408275SEric Cheng * 4) Anytime in polling mode, if we poll the H/W for 1418275SEric Cheng * packets and find nothing plus we have an existing 1428275SEric Cheng * backlog (sr_poll_pkt_cnt > 0), we stay in polling 1438275SEric Cheng * mode but don't poll the H/W for packets anymore 1448275SEric Cheng * (let the polling thread go to sleep). 1458275SEric Cheng * 5) Once the backlog is relived (packets are processed) 1468275SEric Cheng * we reenable polling (by signalling the poll thread) 1478275SEric Cheng * only when the backlog dips below sr_poll_thres. 1488275SEric Cheng * 6) sr_hiwat is used exclusively when we are not 1498275SEric Cheng * polling capable and is used to decide when to 1508275SEric Cheng * drop packets so the SRS queue length doesn't grow 1518275SEric Cheng * infinitely. 1528275SEric Cheng * 1538275SEric Cheng * NOTE: Also see the block level comment on top of mac_soft_ring.c 1548275SEric Cheng */ 1558275SEric Cheng 1568275SEric Cheng /* 1578275SEric Cheng * mac_latency_optimize 1588275SEric Cheng * 1598275SEric Cheng * Controls whether the poll thread can process the packets inline 1608275SEric Cheng * or let the SRS worker thread do the processing. This applies if 1618275SEric Cheng * the SRS was not being processed. For latency sensitive traffic, 1628275SEric Cheng * this needs to be true to allow inline processing. For throughput 1638275SEric Cheng * under load, this should be false. 1648275SEric Cheng * 1658275SEric Cheng * This (and other similar) tunable should be rolled into a link 1668275SEric Cheng * or flow specific workload hint that can be set using dladm 1678275SEric Cheng * linkprop (instead of multiple such tunables). 1688275SEric Cheng */ 1698275SEric Cheng boolean_t mac_latency_optimize = B_TRUE; 1708275SEric Cheng 1718275SEric Cheng /* 1728275SEric Cheng * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN 1738275SEric Cheng * 1748275SEric Cheng * queue a mp or chain in soft ring set and increment the 1758275SEric Cheng * local count (srs_count) for the SRS and the shared counter 1768275SEric Cheng * (srs_poll_pkt_cnt - shared between SRS and its soft rings 1778275SEric Cheng * to track the total unprocessed packets for polling to work 1788275SEric Cheng * correctly). 1798275SEric Cheng * 1808275SEric Cheng * The size (total bytes queued) counters are incremented only 1818275SEric Cheng * if we are doing B/W control. 1828275SEric Cheng */ 1838275SEric Cheng #define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 1848275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 1858275SEric Cheng if ((mac_srs)->srs_last != NULL) \ 1868275SEric Cheng (mac_srs)->srs_last->b_next = (head); \ 1878275SEric Cheng else \ 1888275SEric Cheng (mac_srs)->srs_first = (head); \ 1898275SEric Cheng (mac_srs)->srs_last = (tail); \ 1908275SEric Cheng (mac_srs)->srs_count += count; \ 1918275SEric Cheng } 1928275SEric Cheng 1938275SEric Cheng #define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 1948275SEric Cheng mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 1958275SEric Cheng \ 1968275SEric Cheng MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 1978275SEric Cheng srs_rx->sr_poll_pkt_cnt += count; \ 1988275SEric Cheng ASSERT(srs_rx->sr_poll_pkt_cnt > 0); \ 1998275SEric Cheng if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 2008275SEric Cheng (mac_srs)->srs_size += (sz); \ 2018275SEric Cheng mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock); \ 2028275SEric Cheng (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 2038275SEric Cheng mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock); \ 2048275SEric Cheng } \ 2058275SEric Cheng } 2068275SEric Cheng 2078275SEric Cheng #define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 2088275SEric Cheng mac_srs->srs_state |= SRS_ENQUEUED; \ 2098275SEric Cheng MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 2108275SEric Cheng if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 2118275SEric Cheng (mac_srs)->srs_size += (sz); \ 2128275SEric Cheng (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 2138275SEric Cheng } \ 2148275SEric Cheng } 2158275SEric Cheng 2168275SEric Cheng /* 2178275SEric Cheng * Turn polling on routines 2188275SEric Cheng */ 2198275SEric Cheng #define MAC_SRS_POLLING_ON(mac_srs) { \ 2208275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2218275SEric Cheng if (((mac_srs)->srs_state & \ 2228275SEric Cheng (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) { \ 2238275SEric Cheng (mac_srs)->srs_state |= SRS_POLLING; \ 2248275SEric Cheng (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 2258275SEric Cheng (mac_srs)->srs_ring); \ 2268275SEric Cheng (mac_srs)->srs_rx.sr_poll_on++; \ 2278275SEric Cheng } \ 2288275SEric Cheng } 2298275SEric Cheng 2308275SEric Cheng #define MAC_SRS_WORKER_POLLING_ON(mac_srs) { \ 2318275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2328275SEric Cheng if (((mac_srs)->srs_state & \ 2338275SEric Cheng (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == \ 2348275SEric Cheng (SRS_POLLING_CAPAB|SRS_WORKER)) { \ 2358275SEric Cheng (mac_srs)->srs_state |= SRS_POLLING; \ 2368275SEric Cheng (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 2378275SEric Cheng (mac_srs)->srs_ring); \ 2388275SEric Cheng (mac_srs)->srs_rx.sr_worker_poll_on++; \ 2398275SEric Cheng } \ 2408275SEric Cheng } 2418275SEric Cheng 2428275SEric Cheng /* 2438275SEric Cheng * MAC_SRS_POLL_RING 2448275SEric Cheng * 2458275SEric Cheng * Signal the SRS poll thread to poll the underlying H/W ring 2468275SEric Cheng * provided it wasn't already polling (SRS_GET_PKTS was set). 2478275SEric Cheng * 2488275SEric Cheng * Poll thread gets to run only from mac_rx_srs_drain() and only 2498275SEric Cheng * if the drain was being done by the worker thread. 2508275SEric Cheng */ 2518275SEric Cheng #define MAC_SRS_POLL_RING(mac_srs) { \ 2528275SEric Cheng mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 2538275SEric Cheng \ 2548275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2558275SEric Cheng srs_rx->sr_poll_thr_sig++; \ 2568275SEric Cheng if (((mac_srs)->srs_state & \ 2578275SEric Cheng (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) == \ 2588275SEric Cheng (SRS_WORKER|SRS_POLLING_CAPAB)) { \ 2598275SEric Cheng (mac_srs)->srs_state |= SRS_GET_PKTS; \ 2608275SEric Cheng cv_signal(&(mac_srs)->srs_cv); \ 2618275SEric Cheng } else { \ 2628275SEric Cheng srs_rx->sr_poll_thr_busy++; \ 2638275SEric Cheng } \ 2648275SEric Cheng } 2658275SEric Cheng 2668275SEric Cheng /* 2678275SEric Cheng * MAC_SRS_CHECK_BW_CONTROL 2688275SEric Cheng * 2698275SEric Cheng * Check to see if next tick has started so we can reset the 2708275SEric Cheng * SRS_BW_ENFORCED flag and allow more packets to come in the 2718275SEric Cheng * system. 2728275SEric Cheng */ 2738275SEric Cheng #define MAC_SRS_CHECK_BW_CONTROL(mac_srs) { \ 2748275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2758275SEric Cheng ASSERT(((mac_srs)->srs_type & SRST_TX) || \ 2768275SEric Cheng MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock)); \ 277*11066Srafael.vanoni@sun.com clock_t now = ddi_get_lbolt(); \ 278*11066Srafael.vanoni@sun.com if ((mac_srs)->srs_bw->mac_bw_curr_time != now) { \ 279*11066Srafael.vanoni@sun.com (mac_srs)->srs_bw->mac_bw_curr_time = now; \ 2808275SEric Cheng (mac_srs)->srs_bw->mac_bw_used = 0; \ 2818275SEric Cheng if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED) \ 2828275SEric Cheng (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \ 2838275SEric Cheng } \ 2848275SEric Cheng } 2858275SEric Cheng 2868275SEric Cheng /* 2878275SEric Cheng * MAC_SRS_WORKER_WAKEUP 2888275SEric Cheng * 2898275SEric Cheng * Wake up the SRS worker thread to process the queue as long as 2908275SEric Cheng * no one else is processing the queue. If we are optimizing for 2918275SEric Cheng * latency, we wake up the worker thread immediately or else we 2928275SEric Cheng * wait mac_srs_worker_wakeup_ticks before worker thread gets 2938275SEric Cheng * woken up. 2948275SEric Cheng */ 2958275SEric Cheng int mac_srs_worker_wakeup_ticks = 0; 2968275SEric Cheng #define MAC_SRS_WORKER_WAKEUP(mac_srs) { \ 2978275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2988275SEric Cheng if (!((mac_srs)->srs_state & SRS_PROC) && \ 2998275SEric Cheng (mac_srs)->srs_tid == NULL) { \ 3009618SRajagopal.Kunhappan@Sun.COM if (((mac_srs)->srs_state & SRS_LATENCY_OPT) || \ 3018275SEric Cheng (mac_srs_worker_wakeup_ticks == 0)) \ 3028275SEric Cheng cv_signal(&(mac_srs)->srs_async); \ 3038275SEric Cheng else \ 3048275SEric Cheng (mac_srs)->srs_tid = \ 3058275SEric Cheng timeout(mac_srs_fire, (mac_srs), \ 3068275SEric Cheng mac_srs_worker_wakeup_ticks); \ 3078275SEric Cheng } \ 3088275SEric Cheng } 3098275SEric Cheng 3108275SEric Cheng #define TX_SINGLE_RING_MODE(mac_srs) \ 3118275SEric Cheng ((mac_srs)->srs_tx.st_mode == SRS_TX_DEFAULT || \ 3128275SEric Cheng (mac_srs)->srs_tx.st_mode == SRS_TX_SERIALIZE || \ 3138275SEric Cheng (mac_srs)->srs_tx.st_mode == SRS_TX_BW) 3148275SEric Cheng 3158275SEric Cheng #define TX_BANDWIDTH_MODE(mac_srs) \ 3168275SEric Cheng ((mac_srs)->srs_tx.st_mode == SRS_TX_BW || \ 3178275SEric Cheng (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT) 3188275SEric Cheng 3198275SEric Cheng #define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) { \ 3208275SEric Cheng uint_t hash, indx; \ 3218275SEric Cheng hash = HASH_HINT(hint); \ 3228275SEric Cheng indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); \ 3238275SEric Cheng softring = mac_srs->srs_oth_soft_rings[indx]; \ 3248275SEric Cheng (void) (mac_tx_soft_ring_process(softring, head, 0, NULL)); \ 3258275SEric Cheng } 3268275SEric Cheng 3278275SEric Cheng /* 3288275SEric Cheng * MAC_TX_SRS_BLOCK 3298275SEric Cheng * 3308275SEric Cheng * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED 3318275SEric Cheng * will be set only if srs_tx_woken_up is FALSE. If 3328275SEric Cheng * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived 3338275SEric Cheng * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to 3348275SEric Cheng * attempt to transmit again and not setting SRS_TX_BLOCKED does 3358275SEric Cheng * that. 3368275SEric Cheng */ 3378275SEric Cheng #define MAC_TX_SRS_BLOCK(srs, mp) { \ 3388275SEric Cheng ASSERT(MUTEX_HELD(&(srs)->srs_lock)); \ 3398275SEric Cheng if ((srs)->srs_tx.st_woken_up) { \ 3408275SEric Cheng (srs)->srs_tx.st_woken_up = B_FALSE; \ 3418275SEric Cheng } else { \ 3428275SEric Cheng ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED)); \ 3438275SEric Cheng (srs)->srs_state |= SRS_TX_BLOCKED; \ 3448275SEric Cheng (srs)->srs_tx.st_blocked_cnt++; \ 3458275SEric Cheng } \ 3468275SEric Cheng } 3478275SEric Cheng 3488275SEric Cheng /* 3498275SEric Cheng * MAC_TX_SRS_TEST_HIWAT 3508275SEric Cheng * 3518275SEric Cheng * Called before queueing a packet onto Tx SRS to test and set 3528275SEric Cheng * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat. 3538275SEric Cheng */ 3548275SEric Cheng #define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) { \ 3558275SEric Cheng boolean_t enqueue = 1; \ 3568275SEric Cheng \ 3578275SEric Cheng if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) { \ 3588275SEric Cheng /* \ 3598275SEric Cheng * flow-controlled. Store srs in cookie so that it \ 3608275SEric Cheng * can be returned as mac_tx_cookie_t to client \ 3618275SEric Cheng */ \ 3628275SEric Cheng (srs)->srs_state |= SRS_TX_HIWAT; \ 3638275SEric Cheng cookie = (mac_tx_cookie_t)srs; \ 3648275SEric Cheng (srs)->srs_tx.st_hiwat_cnt++; \ 3658275SEric Cheng if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) { \ 3668275SEric Cheng /* increment freed stats */ \ 3678275SEric Cheng (srs)->srs_tx.st_drop_count += cnt; \ 3688275SEric Cheng /* \ 3698275SEric Cheng * b_prev may be set to the fanout hint \ 3708275SEric Cheng * hence can't use freemsg directly \ 3718275SEric Cheng */ \ 3728275SEric Cheng mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \ 3738275SEric Cheng DTRACE_PROBE1(tx_queued_hiwat, \ 3748275SEric Cheng mac_soft_ring_set_t *, srs); \ 3758275SEric Cheng enqueue = 0; \ 3768275SEric Cheng } \ 3778275SEric Cheng } \ 3788275SEric Cheng if (enqueue) \ 3798275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz); \ 3808275SEric Cheng } 3818275SEric Cheng 3828275SEric Cheng /* Some utility macros */ 3838275SEric Cheng #define MAC_SRS_BW_LOCK(srs) \ 3848275SEric Cheng if (!(srs->srs_type & SRST_TX)) \ 3858275SEric Cheng mutex_enter(&srs->srs_bw->mac_bw_lock); 3868275SEric Cheng 3878275SEric Cheng #define MAC_SRS_BW_UNLOCK(srs) \ 3888275SEric Cheng if (!(srs->srs_type & SRST_TX)) \ 3898275SEric Cheng mutex_exit(&srs->srs_bw->mac_bw_lock); 3908275SEric Cheng 3918275SEric Cheng #define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \ 3928275SEric Cheng mac_pkt_drop(NULL, NULL, mp, B_FALSE); \ 3938275SEric Cheng /* increment freed stats */ \ 3948275SEric Cheng mac_srs->srs_tx.st_drop_count++; \ 3958275SEric Cheng cookie = (mac_tx_cookie_t)srs; \ 3968275SEric Cheng } 3978275SEric Cheng 3988275SEric Cheng #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \ 3998275SEric Cheng mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT; \ 4008275SEric Cheng cookie = (mac_tx_cookie_t)srs; \ 4018275SEric Cheng *ret_mp = mp_chain; \ 4028275SEric Cheng } 4038275SEric Cheng 4048275SEric Cheng /* 4058275SEric Cheng * Drop the rx packet and advance to the next one in the chain. 4068275SEric Cheng */ 4078275SEric Cheng static void 4088275SEric Cheng mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp) 4098275SEric Cheng { 4108275SEric Cheng mac_srs_rx_t *srs_rx = &srs->srs_rx; 4118275SEric Cheng 4128275SEric Cheng ASSERT(mp->b_next == NULL); 4138275SEric Cheng mutex_enter(&srs->srs_lock); 4148275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1); 4158275SEric Cheng MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp)); 4168275SEric Cheng mutex_exit(&srs->srs_lock); 4178275SEric Cheng 4188275SEric Cheng srs_rx->sr_drop_count++; 4198275SEric Cheng freemsg(mp); 4208275SEric Cheng } 4218275SEric Cheng 4228275SEric Cheng /* DATAPATH RUNTIME ROUTINES */ 4238275SEric Cheng 4248275SEric Cheng /* 4258275SEric Cheng * mac_srs_fire 4268275SEric Cheng * 4278275SEric Cheng * Timer callback routine for waking up the SRS worker thread. 4288275SEric Cheng */ 4298275SEric Cheng static void 4308275SEric Cheng mac_srs_fire(void *arg) 4318275SEric Cheng { 4328275SEric Cheng mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg; 4338275SEric Cheng 4348275SEric Cheng mutex_enter(&mac_srs->srs_lock); 4358275SEric Cheng if (mac_srs->srs_tid == 0) { 4368275SEric Cheng mutex_exit(&mac_srs->srs_lock); 4378275SEric Cheng return; 4388275SEric Cheng } 4398275SEric Cheng 4408275SEric Cheng mac_srs->srs_tid = 0; 4418275SEric Cheng if (!(mac_srs->srs_state & SRS_PROC)) 4428275SEric Cheng cv_signal(&mac_srs->srs_async); 4438275SEric Cheng 4448275SEric Cheng mutex_exit(&mac_srs->srs_lock); 4458275SEric Cheng } 4468275SEric Cheng 4478275SEric Cheng /* 4488275SEric Cheng * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack, 4498275SEric Cheng * and it is used on the TX path. 4508275SEric Cheng */ 4518275SEric Cheng #define HASH_HINT(hint) (((hint) << 17) | ((hint) >> 16)) 4528275SEric Cheng 4538275SEric Cheng /* 4548275SEric Cheng * hash based on the src address and the port information. 4558275SEric Cheng */ 4568275SEric Cheng #define HASH_ADDR(src, ports) \ 4578275SEric Cheng (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \ 4588275SEric Cheng ((ports) >> 8) ^ (ports)) 4598275SEric Cheng 4608275SEric Cheng #define COMPUTE_INDEX(key, sz) (key % sz) 4618275SEric Cheng 4628275SEric Cheng #define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) { \ 4638275SEric Cheng if ((tail) != NULL) { \ 4648275SEric Cheng ASSERT((tail)->b_next == NULL); \ 4658275SEric Cheng (tail)->b_next = (mp); \ 4668275SEric Cheng } else { \ 4678275SEric Cheng ASSERT((head) == NULL); \ 4688275SEric Cheng (head) = (mp); \ 4698275SEric Cheng } \ 4708275SEric Cheng (tail) = (mp); \ 4718275SEric Cheng (cnt)++; \ 4728275SEric Cheng if ((bw_ctl)) \ 4738275SEric Cheng (sz) += (sz0); \ 4748275SEric Cheng } 4758275SEric Cheng 4768275SEric Cheng #define MAC_FANOUT_DEFAULT 0 4778275SEric Cheng #define MAC_FANOUT_RND_ROBIN 1 4788275SEric Cheng int mac_fanout_type = MAC_FANOUT_DEFAULT; 4798275SEric Cheng 4808275SEric Cheng #define MAX_SR_TYPES 3 4818275SEric Cheng /* fanout types for port based hashing */ 4828275SEric Cheng enum pkt_type { 4838275SEric Cheng V4_TCP = 0, 4848275SEric Cheng V4_UDP, 4858275SEric Cheng OTH, 4868275SEric Cheng UNDEF 4878275SEric Cheng }; 4888275SEric Cheng 4898275SEric Cheng /* 4908275SEric Cheng * In general we do port based hashing to spread traffic over different 4918275SEric Cheng * softrings. The below tunable allows to override that behavior. Setting it 4928275SEric Cheng * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior 4938275SEric Cheng * is also the applicable to ipv6 packets carrying multiple optional headers 4948275SEric Cheng * and other uncommon packet types. 4958275SEric Cheng */ 4968275SEric Cheng boolean_t mac_src_ipv6_fanout = B_FALSE; 4978275SEric Cheng 4988275SEric Cheng /* 4998275SEric Cheng * Pair of local and remote ports in the transport header 5008275SEric Cheng */ 5018275SEric Cheng #define PORTS_SIZE 4 5028275SEric Cheng 5038275SEric Cheng /* 5048275SEric Cheng * mac_rx_srs_proto_fanout 5058275SEric Cheng * 5068275SEric Cheng * This routine delivers packets destined to an SRS into one of the 5078275SEric Cheng * protocol soft rings. 5088275SEric Cheng * 5098275SEric Cheng * Given a chain of packets we need to split it up into multiple sub chains 5108275SEric Cheng * destined into TCP, UDP or OTH soft ring. Instead of entering 5118275SEric Cheng * the soft ring one packet at a time, we want to enter it in the form of a 5128275SEric Cheng * chain otherwise we get this start/stop behaviour where the worker thread 5138275SEric Cheng * goes to sleep and then next packets comes in forcing it to wake up etc. 5148275SEric Cheng */ 5158275SEric Cheng static void 5168275SEric Cheng mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 5178275SEric Cheng { 5188275SEric Cheng struct ether_header *ehp; 5198833SVenu.Iyer@Sun.COM struct ether_vlan_header *evhp; 5208833SVenu.Iyer@Sun.COM uint32_t sap; 5218275SEric Cheng ipha_t *ipha; 5228833SVenu.Iyer@Sun.COM uint8_t *dstaddr; 5238833SVenu.Iyer@Sun.COM size_t hdrsize; 5248275SEric Cheng mblk_t *mp; 5258275SEric Cheng mblk_t *headmp[MAX_SR_TYPES]; 5268275SEric Cheng mblk_t *tailmp[MAX_SR_TYPES]; 5278275SEric Cheng int cnt[MAX_SR_TYPES]; 5288275SEric Cheng size_t sz[MAX_SR_TYPES]; 5298275SEric Cheng size_t sz1; 5308833SVenu.Iyer@Sun.COM boolean_t bw_ctl; 5318275SEric Cheng boolean_t hw_classified; 5328833SVenu.Iyer@Sun.COM boolean_t dls_bypass; 5338833SVenu.Iyer@Sun.COM boolean_t is_ether; 5348833SVenu.Iyer@Sun.COM boolean_t is_unicast; 5358833SVenu.Iyer@Sun.COM enum pkt_type type; 5368275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 5378833SVenu.Iyer@Sun.COM 5388833SVenu.Iyer@Sun.COM is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 5398833SVenu.Iyer@Sun.COM bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 5408275SEric Cheng 5418275SEric Cheng /* 5428275SEric Cheng * If we don't have a Rx ring, S/W classification would have done 5438275SEric Cheng * its job and its a packet meant for us. If we were polling on 5448275SEric Cheng * the default ring (i.e. there was a ring assigned to this SRS), 5458275SEric Cheng * then we need to make sure that the mac address really belongs 5468275SEric Cheng * to us. 5478275SEric Cheng */ 5488275SEric Cheng hw_classified = mac_srs->srs_ring != NULL && 5498275SEric Cheng mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 5508275SEric Cheng 5518275SEric Cheng /* 5528275SEric Cheng * Special clients (eg. VLAN, non ether, etc) need DLS 5538275SEric Cheng * processing in the Rx path. SRST_DLS_BYPASS will be clear for 55411021SEric.Cheng@Sun.COM * such SRSs. Another way of disabling bypass is to set the 55511021SEric.Cheng@Sun.COM * MCIS_RX_BYPASS_DISABLE flag. 5568275SEric Cheng */ 55711021SEric.Cheng@Sun.COM dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && 55811021SEric.Cheng@Sun.COM ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); 5598275SEric Cheng 5608275SEric Cheng bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *)); 5618275SEric Cheng bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *)); 5628275SEric Cheng bzero(cnt, MAX_SR_TYPES * sizeof (int)); 5638275SEric Cheng bzero(sz, MAX_SR_TYPES * sizeof (size_t)); 5648275SEric Cheng 5658275SEric Cheng /* 5668275SEric Cheng * We got a chain from SRS that we need to send to the soft rings. 5678275SEric Cheng * Since squeues for TCP & IPv4 sap poll their soft rings (for 5688275SEric Cheng * performance reasons), we need to separate out v4_tcp, v4_udp 5698275SEric Cheng * and the rest goes in other. 5708275SEric Cheng */ 5718275SEric Cheng while (head != NULL) { 5728275SEric Cheng mp = head; 5738275SEric Cheng head = head->b_next; 5748275SEric Cheng mp->b_next = NULL; 5758275SEric Cheng 5768275SEric Cheng type = OTH; 5778833SVenu.Iyer@Sun.COM sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 5788833SVenu.Iyer@Sun.COM 5798833SVenu.Iyer@Sun.COM if (is_ether) { 5808833SVenu.Iyer@Sun.COM /* 5818833SVenu.Iyer@Sun.COM * At this point we can be sure the packet at least 5828833SVenu.Iyer@Sun.COM * has an ether header. 5838833SVenu.Iyer@Sun.COM */ 5848833SVenu.Iyer@Sun.COM if (sz1 < sizeof (struct ether_header)) { 5858833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 5868833SVenu.Iyer@Sun.COM continue; 5878833SVenu.Iyer@Sun.COM } 5888275SEric Cheng ehp = (struct ether_header *)mp->b_rptr; 5898275SEric Cheng 5908275SEric Cheng /* 5918833SVenu.Iyer@Sun.COM * Determine if this is a VLAN or non-VLAN packet. 5928275SEric Cheng */ 5938833SVenu.Iyer@Sun.COM if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 5948833SVenu.Iyer@Sun.COM evhp = (struct ether_vlan_header *)mp->b_rptr; 5958833SVenu.Iyer@Sun.COM sap = ntohs(evhp->ether_type); 5968833SVenu.Iyer@Sun.COM hdrsize = sizeof (struct ether_vlan_header); 5978275SEric Cheng /* 5988833SVenu.Iyer@Sun.COM * Check if the VID of the packet, if any, 5998833SVenu.Iyer@Sun.COM * belongs to this client. 6008275SEric Cheng */ 6018275SEric Cheng if (!mac_client_check_flow_vid(mcip, 6028275SEric Cheng VLAN_ID(ntohs(evhp->ether_tci)))) { 6038275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 6048275SEric Cheng continue; 6058275SEric Cheng } 6068833SVenu.Iyer@Sun.COM } else { 6078833SVenu.Iyer@Sun.COM hdrsize = sizeof (struct ether_header); 6088275SEric Cheng } 6098833SVenu.Iyer@Sun.COM is_unicast = 6108833SVenu.Iyer@Sun.COM ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 6118833SVenu.Iyer@Sun.COM dstaddr = (uint8_t *)&ehp->ether_dhost; 6128833SVenu.Iyer@Sun.COM } else { 6138833SVenu.Iyer@Sun.COM mac_header_info_t mhi; 6148833SVenu.Iyer@Sun.COM 6158833SVenu.Iyer@Sun.COM if (mac_header_info((mac_handle_t)mcip->mci_mip, 6168833SVenu.Iyer@Sun.COM mp, &mhi) != 0) { 6178833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 6188833SVenu.Iyer@Sun.COM continue; 6198833SVenu.Iyer@Sun.COM } 6208833SVenu.Iyer@Sun.COM hdrsize = mhi.mhi_hdrsize; 6218833SVenu.Iyer@Sun.COM sap = mhi.mhi_bindsap; 6228833SVenu.Iyer@Sun.COM is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 6238833SVenu.Iyer@Sun.COM dstaddr = (uint8_t *)mhi.mhi_daddr; 6248833SVenu.Iyer@Sun.COM } 6258833SVenu.Iyer@Sun.COM 6268833SVenu.Iyer@Sun.COM if (!dls_bypass) { 6278275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 6288275SEric Cheng cnt[type], bw_ctl, sz[type], sz1, mp); 6298275SEric Cheng continue; 6308275SEric Cheng } 6318275SEric Cheng 6328833SVenu.Iyer@Sun.COM if (sap == ETHERTYPE_IP) { 6338275SEric Cheng /* 6348275SEric Cheng * If we are H/W classified, but we have promisc 6358275SEric Cheng * on, then we need to check for the unicast address. 6368275SEric Cheng */ 6378275SEric Cheng if (hw_classified && mcip->mci_promisc_list != NULL) { 6388275SEric Cheng mac_address_t *map; 6398275SEric Cheng 6408275SEric Cheng rw_enter(&mcip->mci_rw_lock, RW_READER); 6418275SEric Cheng map = mcip->mci_unicast; 6428833SVenu.Iyer@Sun.COM if (bcmp(dstaddr, map->ma_addr, 6438275SEric Cheng map->ma_len) == 0) 6448275SEric Cheng type = UNDEF; 6458275SEric Cheng rw_exit(&mcip->mci_rw_lock); 6468833SVenu.Iyer@Sun.COM } else if (is_unicast) { 6478275SEric Cheng type = UNDEF; 6488275SEric Cheng } 6498275SEric Cheng } 6508275SEric Cheng 6518275SEric Cheng /* 6528275SEric Cheng * This needs to become a contract with the driver for 6538275SEric Cheng * the fast path. 6548275SEric Cheng * 6558275SEric Cheng * In the normal case the packet will have at least the L2 6568275SEric Cheng * header and the IP + Transport header in the same mblk. 6578275SEric Cheng * This is usually the case when the NIC driver sends up 6588275SEric Cheng * the packet. This is also true when the stack generates 6598275SEric Cheng * a packet that is looped back and when the stack uses the 6608275SEric Cheng * fastpath mechanism. The normal case is optimized for 6618275SEric Cheng * performance and may bypass DLS. All other cases go through 6628275SEric Cheng * the 'OTH' type path without DLS bypass. 6638275SEric Cheng */ 6648275SEric Cheng 6658833SVenu.Iyer@Sun.COM ipha = (ipha_t *)(mp->b_rptr + hdrsize); 6668275SEric Cheng if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) 6678275SEric Cheng type = OTH; 6688275SEric Cheng 6698275SEric Cheng if (type == OTH) { 6708275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 6718275SEric Cheng cnt[type], bw_ctl, sz[type], sz1, mp); 6728275SEric Cheng continue; 6738275SEric Cheng } 6748275SEric Cheng 6758275SEric Cheng ASSERT(type == UNDEF); 6768275SEric Cheng /* 6778275SEric Cheng * We look for at least 4 bytes past the IP header to get 6788275SEric Cheng * the port information. If we get an IP fragment, we don't 6798275SEric Cheng * have the port information, and we use just the protocol 6808275SEric Cheng * information. 6818275SEric Cheng */ 6828275SEric Cheng switch (ipha->ipha_protocol) { 6838275SEric Cheng case IPPROTO_TCP: 6848275SEric Cheng type = V4_TCP; 6858833SVenu.Iyer@Sun.COM mp->b_rptr += hdrsize; 6868275SEric Cheng break; 6878275SEric Cheng case IPPROTO_UDP: 6888275SEric Cheng type = V4_UDP; 6898833SVenu.Iyer@Sun.COM mp->b_rptr += hdrsize; 6908275SEric Cheng break; 6918275SEric Cheng default: 6928275SEric Cheng type = OTH; 6938275SEric Cheng break; 6948275SEric Cheng } 6958275SEric Cheng 6968275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type], 6978275SEric Cheng bw_ctl, sz[type], sz1, mp); 6988275SEric Cheng } 6998275SEric Cheng 7008275SEric Cheng for (type = V4_TCP; type < UNDEF; type++) { 7018275SEric Cheng if (headmp[type] != NULL) { 7028833SVenu.Iyer@Sun.COM mac_soft_ring_t *softring; 7038833SVenu.Iyer@Sun.COM 7048275SEric Cheng ASSERT(tailmp[type]->b_next == NULL); 7058275SEric Cheng switch (type) { 7068275SEric Cheng case V4_TCP: 7078275SEric Cheng softring = mac_srs->srs_tcp_soft_rings[0]; 7088275SEric Cheng break; 7098275SEric Cheng case V4_UDP: 7108275SEric Cheng softring = mac_srs->srs_udp_soft_rings[0]; 7118275SEric Cheng break; 7128275SEric Cheng case OTH: 7138275SEric Cheng softring = mac_srs->srs_oth_soft_rings[0]; 7148275SEric Cheng } 7158833SVenu.Iyer@Sun.COM mac_rx_soft_ring_process(mcip, softring, 7168275SEric Cheng headmp[type], tailmp[type], cnt[type], sz[type]); 7178275SEric Cheng } 7188275SEric Cheng } 7198275SEric Cheng } 7208275SEric Cheng 7218275SEric Cheng int fanout_unalligned = 0; 7228275SEric Cheng 7238275SEric Cheng /* 7248275SEric Cheng * mac_rx_srs_long_fanout 7258275SEric Cheng * 7268275SEric Cheng * The fanout routine for IPv6 7278275SEric Cheng */ 7288275SEric Cheng static int 7298275SEric Cheng mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, 7308833SVenu.Iyer@Sun.COM uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx) 7318275SEric Cheng { 7328275SEric Cheng ip6_t *ip6h; 7338275SEric Cheng uint8_t *whereptr; 7348275SEric Cheng uint_t hash; 7358275SEric Cheng uint16_t remlen; 7368275SEric Cheng uint8_t nexthdr; 7378275SEric Cheng uint16_t hdr_len; 7388275SEric Cheng 7398833SVenu.Iyer@Sun.COM if (sap == ETHERTYPE_IPV6) { 7408275SEric Cheng boolean_t modifiable = B_TRUE; 7418275SEric Cheng 7428833SVenu.Iyer@Sun.COM ASSERT(MBLKL(mp) >= hdrsize); 7438833SVenu.Iyer@Sun.COM 7448833SVenu.Iyer@Sun.COM ip6h = (ip6_t *)(mp->b_rptr + hdrsize); 7458275SEric Cheng if ((unsigned char *)ip6h == mp->b_wptr) { 7468275SEric Cheng /* 7478833SVenu.Iyer@Sun.COM * The first mblk_t only includes the mac header. 7488275SEric Cheng * Note that it is safe to change the mp pointer here, 7498275SEric Cheng * as the subsequent operation does not assume mp 7508833SVenu.Iyer@Sun.COM * points to the start of the mac header. 7518275SEric Cheng */ 7528275SEric Cheng mp = mp->b_cont; 7538275SEric Cheng 7548275SEric Cheng /* 7558275SEric Cheng * Make sure ip6h holds the full ip6_t structure. 7568275SEric Cheng */ 7578275SEric Cheng if (mp == NULL) 7588275SEric Cheng return (-1); 7598275SEric Cheng 7608275SEric Cheng if (MBLKL(mp) < IPV6_HDR_LEN) { 7618275SEric Cheng modifiable = (DB_REF(mp) == 1); 7628275SEric Cheng 7638275SEric Cheng if (modifiable && 7648275SEric Cheng !pullupmsg(mp, IPV6_HDR_LEN)) { 7658275SEric Cheng return (-1); 7668275SEric Cheng } 7678275SEric Cheng } 7688275SEric Cheng 7698275SEric Cheng ip6h = (ip6_t *)mp->b_rptr; 7708275SEric Cheng } 7718275SEric Cheng 7728275SEric Cheng if (!modifiable || !(OK_32PTR((char *)ip6h)) || 7738275SEric Cheng ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) { 7748275SEric Cheng /* 7758275SEric Cheng * If either ip6h is not alligned, or ip6h does not 7768275SEric Cheng * hold the complete ip6_t structure (a pullupmsg() 7778275SEric Cheng * is not an option since it would result in an 7788275SEric Cheng * unalligned ip6h), fanout to the default ring. Note 7798275SEric Cheng * that this may cause packets reordering. 7808275SEric Cheng */ 7818275SEric Cheng *indx = 0; 7828275SEric Cheng *type = OTH; 7838275SEric Cheng fanout_unalligned++; 7848275SEric Cheng return (0); 7858275SEric Cheng } 7868275SEric Cheng 7878275SEric Cheng remlen = ntohs(ip6h->ip6_plen); 7888275SEric Cheng nexthdr = ip6h->ip6_nxt; 7898275SEric Cheng 7908275SEric Cheng if (remlen < MIN_EHDR_LEN) 7918275SEric Cheng return (-1); 7928275SEric Cheng /* 7938275SEric Cheng * Do src based fanout if below tunable is set to B_TRUE or 7948275SEric Cheng * when mac_ip_hdr_length_v6() fails because of malformed 7958275SEric Cheng * packets or because mblk's need to be concatenated using 7968275SEric Cheng * pullupmsg(). 7978275SEric Cheng */ 7988275SEric Cheng if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(mp, ip6h, 7998275SEric Cheng &hdr_len, &nexthdr)) { 8008275SEric Cheng goto src_based_fanout; 8018275SEric Cheng } 8028275SEric Cheng whereptr = (uint8_t *)ip6h + hdr_len; 8038275SEric Cheng 8048275SEric Cheng /* If the transport is one of below, we do port based fanout */ 8058275SEric Cheng switch (nexthdr) { 8068275SEric Cheng case IPPROTO_TCP: 8078275SEric Cheng case IPPROTO_UDP: 8088275SEric Cheng case IPPROTO_SCTP: 8098275SEric Cheng case IPPROTO_ESP: 8108275SEric Cheng /* 8118275SEric Cheng * If the ports in the transport header is not part of 8128275SEric Cheng * the mblk, do src_based_fanout, instead of calling 8138275SEric Cheng * pullupmsg(). 8148275SEric Cheng */ 8158275SEric Cheng if (mp->b_cont != NULL && 8168275SEric Cheng whereptr + PORTS_SIZE > mp->b_wptr) { 8178275SEric Cheng goto src_based_fanout; 8188275SEric Cheng } 8198275SEric Cheng break; 8208275SEric Cheng default: 8218275SEric Cheng break; 8228275SEric Cheng } 8238275SEric Cheng 8248275SEric Cheng switch (nexthdr) { 8258275SEric Cheng case IPPROTO_TCP: 8268275SEric Cheng hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 8278275SEric Cheng *(uint32_t *)whereptr); 8288275SEric Cheng *indx = COMPUTE_INDEX(hash, 8298275SEric Cheng mac_srs->srs_tcp_ring_count); 8308275SEric Cheng *type = OTH; 8318275SEric Cheng break; 8328275SEric Cheng 8338275SEric Cheng case IPPROTO_UDP: 8348275SEric Cheng case IPPROTO_SCTP: 8358275SEric Cheng case IPPROTO_ESP: 8368275SEric Cheng if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 8378275SEric Cheng hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 8388275SEric Cheng *(uint32_t *)whereptr); 8398275SEric Cheng *indx = COMPUTE_INDEX(hash, 8408275SEric Cheng mac_srs->srs_udp_ring_count); 8418275SEric Cheng } else { 8428275SEric Cheng *indx = mac_srs->srs_ind % 8438275SEric Cheng mac_srs->srs_udp_ring_count; 8448275SEric Cheng mac_srs->srs_ind++; 8458275SEric Cheng } 8468275SEric Cheng *type = OTH; 8478275SEric Cheng break; 8488275SEric Cheng 8498275SEric Cheng /* For all other protocol, do source based fanout */ 8508275SEric Cheng default: 8518275SEric Cheng goto src_based_fanout; 8528275SEric Cheng } 8538275SEric Cheng } else { 8548275SEric Cheng *indx = 0; 8558275SEric Cheng *type = OTH; 8568275SEric Cheng } 8578275SEric Cheng return (0); 8588275SEric Cheng 8598275SEric Cheng src_based_fanout: 8608275SEric Cheng hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0); 8618275SEric Cheng *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); 8628275SEric Cheng *type = OTH; 8638275SEric Cheng return (0); 8648275SEric Cheng } 8658275SEric Cheng 8668275SEric Cheng /* 8678275SEric Cheng * mac_rx_srs_fanout 8688275SEric Cheng * 8698275SEric Cheng * This routine delivers packets destined to an SRS into a soft ring member 8708275SEric Cheng * of the set. 8718275SEric Cheng * 8728275SEric Cheng * Given a chain of packets we need to split it up into multiple sub chains 8738275SEric Cheng * destined for one of the TCP, UDP or OTH soft rings. Instead of entering 8748275SEric Cheng * the soft ring one packet at a time, we want to enter it in the form of a 8758275SEric Cheng * chain otherwise we get this start/stop behaviour where the worker thread 8768275SEric Cheng * goes to sleep and then next packets comes in forcing it to wake up etc. 8778275SEric Cheng * 8788275SEric Cheng * Note: 8798275SEric Cheng * Since we know what is the maximum fanout possible, we create a 2D array 8808275SEric Cheng * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz 8818275SEric Cheng * variables so that we can enter the softrings with chain. We need the 8828275SEric Cheng * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc 8838275SEric Cheng * for each packet would be expensive). If we ever want to have the 8848275SEric Cheng * ability to have unlimited fanout, we should probably declare a head, 8858275SEric Cheng * tail, cnt, sz with each soft ring (a data struct which contains a softring 8868275SEric Cheng * along with these members) and create an array of this uber struct so we 8878275SEric Cheng * don't have to do kmem_alloc. 8888275SEric Cheng */ 8898275SEric Cheng int fanout_oth1 = 0; 8908275SEric Cheng int fanout_oth2 = 0; 8918275SEric Cheng int fanout_oth3 = 0; 8928275SEric Cheng int fanout_oth4 = 0; 8938275SEric Cheng int fanout_oth5 = 0; 8948275SEric Cheng 8958275SEric Cheng static void 8968275SEric Cheng mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 8978275SEric Cheng { 8988275SEric Cheng struct ether_header *ehp; 8998833SVenu.Iyer@Sun.COM struct ether_vlan_header *evhp; 9008833SVenu.Iyer@Sun.COM uint32_t sap; 9018275SEric Cheng ipha_t *ipha; 9028833SVenu.Iyer@Sun.COM uint8_t *dstaddr; 9038275SEric Cheng uint_t indx; 9048833SVenu.Iyer@Sun.COM size_t ports_offset; 9058833SVenu.Iyer@Sun.COM size_t ipha_len; 9068833SVenu.Iyer@Sun.COM size_t hdrsize; 9078275SEric Cheng uint_t hash; 9088275SEric Cheng mblk_t *mp; 9098275SEric Cheng mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 9108275SEric Cheng mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 9118275SEric Cheng int cnt[MAX_SR_TYPES][MAX_SR_FANOUT]; 9128275SEric Cheng size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT]; 9138275SEric Cheng size_t sz1; 9148833SVenu.Iyer@Sun.COM boolean_t bw_ctl; 9158275SEric Cheng boolean_t hw_classified; 9168833SVenu.Iyer@Sun.COM boolean_t dls_bypass; 9178833SVenu.Iyer@Sun.COM boolean_t is_ether; 9188833SVenu.Iyer@Sun.COM boolean_t is_unicast; 9198275SEric Cheng int fanout_cnt; 9208833SVenu.Iyer@Sun.COM enum pkt_type type; 9218275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 9228833SVenu.Iyer@Sun.COM 9238833SVenu.Iyer@Sun.COM is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 9248833SVenu.Iyer@Sun.COM bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 9258275SEric Cheng 9268275SEric Cheng /* 9278275SEric Cheng * If we don't have a Rx ring, S/W classification would have done 9288275SEric Cheng * its job and its a packet meant for us. If we were polling on 9298275SEric Cheng * the default ring (i.e. there was a ring assigned to this SRS), 9308275SEric Cheng * then we need to make sure that the mac address really belongs 9318275SEric Cheng * to us. 9328275SEric Cheng */ 9338275SEric Cheng hw_classified = mac_srs->srs_ring != NULL && 9348275SEric Cheng mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 9358275SEric Cheng 9368275SEric Cheng /* 9378275SEric Cheng * Special clients (eg. VLAN, non ether, etc) need DLS 9388275SEric Cheng * processing in the Rx path. SRST_DLS_BYPASS will be clear for 93911021SEric.Cheng@Sun.COM * such SRSs. Another way of disabling bypass is to set the 94011021SEric.Cheng@Sun.COM * MCIS_RX_BYPASS_DISABLE flag. 9418275SEric Cheng */ 94211021SEric.Cheng@Sun.COM dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && 94311021SEric.Cheng@Sun.COM ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); 9448275SEric Cheng 9458275SEric Cheng /* 9468275SEric Cheng * Since the softrings are never destroyed and we always 9478275SEric Cheng * create equal number of softrings for TCP, UDP and rest, 9488275SEric Cheng * its OK to check one of them for count and use it without 9498275SEric Cheng * any lock. In future, if soft rings get destroyed because 9508275SEric Cheng * of reduction in fanout, we will need to ensure that happens 9518275SEric Cheng * behind the SRS_PROC. 9528275SEric Cheng */ 9538275SEric Cheng fanout_cnt = mac_srs->srs_tcp_ring_count; 9548275SEric Cheng 9558275SEric Cheng bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 9568275SEric Cheng bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 9578275SEric Cheng bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int)); 9588275SEric Cheng bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t)); 9598275SEric Cheng 9608275SEric Cheng /* 9618275SEric Cheng * We got a chain from SRS that we need to send to the soft rings. 9628275SEric Cheng * Since squeues for TCP & IPv4 sap poll their soft rings (for 9638275SEric Cheng * performance reasons), we need to separate out v4_tcp, v4_udp 9648275SEric Cheng * and the rest goes in other. 9658275SEric Cheng */ 9668275SEric Cheng while (head != NULL) { 9678275SEric Cheng mp = head; 9688275SEric Cheng head = head->b_next; 9698275SEric Cheng mp->b_next = NULL; 9708275SEric Cheng 9718275SEric Cheng type = OTH; 9728833SVenu.Iyer@Sun.COM sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 9738833SVenu.Iyer@Sun.COM 9748833SVenu.Iyer@Sun.COM if (is_ether) { 9758833SVenu.Iyer@Sun.COM /* 9768833SVenu.Iyer@Sun.COM * At this point we can be sure the packet at least 9778833SVenu.Iyer@Sun.COM * has an ether header. 9788833SVenu.Iyer@Sun.COM */ 9798833SVenu.Iyer@Sun.COM if (sz1 < sizeof (struct ether_header)) { 9808833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 9818833SVenu.Iyer@Sun.COM continue; 9828833SVenu.Iyer@Sun.COM } 9838833SVenu.Iyer@Sun.COM ehp = (struct ether_header *)mp->b_rptr; 9848833SVenu.Iyer@Sun.COM 9858833SVenu.Iyer@Sun.COM /* 9868833SVenu.Iyer@Sun.COM * Determine if this is a VLAN or non-VLAN packet. 9878833SVenu.Iyer@Sun.COM */ 9888833SVenu.Iyer@Sun.COM if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 9898833SVenu.Iyer@Sun.COM evhp = (struct ether_vlan_header *)mp->b_rptr; 9908833SVenu.Iyer@Sun.COM sap = ntohs(evhp->ether_type); 9918833SVenu.Iyer@Sun.COM hdrsize = sizeof (struct ether_vlan_header); 9928275SEric Cheng /* 9938833SVenu.Iyer@Sun.COM * Check if the VID of the packet, if any, 9948833SVenu.Iyer@Sun.COM * belongs to this client. 9958275SEric Cheng */ 9968833SVenu.Iyer@Sun.COM if (!mac_client_check_flow_vid(mcip, 9978833SVenu.Iyer@Sun.COM VLAN_ID(ntohs(evhp->ether_tci)))) { 9988275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 9998275SEric Cheng continue; 10008275SEric Cheng } 10018833SVenu.Iyer@Sun.COM } else { 10028833SVenu.Iyer@Sun.COM hdrsize = sizeof (struct ether_header); 10038833SVenu.Iyer@Sun.COM } 10048833SVenu.Iyer@Sun.COM is_unicast = 10058833SVenu.Iyer@Sun.COM ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 10068833SVenu.Iyer@Sun.COM dstaddr = (uint8_t *)&ehp->ether_dhost; 10078833SVenu.Iyer@Sun.COM } else { 10088833SVenu.Iyer@Sun.COM mac_header_info_t mhi; 10098833SVenu.Iyer@Sun.COM 10108833SVenu.Iyer@Sun.COM if (mac_header_info((mac_handle_t)mcip->mci_mip, 10118833SVenu.Iyer@Sun.COM mp, &mhi) != 0) { 10128833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 10138833SVenu.Iyer@Sun.COM continue; 10148833SVenu.Iyer@Sun.COM } 10158833SVenu.Iyer@Sun.COM hdrsize = mhi.mhi_hdrsize; 10168833SVenu.Iyer@Sun.COM sap = mhi.mhi_bindsap; 10178833SVenu.Iyer@Sun.COM is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 10188833SVenu.Iyer@Sun.COM dstaddr = (uint8_t *)mhi.mhi_daddr; 10198833SVenu.Iyer@Sun.COM } 10208833SVenu.Iyer@Sun.COM 10218833SVenu.Iyer@Sun.COM if (!dls_bypass) { 10228833SVenu.Iyer@Sun.COM if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 10238833SVenu.Iyer@Sun.COM hdrsize, &type, &indx) == -1) { 10248833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 10258833SVenu.Iyer@Sun.COM continue; 10268275SEric Cheng } 10278275SEric Cheng 10288275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type][indx], 10298275SEric Cheng tailmp[type][indx], cnt[type][indx], bw_ctl, 10308275SEric Cheng sz[type][indx], sz1, mp); 10318275SEric Cheng continue; 10328275SEric Cheng } 10338275SEric Cheng 10348275SEric Cheng 10358275SEric Cheng /* 10368275SEric Cheng * If we are using the default Rx ring where H/W or S/W 10378275SEric Cheng * classification has not happened, we need to verify if 10388275SEric Cheng * this unicast packet really belongs to us. 10398275SEric Cheng */ 10408833SVenu.Iyer@Sun.COM if (sap == ETHERTYPE_IP) { 10418275SEric Cheng /* 10428275SEric Cheng * If we are H/W classified, but we have promisc 10438275SEric Cheng * on, then we need to check for the unicast address. 10448275SEric Cheng */ 10458275SEric Cheng if (hw_classified && mcip->mci_promisc_list != NULL) { 10468275SEric Cheng mac_address_t *map; 10478275SEric Cheng 10488275SEric Cheng rw_enter(&mcip->mci_rw_lock, RW_READER); 10498275SEric Cheng map = mcip->mci_unicast; 10508833SVenu.Iyer@Sun.COM if (bcmp(dstaddr, map->ma_addr, 10518275SEric Cheng map->ma_len) == 0) 10528275SEric Cheng type = UNDEF; 10538275SEric Cheng rw_exit(&mcip->mci_rw_lock); 10548833SVenu.Iyer@Sun.COM } else if (is_unicast) { 10558275SEric Cheng type = UNDEF; 10568275SEric Cheng } 10578275SEric Cheng } 10588275SEric Cheng 10598275SEric Cheng /* 10608275SEric Cheng * This needs to become a contract with the driver for 10618275SEric Cheng * the fast path. 10628275SEric Cheng */ 10638275SEric Cheng 10648833SVenu.Iyer@Sun.COM ipha = (ipha_t *)(mp->b_rptr + hdrsize); 10658275SEric Cheng if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) { 10668275SEric Cheng type = OTH; 10678275SEric Cheng fanout_oth1++; 10688275SEric Cheng } 10698275SEric Cheng 10708275SEric Cheng if (type != OTH) { 10718833SVenu.Iyer@Sun.COM uint16_t frag_offset_flags; 10728833SVenu.Iyer@Sun.COM 10738275SEric Cheng switch (ipha->ipha_protocol) { 10748275SEric Cheng case IPPROTO_TCP: 10758275SEric Cheng case IPPROTO_UDP: 10768275SEric Cheng case IPPROTO_SCTP: 10778275SEric Cheng case IPPROTO_ESP: 10788275SEric Cheng ipha_len = IPH_HDR_LENGTH(ipha); 10798275SEric Cheng if ((uchar_t *)ipha + ipha_len + PORTS_SIZE > 10808275SEric Cheng mp->b_wptr) { 10818275SEric Cheng type = OTH; 10828275SEric Cheng break; 10838275SEric Cheng } 10848275SEric Cheng frag_offset_flags = 10858275SEric Cheng ntohs(ipha->ipha_fragment_offset_and_flags); 10868275SEric Cheng if ((frag_offset_flags & 10878275SEric Cheng (IPH_MF | IPH_OFFSET)) != 0) { 10888275SEric Cheng type = OTH; 10898275SEric Cheng fanout_oth3++; 10908275SEric Cheng break; 10918275SEric Cheng } 10928833SVenu.Iyer@Sun.COM ports_offset = hdrsize + ipha_len; 10938275SEric Cheng break; 10948275SEric Cheng default: 10958275SEric Cheng type = OTH; 10968275SEric Cheng fanout_oth4++; 10978275SEric Cheng break; 10988275SEric Cheng } 10998275SEric Cheng } 11008275SEric Cheng 11018275SEric Cheng if (type == OTH) { 11028833SVenu.Iyer@Sun.COM if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 11038833SVenu.Iyer@Sun.COM hdrsize, &type, &indx) == -1) { 11048275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 11058275SEric Cheng continue; 11068275SEric Cheng } 11078275SEric Cheng 11088275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type][indx], 11098275SEric Cheng tailmp[type][indx], cnt[type][indx], bw_ctl, 11108275SEric Cheng sz[type][indx], sz1, mp); 11118275SEric Cheng continue; 11128275SEric Cheng } 11138275SEric Cheng 11148275SEric Cheng ASSERT(type == UNDEF); 11158275SEric Cheng 11168275SEric Cheng /* 11178275SEric Cheng * XXX-Sunay: We should hold srs_lock since ring_count 11188275SEric Cheng * below can change. But if we are always called from 11198275SEric Cheng * mac_rx_srs_drain and SRS_PROC is set, then we can 11208275SEric Cheng * enforce that ring_count can't be changed i.e. 11218275SEric Cheng * to change fanout type or ring count, the calling 11228275SEric Cheng * thread needs to be behind SRS_PROC. 11238275SEric Cheng */ 11248275SEric Cheng switch (ipha->ipha_protocol) { 11258275SEric Cheng case IPPROTO_TCP: 11268275SEric Cheng /* 11278275SEric Cheng * Note that for ESP, we fanout on SPI and it is at the 11288275SEric Cheng * same offset as the 2x16-bit ports. So it is clumped 11298275SEric Cheng * along with TCP, UDP and SCTP. 11308275SEric Cheng */ 11318275SEric Cheng hash = HASH_ADDR(ipha->ipha_src, 11328275SEric Cheng *(uint32_t *)(mp->b_rptr + ports_offset)); 11338275SEric Cheng indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count); 11348275SEric Cheng type = V4_TCP; 11358833SVenu.Iyer@Sun.COM mp->b_rptr += hdrsize; 11368275SEric Cheng break; 11378275SEric Cheng case IPPROTO_UDP: 11388275SEric Cheng case IPPROTO_SCTP: 11398275SEric Cheng case IPPROTO_ESP: 11408275SEric Cheng if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 11418275SEric Cheng hash = HASH_ADDR(ipha->ipha_src, 11428275SEric Cheng *(uint32_t *)(mp->b_rptr + ports_offset)); 11438275SEric Cheng indx = COMPUTE_INDEX(hash, 11448275SEric Cheng mac_srs->srs_udp_ring_count); 11458275SEric Cheng } else { 11468275SEric Cheng indx = mac_srs->srs_ind % 11478275SEric Cheng mac_srs->srs_udp_ring_count; 11488275SEric Cheng mac_srs->srs_ind++; 11498275SEric Cheng } 11508275SEric Cheng type = V4_UDP; 11518833SVenu.Iyer@Sun.COM mp->b_rptr += hdrsize; 11528275SEric Cheng break; 11538833SVenu.Iyer@Sun.COM default: 11548833SVenu.Iyer@Sun.COM indx = 0; 11558833SVenu.Iyer@Sun.COM type = OTH; 11568275SEric Cheng } 11578275SEric Cheng 11588275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx], 11598275SEric Cheng cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp); 11608275SEric Cheng } 11618275SEric Cheng 11628275SEric Cheng for (type = V4_TCP; type < UNDEF; type++) { 11638833SVenu.Iyer@Sun.COM int i; 11648833SVenu.Iyer@Sun.COM 11658275SEric Cheng for (i = 0; i < fanout_cnt; i++) { 11668275SEric Cheng if (headmp[type][i] != NULL) { 11678833SVenu.Iyer@Sun.COM mac_soft_ring_t *softring; 11688833SVenu.Iyer@Sun.COM 11698275SEric Cheng ASSERT(tailmp[type][i]->b_next == NULL); 11708275SEric Cheng switch (type) { 11718275SEric Cheng case V4_TCP: 11728275SEric Cheng softring = 11738275SEric Cheng mac_srs->srs_tcp_soft_rings[i]; 11748275SEric Cheng break; 11758275SEric Cheng case V4_UDP: 11768275SEric Cheng softring = 11778275SEric Cheng mac_srs->srs_udp_soft_rings[i]; 11788275SEric Cheng break; 11798275SEric Cheng case OTH: 11808275SEric Cheng softring = 11818275SEric Cheng mac_srs->srs_oth_soft_rings[i]; 11828275SEric Cheng break; 11838275SEric Cheng } 11848833SVenu.Iyer@Sun.COM mac_rx_soft_ring_process(mcip, 11858275SEric Cheng softring, headmp[type][i], tailmp[type][i], 11868275SEric Cheng cnt[type][i], sz[type][i]); 11878275SEric Cheng } 11888275SEric Cheng } 11898275SEric Cheng } 11908275SEric Cheng } 11918275SEric Cheng 11928275SEric Cheng #define SRS_BYTES_TO_PICKUP 150000 11938275SEric Cheng ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP; 11948275SEric Cheng 11958275SEric Cheng /* 11968275SEric Cheng * mac_rx_srs_poll_ring 11978275SEric Cheng * 11988275SEric Cheng * This SRS Poll thread uses this routine to poll the underlying hardware 11998275SEric Cheng * Rx ring to get a chain of packets. It can inline process that chain 12008275SEric Cheng * if mac_latency_optimize is set (default) or signal the SRS worker thread 12018275SEric Cheng * to do the remaining processing. 12028275SEric Cheng * 12038275SEric Cheng * Since packets come in the system via interrupt or poll path, we also 12048275SEric Cheng * update the stats and deal with promiscous clients here. 12058275SEric Cheng */ 12068275SEric Cheng void 12078275SEric Cheng mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs) 12088275SEric Cheng { 12098275SEric Cheng kmutex_t *lock = &mac_srs->srs_lock; 12108275SEric Cheng kcondvar_t *async = &mac_srs->srs_cv; 12118275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 12128275SEric Cheng mblk_t *head, *tail, *mp; 12138275SEric Cheng callb_cpr_t cprinfo; 12148275SEric Cheng ssize_t bytes_to_pickup; 12158275SEric Cheng size_t sz; 12168275SEric Cheng int count; 12178275SEric Cheng mac_client_impl_t *smcip; 12188275SEric Cheng 12198275SEric Cheng CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll"); 12208275SEric Cheng mutex_enter(lock); 12218275SEric Cheng 12228275SEric Cheng start: 12238275SEric Cheng for (;;) { 12248275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 12258275SEric Cheng goto done; 12268275SEric Cheng 12278275SEric Cheng CALLB_CPR_SAFE_BEGIN(&cprinfo); 12288275SEric Cheng cv_wait(async, lock); 12298275SEric Cheng CALLB_CPR_SAFE_END(&cprinfo, lock); 12308275SEric Cheng 12318275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 12328275SEric Cheng goto done; 12338275SEric Cheng 12348275SEric Cheng check_again: 12358275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 12368275SEric Cheng /* 12378275SEric Cheng * We pick as many bytes as we are allowed to queue. 12388275SEric Cheng * Its possible that we will exceed the total 12398275SEric Cheng * packets queued in case this SRS is part of the 12408275SEric Cheng * Rx ring group since > 1 poll thread can be pulling 12418275SEric Cheng * upto the max allowed packets at the same time 12428275SEric Cheng * but that should be OK. 12438275SEric Cheng */ 12448275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 12458275SEric Cheng bytes_to_pickup = 12468275SEric Cheng mac_srs->srs_bw->mac_bw_drop_threshold - 12478275SEric Cheng mac_srs->srs_bw->mac_bw_sz; 12488275SEric Cheng /* 12498275SEric Cheng * We shouldn't have been signalled if we 12508275SEric Cheng * have 0 or less bytes to pick but since 12518275SEric Cheng * some of the bytes accounting is driver 12528275SEric Cheng * dependant, we do the safety check. 12538275SEric Cheng */ 12548275SEric Cheng if (bytes_to_pickup < 0) 12558275SEric Cheng bytes_to_pickup = 0; 12568275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 12578275SEric Cheng } else { 12588275SEric Cheng /* 12598275SEric Cheng * ToDO: Need to change the polling API 12608275SEric Cheng * to add a packet count and a flag which 12618275SEric Cheng * tells the driver whether we want packets 12628275SEric Cheng * based on a count, or bytes, or all the 12638275SEric Cheng * packets queued in the driver/HW. This 12648275SEric Cheng * way, we never have to check the limits 12658275SEric Cheng * on poll path. We truly let only as many 12668275SEric Cheng * packets enter the system as we are willing 12678275SEric Cheng * to process or queue. 12688275SEric Cheng * 12698275SEric Cheng * Something along the lines of 12708275SEric Cheng * pkts_to_pickup = mac_soft_ring_max_q_cnt - 12718275SEric Cheng * mac_srs->srs_poll_pkt_cnt 12728275SEric Cheng */ 12738275SEric Cheng 12748275SEric Cheng /* 12758275SEric Cheng * Since we are not doing B/W control, pick 12768275SEric Cheng * as many packets as allowed. 12778275SEric Cheng */ 12788275SEric Cheng bytes_to_pickup = max_bytes_to_pickup; 12798275SEric Cheng } 12808275SEric Cheng 12818275SEric Cheng /* Poll the underlying Hardware */ 12828275SEric Cheng mutex_exit(lock); 12838275SEric Cheng head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup); 12848275SEric Cheng mutex_enter(lock); 12858275SEric Cheng 12868275SEric Cheng ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 12878275SEric Cheng SRS_POLL_THR_OWNER); 12888275SEric Cheng 12898275SEric Cheng mp = tail = head; 12908275SEric Cheng count = 0; 12918275SEric Cheng sz = 0; 12928275SEric Cheng while (mp != NULL) { 12938275SEric Cheng tail = mp; 12948275SEric Cheng sz += msgdsize(mp); 12958275SEric Cheng mp = mp->b_next; 12968275SEric Cheng count++; 12978275SEric Cheng } 12988275SEric Cheng 12998275SEric Cheng if (head != NULL) { 13008275SEric Cheng tail->b_next = NULL; 13018275SEric Cheng smcip = mac_srs->srs_mcip; 13028275SEric Cheng 13038275SEric Cheng if ((mac_srs->srs_type & SRST_FLOW) || 13048275SEric Cheng (smcip == NULL)) { 13058275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, 13068275SEric Cheng rbytes, sz); 13078275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, 13088275SEric Cheng ipackets, count); 13098275SEric Cheng } 13108275SEric Cheng 13118275SEric Cheng /* 13128275SEric Cheng * If there are any promiscuous mode callbacks 13138275SEric Cheng * defined for this MAC client, pass them a copy 13148275SEric Cheng * if appropriate and also update the counters. 13158275SEric Cheng */ 13168275SEric Cheng if (smcip != NULL) { 13178275SEric Cheng smcip->mci_stat_ibytes += sz; 13188275SEric Cheng smcip->mci_stat_ipackets += count; 13198275SEric Cheng 13208275SEric Cheng if (smcip->mci_mip->mi_promisc_list != NULL) { 13218275SEric Cheng mutex_exit(lock); 13228275SEric Cheng mac_promisc_dispatch(smcip->mci_mip, 13238275SEric Cheng head, NULL); 13248275SEric Cheng mutex_enter(lock); 13258275SEric Cheng } 13268275SEric Cheng } 13278275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 13288275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 13298275SEric Cheng mac_srs->srs_bw->mac_bw_polled += sz; 13308275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 13318275SEric Cheng } 13328275SEric Cheng srs_rx->sr_poll_count += count; 13338275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, 13348275SEric Cheng count, sz); 13358275SEric Cheng if (count <= 10) 13368275SEric Cheng srs_rx->sr_chain_cnt_undr10++; 13378275SEric Cheng else if (count > 10 && count <= 50) 13388275SEric Cheng srs_rx->sr_chain_cnt_10to50++; 13398275SEric Cheng else 13408275SEric Cheng srs_rx->sr_chain_cnt_over50++; 13418275SEric Cheng } 13428275SEric Cheng 13438275SEric Cheng /* 13448275SEric Cheng * We are guaranteed that SRS_PROC will be set if we 13458275SEric Cheng * are here. Also, poll thread gets to run only if 13468275SEric Cheng * the drain was being done by a worker thread although 13478275SEric Cheng * its possible that worker thread is still running 13488275SEric Cheng * and poll thread was sent down to keep the pipeline 13498275SEric Cheng * going instead of doing a complete drain and then 13508275SEric Cheng * trying to poll the NIC. 13518275SEric Cheng * 13528275SEric Cheng * So we need to check SRS_WORKER flag to make sure 13538275SEric Cheng * that the worker thread is not processing the queue 13548275SEric Cheng * in parallel to us. The flags and conditions are 13558275SEric Cheng * protected by the srs_lock to prevent any race. We 13568275SEric Cheng * ensure that we don't drop the srs_lock from now 13578275SEric Cheng * till the end and similarly we don't drop the srs_lock 13588275SEric Cheng * in mac_rx_srs_drain() till similar condition check 13598275SEric Cheng * are complete. The mac_rx_srs_drain() needs to ensure 13608275SEric Cheng * that SRS_WORKER flag remains set as long as its 13618275SEric Cheng * processing the queue. 13628275SEric Cheng */ 13638275SEric Cheng if (!(mac_srs->srs_state & SRS_WORKER) && 13648275SEric Cheng (mac_srs->srs_first != NULL)) { 13658275SEric Cheng /* 13668275SEric Cheng * We have packets to process and worker thread 13678833SVenu.Iyer@Sun.COM * is not running. Check to see if poll thread is 13688833SVenu.Iyer@Sun.COM * allowed to process. 13698275SEric Cheng */ 13708833SVenu.Iyer@Sun.COM if (mac_srs->srs_state & SRS_LATENCY_OPT) { 13718275SEric Cheng mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC); 13729209SEric Cheng if (!(mac_srs->srs_state & SRS_PAUSE) && 13739209SEric Cheng srs_rx->sr_poll_pkt_cnt <= 13748275SEric Cheng srs_rx->sr_lowat) { 13758275SEric Cheng srs_rx->sr_poll_again++; 13768275SEric Cheng goto check_again; 13778833SVenu.Iyer@Sun.COM } 13788833SVenu.Iyer@Sun.COM /* 13798833SVenu.Iyer@Sun.COM * We are already above low water mark 13808833SVenu.Iyer@Sun.COM * so stay in the polling mode but no 13818833SVenu.Iyer@Sun.COM * need to poll. Once we dip below 13828833SVenu.Iyer@Sun.COM * the polling threshold, the processing 13838833SVenu.Iyer@Sun.COM * thread (soft ring) will signal us 13848833SVenu.Iyer@Sun.COM * to poll again (MAC_UPDATE_SRS_COUNT) 13858833SVenu.Iyer@Sun.COM */ 13868833SVenu.Iyer@Sun.COM srs_rx->sr_poll_drain_no_poll++; 13878833SVenu.Iyer@Sun.COM mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 13888833SVenu.Iyer@Sun.COM /* 13898833SVenu.Iyer@Sun.COM * In B/W control case, its possible 13908833SVenu.Iyer@Sun.COM * that the backlog built up due to 13918833SVenu.Iyer@Sun.COM * B/W limit being reached and packets 13928833SVenu.Iyer@Sun.COM * are queued only in SRS. In this case, 13938833SVenu.Iyer@Sun.COM * we should schedule worker thread 13948833SVenu.Iyer@Sun.COM * since no one else will wake us up. 13958833SVenu.Iyer@Sun.COM */ 13968833SVenu.Iyer@Sun.COM if ((mac_srs->srs_type & SRST_BW_CONTROL) && 13978833SVenu.Iyer@Sun.COM (mac_srs->srs_tid == NULL)) { 13988833SVenu.Iyer@Sun.COM mac_srs->srs_tid = 13998833SVenu.Iyer@Sun.COM timeout(mac_srs_fire, mac_srs, 1); 14008833SVenu.Iyer@Sun.COM srs_rx->sr_poll_worker_wakeup++; 14018275SEric Cheng } 14028275SEric Cheng } else { 14038275SEric Cheng /* 14048275SEric Cheng * Wakeup the worker thread for more processing. 14058275SEric Cheng * We optimize for throughput in this case. 14068275SEric Cheng */ 14078275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 14088275SEric Cheng MAC_SRS_WORKER_WAKEUP(mac_srs); 14098275SEric Cheng srs_rx->sr_poll_sig_worker++; 14108275SEric Cheng } 14118275SEric Cheng } else if ((mac_srs->srs_first == NULL) && 14128275SEric Cheng !(mac_srs->srs_state & SRS_WORKER)) { 14138275SEric Cheng /* 14148275SEric Cheng * There is nothing queued in SRS and 14158275SEric Cheng * no worker thread running. Plus we 14168275SEric Cheng * didn't get anything from the H/W 14178275SEric Cheng * as well (head == NULL); 14188275SEric Cheng */ 14198275SEric Cheng ASSERT(head == NULL); 14208275SEric Cheng mac_srs->srs_state &= 14218275SEric Cheng ~(SRS_PROC|SRS_GET_PKTS); 14228275SEric Cheng 14238275SEric Cheng /* 14248275SEric Cheng * If we have a packets in soft ring, don't allow 14258275SEric Cheng * more packets to come into this SRS by keeping the 14268275SEric Cheng * interrupts off but not polling the H/W. The 14278275SEric Cheng * poll thread will get signaled as soon as 14288275SEric Cheng * srs_poll_pkt_cnt dips below poll threshold. 14298275SEric Cheng */ 14308275SEric Cheng if (srs_rx->sr_poll_pkt_cnt == 0) { 14318275SEric Cheng srs_rx->sr_poll_intr_enable++; 14328275SEric Cheng MAC_SRS_POLLING_OFF(mac_srs); 14338275SEric Cheng } else { 14348275SEric Cheng /* 14358275SEric Cheng * We know nothing is queued in SRS 14368275SEric Cheng * since we are here after checking 14378275SEric Cheng * srs_first is NULL. The backlog 14388275SEric Cheng * is entirely due to packets queued 14398275SEric Cheng * in Soft ring which will wake us up 14408275SEric Cheng * and get the interface out of polling 14418275SEric Cheng * mode once the backlog dips below 14428275SEric Cheng * sr_poll_thres. 14438275SEric Cheng */ 14448275SEric Cheng srs_rx->sr_poll_no_poll++; 14458275SEric Cheng } 14468275SEric Cheng } else { 14478275SEric Cheng /* 14488275SEric Cheng * Worker thread is already running. 14498275SEric Cheng * Nothing much to do. If the polling 14508275SEric Cheng * was enabled, worker thread will deal 14518275SEric Cheng * with that. 14528275SEric Cheng */ 14538275SEric Cheng mac_srs->srs_state &= ~SRS_GET_PKTS; 14548275SEric Cheng srs_rx->sr_poll_goto_sleep++; 14558275SEric Cheng } 14568275SEric Cheng } 14578275SEric Cheng done: 14588275SEric Cheng mac_srs->srs_state |= SRS_POLL_THR_QUIESCED; 14598275SEric Cheng cv_signal(&mac_srs->srs_async); 14608275SEric Cheng /* 14618275SEric Cheng * If this is a temporary quiesce then wait for the restart signal 14628275SEric Cheng * from the srs worker. Then clear the flags and signal the srs worker 14638275SEric Cheng * to ensure a positive handshake and go back to start. 14648275SEric Cheng */ 14658275SEric Cheng while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART))) 14668275SEric Cheng cv_wait(async, lock); 14678275SEric Cheng if (mac_srs->srs_state & SRS_POLL_THR_RESTART) { 14688275SEric Cheng ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 14698275SEric Cheng mac_srs->srs_state &= 14708275SEric Cheng ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART); 14718275SEric Cheng cv_signal(&mac_srs->srs_async); 14728275SEric Cheng goto start; 14738275SEric Cheng } else { 14748275SEric Cheng mac_srs->srs_state |= SRS_POLL_THR_EXITED; 14758275SEric Cheng cv_signal(&mac_srs->srs_async); 14768275SEric Cheng CALLB_CPR_EXIT(&cprinfo); 14778275SEric Cheng thread_exit(); 14788275SEric Cheng } 14798275SEric Cheng } 14808275SEric Cheng 14818275SEric Cheng /* 14828275SEric Cheng * mac_srs_pick_chain 14838275SEric Cheng * 14848275SEric Cheng * In Bandwidth control case, checks how many packets can be processed 14858275SEric Cheng * and return them in a sub chain. 14868275SEric Cheng */ 14878275SEric Cheng static mblk_t * 14888275SEric Cheng mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail, 14898275SEric Cheng size_t *chain_sz, int *chain_cnt) 14908275SEric Cheng { 14918275SEric Cheng mblk_t *head = NULL; 14928275SEric Cheng mblk_t *tail = NULL; 14938275SEric Cheng size_t sz; 14948275SEric Cheng size_t tsz = 0; 14958275SEric Cheng int cnt = 0; 14968275SEric Cheng mblk_t *mp; 14978275SEric Cheng 14988275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 14998275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 15008275SEric Cheng if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <= 15018275SEric Cheng mac_srs->srs_bw->mac_bw_limit) || 15028275SEric Cheng (mac_srs->srs_bw->mac_bw_limit == 0)) { 15038275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 15048275SEric Cheng head = mac_srs->srs_first; 15058275SEric Cheng mac_srs->srs_first = NULL; 15068275SEric Cheng *chain_tail = mac_srs->srs_last; 15078275SEric Cheng mac_srs->srs_last = NULL; 15088275SEric Cheng *chain_sz = mac_srs->srs_size; 15098275SEric Cheng *chain_cnt = mac_srs->srs_count; 15108275SEric Cheng mac_srs->srs_count = 0; 15118275SEric Cheng mac_srs->srs_size = 0; 15128275SEric Cheng return (head); 15138275SEric Cheng } 15148275SEric Cheng 15158275SEric Cheng /* 15168275SEric Cheng * Can't clear the entire backlog. 15178275SEric Cheng * Need to find how many packets to pick 15188275SEric Cheng */ 15198275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock)); 15208275SEric Cheng while ((mp = mac_srs->srs_first) != NULL) { 15218275SEric Cheng sz = msgdsize(mp); 15228275SEric Cheng if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) > 15238275SEric Cheng mac_srs->srs_bw->mac_bw_limit) { 15248275SEric Cheng if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) 15258275SEric Cheng mac_srs->srs_bw->mac_bw_state |= 15268275SEric Cheng SRS_BW_ENFORCED; 15278275SEric Cheng break; 15288275SEric Cheng } 15298275SEric Cheng 15308275SEric Cheng /* 15318275SEric Cheng * The _size & cnt is decremented from the softrings 15328275SEric Cheng * when they send up the packet for polling to work 15338275SEric Cheng * properly. 15348275SEric Cheng */ 15358275SEric Cheng tsz += sz; 15368275SEric Cheng cnt++; 15378275SEric Cheng mac_srs->srs_count--; 15388275SEric Cheng mac_srs->srs_size -= sz; 15398275SEric Cheng if (tail != NULL) 15408275SEric Cheng tail->b_next = mp; 15418275SEric Cheng else 15428275SEric Cheng head = mp; 15438275SEric Cheng tail = mp; 15448275SEric Cheng mac_srs->srs_first = mac_srs->srs_first->b_next; 15458275SEric Cheng } 15468275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 15478275SEric Cheng if (mac_srs->srs_first == NULL) 15488275SEric Cheng mac_srs->srs_last = NULL; 15498275SEric Cheng 15508275SEric Cheng if (tail != NULL) 15518275SEric Cheng tail->b_next = NULL; 15528275SEric Cheng *chain_tail = tail; 15538275SEric Cheng *chain_cnt = cnt; 15548275SEric Cheng *chain_sz = tsz; 15558275SEric Cheng 15568275SEric Cheng return (head); 15578275SEric Cheng } 15588275SEric Cheng 15598275SEric Cheng /* 15608275SEric Cheng * mac_rx_srs_drain 15618275SEric Cheng * 15628275SEric Cheng * The SRS drain routine. Gets to run to clear the queue. Any thread 15638275SEric Cheng * (worker, interrupt, poll) can call this based on processing model. 15648275SEric Cheng * The first thing we do is disable interrupts if possible and then 15658275SEric Cheng * drain the queue. we also try to poll the underlying hardware if 15668275SEric Cheng * there is a dedicated hardware Rx ring assigned to this SRS. 15678275SEric Cheng * 15688275SEric Cheng * There is a equivalent drain routine in bandwidth control mode 15698275SEric Cheng * mac_rx_srs_drain_bw. There is some code duplication between the two 15708275SEric Cheng * routines but they are highly performance sensitive and are easier 15718275SEric Cheng * to read/debug if they stay separate. Any code changes here might 15728275SEric Cheng * also apply to mac_rx_srs_drain_bw as well. 15738275SEric Cheng */ 15748275SEric Cheng void 15758275SEric Cheng mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 15768275SEric Cheng { 15778275SEric Cheng mblk_t *head; 15788275SEric Cheng mblk_t *tail; 15798275SEric Cheng timeout_id_t tid; 15808275SEric Cheng int cnt = 0; 15818275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 15828275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 15838275SEric Cheng 15848275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 15858275SEric Cheng ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL)); 15868833SVenu.Iyer@Sun.COM 15878275SEric Cheng /* If we are blanked i.e. can't do upcalls, then we are done */ 15888275SEric Cheng if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 15898275SEric Cheng ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 15908275SEric Cheng (mac_srs->srs_state & SRS_PAUSE)); 15918275SEric Cheng goto out; 15928275SEric Cheng } 15938275SEric Cheng 15948275SEric Cheng if (mac_srs->srs_first == NULL) 15958275SEric Cheng goto out; 15968275SEric Cheng 15978833SVenu.Iyer@Sun.COM if (!(mac_srs->srs_state & SRS_LATENCY_OPT) && 15988833SVenu.Iyer@Sun.COM (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) { 15998833SVenu.Iyer@Sun.COM /* 16008833SVenu.Iyer@Sun.COM * In the normal case, the SRS worker thread does no 16018833SVenu.Iyer@Sun.COM * work and we wait for a backlog to build up before 16028833SVenu.Iyer@Sun.COM * we switch into polling mode. In case we are 16038833SVenu.Iyer@Sun.COM * optimizing for throughput, we use the worker thread 16048833SVenu.Iyer@Sun.COM * as well. The goal is to let worker thread process 16058833SVenu.Iyer@Sun.COM * the queue and poll thread to feed packets into 16068833SVenu.Iyer@Sun.COM * the queue. As such, we should signal the poll 16078833SVenu.Iyer@Sun.COM * thread to try and get more packets. 16088833SVenu.Iyer@Sun.COM * 16098833SVenu.Iyer@Sun.COM * We could have pulled this check in the POLL_RING 16108833SVenu.Iyer@Sun.COM * macro itself but keeping it explicit here makes 16118833SVenu.Iyer@Sun.COM * the architecture more human understandable. 16128833SVenu.Iyer@Sun.COM */ 16138833SVenu.Iyer@Sun.COM MAC_SRS_POLL_RING(mac_srs); 16148833SVenu.Iyer@Sun.COM } 16158833SVenu.Iyer@Sun.COM 16168833SVenu.Iyer@Sun.COM again: 16178275SEric Cheng head = mac_srs->srs_first; 16188275SEric Cheng mac_srs->srs_first = NULL; 16198275SEric Cheng tail = mac_srs->srs_last; 16208275SEric Cheng mac_srs->srs_last = NULL; 16218275SEric Cheng cnt = mac_srs->srs_count; 16228275SEric Cheng mac_srs->srs_count = 0; 16238275SEric Cheng 16248275SEric Cheng ASSERT(head != NULL); 16258275SEric Cheng ASSERT(tail != NULL); 16268275SEric Cheng 16278275SEric Cheng if ((tid = mac_srs->srs_tid) != 0) 16288275SEric Cheng mac_srs->srs_tid = 0; 16298275SEric Cheng 16308275SEric Cheng mac_srs->srs_state |= (SRS_PROC|proc_type); 16318275SEric Cheng 16328833SVenu.Iyer@Sun.COM 16338275SEric Cheng /* 16348275SEric Cheng * mcip is NULL for broadcast and multicast flows. The promisc 16358275SEric Cheng * callbacks for broadcast and multicast packets are delivered from 16368275SEric Cheng * mac_rx() and we don't need to worry about that case in this path 16378275SEric Cheng */ 16388275SEric Cheng if (mcip != NULL && mcip->mci_promisc_list != NULL) { 16398275SEric Cheng mutex_exit(&mac_srs->srs_lock); 16408275SEric Cheng mac_promisc_client_dispatch(mcip, head); 16418275SEric Cheng mutex_enter(&mac_srs->srs_lock); 16428275SEric Cheng } 16438275SEric Cheng 16448275SEric Cheng /* 16458275SEric Cheng * Check if SRS itself is doing the processing 16468275SEric Cheng * This direct path does not apply when subflows are present. In this 16478275SEric Cheng * case, packets need to be dispatched to a soft ring according to the 16488275SEric Cheng * flow's bandwidth and other resources contraints. 16498275SEric Cheng */ 16508275SEric Cheng if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 16518275SEric Cheng mac_direct_rx_t proc; 16528275SEric Cheng void *arg1; 16538275SEric Cheng mac_resource_handle_t arg2; 16548275SEric Cheng 16558275SEric Cheng /* 16568275SEric Cheng * This is the case when a Rx is directly 16578275SEric Cheng * assigned and we have a fully classified 16588275SEric Cheng * protocol chain. We can deal with it in 16598275SEric Cheng * one shot. 16608275SEric Cheng */ 16618275SEric Cheng proc = srs_rx->sr_func; 16628275SEric Cheng arg1 = srs_rx->sr_arg1; 16638275SEric Cheng arg2 = srs_rx->sr_arg2; 16648275SEric Cheng 16658275SEric Cheng mac_srs->srs_state |= SRS_CLIENT_PROC; 16668275SEric Cheng mutex_exit(&mac_srs->srs_lock); 16678275SEric Cheng if (tid != 0) { 16688275SEric Cheng (void) untimeout(tid); 16698275SEric Cheng tid = 0; 16708275SEric Cheng } 16718275SEric Cheng 16728275SEric Cheng proc(arg1, arg2, head, NULL); 16738275SEric Cheng /* 16748275SEric Cheng * Decrement the size and count here itelf 16758275SEric Cheng * since the packet has been processed. 16768275SEric Cheng */ 16778275SEric Cheng mutex_enter(&mac_srs->srs_lock); 16788275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 16798275SEric Cheng if (mac_srs->srs_state & SRS_CLIENT_WAIT) 16808275SEric Cheng cv_signal(&mac_srs->srs_client_cv); 16818275SEric Cheng mac_srs->srs_state &= ~SRS_CLIENT_PROC; 16828275SEric Cheng } else { 16838275SEric Cheng /* Some kind of softrings based fanout is required */ 16848275SEric Cheng mutex_exit(&mac_srs->srs_lock); 16858275SEric Cheng if (tid != 0) { 16868275SEric Cheng (void) untimeout(tid); 16878275SEric Cheng tid = 0; 16888275SEric Cheng } 16898275SEric Cheng 16908275SEric Cheng /* 16918275SEric Cheng * Since the fanout routines can deal with chains, 16928275SEric Cheng * shoot the entire chain up. 16938275SEric Cheng */ 16948275SEric Cheng if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 16958275SEric Cheng mac_rx_srs_fanout(mac_srs, head); 16968275SEric Cheng else 16978275SEric Cheng mac_rx_srs_proto_fanout(mac_srs, head); 16988275SEric Cheng mutex_enter(&mac_srs->srs_lock); 16998275SEric Cheng } 17008275SEric Cheng 17019820SEric Cheng if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) && 17029820SEric Cheng (mac_srs->srs_first != NULL)) { 17038833SVenu.Iyer@Sun.COM /* 17049820SEric Cheng * More packets arrived while we were clearing the 17059820SEric Cheng * SRS. This can be possible because of one of 17069820SEric Cheng * three conditions below: 17079820SEric Cheng * 1) The driver is using multiple worker threads 17089820SEric Cheng * to send the packets to us. 17099820SEric Cheng * 2) The driver has a race in switching 17109820SEric Cheng * between interrupt and polling mode or 17119820SEric Cheng * 3) Packets are arriving in this SRS via the 17129820SEric Cheng * S/W classification as well. 17139820SEric Cheng * 17149820SEric Cheng * We should switch to polling mode and see if we 17159820SEric Cheng * need to send the poll thread down. Also, signal 17169820SEric Cheng * the worker thread to process whats just arrived. 17178833SVenu.Iyer@Sun.COM */ 17189820SEric Cheng MAC_SRS_POLLING_ON(mac_srs); 17198833SVenu.Iyer@Sun.COM if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) { 17208833SVenu.Iyer@Sun.COM srs_rx->sr_drain_poll_sig++; 17218833SVenu.Iyer@Sun.COM MAC_SRS_POLL_RING(mac_srs); 17228833SVenu.Iyer@Sun.COM } 17239820SEric Cheng 17249820SEric Cheng /* 17259820SEric Cheng * If we didn't signal the poll thread, we need 17269820SEric Cheng * to deal with the pending packets ourselves. 17279820SEric Cheng */ 17289820SEric Cheng if (proc_type == SRS_WORKER) { 17298275SEric Cheng srs_rx->sr_drain_again++; 17308275SEric Cheng goto again; 17319820SEric Cheng } else { 17329820SEric Cheng srs_rx->sr_drain_worker_sig++; 17339820SEric Cheng cv_signal(&mac_srs->srs_async); 17348275SEric Cheng } 17358275SEric Cheng } 17368275SEric Cheng 17378275SEric Cheng out: 17388275SEric Cheng if (mac_srs->srs_state & SRS_GET_PKTS) { 17398275SEric Cheng /* 17408275SEric Cheng * Poll thread is already running. Leave the 17418275SEric Cheng * SRS_RPOC set and hand over the control to 17428275SEric Cheng * poll thread. 17438275SEric Cheng */ 17448275SEric Cheng mac_srs->srs_state &= ~proc_type; 17458275SEric Cheng srs_rx->sr_drain_poll_running++; 17468275SEric Cheng return; 17478275SEric Cheng } 17488275SEric Cheng 17498275SEric Cheng /* 17508275SEric Cheng * Even if there are no packets queued in SRS, we 17518275SEric Cheng * need to make sure that the shared counter is 17528275SEric Cheng * clear and any associated softrings have cleared 17538275SEric Cheng * all the backlog. Otherwise, leave the interface 17548275SEric Cheng * in polling mode and the poll thread will get 17558275SEric Cheng * signalled once the count goes down to zero. 17568275SEric Cheng * 17578275SEric Cheng * If someone is already draining the queue (SRS_PROC is 17588275SEric Cheng * set) when the srs_poll_pkt_cnt goes down to zero, 17598275SEric Cheng * then it means that drain is already running and we 17608275SEric Cheng * will turn off polling at that time if there is 17618275SEric Cheng * no backlog. 17628275SEric Cheng * 17638275SEric Cheng * As long as there are packets queued either 17648275SEric Cheng * in soft ring set or its soft rings, we will leave 17658275SEric Cheng * the interface in polling mode (even if the drain 17668275SEric Cheng * was done being the interrupt thread). We signal 17678275SEric Cheng * the poll thread as well if we have dipped below 17688275SEric Cheng * low water mark. 17698275SEric Cheng * 17708275SEric Cheng * NOTE: We can't use the MAC_SRS_POLLING_ON macro 17718275SEric Cheng * since that turn polling on only for worker thread. 17728275SEric Cheng * Its not worth turning polling on for interrupt 17738275SEric Cheng * thread (since NIC will not issue another interrupt) 17748275SEric Cheng * unless a backlog builds up. 17758275SEric Cheng */ 17768275SEric Cheng if ((srs_rx->sr_poll_pkt_cnt > 0) && 17778275SEric Cheng (mac_srs->srs_state & SRS_POLLING_CAPAB)) { 17788275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 17798275SEric Cheng srs_rx->sr_drain_keep_polling++; 17808275SEric Cheng MAC_SRS_POLLING_ON(mac_srs); 17818275SEric Cheng if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) 17828275SEric Cheng MAC_SRS_POLL_RING(mac_srs); 17838275SEric Cheng return; 17848275SEric Cheng } 17858275SEric Cheng 17868275SEric Cheng /* Nothing else to do. Get out of poll mode */ 17878275SEric Cheng MAC_SRS_POLLING_OFF(mac_srs); 17888275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 17898275SEric Cheng srs_rx->sr_drain_finish_intr++; 17908275SEric Cheng } 17918275SEric Cheng 17928275SEric Cheng /* 17938275SEric Cheng * mac_rx_srs_drain_bw 17948275SEric Cheng * 17958275SEric Cheng * The SRS BW drain routine. Gets to run to clear the queue. Any thread 17968275SEric Cheng * (worker, interrupt, poll) can call this based on processing model. 17978275SEric Cheng * The first thing we do is disable interrupts if possible and then 17988275SEric Cheng * drain the queue. we also try to poll the underlying hardware if 17998275SEric Cheng * there is a dedicated hardware Rx ring assigned to this SRS. 18008275SEric Cheng * 18018275SEric Cheng * There is a equivalent drain routine in non bandwidth control mode 18028275SEric Cheng * mac_rx_srs_drain. There is some code duplication between the two 18038275SEric Cheng * routines but they are highly performance sensitive and are easier 18048275SEric Cheng * to read/debug if they stay separate. Any code changes here might 18058275SEric Cheng * also apply to mac_rx_srs_drain as well. 18068275SEric Cheng */ 18078275SEric Cheng void 18088275SEric Cheng mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 18098275SEric Cheng { 18108275SEric Cheng mblk_t *head; 18118275SEric Cheng mblk_t *tail; 18128275SEric Cheng timeout_id_t tid; 18138275SEric Cheng size_t sz = 0; 18148275SEric Cheng int cnt = 0; 18158275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 18168275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1817*11066Srafael.vanoni@sun.com clock_t now; 18188275SEric Cheng 18198275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 18208275SEric Cheng ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 18218275SEric Cheng again: 18228275SEric Cheng /* Check if we are doing B/W control */ 18238275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1824*11066Srafael.vanoni@sun.com now = ddi_get_lbolt(); 1825*11066Srafael.vanoni@sun.com if (mac_srs->srs_bw->mac_bw_curr_time != now) { 1826*11066Srafael.vanoni@sun.com mac_srs->srs_bw->mac_bw_curr_time = now; 18278275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 18288275SEric Cheng if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 18298275SEric Cheng mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; 18308275SEric Cheng } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) { 18318275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18328275SEric Cheng goto done; 18338275SEric Cheng } else if (mac_srs->srs_bw->mac_bw_used > 18348275SEric Cheng mac_srs->srs_bw->mac_bw_limit) { 18358275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 18368275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18378275SEric Cheng goto done; 18388275SEric Cheng } 18398275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18408275SEric Cheng 18418275SEric Cheng /* If we are blanked i.e. can't do upcalls, then we are done */ 18428275SEric Cheng if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 18438275SEric Cheng ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 18448275SEric Cheng (mac_srs->srs_state & SRS_PAUSE)); 18458275SEric Cheng goto done; 18468275SEric Cheng } 18478275SEric Cheng 18488275SEric Cheng sz = 0; 18498275SEric Cheng cnt = 0; 18508275SEric Cheng if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) { 18518275SEric Cheng /* 18528275SEric Cheng * We couldn't pick up a single packet. 18538275SEric Cheng */ 18548275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 18558275SEric Cheng if ((mac_srs->srs_bw->mac_bw_used == 0) && 18568275SEric Cheng (mac_srs->srs_size != 0) && 18578275SEric Cheng !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 18588275SEric Cheng /* 18598275SEric Cheng * Seems like configured B/W doesn't 18608275SEric Cheng * even allow processing of 1 packet 18618275SEric Cheng * per tick. 18628275SEric Cheng * 18638275SEric Cheng * XXX: raise the limit to processing 18648275SEric Cheng * at least 1 packet per tick. 18658275SEric Cheng */ 18668275SEric Cheng mac_srs->srs_bw->mac_bw_limit += 18678275SEric Cheng mac_srs->srs_bw->mac_bw_limit; 18688275SEric Cheng mac_srs->srs_bw->mac_bw_drop_threshold += 18698275SEric Cheng mac_srs->srs_bw->mac_bw_drop_threshold; 18708275SEric Cheng cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) " 18718275SEric Cheng "raised B/W limit to %d since not even a " 18728275SEric Cheng "single packet can be processed per " 18738275SEric Cheng "tick %d\n", (void *)mac_srs, 18748275SEric Cheng (int)mac_srs->srs_bw->mac_bw_limit, 18758275SEric Cheng (int)msgdsize(mac_srs->srs_first)); 18768275SEric Cheng } 18778275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18788275SEric Cheng goto done; 18798275SEric Cheng } 18808275SEric Cheng 18818275SEric Cheng ASSERT(head != NULL); 18828275SEric Cheng ASSERT(tail != NULL); 18838275SEric Cheng 18848275SEric Cheng /* zero bandwidth: drop all and return to interrupt mode */ 18858275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 18868275SEric Cheng if (mac_srs->srs_bw->mac_bw_limit == 0) { 18878275SEric Cheng srs_rx->sr_drop_count += cnt; 18888275SEric Cheng ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz); 18898275SEric Cheng mac_srs->srs_bw->mac_bw_sz -= sz; 18908275SEric Cheng mac_srs->srs_bw->mac_bw_drop_bytes += sz; 18918275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18928275SEric Cheng mac_pkt_drop(NULL, NULL, head, B_FALSE); 18938275SEric Cheng goto leave_poll; 18948275SEric Cheng } else { 18958275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18968275SEric Cheng } 18978275SEric Cheng 18988275SEric Cheng if ((tid = mac_srs->srs_tid) != 0) 18998275SEric Cheng mac_srs->srs_tid = 0; 19008275SEric Cheng 19018275SEric Cheng mac_srs->srs_state |= (SRS_PROC|proc_type); 19028275SEric Cheng MAC_SRS_WORKER_POLLING_ON(mac_srs); 19038275SEric Cheng 19048275SEric Cheng /* 19058275SEric Cheng * mcip is NULL for broadcast and multicast flows. The promisc 19068275SEric Cheng * callbacks for broadcast and multicast packets are delivered from 19078275SEric Cheng * mac_rx() and we don't need to worry about that case in this path 19088275SEric Cheng */ 19098275SEric Cheng if (mcip != NULL && mcip->mci_promisc_list != NULL) { 19108275SEric Cheng mutex_exit(&mac_srs->srs_lock); 19118275SEric Cheng mac_promisc_client_dispatch(mcip, head); 19128275SEric Cheng mutex_enter(&mac_srs->srs_lock); 19138275SEric Cheng } 19148275SEric Cheng 19158275SEric Cheng /* 19168275SEric Cheng * Check if SRS itself is doing the processing 19178275SEric Cheng * This direct path does not apply when subflows are present. In this 19188275SEric Cheng * case, packets need to be dispatched to a soft ring according to the 19198275SEric Cheng * flow's bandwidth and other resources contraints. 19208275SEric Cheng */ 19218275SEric Cheng if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 19228275SEric Cheng mac_direct_rx_t proc; 19238275SEric Cheng void *arg1; 19248275SEric Cheng mac_resource_handle_t arg2; 19258275SEric Cheng 19268275SEric Cheng /* 19278275SEric Cheng * This is the case when a Rx is directly 19288275SEric Cheng * assigned and we have a fully classified 19298275SEric Cheng * protocol chain. We can deal with it in 19308275SEric Cheng * one shot. 19318275SEric Cheng */ 19328275SEric Cheng proc = srs_rx->sr_func; 19338275SEric Cheng arg1 = srs_rx->sr_arg1; 19348275SEric Cheng arg2 = srs_rx->sr_arg2; 19358275SEric Cheng 19368275SEric Cheng mac_srs->srs_state |= SRS_CLIENT_PROC; 19378275SEric Cheng mutex_exit(&mac_srs->srs_lock); 19388275SEric Cheng if (tid != 0) { 19398275SEric Cheng (void) untimeout(tid); 19408275SEric Cheng tid = 0; 19418275SEric Cheng } 19428275SEric Cheng 19438275SEric Cheng proc(arg1, arg2, head, NULL); 19448275SEric Cheng /* 19458275SEric Cheng * Decrement the size and count here itelf 19468275SEric Cheng * since the packet has been processed. 19478275SEric Cheng */ 19488275SEric Cheng mutex_enter(&mac_srs->srs_lock); 19498275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 19508275SEric Cheng MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 19518275SEric Cheng 19528275SEric Cheng if (mac_srs->srs_state & SRS_CLIENT_WAIT) 19538275SEric Cheng cv_signal(&mac_srs->srs_client_cv); 19548275SEric Cheng mac_srs->srs_state &= ~SRS_CLIENT_PROC; 19558275SEric Cheng } else { 19568275SEric Cheng /* Some kind of softrings based fanout is required */ 19578275SEric Cheng mutex_exit(&mac_srs->srs_lock); 19588275SEric Cheng if (tid != 0) { 19598275SEric Cheng (void) untimeout(tid); 19608275SEric Cheng tid = 0; 19618275SEric Cheng } 19628275SEric Cheng 19638275SEric Cheng /* 19648275SEric Cheng * Since the fanout routines can deal with chains, 19658275SEric Cheng * shoot the entire chain up. 19668275SEric Cheng */ 19678275SEric Cheng if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 19688275SEric Cheng mac_rx_srs_fanout(mac_srs, head); 19698275SEric Cheng else 19708275SEric Cheng mac_rx_srs_proto_fanout(mac_srs, head); 19718275SEric Cheng mutex_enter(&mac_srs->srs_lock); 19728275SEric Cheng } 19738275SEric Cheng 19748275SEric Cheng /* 19758275SEric Cheng * Send the poll thread to pick up any packets arrived 19768275SEric Cheng * so far. This also serves as the last check in case 19778275SEric Cheng * nothing else is queued in the SRS. The poll thread 19788275SEric Cheng * is signalled only in the case the drain was done 19798275SEric Cheng * by the worker thread and SRS_WORKER is set. The 19808275SEric Cheng * worker thread can run in parallel as long as the 19818275SEric Cheng * SRS_WORKER flag is set. We we have nothing else to 19828275SEric Cheng * process, we can exit while leaving SRS_PROC set 19838275SEric Cheng * which gives the poll thread control to process and 19848275SEric Cheng * cleanup once it returns from the NIC. 19858275SEric Cheng * 19868275SEric Cheng * If we have nothing else to process, we need to 19878275SEric Cheng * ensure that we keep holding the srs_lock till 19888275SEric Cheng * all the checks below are done and control is 19898275SEric Cheng * handed to the poll thread if it was running. 19908275SEric Cheng */ 19918275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 19928275SEric Cheng if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 19938275SEric Cheng if (mac_srs->srs_first != NULL) { 19948275SEric Cheng if (proc_type == SRS_WORKER) { 19958275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 19968275SEric Cheng if (srs_rx->sr_poll_pkt_cnt <= 19978275SEric Cheng srs_rx->sr_lowat) 19988275SEric Cheng MAC_SRS_POLL_RING(mac_srs); 19998275SEric Cheng goto again; 20008275SEric Cheng } else { 20018275SEric Cheng cv_signal(&mac_srs->srs_async); 20028275SEric Cheng } 20038275SEric Cheng } 20048275SEric Cheng } 20058275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 20068275SEric Cheng 20078275SEric Cheng done: 20088275SEric Cheng 20098275SEric Cheng if (mac_srs->srs_state & SRS_GET_PKTS) { 20108275SEric Cheng /* 20118275SEric Cheng * Poll thread is already running. Leave the 20128275SEric Cheng * SRS_RPOC set and hand over the control to 20138275SEric Cheng * poll thread. 20148275SEric Cheng */ 20158275SEric Cheng mac_srs->srs_state &= ~proc_type; 20168275SEric Cheng return; 20178275SEric Cheng } 20188275SEric Cheng 20198275SEric Cheng /* 20208275SEric Cheng * If we can't process packets because we have exceeded 20218275SEric Cheng * B/W limit for this tick, just set the timeout 20228275SEric Cheng * and leave. 20238275SEric Cheng * 20248275SEric Cheng * Even if there are no packets queued in SRS, we 20258275SEric Cheng * need to make sure that the shared counter is 20268275SEric Cheng * clear and any associated softrings have cleared 20278275SEric Cheng * all the backlog. Otherwise, leave the interface 20288275SEric Cheng * in polling mode and the poll thread will get 20298275SEric Cheng * signalled once the count goes down to zero. 20308275SEric Cheng * 20318275SEric Cheng * If someone is already draining the queue (SRS_PROC is 20328275SEric Cheng * set) when the srs_poll_pkt_cnt goes down to zero, 20338275SEric Cheng * then it means that drain is already running and we 20348275SEric Cheng * will turn off polling at that time if there is 20358275SEric Cheng * no backlog. As long as there are packets queued either 20368275SEric Cheng * is soft ring set or its soft rings, we will leave 20378275SEric Cheng * the interface in polling mode. 20388275SEric Cheng */ 20398275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 20408275SEric Cheng if ((mac_srs->srs_state & SRS_POLLING_CAPAB) && 20418275SEric Cheng ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) || 20428275SEric Cheng (srs_rx->sr_poll_pkt_cnt > 0))) { 20438275SEric Cheng MAC_SRS_POLLING_ON(mac_srs); 20448275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 20458275SEric Cheng if ((mac_srs->srs_first != NULL) && 20468275SEric Cheng (mac_srs->srs_tid == NULL)) 20478275SEric Cheng mac_srs->srs_tid = timeout(mac_srs_fire, 20488275SEric Cheng mac_srs, 1); 20498275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 20508275SEric Cheng return; 20518275SEric Cheng } 20528275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 20538275SEric Cheng 20548275SEric Cheng leave_poll: 20558275SEric Cheng 20568275SEric Cheng /* Nothing else to do. Get out of poll mode */ 20578275SEric Cheng MAC_SRS_POLLING_OFF(mac_srs); 20588275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 20598275SEric Cheng } 20608275SEric Cheng 20618275SEric Cheng /* 20628275SEric Cheng * mac_srs_worker 20638275SEric Cheng * 20648275SEric Cheng * The SRS worker routine. Drains the queue when no one else is 20658275SEric Cheng * processing it. 20668275SEric Cheng */ 20678275SEric Cheng void 20688275SEric Cheng mac_srs_worker(mac_soft_ring_set_t *mac_srs) 20698275SEric Cheng { 20708275SEric Cheng kmutex_t *lock = &mac_srs->srs_lock; 20718275SEric Cheng kcondvar_t *async = &mac_srs->srs_async; 20728275SEric Cheng callb_cpr_t cprinfo; 20738275SEric Cheng boolean_t bw_ctl_flag; 20748275SEric Cheng 20758275SEric Cheng CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker"); 20768275SEric Cheng mutex_enter(lock); 20778275SEric Cheng 20788275SEric Cheng start: 20798275SEric Cheng for (;;) { 20808275SEric Cheng bw_ctl_flag = B_FALSE; 20818275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 20828275SEric Cheng MAC_SRS_BW_LOCK(mac_srs); 20838275SEric Cheng MAC_SRS_CHECK_BW_CONTROL(mac_srs); 20848275SEric Cheng if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 20858275SEric Cheng bw_ctl_flag = B_TRUE; 20868275SEric Cheng MAC_SRS_BW_UNLOCK(mac_srs); 20878275SEric Cheng } 20888275SEric Cheng /* 20898275SEric Cheng * The SRS_BW_ENFORCED flag may change since we have dropped 20908275SEric Cheng * the mac_bw_lock. However the drain function can handle both 20918275SEric Cheng * a drainable SRS or a bandwidth controlled SRS, and the 20928275SEric Cheng * effect of scheduling a timeout is to wakeup the worker 20938275SEric Cheng * thread which in turn will call the drain function. Since 20948275SEric Cheng * we release the srs_lock atomically only in the cv_wait there 20958275SEric Cheng * isn't a fear of waiting for ever. 20968275SEric Cheng */ 20978275SEric Cheng while (((mac_srs->srs_state & SRS_PROC) || 20988275SEric Cheng (mac_srs->srs_first == NULL) || bw_ctl_flag || 20998275SEric Cheng (mac_srs->srs_state & SRS_TX_BLOCKED)) && 21008275SEric Cheng !(mac_srs->srs_state & SRS_PAUSE)) { 21018275SEric Cheng /* 21028275SEric Cheng * If we have packets queued and we are here 21038275SEric Cheng * because B/W control is in place, we better 21048275SEric Cheng * schedule the worker wakeup after 1 tick 21058275SEric Cheng * to see if bandwidth control can be relaxed. 21068275SEric Cheng */ 21078275SEric Cheng if (bw_ctl_flag && mac_srs->srs_tid == NULL) { 21088275SEric Cheng /* 21098275SEric Cheng * We need to ensure that a timer is already 21108275SEric Cheng * scheduled or we force schedule one for 21118275SEric Cheng * later so that we can continue processing 21128275SEric Cheng * after this quanta is over. 21138275SEric Cheng */ 21148275SEric Cheng mac_srs->srs_tid = timeout(mac_srs_fire, 21158275SEric Cheng mac_srs, 1); 21168275SEric Cheng } 21178275SEric Cheng wait: 21188275SEric Cheng CALLB_CPR_SAFE_BEGIN(&cprinfo); 21198275SEric Cheng cv_wait(async, lock); 21208275SEric Cheng CALLB_CPR_SAFE_END(&cprinfo, lock); 21218275SEric Cheng 21228275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 21238275SEric Cheng goto done; 21248275SEric Cheng if (mac_srs->srs_state & SRS_PROC) 21258275SEric Cheng goto wait; 21268275SEric Cheng 21278275SEric Cheng if (mac_srs->srs_first != NULL && 21288275SEric Cheng mac_srs->srs_type & SRST_BW_CONTROL) { 21298275SEric Cheng MAC_SRS_BW_LOCK(mac_srs); 21308275SEric Cheng if (mac_srs->srs_bw->mac_bw_state & 21318275SEric Cheng SRS_BW_ENFORCED) { 21328275SEric Cheng MAC_SRS_CHECK_BW_CONTROL(mac_srs); 21338275SEric Cheng } 21348275SEric Cheng bw_ctl_flag = mac_srs->srs_bw->mac_bw_state & 21358275SEric Cheng SRS_BW_ENFORCED; 21368275SEric Cheng MAC_SRS_BW_UNLOCK(mac_srs); 21378275SEric Cheng } 21388275SEric Cheng } 21398275SEric Cheng 21408275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 21418275SEric Cheng goto done; 21428275SEric Cheng mac_srs->srs_drain_func(mac_srs, SRS_WORKER); 21438275SEric Cheng } 21448275SEric Cheng done: 21458275SEric Cheng /* 21468275SEric Cheng * The Rx SRS quiesce logic first cuts off packet supply to the SRS 21478275SEric Cheng * from both hard and soft classifications and waits for such threads 21488275SEric Cheng * to finish before signaling the worker. So at this point the only 21498275SEric Cheng * thread left that could be competing with the worker is the poll 21508275SEric Cheng * thread. In the case of Tx, there shouldn't be any thread holding 21518275SEric Cheng * SRS_PROC at this point. 21528275SEric Cheng */ 21538275SEric Cheng if (!(mac_srs->srs_state & SRS_PROC)) { 21548275SEric Cheng mac_srs->srs_state |= SRS_PROC; 21558275SEric Cheng } else { 21568275SEric Cheng ASSERT((mac_srs->srs_type & SRST_TX) == 0); 21578275SEric Cheng /* 21588275SEric Cheng * Poll thread still owns the SRS and is still running 21598275SEric Cheng */ 21608275SEric Cheng ASSERT((mac_srs->srs_poll_thr == NULL) || 21618275SEric Cheng ((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 21628275SEric Cheng SRS_POLL_THR_OWNER)); 21638275SEric Cheng } 21648275SEric Cheng mac_srs_worker_quiesce(mac_srs); 21658275SEric Cheng /* 21668275SEric Cheng * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator 21678275SEric Cheng * of the quiesce operation 21688275SEric Cheng */ 21698275SEric Cheng while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART))) 21708275SEric Cheng cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 21718275SEric Cheng 21728275SEric Cheng if (mac_srs->srs_state & SRS_RESTART) { 21738275SEric Cheng ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 21748275SEric Cheng mac_srs_worker_restart(mac_srs); 21758275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 21768275SEric Cheng goto start; 21778275SEric Cheng } 21788275SEric Cheng 21798275SEric Cheng if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE)) 21808275SEric Cheng mac_srs_worker_quiesce(mac_srs); 21818275SEric Cheng 21828275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 21838275SEric Cheng /* The macro drops the srs_lock */ 21848275SEric Cheng CALLB_CPR_EXIT(&cprinfo); 21858275SEric Cheng thread_exit(); 21868275SEric Cheng } 21878275SEric Cheng 21888275SEric Cheng /* 21898275SEric Cheng * mac_rx_srs_subflow_process 21908275SEric Cheng * 21918275SEric Cheng * Receive side routine called from interrupt path when there are 21928275SEric Cheng * sub flows present on this SRS. 21938275SEric Cheng */ 21948275SEric Cheng /* ARGSUSED */ 21958275SEric Cheng void 21968275SEric Cheng mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs, 21978275SEric Cheng mblk_t *mp_chain, boolean_t loopback) 21988275SEric Cheng { 21998275SEric Cheng flow_entry_t *flent = NULL; 22008275SEric Cheng flow_entry_t *prev_flent = NULL; 22018275SEric Cheng mblk_t *mp = NULL; 22028275SEric Cheng mblk_t *tail = NULL; 22038275SEric Cheng mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 22048275SEric Cheng mac_client_impl_t *mcip; 22058275SEric Cheng 22068275SEric Cheng mcip = mac_srs->srs_mcip; 22078275SEric Cheng ASSERT(mcip != NULL); 22088275SEric Cheng 22098275SEric Cheng /* 22108275SEric Cheng * We need to determine the SRS for every packet 22118275SEric Cheng * by walking the flow table, if we don't get any, 22128275SEric Cheng * then we proceed using the SRS we came with. 22138275SEric Cheng */ 22148275SEric Cheng mp = tail = mp_chain; 22158275SEric Cheng while (mp != NULL) { 22168275SEric Cheng 22178275SEric Cheng /* 22188275SEric Cheng * We will increment the stats for the mactching subflow. 22198275SEric Cheng * when we get the bytes/pkt count for the classified packets 22208275SEric Cheng * later in mac_rx_srs_process. 22218275SEric Cheng */ 22228275SEric Cheng (void) mac_flow_lookup(mcip->mci_subflow_tab, mp, 22238275SEric Cheng FLOW_INBOUND, &flent); 22248275SEric Cheng 22258275SEric Cheng if (mp == mp_chain || flent == prev_flent) { 22268275SEric Cheng if (prev_flent != NULL) 22278275SEric Cheng FLOW_REFRELE(prev_flent); 22288275SEric Cheng prev_flent = flent; 22298275SEric Cheng flent = NULL; 22308275SEric Cheng tail = mp; 22318275SEric Cheng mp = mp->b_next; 22328275SEric Cheng continue; 22338275SEric Cheng } 22348275SEric Cheng tail->b_next = NULL; 22358275SEric Cheng /* 22368275SEric Cheng * A null indicates, this is for the mac_srs itself. 22378275SEric Cheng * XXX-venu : probably assert for fe_rx_srs_cnt == 0. 22388275SEric Cheng */ 22398275SEric Cheng if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 22408275SEric Cheng mac_rx_srs_process(arg, 22418275SEric Cheng (mac_resource_handle_t)mac_srs, mp_chain, 22428275SEric Cheng loopback); 22438275SEric Cheng } else { 22448275SEric Cheng (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 22458275SEric Cheng prev_flent->fe_cb_arg2, mp_chain, loopback); 22468275SEric Cheng FLOW_REFRELE(prev_flent); 22478275SEric Cheng } 22488275SEric Cheng prev_flent = flent; 22498275SEric Cheng flent = NULL; 22508275SEric Cheng mp_chain = mp; 22518275SEric Cheng tail = mp; 22528275SEric Cheng mp = mp->b_next; 22538275SEric Cheng } 22548275SEric Cheng /* Last chain */ 22558275SEric Cheng ASSERT(mp_chain != NULL); 22568275SEric Cheng if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 22578275SEric Cheng mac_rx_srs_process(arg, 22588275SEric Cheng (mac_resource_handle_t)mac_srs, mp_chain, loopback); 22598275SEric Cheng } else { 22608275SEric Cheng (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 22618275SEric Cheng prev_flent->fe_cb_arg2, mp_chain, loopback); 22628275SEric Cheng FLOW_REFRELE(prev_flent); 22638275SEric Cheng } 22648275SEric Cheng } 22658275SEric Cheng 22668275SEric Cheng /* 22678275SEric Cheng * mac_rx_srs_process 22688275SEric Cheng * 22698275SEric Cheng * Receive side routine called from the interrupt path. 22708275SEric Cheng * 22718275SEric Cheng * loopback is set to force a context switch on the loopback 22728275SEric Cheng * path between MAC clients. 22738275SEric Cheng */ 22748275SEric Cheng /* ARGSUSED */ 22758275SEric Cheng void 22768275SEric Cheng mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, 22778275SEric Cheng boolean_t loopback) 22788275SEric Cheng { 22798275SEric Cheng mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 22808275SEric Cheng mblk_t *mp, *tail, *head; 22818275SEric Cheng int count = 0; 22828275SEric Cheng int count1; 22838275SEric Cheng size_t sz = 0; 22848275SEric Cheng size_t chain_sz, sz1; 22858275SEric Cheng mac_bw_ctl_t *mac_bw; 22868275SEric Cheng mac_client_impl_t *smcip; 22878275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 22888275SEric Cheng 22898275SEric Cheng /* 22908275SEric Cheng * Set the tail, count and sz. We set the sz irrespective 22918275SEric Cheng * of whether we are doing B/W control or not for the 22928275SEric Cheng * purpose of updating the stats. 22938275SEric Cheng */ 22948275SEric Cheng mp = tail = mp_chain; 22958275SEric Cheng while (mp != NULL) { 22968275SEric Cheng tail = mp; 22978275SEric Cheng count++; 22988275SEric Cheng sz += msgdsize(mp); 22998275SEric Cheng mp = mp->b_next; 23008275SEric Cheng } 23018275SEric Cheng 23028275SEric Cheng mutex_enter(&mac_srs->srs_lock); 23038275SEric Cheng smcip = mac_srs->srs_mcip; 23048275SEric Cheng 23058275SEric Cheng if (mac_srs->srs_type & SRST_FLOW || smcip == NULL) { 23068275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, rbytes, sz); 23078275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, ipackets, count); 23088275SEric Cheng } 23098275SEric Cheng if (smcip != NULL) { 23108275SEric Cheng smcip->mci_stat_ibytes += sz; 23118275SEric Cheng smcip->mci_stat_ipackets += count; 23128275SEric Cheng } 23138275SEric Cheng 23148275SEric Cheng /* 23158275SEric Cheng * If the SRS in already being processed; has been blanked; 23168275SEric Cheng * can be processed by worker thread only; or the B/W limit 23178275SEric Cheng * has been reached, then queue the chain and check if 23188275SEric Cheng * worker thread needs to be awakend. 23198275SEric Cheng */ 23208275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 23218275SEric Cheng mac_bw = mac_srs->srs_bw; 23228275SEric Cheng ASSERT(mac_bw != NULL); 23238275SEric Cheng mutex_enter(&mac_bw->mac_bw_lock); 23248275SEric Cheng /* Count the packets and bytes via interrupt */ 23258275SEric Cheng srs_rx->sr_intr_count += count; 23268275SEric Cheng mac_bw->mac_bw_intr += sz; 23278275SEric Cheng if (mac_bw->mac_bw_limit == 0) { 23288275SEric Cheng /* zero bandwidth: drop all */ 23298275SEric Cheng srs_rx->sr_drop_count += count; 23308275SEric Cheng mac_bw->mac_bw_drop_bytes += sz; 23318275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23328275SEric Cheng mutex_exit(&mac_srs->srs_lock); 23338275SEric Cheng mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 23348275SEric Cheng return; 23358275SEric Cheng } else { 23368275SEric Cheng if ((mac_bw->mac_bw_sz + sz) <= 23378275SEric Cheng mac_bw->mac_bw_drop_threshold) { 23388275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23398275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, 23408275SEric Cheng tail, count, sz); 23418275SEric Cheng } else { 23428275SEric Cheng mp = mp_chain; 23438275SEric Cheng chain_sz = 0; 23448275SEric Cheng count1 = 0; 23458275SEric Cheng tail = NULL; 23468275SEric Cheng head = NULL; 23478275SEric Cheng while (mp != NULL) { 23488275SEric Cheng sz1 = msgdsize(mp); 23498275SEric Cheng if (mac_bw->mac_bw_sz + chain_sz + sz1 > 23508275SEric Cheng mac_bw->mac_bw_drop_threshold) 23518275SEric Cheng break; 23528275SEric Cheng chain_sz += sz1; 23538275SEric Cheng count1++; 23548275SEric Cheng tail = mp; 23558275SEric Cheng mp = mp->b_next; 23568275SEric Cheng } 23578275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23588275SEric Cheng if (tail != NULL) { 23598275SEric Cheng head = tail->b_next; 23608275SEric Cheng tail->b_next = NULL; 23618275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, 23628275SEric Cheng mp_chain, tail, count1, chain_sz); 23638275SEric Cheng sz -= chain_sz; 23648275SEric Cheng count -= count1; 23658275SEric Cheng } else { 23668275SEric Cheng /* Can't pick up any */ 23678275SEric Cheng head = mp_chain; 23688275SEric Cheng } 23698275SEric Cheng if (head != NULL) { 23708275SEric Cheng /* Drop any packet over the threshold */ 23718275SEric Cheng srs_rx->sr_drop_count += count; 23728275SEric Cheng mutex_enter(&mac_bw->mac_bw_lock); 23738275SEric Cheng mac_bw->mac_bw_drop_bytes += sz; 23748275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23758275SEric Cheng freemsgchain(head); 23768275SEric Cheng } 23778275SEric Cheng } 23788275SEric Cheng MAC_SRS_WORKER_WAKEUP(mac_srs); 23798275SEric Cheng mutex_exit(&mac_srs->srs_lock); 23808275SEric Cheng return; 23818275SEric Cheng } 23828275SEric Cheng } 23838275SEric Cheng 23848275SEric Cheng /* 23858275SEric Cheng * If the total number of packets queued in the SRS and 23868275SEric Cheng * its associated soft rings exceeds the max allowed, 23878275SEric Cheng * then drop the chain. If we are polling capable, this 23888275SEric Cheng * shouldn't be happening. 23898275SEric Cheng */ 23908275SEric Cheng if (!(mac_srs->srs_type & SRST_BW_CONTROL) && 23918275SEric Cheng (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) { 23928275SEric Cheng mac_bw = mac_srs->srs_bw; 23938275SEric Cheng srs_rx->sr_drop_count += count; 23948275SEric Cheng mutex_enter(&mac_bw->mac_bw_lock); 23958275SEric Cheng mac_bw->mac_bw_drop_bytes += sz; 23968275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23978275SEric Cheng freemsgchain(mp_chain); 23988275SEric Cheng mutex_exit(&mac_srs->srs_lock); 23998275SEric Cheng return; 24008275SEric Cheng } 24018275SEric Cheng 24028275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz); 24038275SEric Cheng /* Count the packets entering via interrupt path */ 24048275SEric Cheng srs_rx->sr_intr_count += count; 24058275SEric Cheng 24068275SEric Cheng if (!(mac_srs->srs_state & SRS_PROC)) { 24078275SEric Cheng /* 24088275SEric Cheng * If we are coming via loopback or if we are not 24098275SEric Cheng * optimizing for latency, we should signal the 24108275SEric Cheng * worker thread. 24118275SEric Cheng */ 24128833SVenu.Iyer@Sun.COM if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT)) { 24138275SEric Cheng /* 24148275SEric Cheng * For loopback, We need to let the worker take 24158275SEric Cheng * over as we don't want to continue in the same 24168275SEric Cheng * thread even if we can. This could lead to stack 24178275SEric Cheng * overflows and may also end up using 24188275SEric Cheng * resources (cpu) incorrectly. 24198275SEric Cheng */ 24208275SEric Cheng cv_signal(&mac_srs->srs_async); 24218275SEric Cheng } else { 24228275SEric Cheng /* 24238275SEric Cheng * Seems like no one is processing the SRS and 24248275SEric Cheng * there is no backlog. We also inline process 24258275SEric Cheng * our packet if its a single packet in non 24268275SEric Cheng * latency optimized case (in latency optimized 24278275SEric Cheng * case, we inline process chains of any size). 24288275SEric Cheng */ 24298275SEric Cheng mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST); 24308275SEric Cheng } 24318275SEric Cheng } 24328275SEric Cheng mutex_exit(&mac_srs->srs_lock); 24338275SEric Cheng } 24348275SEric Cheng 24358275SEric Cheng /* TX SIDE ROUTINES (RUNTIME) */ 24368275SEric Cheng 24378275SEric Cheng /* 24388275SEric Cheng * mac_tx_srs_no_desc 24398275SEric Cheng * 24408275SEric Cheng * This routine is called by Tx single ring default mode 24418275SEric Cheng * when Tx ring runs out of descs. 24428275SEric Cheng */ 24438275SEric Cheng mac_tx_cookie_t 24448275SEric Cheng mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 24458275SEric Cheng uint16_t flag, mblk_t **ret_mp) 24468275SEric Cheng { 24478275SEric Cheng mac_tx_cookie_t cookie = NULL; 24488275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 24498275SEric Cheng boolean_t wakeup_worker = B_TRUE; 24508275SEric Cheng uint32_t tx_mode = srs_tx->st_mode; 24518275SEric Cheng int cnt, sz; 24528275SEric Cheng mblk_t *tail; 24538275SEric Cheng 24548275SEric Cheng ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW); 24558275SEric Cheng if (flag & MAC_DROP_ON_NO_DESC) { 24568275SEric Cheng MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 24578275SEric Cheng } else { 24588275SEric Cheng if (mac_srs->srs_first != NULL) 24598275SEric Cheng wakeup_worker = B_FALSE; 24608275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 24618275SEric Cheng if (flag & MAC_TX_NO_ENQUEUE) { 24628275SEric Cheng /* 24638275SEric Cheng * If TX_QUEUED is not set, queue the 24648275SEric Cheng * packet and let mac_tx_srs_drain() 24658275SEric Cheng * set the TX_BLOCKED bit for the 24668275SEric Cheng * reasons explained above. Otherwise, 24678275SEric Cheng * return the mblks. 24688275SEric Cheng */ 24698275SEric Cheng if (wakeup_worker) { 24708275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 24718275SEric Cheng mp_chain, tail, cnt, sz); 24728275SEric Cheng } else { 24738275SEric Cheng MAC_TX_SET_NO_ENQUEUE(mac_srs, 24748275SEric Cheng mp_chain, ret_mp, cookie); 24758275SEric Cheng } 24768275SEric Cheng } else { 24778275SEric Cheng MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 24788275SEric Cheng tail, cnt, sz, cookie); 24798275SEric Cheng } 24808275SEric Cheng if (wakeup_worker) 24818275SEric Cheng cv_signal(&mac_srs->srs_async); 24828275SEric Cheng } 24838275SEric Cheng return (cookie); 24848275SEric Cheng } 24858275SEric Cheng 24868275SEric Cheng /* 24878275SEric Cheng * mac_tx_srs_enqueue 24888275SEric Cheng * 24898275SEric Cheng * This routine is called when Tx SRS is operating in either serializer 24908275SEric Cheng * or bandwidth mode. In serializer mode, a packet will get enqueued 24918275SEric Cheng * when a thread cannot enter SRS exclusively. In bandwidth mode, 24928275SEric Cheng * packets gets queued if allowed byte-count limit for a tick is 24938275SEric Cheng * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and 24948275SEric Cheng * MAC_TX_NO_ENQUEUE is set is different than when operaing in either 24958275SEric Cheng * the default mode or fanout mode. Here packets get dropped or 24968275SEric Cheng * returned back to the caller only after hi-watermark worth of data 24978275SEric Cheng * is queued. 24988275SEric Cheng */ 24998275SEric Cheng static mac_tx_cookie_t 25008275SEric Cheng mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 25018275SEric Cheng uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp) 25028275SEric Cheng { 25038275SEric Cheng mac_tx_cookie_t cookie = NULL; 25048275SEric Cheng int cnt, sz; 25058275SEric Cheng mblk_t *tail; 25068275SEric Cheng boolean_t wakeup_worker = B_TRUE; 25078275SEric Cheng 25088833SVenu.Iyer@Sun.COM /* 25098833SVenu.Iyer@Sun.COM * Ignore fanout hint if we don't have multiple tx rings. 25108833SVenu.Iyer@Sun.COM */ 25118833SVenu.Iyer@Sun.COM if (!TX_MULTI_RING_MODE(mac_srs)) 25128833SVenu.Iyer@Sun.COM fanout_hint = 0; 25138833SVenu.Iyer@Sun.COM 25148275SEric Cheng if (mac_srs->srs_first != NULL) 25158275SEric Cheng wakeup_worker = B_FALSE; 25168275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 25178275SEric Cheng if (flag & MAC_DROP_ON_NO_DESC) { 25188275SEric Cheng if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) { 25198275SEric Cheng MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 25208275SEric Cheng } else { 25218275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 25228275SEric Cheng mp_chain, tail, cnt, sz); 25238275SEric Cheng } 25248275SEric Cheng } else if (flag & MAC_TX_NO_ENQUEUE) { 25258275SEric Cheng if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) || 25268275SEric Cheng (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) { 25278275SEric Cheng MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain, 25288275SEric Cheng ret_mp, cookie); 25298275SEric Cheng } else { 25308275SEric Cheng mp_chain->b_prev = (mblk_t *)fanout_hint; 25318275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 25328275SEric Cheng mp_chain, tail, cnt, sz); 25338275SEric Cheng } 25348275SEric Cheng } else { 25358275SEric Cheng /* 25368275SEric Cheng * If you are BW_ENFORCED, just enqueue the 25378275SEric Cheng * packet. srs_worker will drain it at the 25388275SEric Cheng * prescribed rate. Before enqueueing, save 25398275SEric Cheng * the fanout hint. 25408275SEric Cheng */ 25418275SEric Cheng mp_chain->b_prev = (mblk_t *)fanout_hint; 25428275SEric Cheng MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 25438275SEric Cheng tail, cnt, sz, cookie); 25448275SEric Cheng } 25458275SEric Cheng if (wakeup_worker) 25468275SEric Cheng cv_signal(&mac_srs->srs_async); 25478275SEric Cheng return (cookie); 25488275SEric Cheng } 25498275SEric Cheng 25508275SEric Cheng /* 25518275SEric Cheng * There are five tx modes: 25528275SEric Cheng * 25538275SEric Cheng * 1) Default mode (SRS_TX_DEFAULT) 25548275SEric Cheng * 2) Serialization mode (SRS_TX_SERIALIZE) 25558275SEric Cheng * 3) Fanout mode (SRS_TX_FANOUT) 25568275SEric Cheng * 4) Bandwdith mode (SRS_TX_BW) 25578275SEric Cheng * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT) 25588275SEric Cheng * 25598275SEric Cheng * The tx mode in which an SRS operates is decided in mac_tx_srs_setup() 25608275SEric Cheng * based on the number of Tx rings requested for an SRS and whether 25618275SEric Cheng * bandwidth control is requested or not. 25628275SEric Cheng * 25638275SEric Cheng * In the default mode (i.e., no fanout/no bandwidth), the SRS acts as a 25648275SEric Cheng * pass-thru. Packets will go directly to mac_tx_send(). When the underlying 25658275SEric Cheng * Tx ring runs out of Tx descs, it starts queueing up packets in SRS. 25668275SEric Cheng * When flow-control is relieved, the srs_worker drains the queued 25678275SEric Cheng * packets and informs blocked clients to restart sending packets. 25688275SEric Cheng * 25698275SEric Cheng * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. 25708275SEric Cheng * 25718275SEric Cheng * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple 25728275SEric Cheng * Tx rings. Each Tx ring will have a soft ring associated with it. 25738275SEric Cheng * These soft rings will be hung off the Tx SRS. Queueing if it happens 25748275SEric Cheng * due to lack of Tx desc will be in individual soft ring (and not srs) 25758275SEric Cheng * associated with Tx ring. 25768275SEric Cheng * 25778275SEric Cheng * In the TX_BW mode, tx srs will allow packets to go down to Tx ring 25788275SEric Cheng * only if bw is available. Otherwise the packets will be queued in 25798275SEric Cheng * SRS. If fanout to multiple Tx rings is configured, the packets will 25808275SEric Cheng * be fanned out among the soft rings associated with the Tx rings. 25818275SEric Cheng * 25828275SEric Cheng * Four flags are used in srs_state for indicating flow control 25838275SEric Cheng * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT. 25848275SEric Cheng * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the 25858275SEric Cheng * driver below. 25868275SEric Cheng * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat 25878275SEric Cheng * and flow-control pressure is applied back to clients. The clients expect 25888275SEric Cheng * wakeup when flow-control is relieved. 25898275SEric Cheng * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk 25908275SEric Cheng * got returned back to client either due to lack of Tx descs or due to bw 25918275SEric Cheng * control reasons. The clients expect a wakeup when condition is relieved. 25928275SEric Cheng * 25938275SEric Cheng * The fourth argument to mac_tx() is the flag. Normally it will be 0 but 25948275SEric Cheng * some clients set the following values too: MAC_DROP_ON_NO_DESC, 25958275SEric Cheng * MAC_TX_NO_ENQUEUE 25968275SEric Cheng * Mac clients that do not want packets to be enqueued in the mac layer set 25978275SEric Cheng * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or 25988275SEric Cheng * Tx soft rings but instead get dropped when the NIC runs out of desc. The 25998275SEric Cheng * behaviour of this flag is different when the Tx is running in serializer 26008275SEric Cheng * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet 26018275SEric Cheng * get dropped when Tx high watermark is reached. 26028275SEric Cheng * There are some mac clients like vsw, aggr that want the mblks to be 26038275SEric Cheng * returned back to clients instead of being queued in Tx SRS (or Tx soft 26048275SEric Cheng * rings) under flow-control (i.e., out of desc or exceeding bw limits) 26058275SEric Cheng * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set. 26068275SEric Cheng * In the default and Tx fanout mode, the un-transmitted mblks will be 26078275SEric Cheng * returned back to the clients when the driver runs out of Tx descs. 26088275SEric Cheng * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or 26098275SEric Cheng * soft ring) so that the clients can be woken up when Tx desc become 26108275SEric Cheng * available. When running in serializer or bandwidth mode mode, 26118275SEric Cheng * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached. 26128275SEric Cheng */ 26138275SEric Cheng 26148275SEric Cheng mac_tx_func_t 26158275SEric Cheng mac_tx_get_func(uint32_t mode) 26168275SEric Cheng { 26178275SEric Cheng return (mac_tx_mode_list[mode].mac_tx_func); 26188275SEric Cheng } 26198275SEric Cheng 26208275SEric Cheng /* ARGSUSED */ 26218275SEric Cheng static mac_tx_cookie_t 26228275SEric Cheng mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 26238275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 26248275SEric Cheng { 26258275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 26268275SEric Cheng boolean_t is_subflow; 26278275SEric Cheng mac_tx_stats_t stats; 26288275SEric Cheng mac_tx_cookie_t cookie = NULL; 26298275SEric Cheng 26308275SEric Cheng ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT); 26318275SEric Cheng 26328275SEric Cheng /* Regular case with a single Tx ring */ 26338275SEric Cheng /* 26348275SEric Cheng * SRS_TX_BLOCKED is set when underlying NIC runs 26358275SEric Cheng * out of Tx descs and messages start getting 26368275SEric Cheng * queued. It won't get reset until 26378275SEric Cheng * tx_srs_drain() completely drains out the 26388275SEric Cheng * messages. 26398275SEric Cheng */ 26408275SEric Cheng if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 26418275SEric Cheng /* Tx descs/resources not available */ 26428275SEric Cheng mutex_enter(&mac_srs->srs_lock); 26438275SEric Cheng if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 26448275SEric Cheng cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, 26458275SEric Cheng flag, ret_mp); 26468275SEric Cheng mutex_exit(&mac_srs->srs_lock); 26478275SEric Cheng return (cookie); 26488275SEric Cheng } 26498275SEric Cheng /* 26508275SEric Cheng * While we were computing mblk count, the 26518275SEric Cheng * flow control condition got relieved. 26528275SEric Cheng * Continue with the transmission. 26538275SEric Cheng */ 26548275SEric Cheng mutex_exit(&mac_srs->srs_lock); 26558275SEric Cheng } 26568275SEric Cheng 26578275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 26588275SEric Cheng 26598275SEric Cheng mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 26608275SEric Cheng mp_chain, (is_subflow ? &stats : NULL)); 26618275SEric Cheng 26628275SEric Cheng /* 26638275SEric Cheng * Multiple threads could be here sending packets. 26648275SEric Cheng * Under such conditions, it is not possible to 26658275SEric Cheng * automically set SRS_TX_BLOCKED bit to indicate 26668275SEric Cheng * out of tx desc condition. To atomically set 26678275SEric Cheng * this, we queue the returned packet and do 26688275SEric Cheng * the setting of SRS_TX_BLOCKED in 26698275SEric Cheng * mac_tx_srs_drain(). 26708275SEric Cheng */ 26718275SEric Cheng if (mp_chain != NULL) { 26728275SEric Cheng mutex_enter(&mac_srs->srs_lock); 26738275SEric Cheng cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp); 26748275SEric Cheng mutex_exit(&mac_srs->srs_lock); 26758275SEric Cheng return (cookie); 26768275SEric Cheng } 26778275SEric Cheng 26788275SEric Cheng if (is_subflow) 26798275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 26808275SEric Cheng 26818275SEric Cheng return (NULL); 26828275SEric Cheng } 26838275SEric Cheng 26848275SEric Cheng /* 26858275SEric Cheng * mac_tx_serialize_mode 26868275SEric Cheng * 26878275SEric Cheng * This is an experimental mode implemented as per the request of PAE. 26888275SEric Cheng * In this mode, all callers attempting to send a packet to the NIC 26898275SEric Cheng * will get serialized. Only one thread at any time will access the 26908275SEric Cheng * NIC to send the packet out. 26918275SEric Cheng */ 26928275SEric Cheng /* ARGSUSED */ 26938275SEric Cheng static mac_tx_cookie_t 26948275SEric Cheng mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 26958275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 26968275SEric Cheng { 26978275SEric Cheng boolean_t is_subflow; 26988275SEric Cheng mac_tx_stats_t stats; 26998275SEric Cheng mac_tx_cookie_t cookie = NULL; 27008275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 27018275SEric Cheng 27028275SEric Cheng /* Single ring, serialize below */ 27038275SEric Cheng ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE); 27048275SEric Cheng mutex_enter(&mac_srs->srs_lock); 27058275SEric Cheng if ((mac_srs->srs_first != NULL) || 27068275SEric Cheng (mac_srs->srs_state & SRS_PROC)) { 27078275SEric Cheng /* 27088275SEric Cheng * In serialization mode, queue all packets until 27098275SEric Cheng * TX_HIWAT is set. 27108275SEric Cheng * If drop bit is set, drop if TX_HIWAT is set. 27118275SEric Cheng * If no_enqueue is set, still enqueue until hiwat 27128275SEric Cheng * is set and return mblks after TX_HIWAT is set. 27138275SEric Cheng */ 27148275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, 27158275SEric Cheng flag, NULL, ret_mp); 27168275SEric Cheng mutex_exit(&mac_srs->srs_lock); 27178275SEric Cheng return (cookie); 27188275SEric Cheng } 27198275SEric Cheng /* 27208275SEric Cheng * No packets queued, nothing on proc and no flow 27218275SEric Cheng * control condition. Fast-path, ok. Do inline 27228275SEric Cheng * processing. 27238275SEric Cheng */ 27248275SEric Cheng mac_srs->srs_state |= SRS_PROC; 27258275SEric Cheng mutex_exit(&mac_srs->srs_lock); 27268275SEric Cheng 27278275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 27288275SEric Cheng 27298275SEric Cheng mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 27308275SEric Cheng mp_chain, (is_subflow ? &stats : NULL)); 27318275SEric Cheng 27328275SEric Cheng mutex_enter(&mac_srs->srs_lock); 27338275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 27348275SEric Cheng if (mp_chain != NULL) { 27358275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, 27368275SEric Cheng mp_chain, flag, NULL, ret_mp); 27378275SEric Cheng } 27388275SEric Cheng if (mac_srs->srs_first != NULL) { 27398275SEric Cheng /* 27408275SEric Cheng * We processed inline our packet and a new 27418275SEric Cheng * packet/s got queued while we were 27428275SEric Cheng * processing. Wakeup srs worker 27438275SEric Cheng */ 27448275SEric Cheng cv_signal(&mac_srs->srs_async); 27458275SEric Cheng } 27468275SEric Cheng mutex_exit(&mac_srs->srs_lock); 27478275SEric Cheng 27488275SEric Cheng if (is_subflow && cookie == NULL) 27498275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 27508275SEric Cheng 27518275SEric Cheng return (cookie); 27528275SEric Cheng } 27538275SEric Cheng 27548275SEric Cheng /* 27558275SEric Cheng * mac_tx_fanout_mode 27568275SEric Cheng * 27578275SEric Cheng * In this mode, the SRS will have access to multiple Tx rings to send 27588275SEric Cheng * the packet out. The fanout hint that is passed as an argument is 27598275SEric Cheng * used to find an appropriate ring to fanout the traffic. Each Tx 27608275SEric Cheng * ring, in turn, will have a soft ring associated with it. If a Tx 27618275SEric Cheng * ring runs out of Tx desc's the returned packet will be queued in 27628275SEric Cheng * the soft ring associated with that Tx ring. The srs itself will not 27638275SEric Cheng * queue any packets. 27648275SEric Cheng */ 27658833SVenu.Iyer@Sun.COM 27668833SVenu.Iyer@Sun.COM #define MAC_TX_SOFT_RING_PROCESS(chain) { \ 27678833SVenu.Iyer@Sun.COM index = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count), \ 27688833SVenu.Iyer@Sun.COM softring = mac_srs->srs_oth_soft_rings[index]; \ 27698833SVenu.Iyer@Sun.COM cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \ 27708833SVenu.Iyer@Sun.COM DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index); \ 27718833SVenu.Iyer@Sun.COM } 27728833SVenu.Iyer@Sun.COM 27738275SEric Cheng static mac_tx_cookie_t 27748275SEric Cheng mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 27758275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 27768275SEric Cheng { 27778275SEric Cheng mac_soft_ring_t *softring; 27788833SVenu.Iyer@Sun.COM uint64_t hash; 27798833SVenu.Iyer@Sun.COM uint_t index; 27808833SVenu.Iyer@Sun.COM mac_tx_cookie_t cookie = NULL; 27818275SEric Cheng 27828275SEric Cheng ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT); 27838833SVenu.Iyer@Sun.COM if (fanout_hint != 0) { 27848833SVenu.Iyer@Sun.COM /* 27858833SVenu.Iyer@Sun.COM * The hint is specified by the caller, simply pass the 27868833SVenu.Iyer@Sun.COM * whole chain to the soft ring. 27878833SVenu.Iyer@Sun.COM */ 27888833SVenu.Iyer@Sun.COM hash = HASH_HINT(fanout_hint); 27898833SVenu.Iyer@Sun.COM MAC_TX_SOFT_RING_PROCESS(mp_chain); 27908833SVenu.Iyer@Sun.COM } else { 27918833SVenu.Iyer@Sun.COM mblk_t *last_mp, *cur_mp, *sub_chain; 27928833SVenu.Iyer@Sun.COM uint64_t last_hash = 0; 27938833SVenu.Iyer@Sun.COM uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media; 27948833SVenu.Iyer@Sun.COM 27958833SVenu.Iyer@Sun.COM /* 27968833SVenu.Iyer@Sun.COM * Compute the hash from the contents (headers) of the 27978833SVenu.Iyer@Sun.COM * packets of the mblk chain. Split the chains into 27988833SVenu.Iyer@Sun.COM * subchains of the same conversation. 27998833SVenu.Iyer@Sun.COM * 28008833SVenu.Iyer@Sun.COM * Since there may be more than one ring used for 28018833SVenu.Iyer@Sun.COM * sub-chains of the same call, and since the caller 28028833SVenu.Iyer@Sun.COM * does not maintain per conversation state since it 28038833SVenu.Iyer@Sun.COM * passed a zero hint, unsent subchains will be 28048833SVenu.Iyer@Sun.COM * dropped. 28058833SVenu.Iyer@Sun.COM */ 28068833SVenu.Iyer@Sun.COM 28078833SVenu.Iyer@Sun.COM flag |= MAC_DROP_ON_NO_DESC; 28088833SVenu.Iyer@Sun.COM ret_mp = NULL; 28098833SVenu.Iyer@Sun.COM 28108833SVenu.Iyer@Sun.COM ASSERT(ret_mp == NULL); 28118833SVenu.Iyer@Sun.COM 28128833SVenu.Iyer@Sun.COM sub_chain = NULL; 28138833SVenu.Iyer@Sun.COM last_mp = NULL; 28148833SVenu.Iyer@Sun.COM 28158833SVenu.Iyer@Sun.COM for (cur_mp = mp_chain; cur_mp != NULL; 28168833SVenu.Iyer@Sun.COM cur_mp = cur_mp->b_next) { 28178833SVenu.Iyer@Sun.COM hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4, 28188833SVenu.Iyer@Sun.COM B_TRUE); 28198833SVenu.Iyer@Sun.COM if (last_hash != 0 && hash != last_hash) { 28208833SVenu.Iyer@Sun.COM /* 28218833SVenu.Iyer@Sun.COM * Starting a different subchain, send current 28228833SVenu.Iyer@Sun.COM * chain out. 28238833SVenu.Iyer@Sun.COM */ 28248833SVenu.Iyer@Sun.COM ASSERT(last_mp != NULL); 28258833SVenu.Iyer@Sun.COM last_mp->b_next = NULL; 28268833SVenu.Iyer@Sun.COM MAC_TX_SOFT_RING_PROCESS(sub_chain); 28278833SVenu.Iyer@Sun.COM sub_chain = NULL; 28288833SVenu.Iyer@Sun.COM } 28298833SVenu.Iyer@Sun.COM 28308833SVenu.Iyer@Sun.COM /* add packet to subchain */ 28318833SVenu.Iyer@Sun.COM if (sub_chain == NULL) 28328833SVenu.Iyer@Sun.COM sub_chain = cur_mp; 28338833SVenu.Iyer@Sun.COM last_mp = cur_mp; 28348833SVenu.Iyer@Sun.COM last_hash = hash; 28358833SVenu.Iyer@Sun.COM } 28368833SVenu.Iyer@Sun.COM 28378833SVenu.Iyer@Sun.COM if (sub_chain != NULL) { 28388833SVenu.Iyer@Sun.COM /* send last subchain */ 28398833SVenu.Iyer@Sun.COM ASSERT(last_mp != NULL); 28408833SVenu.Iyer@Sun.COM last_mp->b_next = NULL; 28418833SVenu.Iyer@Sun.COM MAC_TX_SOFT_RING_PROCESS(sub_chain); 28428833SVenu.Iyer@Sun.COM } 28438833SVenu.Iyer@Sun.COM 28448833SVenu.Iyer@Sun.COM cookie = NULL; 28458833SVenu.Iyer@Sun.COM } 28468833SVenu.Iyer@Sun.COM 28478833SVenu.Iyer@Sun.COM return (cookie); 28488275SEric Cheng } 28498275SEric Cheng 28508275SEric Cheng /* 28518275SEric Cheng * mac_tx_bw_mode 28528275SEric Cheng * 28538275SEric Cheng * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring 28548275SEric Cheng * only if bw is available. Otherwise the packets will be queued in 28558275SEric Cheng * SRS. If the SRS has multiple Tx rings, then packets will get fanned 28568275SEric Cheng * out to a Tx rings. 28578275SEric Cheng */ 28588275SEric Cheng static mac_tx_cookie_t 28598275SEric Cheng mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 28608275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 28618275SEric Cheng { 28628275SEric Cheng int cnt, sz; 28638275SEric Cheng mblk_t *tail; 28648275SEric Cheng mac_tx_cookie_t cookie = NULL; 28658275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2866*11066Srafael.vanoni@sun.com clock_t now; 28678275SEric Cheng 28688275SEric Cheng ASSERT(TX_BANDWIDTH_MODE(mac_srs)); 28698275SEric Cheng ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 28708275SEric Cheng mutex_enter(&mac_srs->srs_lock); 28718275SEric Cheng if (mac_srs->srs_bw->mac_bw_limit == 0) { 28728833SVenu.Iyer@Sun.COM /* 28738833SVenu.Iyer@Sun.COM * zero bandwidth, no traffic is sent: drop the packets, 28748833SVenu.Iyer@Sun.COM * or return the whole chain if the caller requests all 28758833SVenu.Iyer@Sun.COM * unsent packets back. 28768833SVenu.Iyer@Sun.COM */ 28778833SVenu.Iyer@Sun.COM if (flag & MAC_TX_NO_ENQUEUE) { 28788833SVenu.Iyer@Sun.COM cookie = (mac_tx_cookie_t)mac_srs; 28798833SVenu.Iyer@Sun.COM *ret_mp = mp_chain; 28808833SVenu.Iyer@Sun.COM } else { 28818833SVenu.Iyer@Sun.COM MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 28828833SVenu.Iyer@Sun.COM } 28838275SEric Cheng mutex_exit(&mac_srs->srs_lock); 28848275SEric Cheng return (cookie); 28858275SEric Cheng } else if ((mac_srs->srs_first != NULL) || 28868275SEric Cheng (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 28878275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 28888275SEric Cheng fanout_hint, ret_mp); 28898275SEric Cheng mutex_exit(&mac_srs->srs_lock); 28908275SEric Cheng return (cookie); 28918275SEric Cheng } 28928275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2893*11066Srafael.vanoni@sun.com now = ddi_get_lbolt(); 2894*11066Srafael.vanoni@sun.com if (mac_srs->srs_bw->mac_bw_curr_time != now) { 2895*11066Srafael.vanoni@sun.com mac_srs->srs_bw->mac_bw_curr_time = now; 28968275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 28978275SEric Cheng } else if (mac_srs->srs_bw->mac_bw_used > 28988275SEric Cheng mac_srs->srs_bw->mac_bw_limit) { 28998275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 29008275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 29018275SEric Cheng mp_chain, tail, cnt, sz); 29028275SEric Cheng /* 29038275SEric Cheng * Wakeup worker thread. Note that worker 29048275SEric Cheng * thread has to be woken up so that it 29058275SEric Cheng * can fire up the timer to be woken up 29068275SEric Cheng * on the next tick. Also once 29078275SEric Cheng * BW_ENFORCED is set, it can only be 29088275SEric Cheng * reset by srs_worker thread. Until then 29098275SEric Cheng * all packets will get queued up in SRS 29108275SEric Cheng * and hence this this code path won't be 29118275SEric Cheng * entered until BW_ENFORCED is reset. 29128275SEric Cheng */ 29138275SEric Cheng cv_signal(&mac_srs->srs_async); 29148275SEric Cheng mutex_exit(&mac_srs->srs_lock); 29158275SEric Cheng return (cookie); 29168275SEric Cheng } 29178275SEric Cheng 29188275SEric Cheng mac_srs->srs_bw->mac_bw_used += sz; 29198275SEric Cheng mutex_exit(&mac_srs->srs_lock); 29208275SEric Cheng 29218275SEric Cheng if (srs_tx->st_mode == SRS_TX_BW_FANOUT) { 29228275SEric Cheng mac_soft_ring_t *softring; 29238275SEric Cheng uint_t indx, hash; 29248275SEric Cheng 29258275SEric Cheng hash = HASH_HINT(fanout_hint); 29268275SEric Cheng indx = COMPUTE_INDEX(hash, 29278275SEric Cheng mac_srs->srs_oth_ring_count); 29288275SEric Cheng softring = mac_srs->srs_oth_soft_rings[indx]; 29298275SEric Cheng return (mac_tx_soft_ring_process(softring, mp_chain, flag, 29308275SEric Cheng ret_mp)); 29318275SEric Cheng } else { 29328275SEric Cheng boolean_t is_subflow; 29338275SEric Cheng mac_tx_stats_t stats; 29348275SEric Cheng 29358275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 29368275SEric Cheng 29378275SEric Cheng mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 29388275SEric Cheng mp_chain, (is_subflow ? &stats : NULL)); 29398275SEric Cheng 29408275SEric Cheng if (mp_chain != NULL) { 29418275SEric Cheng mutex_enter(&mac_srs->srs_lock); 29428275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 29438275SEric Cheng if (mac_srs->srs_bw->mac_bw_used > sz) 29448275SEric Cheng mac_srs->srs_bw->mac_bw_used -= sz; 29458275SEric Cheng else 29468275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 29478275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 29488275SEric Cheng fanout_hint, ret_mp); 29498275SEric Cheng mutex_exit(&mac_srs->srs_lock); 29508275SEric Cheng return (cookie); 29518275SEric Cheng } 29528275SEric Cheng if (is_subflow) 29538275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 29548275SEric Cheng 29558275SEric Cheng return (NULL); 29568275SEric Cheng } 29578275SEric Cheng } 29588275SEric Cheng 29598275SEric Cheng /* ARGSUSED */ 29608275SEric Cheng void 29618275SEric Cheng mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 29628275SEric Cheng { 29638275SEric Cheng mblk_t *head, *tail; 29648275SEric Cheng size_t sz; 29658275SEric Cheng uint32_t tx_mode; 29668275SEric Cheng uint_t saved_pkt_count; 29678275SEric Cheng boolean_t is_subflow; 29688275SEric Cheng mac_tx_stats_t stats; 29698275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2970*11066Srafael.vanoni@sun.com clock_t now; 29718275SEric Cheng 29728275SEric Cheng saved_pkt_count = 0; 29738275SEric Cheng ASSERT(mutex_owned(&mac_srs->srs_lock)); 29748275SEric Cheng ASSERT(!(mac_srs->srs_state & SRS_PROC)); 29758275SEric Cheng 29768275SEric Cheng mac_srs->srs_state |= SRS_PROC; 29778275SEric Cheng 29788275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 29798275SEric Cheng tx_mode = srs_tx->st_mode; 29808275SEric Cheng if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) { 29818275SEric Cheng if (mac_srs->srs_first != NULL) { 29828275SEric Cheng head = mac_srs->srs_first; 29838275SEric Cheng tail = mac_srs->srs_last; 29848275SEric Cheng saved_pkt_count = mac_srs->srs_count; 29858275SEric Cheng mac_srs->srs_first = NULL; 29868275SEric Cheng mac_srs->srs_last = NULL; 29878275SEric Cheng mac_srs->srs_count = 0; 29888275SEric Cheng mutex_exit(&mac_srs->srs_lock); 29898275SEric Cheng 29908275SEric Cheng head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 29918275SEric Cheng head, &stats); 29928275SEric Cheng 29938275SEric Cheng mutex_enter(&mac_srs->srs_lock); 29948275SEric Cheng if (head != NULL) { 29958275SEric Cheng /* Device out of tx desc, set block */ 29968275SEric Cheng if (head->b_next == NULL) 29978275SEric Cheng VERIFY(head == tail); 29988275SEric Cheng tail->b_next = mac_srs->srs_first; 29998275SEric Cheng mac_srs->srs_first = head; 30008275SEric Cheng mac_srs->srs_count += 30018275SEric Cheng (saved_pkt_count - stats.ts_opackets); 30028275SEric Cheng if (mac_srs->srs_last == NULL) 30038275SEric Cheng mac_srs->srs_last = tail; 30048275SEric Cheng MAC_TX_SRS_BLOCK(mac_srs, head); 30058275SEric Cheng } else { 30068275SEric Cheng srs_tx->st_woken_up = B_FALSE; 30078275SEric Cheng if (is_subflow) { 30088275SEric Cheng FLOW_TX_STATS_UPDATE( 30098275SEric Cheng mac_srs->srs_flent, &stats); 30108275SEric Cheng } 30118275SEric Cheng } 30128275SEric Cheng } 30138275SEric Cheng } else if (tx_mode == SRS_TX_BW) { 30148275SEric Cheng /* 30158275SEric Cheng * We are here because the timer fired and we have some data 30168275SEric Cheng * to tranmit. Also mac_tx_srs_worker should have reset 30178275SEric Cheng * SRS_BW_ENFORCED flag 30188275SEric Cheng */ 30198275SEric Cheng ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)); 30208275SEric Cheng head = tail = mac_srs->srs_first; 30218275SEric Cheng while (mac_srs->srs_first != NULL) { 30228275SEric Cheng tail = mac_srs->srs_first; 30238275SEric Cheng tail->b_prev = NULL; 30248275SEric Cheng mac_srs->srs_first = tail->b_next; 30258275SEric Cheng if (mac_srs->srs_first == NULL) 30268275SEric Cheng mac_srs->srs_last = NULL; 30278275SEric Cheng mac_srs->srs_count--; 30288275SEric Cheng sz = msgdsize(tail); 30298275SEric Cheng mac_srs->srs_size -= sz; 30308275SEric Cheng saved_pkt_count++; 30318275SEric Cheng MAC_TX_UPDATE_BW_INFO(mac_srs, sz); 30328275SEric Cheng 30338275SEric Cheng if (mac_srs->srs_bw->mac_bw_used < 30348275SEric Cheng mac_srs->srs_bw->mac_bw_limit) 30358275SEric Cheng continue; 30368275SEric Cheng 3037*11066Srafael.vanoni@sun.com now = ddi_get_lbolt(); 3038*11066Srafael.vanoni@sun.com if (mac_srs->srs_bw->mac_bw_curr_time != now) { 3039*11066Srafael.vanoni@sun.com mac_srs->srs_bw->mac_bw_curr_time = now; 30408275SEric Cheng mac_srs->srs_bw->mac_bw_used = sz; 30418275SEric Cheng continue; 30428275SEric Cheng } 30438275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 30448275SEric Cheng break; 30458275SEric Cheng } 30468275SEric Cheng 30478275SEric Cheng ASSERT((head == NULL && tail == NULL) || 30488275SEric Cheng (head != NULL && tail != NULL)); 30498275SEric Cheng if (tail != NULL) { 30508275SEric Cheng tail->b_next = NULL; 30518275SEric Cheng mutex_exit(&mac_srs->srs_lock); 30528275SEric Cheng 30538275SEric Cheng head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 30548275SEric Cheng head, &stats); 30558275SEric Cheng 30568275SEric Cheng mutex_enter(&mac_srs->srs_lock); 30578275SEric Cheng if (head != NULL) { 30588275SEric Cheng uint_t size_sent; 30598275SEric Cheng 30608275SEric Cheng /* Device out of tx desc, set block */ 30618275SEric Cheng if (head->b_next == NULL) 30628275SEric Cheng VERIFY(head == tail); 30638275SEric Cheng tail->b_next = mac_srs->srs_first; 30648275SEric Cheng mac_srs->srs_first = head; 30658275SEric Cheng mac_srs->srs_count += 30668275SEric Cheng (saved_pkt_count - stats.ts_opackets); 30678275SEric Cheng if (mac_srs->srs_last == NULL) 30688275SEric Cheng mac_srs->srs_last = tail; 30698275SEric Cheng size_sent = sz - stats.ts_obytes; 30708275SEric Cheng mac_srs->srs_size += size_sent; 30718275SEric Cheng mac_srs->srs_bw->mac_bw_sz += size_sent; 30728275SEric Cheng if (mac_srs->srs_bw->mac_bw_used > size_sent) { 30738275SEric Cheng mac_srs->srs_bw->mac_bw_used -= 30748275SEric Cheng size_sent; 30758275SEric Cheng } else { 30768275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 30778275SEric Cheng } 30788275SEric Cheng MAC_TX_SRS_BLOCK(mac_srs, head); 30798275SEric Cheng } else { 30808275SEric Cheng srs_tx->st_woken_up = B_FALSE; 30818275SEric Cheng if (is_subflow) { 30828275SEric Cheng FLOW_TX_STATS_UPDATE( 30838275SEric Cheng mac_srs->srs_flent, &stats); 30848275SEric Cheng } 30858275SEric Cheng } 30868275SEric Cheng } 30878275SEric Cheng } else if (tx_mode == SRS_TX_BW_FANOUT) { 30888275SEric Cheng mblk_t *prev; 30898275SEric Cheng mac_soft_ring_t *softring; 30908275SEric Cheng uint64_t hint; 30918275SEric Cheng 30928275SEric Cheng /* 30938275SEric Cheng * We are here because the timer fired and we 30948275SEric Cheng * have some quota to tranmit. 30958275SEric Cheng */ 30968275SEric Cheng prev = NULL; 30978275SEric Cheng head = tail = mac_srs->srs_first; 30988275SEric Cheng while (mac_srs->srs_first != NULL) { 30998275SEric Cheng tail = mac_srs->srs_first; 31008275SEric Cheng mac_srs->srs_first = tail->b_next; 31018275SEric Cheng if (mac_srs->srs_first == NULL) 31028275SEric Cheng mac_srs->srs_last = NULL; 31038275SEric Cheng mac_srs->srs_count--; 31048275SEric Cheng sz = msgdsize(tail); 31058275SEric Cheng mac_srs->srs_size -= sz; 31068275SEric Cheng mac_srs->srs_bw->mac_bw_used += sz; 31078275SEric Cheng if (prev == NULL) 31088275SEric Cheng hint = (ulong_t)tail->b_prev; 31098275SEric Cheng if (hint != (ulong_t)tail->b_prev) { 31108275SEric Cheng prev->b_next = NULL; 31118275SEric Cheng mutex_exit(&mac_srs->srs_lock); 31128275SEric Cheng TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 31138275SEric Cheng head = tail; 31148275SEric Cheng hint = (ulong_t)tail->b_prev; 31158275SEric Cheng mutex_enter(&mac_srs->srs_lock); 31168275SEric Cheng } 31178275SEric Cheng 31188275SEric Cheng prev = tail; 31198275SEric Cheng tail->b_prev = NULL; 31208275SEric Cheng if (mac_srs->srs_bw->mac_bw_used < 31218275SEric Cheng mac_srs->srs_bw->mac_bw_limit) 31228275SEric Cheng continue; 31238275SEric Cheng 3124*11066Srafael.vanoni@sun.com now = ddi_get_lbolt(); 3125*11066Srafael.vanoni@sun.com if (mac_srs->srs_bw->mac_bw_curr_time != now) { 3126*11066Srafael.vanoni@sun.com mac_srs->srs_bw->mac_bw_curr_time = now; 31278275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 31288275SEric Cheng continue; 31298275SEric Cheng } 31308275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 31318275SEric Cheng break; 31328275SEric Cheng } 31338275SEric Cheng ASSERT((head == NULL && tail == NULL) || 31348275SEric Cheng (head != NULL && tail != NULL)); 31358275SEric Cheng if (tail != NULL) { 31368275SEric Cheng tail->b_next = NULL; 31378275SEric Cheng mutex_exit(&mac_srs->srs_lock); 31388275SEric Cheng TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 31398275SEric Cheng mutex_enter(&mac_srs->srs_lock); 31408275SEric Cheng } 31418275SEric Cheng } 31428275SEric Cheng /* 31438275SEric Cheng * SRS_TX_FANOUT case not considered here because packets 31448275SEric Cheng * won't be queued in the SRS for this case. Packets will 31458275SEric Cheng * be sent directly to soft rings underneath and if there 31468275SEric Cheng * is any queueing at all, it would be in Tx side soft 31478275SEric Cheng * rings. 31488275SEric Cheng */ 31498275SEric Cheng 31508275SEric Cheng /* 31518275SEric Cheng * When srs_count becomes 0, reset SRS_TX_HIWAT and 31528275SEric Cheng * SRS_TX_WAKEUP_CLIENT and wakeup registered clients. 31538275SEric Cheng */ 31548275SEric Cheng if (mac_srs->srs_count == 0 && (mac_srs->srs_state & 31558275SEric Cheng (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) { 31568275SEric Cheng mac_tx_notify_cb_t *mtnfp; 31578275SEric Cheng mac_cb_t *mcb; 31588275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 31598275SEric Cheng boolean_t wakeup_required = B_FALSE; 31608275SEric Cheng 31618275SEric Cheng if (mac_srs->srs_state & 31628275SEric Cheng (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) { 31638275SEric Cheng wakeup_required = B_TRUE; 31648275SEric Cheng } 31658275SEric Cheng mac_srs->srs_state &= ~(SRS_TX_HIWAT | 31668275SEric Cheng SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED); 31678275SEric Cheng mutex_exit(&mac_srs->srs_lock); 31688275SEric Cheng if (wakeup_required) { 31698275SEric Cheng /* Wakeup callback registered clients */ 31708275SEric Cheng MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info); 31718275SEric Cheng for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL; 31728275SEric Cheng mcb = mcb->mcb_nextp) { 31738275SEric Cheng mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp; 31748275SEric Cheng mtnfp->mtnf_fn(mtnfp->mtnf_arg, 31758275SEric Cheng (mac_tx_cookie_t)mac_srs); 31768275SEric Cheng } 31778275SEric Cheng MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info, 31788275SEric Cheng &mcip->mci_tx_notify_cb_list); 31798275SEric Cheng /* 31808275SEric Cheng * If the client is not the primary MAC client, then we 31818275SEric Cheng * need to send the notification to the clients upper 31828275SEric Cheng * MAC, i.e. mci_upper_mip. 31838275SEric Cheng */ 31848275SEric Cheng mac_tx_notify(mcip->mci_upper_mip != NULL ? 31858275SEric Cheng mcip->mci_upper_mip : mcip->mci_mip); 31868275SEric Cheng } 31878275SEric Cheng mutex_enter(&mac_srs->srs_lock); 31888275SEric Cheng } 31898275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 31908275SEric Cheng } 31918275SEric Cheng 31928275SEric Cheng /* 31938275SEric Cheng * Given a packet, get the flow_entry that identifies the flow 31948275SEric Cheng * to which that packet belongs. The flow_entry will contain 31958275SEric Cheng * the transmit function to be used to send the packet. If the 31968275SEric Cheng * function returns NULL, the packet should be sent using the 31978275SEric Cheng * underlying NIC. 31988275SEric Cheng */ 31998275SEric Cheng static flow_entry_t * 32008275SEric Cheng mac_tx_classify(mac_impl_t *mip, mblk_t *mp) 32018275SEric Cheng { 32028275SEric Cheng flow_entry_t *flent = NULL; 32038275SEric Cheng mac_client_impl_t *mcip; 32048275SEric Cheng int err; 32058275SEric Cheng 32068275SEric Cheng /* 32078275SEric Cheng * Do classification on the packet. 32088275SEric Cheng */ 32098275SEric Cheng err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent); 32108275SEric Cheng if (err != 0) 32118275SEric Cheng return (NULL); 32128275SEric Cheng 32138275SEric Cheng /* 32148275SEric Cheng * This flent might just be an additional one on the MAC client, 32158275SEric Cheng * i.e. for classification purposes (different fdesc), however 32168275SEric Cheng * the resources, SRS et. al., are in the mci_flent, so if 32178275SEric Cheng * this isn't the mci_flent, we need to get it. 32188275SEric Cheng */ 32198275SEric Cheng if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) { 32208275SEric Cheng FLOW_REFRELE(flent); 32218275SEric Cheng flent = mcip->mci_flent; 32228275SEric Cheng FLOW_TRY_REFHOLD(flent, err); 32238275SEric Cheng if (err != 0) 32248275SEric Cheng return (NULL); 32258275SEric Cheng } 32268275SEric Cheng 32278275SEric Cheng return (flent); 32288275SEric Cheng } 32298275SEric Cheng 32308275SEric Cheng /* 32318275SEric Cheng * This macro is only meant to be used by mac_tx_send(). 32328275SEric Cheng */ 32338275SEric Cheng #define CHECK_VID_AND_ADD_TAG(mp) { \ 32348275SEric Cheng if (vid_check) { \ 32358275SEric Cheng int err = 0; \ 32368275SEric Cheng \ 32378275SEric Cheng MAC_VID_CHECK(src_mcip, (mp), err); \ 32388275SEric Cheng if (err != 0) { \ 32398275SEric Cheng freemsg((mp)); \ 32408275SEric Cheng (mp) = next; \ 32418275SEric Cheng oerrors++; \ 32428275SEric Cheng continue; \ 32438275SEric Cheng } \ 32448275SEric Cheng } \ 32458275SEric Cheng if (add_tag) { \ 32468275SEric Cheng (mp) = mac_add_vlan_tag((mp), 0, vid); \ 32478275SEric Cheng if ((mp) == NULL) { \ 32488275SEric Cheng (mp) = next; \ 32498275SEric Cheng oerrors++; \ 32508275SEric Cheng continue; \ 32518275SEric Cheng } \ 32528275SEric Cheng } \ 32538275SEric Cheng } 32548275SEric Cheng 32558275SEric Cheng mblk_t * 32568275SEric Cheng mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, 32578275SEric Cheng mac_tx_stats_t *stats) 32588275SEric Cheng { 32598275SEric Cheng mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch; 32608275SEric Cheng mac_impl_t *mip = src_mcip->mci_mip; 32618275SEric Cheng uint_t obytes = 0, opackets = 0, oerrors = 0; 32628275SEric Cheng mblk_t *mp = NULL, *next; 32638275SEric Cheng boolean_t vid_check, add_tag; 32648275SEric Cheng uint16_t vid = 0; 32658275SEric Cheng 32668275SEric Cheng if (mip->mi_nclients > 1) { 32678275SEric Cheng vid_check = MAC_VID_CHECK_NEEDED(src_mcip); 32688275SEric Cheng add_tag = MAC_TAG_NEEDED(src_mcip); 32698275SEric Cheng if (add_tag) 32708275SEric Cheng vid = mac_client_vid(mch); 32718275SEric Cheng } else { 32728275SEric Cheng ASSERT(mip->mi_nclients == 1); 32738275SEric Cheng vid_check = add_tag = B_FALSE; 32748275SEric Cheng } 32758275SEric Cheng 32768275SEric Cheng /* 32778275SEric Cheng * Fastpath: if there's only one client, and there's no 32788275SEric Cheng * multicast listeners, we simply send the packet down to the 32798275SEric Cheng * underlying NIC. 32808275SEric Cheng */ 32818275SEric Cheng if (mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL) { 32828275SEric Cheng DTRACE_PROBE2(fastpath, 32838275SEric Cheng mac_client_impl_t *, src_mcip, mblk_t *, mp_chain); 32848275SEric Cheng 32858275SEric Cheng mp = mp_chain; 32868275SEric Cheng while (mp != NULL) { 32878275SEric Cheng next = mp->b_next; 32888275SEric Cheng mp->b_next = NULL; 32898275SEric Cheng opackets++; 32908275SEric Cheng obytes += (mp->b_cont == NULL ? MBLKL(mp) : 32918275SEric Cheng msgdsize(mp)); 32928275SEric Cheng 32938275SEric Cheng CHECK_VID_AND_ADD_TAG(mp); 329410491SRishi.Srivatsavai@Sun.COM MAC_TX(mip, ring, mp, 329510491SRishi.Srivatsavai@Sun.COM ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) != 329610491SRishi.Srivatsavai@Sun.COM 0)); 32978275SEric Cheng 32988275SEric Cheng /* 32998275SEric Cheng * If the driver is out of descriptors and does a 33008275SEric Cheng * partial send it will return a chain of unsent 33018275SEric Cheng * mblks. Adjust the accounting stats. 33028275SEric Cheng */ 33038275SEric Cheng if (mp != NULL) { 33048275SEric Cheng opackets--; 33058275SEric Cheng obytes -= msgdsize(mp); 33068275SEric Cheng mp->b_next = next; 33078275SEric Cheng break; 33088275SEric Cheng } 33098275SEric Cheng mp = next; 33108275SEric Cheng } 33118275SEric Cheng goto done; 33128275SEric Cheng } 33138275SEric Cheng 33148275SEric Cheng /* 33158275SEric Cheng * No fastpath, we either have more than one MAC client 33168275SEric Cheng * defined on top of the same MAC, or one or more MAC 33178275SEric Cheng * client promiscuous callbacks. 33188275SEric Cheng */ 33198275SEric Cheng DTRACE_PROBE3(slowpath, mac_client_impl_t *, 33208275SEric Cheng src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain); 33218275SEric Cheng 33228275SEric Cheng mp = mp_chain; 33238275SEric Cheng while (mp != NULL) { 33248275SEric Cheng flow_entry_t *dst_flow_ent; 33258275SEric Cheng void *flow_cookie; 33268275SEric Cheng size_t pkt_size; 33278275SEric Cheng mblk_t *mp1; 33288275SEric Cheng 33298275SEric Cheng next = mp->b_next; 33308275SEric Cheng mp->b_next = NULL; 33318275SEric Cheng opackets++; 33328275SEric Cheng pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp)); 33338275SEric Cheng obytes += pkt_size; 33348275SEric Cheng CHECK_VID_AND_ADD_TAG(mp); 33358275SEric Cheng 33368275SEric Cheng /* 33378833SVenu.Iyer@Sun.COM * Check if there are promiscuous mode callbacks defined. 33388833SVenu.Iyer@Sun.COM */ 33398833SVenu.Iyer@Sun.COM if (mip->mi_promisc_list != NULL) 33408833SVenu.Iyer@Sun.COM mac_promisc_dispatch(mip, mp, src_mcip); 33418833SVenu.Iyer@Sun.COM 33428833SVenu.Iyer@Sun.COM /* 33438275SEric Cheng * Find the destination. 33448275SEric Cheng */ 33458275SEric Cheng dst_flow_ent = mac_tx_classify(mip, mp); 33468275SEric Cheng 33478275SEric Cheng if (dst_flow_ent != NULL) { 33488275SEric Cheng size_t hdrsize; 33498275SEric Cheng int err = 0; 33508275SEric Cheng 33518275SEric Cheng if (mip->mi_info.mi_nativemedia == DL_ETHER) { 33528275SEric Cheng struct ether_vlan_header *evhp = 33538275SEric Cheng (struct ether_vlan_header *)mp->b_rptr; 33548275SEric Cheng 33558275SEric Cheng if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) 33568275SEric Cheng hdrsize = sizeof (*evhp); 33578275SEric Cheng else 33588275SEric Cheng hdrsize = sizeof (struct ether_header); 33598275SEric Cheng } else { 33608275SEric Cheng mac_header_info_t mhi; 33618275SEric Cheng 33628275SEric Cheng err = mac_header_info((mac_handle_t)mip, 33638275SEric Cheng mp, &mhi); 33648275SEric Cheng if (err == 0) 33658275SEric Cheng hdrsize = mhi.mhi_hdrsize; 33668275SEric Cheng } 33678275SEric Cheng 33688275SEric Cheng /* 33698275SEric Cheng * Got a matching flow. It's either another 33708275SEric Cheng * MAC client, or a broadcast/multicast flow. 33718275SEric Cheng * Make sure the packet size is within the 33728275SEric Cheng * allowed size. If not drop the packet and 33738275SEric Cheng * move to next packet. 33748275SEric Cheng */ 33758275SEric Cheng if (err != 0 || 33768275SEric Cheng (pkt_size - hdrsize) > mip->mi_sdu_max) { 33778275SEric Cheng oerrors++; 33788275SEric Cheng DTRACE_PROBE2(loopback__drop, size_t, pkt_size, 33798275SEric Cheng mblk_t *, mp); 33808275SEric Cheng freemsg(mp); 33818275SEric Cheng mp = next; 33828275SEric Cheng FLOW_REFRELE(dst_flow_ent); 33838275SEric Cheng continue; 33848275SEric Cheng } 33858275SEric Cheng flow_cookie = mac_flow_get_client_cookie(dst_flow_ent); 33868275SEric Cheng if (flow_cookie != NULL) { 33878275SEric Cheng /* 33888275SEric Cheng * The vnic_bcast_send function expects 33898275SEric Cheng * to receive the sender MAC client 33908275SEric Cheng * as value for arg2. 33918275SEric Cheng */ 33928275SEric Cheng mac_bcast_send(flow_cookie, src_mcip, mp, 33938275SEric Cheng B_TRUE); 33948275SEric Cheng } else { 33958275SEric Cheng /* 33968275SEric Cheng * loopback the packet to a 33978275SEric Cheng * local MAC client. We force a context 33988275SEric Cheng * switch if both source and destination 33998275SEric Cheng * MAC clients are used by IP, i.e. bypass 34008275SEric Cheng * is set. 34018275SEric Cheng */ 34028275SEric Cheng boolean_t do_switch; 34038275SEric Cheng mac_client_impl_t *dst_mcip = 34048275SEric Cheng dst_flow_ent->fe_mcip; 34058275SEric Cheng 34068275SEric Cheng do_switch = ((src_mcip->mci_state_flags & 34078275SEric Cheng dst_mcip->mci_state_flags & 34088275SEric Cheng MCIS_CLIENT_POLL_CAPABLE) != 0); 34098275SEric Cheng 34108275SEric Cheng if ((mp1 = mac_fix_cksum(mp)) != NULL) { 34118275SEric Cheng (dst_flow_ent->fe_cb_fn)( 34128275SEric Cheng dst_flow_ent->fe_cb_arg1, 34138275SEric Cheng dst_flow_ent->fe_cb_arg2, 34148275SEric Cheng mp1, do_switch); 34158275SEric Cheng } 34168275SEric Cheng } 34178275SEric Cheng FLOW_REFRELE(dst_flow_ent); 34188275SEric Cheng } else { 34198275SEric Cheng /* 34208275SEric Cheng * Unknown destination, send via the underlying 34218275SEric Cheng * NIC. 34228275SEric Cheng */ 342310491SRishi.Srivatsavai@Sun.COM MAC_TX(mip, ring, mp, 342410491SRishi.Srivatsavai@Sun.COM ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) != 342510491SRishi.Srivatsavai@Sun.COM 0)); 34268275SEric Cheng if (mp != NULL) { 34278275SEric Cheng /* 34288275SEric Cheng * Adjust for the last packet that 34298275SEric Cheng * could not be transmitted 34308275SEric Cheng */ 34318275SEric Cheng opackets--; 34328275SEric Cheng obytes -= pkt_size; 34338275SEric Cheng mp->b_next = next; 34348275SEric Cheng break; 34358275SEric Cheng } 34368275SEric Cheng } 34378275SEric Cheng mp = next; 34388275SEric Cheng } 34398275SEric Cheng 34408275SEric Cheng done: 34418275SEric Cheng src_mcip->mci_stat_obytes += obytes; 34428275SEric Cheng src_mcip->mci_stat_opackets += opackets; 34438275SEric Cheng src_mcip->mci_stat_oerrors += oerrors; 34448275SEric Cheng 34458275SEric Cheng if (stats != NULL) { 34468275SEric Cheng stats->ts_opackets = opackets; 34478275SEric Cheng stats->ts_obytes = obytes; 34488275SEric Cheng stats->ts_oerrors = oerrors; 34498275SEric Cheng } 34508275SEric Cheng return (mp); 34518275SEric Cheng } 34528275SEric Cheng 34538275SEric Cheng /* 34548275SEric Cheng * mac_tx_srs_ring_present 34558275SEric Cheng * 34568275SEric Cheng * Returns whether the specified ring is part of the specified SRS. 34578275SEric Cheng */ 34588275SEric Cheng boolean_t 34598275SEric Cheng mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring) 34608275SEric Cheng { 34618275SEric Cheng int i; 34628275SEric Cheng mac_soft_ring_t *soft_ring; 34638275SEric Cheng 34648275SEric Cheng if (srs->srs_tx.st_arg2 == tx_ring) 34658275SEric Cheng return (B_TRUE); 34668275SEric Cheng 34678275SEric Cheng for (i = 0; i < srs->srs_oth_ring_count; i++) { 34688275SEric Cheng soft_ring = srs->srs_oth_soft_rings[i]; 34698275SEric Cheng if (soft_ring->s_ring_tx_arg2 == tx_ring) 34708275SEric Cheng return (B_TRUE); 34718275SEric Cheng } 34728275SEric Cheng 34738275SEric Cheng return (B_FALSE); 34748275SEric Cheng } 34758275SEric Cheng 34768275SEric Cheng /* 34778275SEric Cheng * mac_tx_srs_wakeup 34788275SEric Cheng * 34798275SEric Cheng * Called when Tx desc become available. Wakeup the appropriate worker 34808275SEric Cheng * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the 34818275SEric Cheng * state field. 34828275SEric Cheng */ 34838275SEric Cheng void 34848275SEric Cheng mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring) 34858275SEric Cheng { 34868275SEric Cheng int i; 34878275SEric Cheng mac_soft_ring_t *sringp; 34888275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 34898275SEric Cheng 34908275SEric Cheng mutex_enter(&mac_srs->srs_lock); 34918275SEric Cheng if (TX_SINGLE_RING_MODE(mac_srs)) { 34928275SEric Cheng if (srs_tx->st_arg2 == ring && 34938275SEric Cheng mac_srs->srs_state & SRS_TX_BLOCKED) { 34948275SEric Cheng mac_srs->srs_state &= ~SRS_TX_BLOCKED; 34958275SEric Cheng srs_tx->st_unblocked_cnt++; 34968275SEric Cheng cv_signal(&mac_srs->srs_async); 34978275SEric Cheng } 34988275SEric Cheng /* 34998275SEric Cheng * A wakeup can come before tx_srs_drain() could 35008275SEric Cheng * grab srs lock and set SRS_TX_BLOCKED. So 35018275SEric Cheng * always set woken_up flag when we come here. 35028275SEric Cheng */ 35038275SEric Cheng srs_tx->st_woken_up = B_TRUE; 35048275SEric Cheng mutex_exit(&mac_srs->srs_lock); 35058275SEric Cheng return; 35068275SEric Cheng } 35078275SEric Cheng 35088275SEric Cheng /* If you are here, it is for FANOUT or BW_FANOUT case */ 35098275SEric Cheng ASSERT(TX_MULTI_RING_MODE(mac_srs)); 35108275SEric Cheng for (i = 0; i < mac_srs->srs_oth_ring_count; i++) { 35118275SEric Cheng sringp = mac_srs->srs_oth_soft_rings[i]; 35128275SEric Cheng mutex_enter(&sringp->s_ring_lock); 35138275SEric Cheng if (sringp->s_ring_tx_arg2 == ring) { 35148275SEric Cheng if (sringp->s_ring_state & S_RING_BLOCK) { 35158275SEric Cheng sringp->s_ring_state &= ~S_RING_BLOCK; 35168275SEric Cheng sringp->s_ring_unblocked_cnt++; 35178275SEric Cheng cv_signal(&sringp->s_ring_async); 35188275SEric Cheng } 35198275SEric Cheng sringp->s_ring_tx_woken_up = B_TRUE; 35208275SEric Cheng } 35218275SEric Cheng mutex_exit(&sringp->s_ring_lock); 35228275SEric Cheng } 35238275SEric Cheng mutex_exit(&mac_srs->srs_lock); 35248275SEric Cheng } 35258275SEric Cheng 35268275SEric Cheng /* 35278275SEric Cheng * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash 35288275SEric Cheng * the blocked clients again. 35298275SEric Cheng */ 35308275SEric Cheng void 35318275SEric Cheng mac_tx_notify(mac_impl_t *mip) 35328275SEric Cheng { 35338275SEric Cheng i_mac_notify(mip, MAC_NOTE_TX); 35348275SEric Cheng } 35358275SEric Cheng 35368275SEric Cheng /* 35378275SEric Cheng * RX SOFTRING RELATED FUNCTIONS 35388275SEric Cheng * 35398275SEric Cheng * These functions really belong in mac_soft_ring.c and here for 35408275SEric Cheng * a short period. 35418275SEric Cheng */ 35428275SEric Cheng 35438275SEric Cheng #define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 35448275SEric Cheng /* \ 35458275SEric Cheng * Enqueue our mblk chain. \ 35468275SEric Cheng */ \ 35478275SEric Cheng ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \ 35488275SEric Cheng \ 35498275SEric Cheng if ((ringp)->s_ring_last != NULL) \ 35508275SEric Cheng (ringp)->s_ring_last->b_next = (mp); \ 35518275SEric Cheng else \ 35528275SEric Cheng (ringp)->s_ring_first = (mp); \ 35538275SEric Cheng (ringp)->s_ring_last = (tail); \ 35548275SEric Cheng (ringp)->s_ring_count += (cnt); \ 35558275SEric Cheng ASSERT((ringp)->s_ring_count > 0); \ 35568275SEric Cheng if ((ringp)->s_ring_type & ST_RING_BW_CTL) { \ 35578275SEric Cheng (ringp)->s_ring_size += sz; \ 35588275SEric Cheng } \ 35598275SEric Cheng } 35608275SEric Cheng 35618275SEric Cheng /* 35628275SEric Cheng * Default entry point to deliver a packet chain to a MAC client. 35638275SEric Cheng * If the MAC client has flows, do the classification with these 35648275SEric Cheng * flows as well. 35658275SEric Cheng */ 35668275SEric Cheng /* ARGSUSED */ 35678275SEric Cheng void 35688275SEric Cheng mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain, 35698275SEric Cheng mac_header_info_t *arg3) 35708275SEric Cheng { 35718275SEric Cheng mac_client_impl_t *mcip = arg1; 35728275SEric Cheng 35738275SEric Cheng if (mcip->mci_nvids == 1 && 35749109SVenu.Iyer@Sun.COM !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) { 35758275SEric Cheng /* 35768275SEric Cheng * If the client has exactly one VID associated with it 35778275SEric Cheng * and striping of VLAN header is not disabled, 35788275SEric Cheng * remove the VLAN tag from the packet before 35798275SEric Cheng * passing it on to the client's receive callback. 35808275SEric Cheng * Note that this needs to be done after we dispatch 35818275SEric Cheng * the packet to the promiscuous listeners of the 35828275SEric Cheng * client, since they expect to see the whole 35838275SEric Cheng * frame including the VLAN headers. 35848275SEric Cheng */ 35858275SEric Cheng mp_chain = mac_strip_vlan_tag_chain(mp_chain); 35868275SEric Cheng } 35878275SEric Cheng 35888275SEric Cheng mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE); 35898275SEric Cheng } 35908275SEric Cheng 35918275SEric Cheng /* 35928275SEric Cheng * mac_rx_soft_ring_process 35938275SEric Cheng * 35948275SEric Cheng * process a chain for a given soft ring. The number of packets queued 35958275SEric Cheng * in the SRS and its associated soft rings (including this one) is 35968275SEric Cheng * very small (tracked by srs_poll_pkt_cnt), then allow the entering 35978275SEric Cheng * thread (interrupt or poll thread) to do inline processing. This 35988275SEric Cheng * helps keep the latency down under low load. 35998275SEric Cheng * 36008275SEric Cheng * The proc and arg for each mblk is already stored in the mblk in 36018275SEric Cheng * appropriate places. 36028275SEric Cheng */ 36038275SEric Cheng /* ARGSUSED */ 36048275SEric Cheng void 36058275SEric Cheng mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, 36068275SEric Cheng mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz) 36078275SEric Cheng { 36088275SEric Cheng mac_direct_rx_t proc; 36098275SEric Cheng void *arg1; 36108275SEric Cheng mac_resource_handle_t arg2; 36118275SEric Cheng mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 36128275SEric Cheng 36138275SEric Cheng ASSERT(ringp != NULL); 36148275SEric Cheng ASSERT(mp_chain != NULL); 36158275SEric Cheng ASSERT(tail != NULL); 36168275SEric Cheng ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 36178275SEric Cheng 36188275SEric Cheng mutex_enter(&ringp->s_ring_lock); 36198275SEric Cheng ringp->s_ring_total_inpkt += cnt; 36208833SVenu.Iyer@Sun.COM if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) && 36218833SVenu.Iyer@Sun.COM !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) { 36228275SEric Cheng /* If on processor or blanking on, then enqueue and return */ 36238275SEric Cheng if (ringp->s_ring_state & S_RING_BLANK || 36248275SEric Cheng ringp->s_ring_state & S_RING_PROC) { 36258275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 36268275SEric Cheng mutex_exit(&ringp->s_ring_lock); 36278275SEric Cheng return; 36288275SEric Cheng } 36298275SEric Cheng proc = ringp->s_ring_rx_func; 36308275SEric Cheng arg1 = ringp->s_ring_rx_arg1; 36318275SEric Cheng arg2 = ringp->s_ring_rx_arg2; 36328275SEric Cheng /* 36338275SEric Cheng * See if anything is already queued. If we are the 36348275SEric Cheng * first packet, do inline processing else queue the 36358275SEric Cheng * packet and do the drain. 36368275SEric Cheng */ 36378275SEric Cheng if (ringp->s_ring_first == NULL) { 36388275SEric Cheng /* 36398275SEric Cheng * Fast-path, ok to process and nothing queued. 36408275SEric Cheng */ 36418275SEric Cheng ringp->s_ring_run = curthread; 36428275SEric Cheng ringp->s_ring_state |= (S_RING_PROC); 36438275SEric Cheng 36448275SEric Cheng mutex_exit(&ringp->s_ring_lock); 36458275SEric Cheng 36468275SEric Cheng /* 36478275SEric Cheng * We are the chain of 1 packet so 36488275SEric Cheng * go through this fast path. 36498275SEric Cheng */ 36508275SEric Cheng ASSERT(mp_chain->b_next == NULL); 36518275SEric Cheng 36528275SEric Cheng (*proc)(arg1, arg2, mp_chain, NULL); 36538275SEric Cheng 36548275SEric Cheng ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 36558275SEric Cheng /* 36568275SEric Cheng * If we have a soft ring set which is doing 36578275SEric Cheng * bandwidth control, we need to decrement 36588275SEric Cheng * srs_size and count so it the SRS can have a 36598275SEric Cheng * accurate idea of what is the real data 36608275SEric Cheng * queued between SRS and its soft rings. We 36618275SEric Cheng * decrement the counters only when the packet 36628275SEric Cheng * gets processed by both SRS and the soft ring. 36638275SEric Cheng */ 36648275SEric Cheng mutex_enter(&mac_srs->srs_lock); 36658275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 36668275SEric Cheng MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 36678275SEric Cheng mutex_exit(&mac_srs->srs_lock); 36688275SEric Cheng 36698275SEric Cheng mutex_enter(&ringp->s_ring_lock); 36708275SEric Cheng ringp->s_ring_run = NULL; 36718275SEric Cheng ringp->s_ring_state &= ~S_RING_PROC; 36728275SEric Cheng if (ringp->s_ring_state & S_RING_CLIENT_WAIT) 36738275SEric Cheng cv_signal(&ringp->s_ring_client_cv); 36748275SEric Cheng 36758275SEric Cheng if ((ringp->s_ring_first == NULL) || 36768275SEric Cheng (ringp->s_ring_state & S_RING_BLANK)) { 36778275SEric Cheng /* 36788275SEric Cheng * We processed inline our packet and 36798275SEric Cheng * nothing new has arrived or our 36808275SEric Cheng * receiver doesn't want to receive 36818275SEric Cheng * any packets. We are done. 36828275SEric Cheng */ 36838275SEric Cheng mutex_exit(&ringp->s_ring_lock); 36848275SEric Cheng return; 36858275SEric Cheng } 36868275SEric Cheng } else { 36878275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, 36888275SEric Cheng mp_chain, tail, cnt, sz); 36898275SEric Cheng } 36908275SEric Cheng 36918275SEric Cheng /* 36928275SEric Cheng * We are here because either we couldn't do inline 36938275SEric Cheng * processing (because something was already 36948275SEric Cheng * queued), or we had a chain of more than one 36958275SEric Cheng * packet, or something else arrived after we were 36968275SEric Cheng * done with inline processing. 36978275SEric Cheng */ 36988275SEric Cheng ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 36998275SEric Cheng ASSERT(ringp->s_ring_first != NULL); 37008275SEric Cheng 37018275SEric Cheng ringp->s_ring_drain_func(ringp); 37028275SEric Cheng mutex_exit(&ringp->s_ring_lock); 37038275SEric Cheng return; 37048275SEric Cheng } else { 37058275SEric Cheng /* ST_RING_WORKER_ONLY case */ 37068275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 37078275SEric Cheng mac_soft_ring_worker_wakeup(ringp); 37088275SEric Cheng mutex_exit(&ringp->s_ring_lock); 37098275SEric Cheng } 37108275SEric Cheng } 37118275SEric Cheng 37128275SEric Cheng /* 37138275SEric Cheng * TX SOFTRING RELATED FUNCTIONS 37148275SEric Cheng * 37158275SEric Cheng * These functions really belong in mac_soft_ring.c and here for 37168275SEric Cheng * a short period. 37178275SEric Cheng */ 37188275SEric Cheng 37198275SEric Cheng #define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 37208275SEric Cheng ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); \ 37218275SEric Cheng ringp->s_ring_state |= S_RING_ENQUEUED; \ 37228275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); \ 37238275SEric Cheng } 37248275SEric Cheng 37258275SEric Cheng /* 37268275SEric Cheng * mac_tx_sring_queued 37278275SEric Cheng * 37288275SEric Cheng * When we are out of transmit descriptors and we already have a 37298275SEric Cheng * queue that exceeds hiwat (or the client called us with 37308275SEric Cheng * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the 37318275SEric Cheng * soft ring pointer as the opaque cookie for the client enable 37328275SEric Cheng * flow control. 37338275SEric Cheng */ 37348275SEric Cheng static mac_tx_cookie_t 37358275SEric Cheng mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, 37368275SEric Cheng mblk_t **ret_mp) 37378275SEric Cheng { 37388275SEric Cheng int cnt; 37398275SEric Cheng size_t sz; 37408275SEric Cheng mblk_t *tail; 37418275SEric Cheng mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 37428275SEric Cheng mac_tx_cookie_t cookie = NULL; 37438275SEric Cheng boolean_t wakeup_worker = B_TRUE; 37448275SEric Cheng 37458275SEric Cheng ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 37468275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 37478275SEric Cheng if (flag & MAC_DROP_ON_NO_DESC) { 37488275SEric Cheng mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 37498275SEric Cheng /* increment freed stats */ 37508275SEric Cheng ringp->s_ring_drops += cnt; 37518275SEric Cheng cookie = (mac_tx_cookie_t)ringp; 37528275SEric Cheng } else { 37538275SEric Cheng if (ringp->s_ring_first != NULL) 37548275SEric Cheng wakeup_worker = B_FALSE; 37558275SEric Cheng 37568275SEric Cheng if (flag & MAC_TX_NO_ENQUEUE) { 37578275SEric Cheng /* 37588275SEric Cheng * If QUEUED is not set, queue the packet 37598275SEric Cheng * and let mac_tx_soft_ring_drain() set 37608275SEric Cheng * the TX_BLOCKED bit for the reasons 37618275SEric Cheng * explained above. Otherwise, return the 37628275SEric Cheng * mblks. 37638275SEric Cheng */ 37648275SEric Cheng if (wakeup_worker) { 37658275SEric Cheng TX_SOFT_RING_ENQUEUE_CHAIN(ringp, 37668275SEric Cheng mp_chain, tail, cnt, sz); 37678275SEric Cheng } else { 37688275SEric Cheng ringp->s_ring_state |= S_RING_WAKEUP_CLIENT; 37698275SEric Cheng cookie = (mac_tx_cookie_t)ringp; 37708275SEric Cheng *ret_mp = mp_chain; 37718275SEric Cheng } 37728275SEric Cheng } else { 37738275SEric Cheng boolean_t enqueue = B_TRUE; 37748275SEric Cheng 37758275SEric Cheng if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 37768275SEric Cheng /* 37778275SEric Cheng * flow-controlled. Store ringp in cookie 37788275SEric Cheng * so that it can be returned as 37798275SEric Cheng * mac_tx_cookie_t to client 37808275SEric Cheng */ 37818275SEric Cheng ringp->s_ring_state |= S_RING_TX_HIWAT; 37828275SEric Cheng cookie = (mac_tx_cookie_t)ringp; 37838275SEric Cheng ringp->s_ring_hiwat_cnt++; 37848275SEric Cheng if (ringp->s_ring_count > 37858275SEric Cheng ringp->s_ring_tx_max_q_cnt) { 37868275SEric Cheng /* increment freed stats */ 37878275SEric Cheng ringp->s_ring_drops += cnt; 37888275SEric Cheng /* 37898275SEric Cheng * b_prev may be set to the fanout hint 37908275SEric Cheng * hence can't use freemsg directly 37918275SEric Cheng */ 37928275SEric Cheng mac_pkt_drop(NULL, NULL, 37938275SEric Cheng mp_chain, B_FALSE); 37948275SEric Cheng DTRACE_PROBE1(tx_queued_hiwat, 37958275SEric Cheng mac_soft_ring_t *, ringp); 37968275SEric Cheng enqueue = B_FALSE; 37978275SEric Cheng } 37988275SEric Cheng } 37998275SEric Cheng if (enqueue) { 38008275SEric Cheng TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, 38018275SEric Cheng tail, cnt, sz); 38028275SEric Cheng } 38038275SEric Cheng } 38048275SEric Cheng if (wakeup_worker) 38058275SEric Cheng cv_signal(&ringp->s_ring_async); 38068275SEric Cheng } 38078275SEric Cheng return (cookie); 38088275SEric Cheng } 38098275SEric Cheng 38108275SEric Cheng 38118275SEric Cheng /* 38128275SEric Cheng * mac_tx_soft_ring_process 38138275SEric Cheng * 38148275SEric Cheng * This routine is called when fanning out outgoing traffic among 38158275SEric Cheng * multipe Tx rings. 38168275SEric Cheng * Note that a soft ring is associated with a h/w Tx ring. 38178275SEric Cheng */ 38188275SEric Cheng mac_tx_cookie_t 38198275SEric Cheng mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain, 38208275SEric Cheng uint16_t flag, mblk_t **ret_mp) 38218275SEric Cheng { 38228275SEric Cheng mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 38238275SEric Cheng int cnt; 38248275SEric Cheng size_t sz; 38258275SEric Cheng mblk_t *tail; 38268275SEric Cheng mac_tx_cookie_t cookie = NULL; 38278275SEric Cheng 38288275SEric Cheng ASSERT(ringp != NULL); 38298275SEric Cheng ASSERT(mp_chain != NULL); 38308275SEric Cheng ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 38318275SEric Cheng /* 38328275SEric Cheng * Only two modes can come here; either it can be 38338275SEric Cheng * SRS_TX_BW_FANOUT or SRS_TX_FANOUT 38348275SEric Cheng */ 38358275SEric Cheng ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT || 38368275SEric Cheng mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT); 38378275SEric Cheng 38388275SEric Cheng if (ringp->s_ring_type & ST_RING_WORKER_ONLY) { 38398275SEric Cheng /* Serialization mode */ 38408275SEric Cheng 38418275SEric Cheng mutex_enter(&ringp->s_ring_lock); 38428275SEric Cheng if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 38438275SEric Cheng cookie = mac_tx_sring_enqueue(ringp, mp_chain, 38448275SEric Cheng flag, ret_mp); 38458275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38468275SEric Cheng return (cookie); 38478275SEric Cheng } 38488275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 38498275SEric Cheng TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 38508275SEric Cheng if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) { 38518275SEric Cheng /* 38528275SEric Cheng * If ring is blocked due to lack of Tx 38538275SEric Cheng * descs, just return. Worker thread 38548275SEric Cheng * will get scheduled when Tx desc's 38558275SEric Cheng * become available. 38568275SEric Cheng */ 38578275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38588275SEric Cheng return (cookie); 38598275SEric Cheng } 38608275SEric Cheng mac_soft_ring_worker_wakeup(ringp); 38618275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38628275SEric Cheng return (cookie); 38638275SEric Cheng } else { 38648275SEric Cheng /* Default fanout mode */ 38658275SEric Cheng /* 38668275SEric Cheng * S_RING_BLOCKED is set when underlying NIC runs 38678275SEric Cheng * out of Tx descs and messages start getting 38688275SEric Cheng * queued. It won't get reset until 38698275SEric Cheng * tx_srs_drain() completely drains out the 38708275SEric Cheng * messages. 38718275SEric Cheng */ 38728275SEric Cheng boolean_t is_subflow; 38738275SEric Cheng mac_tx_stats_t stats; 38748275SEric Cheng 38758275SEric Cheng if (ringp->s_ring_state & S_RING_ENQUEUED) { 38768275SEric Cheng /* Tx descs/resources not available */ 38778275SEric Cheng mutex_enter(&ringp->s_ring_lock); 38788275SEric Cheng if (ringp->s_ring_state & S_RING_ENQUEUED) { 38798275SEric Cheng cookie = mac_tx_sring_enqueue(ringp, mp_chain, 38808275SEric Cheng flag, ret_mp); 38818275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38828275SEric Cheng return (cookie); 38838275SEric Cheng } 38848275SEric Cheng /* 38858275SEric Cheng * While we were computing mblk count, the 38868275SEric Cheng * flow control condition got relieved. 38878275SEric Cheng * Continue with the transmission. 38888275SEric Cheng */ 38898275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38908275SEric Cheng } 38918275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 38928275SEric Cheng 38938275SEric Cheng mp_chain = mac_tx_send(ringp->s_ring_tx_arg1, 38948275SEric Cheng ringp->s_ring_tx_arg2, mp_chain, 38958275SEric Cheng (is_subflow ? &stats : NULL)); 38968275SEric Cheng 38978275SEric Cheng /* 38988275SEric Cheng * Multiple threads could be here sending packets. 38998275SEric Cheng * Under such conditions, it is not possible to 39008275SEric Cheng * automically set S_RING_BLOCKED bit to indicate 39018275SEric Cheng * out of tx desc condition. To atomically set 39028275SEric Cheng * this, we queue the returned packet and do 39038275SEric Cheng * the setting of S_RING_BLOCKED in 39048275SEric Cheng * mac_tx_soft_ring_drain(). 39058275SEric Cheng */ 39068275SEric Cheng if (mp_chain != NULL) { 39078275SEric Cheng mutex_enter(&ringp->s_ring_lock); 39088275SEric Cheng cookie = 39098275SEric Cheng mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp); 39108275SEric Cheng mutex_exit(&ringp->s_ring_lock); 39118275SEric Cheng return (cookie); 39128275SEric Cheng } 39138275SEric Cheng if (is_subflow) { 39148275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 39158275SEric Cheng } 39168275SEric Cheng return (NULL); 39178275SEric Cheng } 39188275SEric Cheng } 3919