18275SEric Cheng /* 28275SEric Cheng * CDDL HEADER START 38275SEric Cheng * 48275SEric Cheng * The contents of this file are subject to the terms of the 58275SEric Cheng * Common Development and Distribution License (the "License"). 68275SEric Cheng * You may not use this file except in compliance with the License. 78275SEric Cheng * 88275SEric Cheng * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 98275SEric Cheng * or http://www.opensolaris.org/os/licensing. 108275SEric Cheng * See the License for the specific language governing permissions 118275SEric Cheng * and limitations under the License. 128275SEric Cheng * 138275SEric Cheng * When distributing Covered Code, include this CDDL HEADER in each 148275SEric Cheng * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 158275SEric Cheng * If applicable, add the following below this CDDL HEADER, with the 168275SEric Cheng * fields enclosed by brackets "[]" replaced with your own identifying 178275SEric Cheng * information: Portions Copyright [yyyy] [name of copyright owner] 188275SEric Cheng * 198275SEric Cheng * CDDL HEADER END 208275SEric Cheng */ 218275SEric Cheng /* 2211528SBaban.Kenkre@Sun.COM * Copyright 2010 Sun Microsystems, Inc. All rights reserved. 238275SEric Cheng * Use is subject to license terms. 248275SEric Cheng */ 258275SEric Cheng 268275SEric Cheng #include <sys/types.h> 278275SEric Cheng #include <sys/callb.h> 288275SEric Cheng #include <sys/sdt.h> 298275SEric Cheng #include <sys/strsubr.h> 308275SEric Cheng #include <sys/strsun.h> 318275SEric Cheng #include <sys/vlan.h> 328275SEric Cheng #include <inet/ipsec_impl.h> 338275SEric Cheng #include <inet/ip_impl.h> 348275SEric Cheng #include <inet/sadb.h> 358275SEric Cheng #include <inet/ipsecesp.h> 368275SEric Cheng #include <inet/ipsecah.h> 378275SEric Cheng #include <inet/ip6.h> 388275SEric Cheng 398275SEric Cheng #include <sys/mac_impl.h> 408275SEric Cheng #include <sys/mac_client_impl.h> 418275SEric Cheng #include <sys/mac_client_priv.h> 428275SEric Cheng #include <sys/mac_soft_ring.h> 438275SEric Cheng #include <sys/mac_flow_impl.h> 448275SEric Cheng 458275SEric Cheng static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *, 468275SEric Cheng uintptr_t, uint16_t, mblk_t **); 478275SEric Cheng static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *, 488275SEric Cheng uintptr_t, uint16_t, mblk_t **); 498275SEric Cheng static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *, 508275SEric Cheng uintptr_t, uint16_t, mblk_t **); 518275SEric Cheng static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *, 528275SEric Cheng uintptr_t, uint16_t, mblk_t **); 538275SEric Cheng 548275SEric Cheng typedef struct mac_tx_mode_s { 558275SEric Cheng mac_tx_srs_mode_t mac_tx_mode; 568275SEric Cheng mac_tx_func_t mac_tx_func; 578275SEric Cheng } mac_tx_mode_t; 588275SEric Cheng 598275SEric Cheng /* 608275SEric Cheng * There are five modes of operation on the Tx side. These modes get set 618275SEric Cheng * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode, 628275SEric Cheng * none of the other modes are user configurable. They get selected by 638275SEric Cheng * the system depending upon whether the link (or flow) has multiple Tx 648275SEric Cheng * rings or a bandwidth configured, etc. 658275SEric Cheng */ 668275SEric Cheng mac_tx_mode_t mac_tx_mode_list[] = { 678275SEric Cheng {SRS_TX_DEFAULT, mac_tx_single_ring_mode}, 688275SEric Cheng {SRS_TX_SERIALIZE, mac_tx_serializer_mode}, 698275SEric Cheng {SRS_TX_FANOUT, mac_tx_fanout_mode}, 708275SEric Cheng {SRS_TX_BW, mac_tx_bw_mode}, 718275SEric Cheng {SRS_TX_BW_FANOUT, mac_tx_bw_mode} 728275SEric Cheng }; 738275SEric Cheng 748275SEric Cheng /* 758275SEric Cheng * Soft Ring Set (SRS) - The Run time code that deals with 768275SEric Cheng * dynamic polling from the hardware, bandwidth enforcement, 778275SEric Cheng * fanout etc. 788275SEric Cheng * 798275SEric Cheng * We try to use H/W classification on NIC and assign traffic for 808275SEric Cheng * a MAC address to a particular Rx ring or ring group. There is a 818275SEric Cheng * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically 828275SEric Cheng * switches the underlying Rx ring between interrupt and 838275SEric Cheng * polling mode and enforces any specified B/W control. 848275SEric Cheng * 858275SEric Cheng * There is always a SRS created and tied to each H/W and S/W rule. 868275SEric Cheng * Whenever we create a H/W rule, we always add the the same rule to 878275SEric Cheng * S/W classifier and tie a SRS to it. 888275SEric Cheng * 898275SEric Cheng * In case a B/W control is specified, it is broken into bytes 908275SEric Cheng * per ticks and as soon as the quota for a tick is exhausted, 918275SEric Cheng * the underlying Rx ring is forced into poll mode for remainder of 928275SEric Cheng * the tick. The SRS poll thread only polls for bytes that are 938275SEric Cheng * allowed to come in the SRS. We typically let 4x the configured 948275SEric Cheng * B/W worth of packets to come in the SRS (to prevent unnecessary 958275SEric Cheng * drops due to bursts) but only process the specified amount. 968275SEric Cheng * 978275SEric Cheng * A MAC client (e.g. a VNIC or aggr) can have 1 or more 988275SEric Cheng * Rx rings (and corresponding SRSs) assigned to it. The SRS 998275SEric Cheng * in turn can have softrings to do protocol level fanout or 1008275SEric Cheng * softrings to do S/W based fanout or both. In case the NIC 1018275SEric Cheng * has no Rx rings, we do S/W classification to respective SRS. 1028275SEric Cheng * The S/W classification rule is always setup and ready. This 1038275SEric Cheng * allows the MAC layer to reassign Rx rings whenever needed 1048275SEric Cheng * but packets still continue to flow via the default path and 1058275SEric Cheng * getting S/W classified to correct SRS. 1068275SEric Cheng * 1078275SEric Cheng * The SRS's are used on both Tx and Rx side. They use the same 1088275SEric Cheng * data structure but the processing routines have slightly different 1098275SEric Cheng * semantics due to the fact that Rx side needs to do dynamic 1108275SEric Cheng * polling etc. 1118275SEric Cheng * 1128275SEric Cheng * Dynamic Polling Notes 1138275SEric Cheng * ===================== 1148275SEric Cheng * 1158275SEric Cheng * Each Soft ring set is capable of switching its Rx ring between 1168275SEric Cheng * interrupt and poll mode and actively 'polls' for packets in 1178275SEric Cheng * poll mode. If the SRS is implementing a B/W limit, it makes 1188275SEric Cheng * sure that only Max allowed packets are pulled in poll mode 1198275SEric Cheng * and goes to poll mode as soon as B/W limit is exceeded. As 1208275SEric Cheng * such, there are no overheads to implement B/W limits. 1218275SEric Cheng * 1228275SEric Cheng * In poll mode, its better to keep the pipeline going where the 1238275SEric Cheng * SRS worker thread keeps processing packets and poll thread 1248275SEric Cheng * keeps bringing more packets (specially if they get to run 1258275SEric Cheng * on different CPUs). This also prevents the overheads associated 1268275SEric Cheng * by excessive signalling (on NUMA machines, this can be 1278275SEric Cheng * pretty devastating). The exception is latency optimized case 1288275SEric Cheng * where worker thread does no work and interrupt and poll thread 1298275SEric Cheng * are allowed to do their own drain. 1308275SEric Cheng * 1318275SEric Cheng * We use the following policy to control Dynamic Polling: 1328275SEric Cheng * 1) We switch to poll mode anytime the processing 1338275SEric Cheng * thread causes a backlog to build up in SRS and 1348275SEric Cheng * its associated Soft Rings (sr_poll_pkt_cnt > 0). 1358275SEric Cheng * 2) As long as the backlog stays under the low water 1368275SEric Cheng * mark (sr_lowat), we poll the H/W for more packets. 1378275SEric Cheng * 3) If the backlog (sr_poll_pkt_cnt) exceeds low 1388275SEric Cheng * water mark, we stay in poll mode but don't poll 1398275SEric Cheng * the H/W for more packets. 1408275SEric Cheng * 4) Anytime in polling mode, if we poll the H/W for 1418275SEric Cheng * packets and find nothing plus we have an existing 1428275SEric Cheng * backlog (sr_poll_pkt_cnt > 0), we stay in polling 1438275SEric Cheng * mode but don't poll the H/W for packets anymore 1448275SEric Cheng * (let the polling thread go to sleep). 1458275SEric Cheng * 5) Once the backlog is relived (packets are processed) 1468275SEric Cheng * we reenable polling (by signalling the poll thread) 1478275SEric Cheng * only when the backlog dips below sr_poll_thres. 1488275SEric Cheng * 6) sr_hiwat is used exclusively when we are not 1498275SEric Cheng * polling capable and is used to decide when to 1508275SEric Cheng * drop packets so the SRS queue length doesn't grow 1518275SEric Cheng * infinitely. 1528275SEric Cheng * 1538275SEric Cheng * NOTE: Also see the block level comment on top of mac_soft_ring.c 1548275SEric Cheng */ 1558275SEric Cheng 1568275SEric Cheng /* 1578275SEric Cheng * mac_latency_optimize 1588275SEric Cheng * 1598275SEric Cheng * Controls whether the poll thread can process the packets inline 1608275SEric Cheng * or let the SRS worker thread do the processing. This applies if 1618275SEric Cheng * the SRS was not being processed. For latency sensitive traffic, 1628275SEric Cheng * this needs to be true to allow inline processing. For throughput 1638275SEric Cheng * under load, this should be false. 1648275SEric Cheng * 1658275SEric Cheng * This (and other similar) tunable should be rolled into a link 1668275SEric Cheng * or flow specific workload hint that can be set using dladm 1678275SEric Cheng * linkprop (instead of multiple such tunables). 1688275SEric Cheng */ 1698275SEric Cheng boolean_t mac_latency_optimize = B_TRUE; 1708275SEric Cheng 1718275SEric Cheng /* 1728275SEric Cheng * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN 1738275SEric Cheng * 1748275SEric Cheng * queue a mp or chain in soft ring set and increment the 1758275SEric Cheng * local count (srs_count) for the SRS and the shared counter 1768275SEric Cheng * (srs_poll_pkt_cnt - shared between SRS and its soft rings 1778275SEric Cheng * to track the total unprocessed packets for polling to work 1788275SEric Cheng * correctly). 1798275SEric Cheng * 1808275SEric Cheng * The size (total bytes queued) counters are incremented only 1818275SEric Cheng * if we are doing B/W control. 1828275SEric Cheng */ 1838275SEric Cheng #define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 1848275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 1858275SEric Cheng if ((mac_srs)->srs_last != NULL) \ 1868275SEric Cheng (mac_srs)->srs_last->b_next = (head); \ 1878275SEric Cheng else \ 1888275SEric Cheng (mac_srs)->srs_first = (head); \ 1898275SEric Cheng (mac_srs)->srs_last = (tail); \ 1908275SEric Cheng (mac_srs)->srs_count += count; \ 1918275SEric Cheng } 1928275SEric Cheng 1938275SEric Cheng #define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 1948275SEric Cheng mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 1958275SEric Cheng \ 1968275SEric Cheng MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 1978275SEric Cheng srs_rx->sr_poll_pkt_cnt += count; \ 1988275SEric Cheng ASSERT(srs_rx->sr_poll_pkt_cnt > 0); \ 1998275SEric Cheng if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 2008275SEric Cheng (mac_srs)->srs_size += (sz); \ 2018275SEric Cheng mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock); \ 2028275SEric Cheng (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 2038275SEric Cheng mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock); \ 2048275SEric Cheng } \ 2058275SEric Cheng } 2068275SEric Cheng 2078275SEric Cheng #define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 2088275SEric Cheng mac_srs->srs_state |= SRS_ENQUEUED; \ 2098275SEric Cheng MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 2108275SEric Cheng if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 2118275SEric Cheng (mac_srs)->srs_size += (sz); \ 2128275SEric Cheng (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 2138275SEric Cheng } \ 2148275SEric Cheng } 2158275SEric Cheng 2168275SEric Cheng /* 2178275SEric Cheng * Turn polling on routines 2188275SEric Cheng */ 2198275SEric Cheng #define MAC_SRS_POLLING_ON(mac_srs) { \ 2208275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2218275SEric Cheng if (((mac_srs)->srs_state & \ 2228275SEric Cheng (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) { \ 2238275SEric Cheng (mac_srs)->srs_state |= SRS_POLLING; \ 2248275SEric Cheng (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 2258275SEric Cheng (mac_srs)->srs_ring); \ 2268275SEric Cheng (mac_srs)->srs_rx.sr_poll_on++; \ 2278275SEric Cheng } \ 2288275SEric Cheng } 2298275SEric Cheng 2308275SEric Cheng #define MAC_SRS_WORKER_POLLING_ON(mac_srs) { \ 2318275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2328275SEric Cheng if (((mac_srs)->srs_state & \ 2338275SEric Cheng (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == \ 2348275SEric Cheng (SRS_POLLING_CAPAB|SRS_WORKER)) { \ 2358275SEric Cheng (mac_srs)->srs_state |= SRS_POLLING; \ 2368275SEric Cheng (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 2378275SEric Cheng (mac_srs)->srs_ring); \ 2388275SEric Cheng (mac_srs)->srs_rx.sr_worker_poll_on++; \ 2398275SEric Cheng } \ 2408275SEric Cheng } 2418275SEric Cheng 2428275SEric Cheng /* 2438275SEric Cheng * MAC_SRS_POLL_RING 2448275SEric Cheng * 2458275SEric Cheng * Signal the SRS poll thread to poll the underlying H/W ring 2468275SEric Cheng * provided it wasn't already polling (SRS_GET_PKTS was set). 2478275SEric Cheng * 2488275SEric Cheng * Poll thread gets to run only from mac_rx_srs_drain() and only 2498275SEric Cheng * if the drain was being done by the worker thread. 2508275SEric Cheng */ 2518275SEric Cheng #define MAC_SRS_POLL_RING(mac_srs) { \ 2528275SEric Cheng mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 2538275SEric Cheng \ 2548275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2558275SEric Cheng srs_rx->sr_poll_thr_sig++; \ 2568275SEric Cheng if (((mac_srs)->srs_state & \ 2578275SEric Cheng (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) == \ 2588275SEric Cheng (SRS_WORKER|SRS_POLLING_CAPAB)) { \ 2598275SEric Cheng (mac_srs)->srs_state |= SRS_GET_PKTS; \ 2608275SEric Cheng cv_signal(&(mac_srs)->srs_cv); \ 2618275SEric Cheng } else { \ 2628275SEric Cheng srs_rx->sr_poll_thr_busy++; \ 2638275SEric Cheng } \ 2648275SEric Cheng } 2658275SEric Cheng 2668275SEric Cheng /* 2678275SEric Cheng * MAC_SRS_CHECK_BW_CONTROL 2688275SEric Cheng * 2698275SEric Cheng * Check to see if next tick has started so we can reset the 2708275SEric Cheng * SRS_BW_ENFORCED flag and allow more packets to come in the 2718275SEric Cheng * system. 2728275SEric Cheng */ 2738275SEric Cheng #define MAC_SRS_CHECK_BW_CONTROL(mac_srs) { \ 2748275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2758275SEric Cheng ASSERT(((mac_srs)->srs_type & SRST_TX) || \ 2768275SEric Cheng MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock)); \ 27711066Srafael.vanoni@sun.com clock_t now = ddi_get_lbolt(); \ 27811066Srafael.vanoni@sun.com if ((mac_srs)->srs_bw->mac_bw_curr_time != now) { \ 27911066Srafael.vanoni@sun.com (mac_srs)->srs_bw->mac_bw_curr_time = now; \ 2808275SEric Cheng (mac_srs)->srs_bw->mac_bw_used = 0; \ 2818275SEric Cheng if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED) \ 2828275SEric Cheng (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \ 2838275SEric Cheng } \ 2848275SEric Cheng } 2858275SEric Cheng 2868275SEric Cheng /* 2878275SEric Cheng * MAC_SRS_WORKER_WAKEUP 2888275SEric Cheng * 2898275SEric Cheng * Wake up the SRS worker thread to process the queue as long as 2908275SEric Cheng * no one else is processing the queue. If we are optimizing for 2918275SEric Cheng * latency, we wake up the worker thread immediately or else we 2928275SEric Cheng * wait mac_srs_worker_wakeup_ticks before worker thread gets 2938275SEric Cheng * woken up. 2948275SEric Cheng */ 2958275SEric Cheng int mac_srs_worker_wakeup_ticks = 0; 2968275SEric Cheng #define MAC_SRS_WORKER_WAKEUP(mac_srs) { \ 2978275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2988275SEric Cheng if (!((mac_srs)->srs_state & SRS_PROC) && \ 2998275SEric Cheng (mac_srs)->srs_tid == NULL) { \ 3009618SRajagopal.Kunhappan@Sun.COM if (((mac_srs)->srs_state & SRS_LATENCY_OPT) || \ 3018275SEric Cheng (mac_srs_worker_wakeup_ticks == 0)) \ 3028275SEric Cheng cv_signal(&(mac_srs)->srs_async); \ 3038275SEric Cheng else \ 3048275SEric Cheng (mac_srs)->srs_tid = \ 3058275SEric Cheng timeout(mac_srs_fire, (mac_srs), \ 3068275SEric Cheng mac_srs_worker_wakeup_ticks); \ 3078275SEric Cheng } \ 3088275SEric Cheng } 3098275SEric Cheng 3108275SEric Cheng #define TX_SINGLE_RING_MODE(mac_srs) \ 3118275SEric Cheng ((mac_srs)->srs_tx.st_mode == SRS_TX_DEFAULT || \ 3128275SEric Cheng (mac_srs)->srs_tx.st_mode == SRS_TX_SERIALIZE || \ 3138275SEric Cheng (mac_srs)->srs_tx.st_mode == SRS_TX_BW) 3148275SEric Cheng 3158275SEric Cheng #define TX_BANDWIDTH_MODE(mac_srs) \ 3168275SEric Cheng ((mac_srs)->srs_tx.st_mode == SRS_TX_BW || \ 3178275SEric Cheng (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT) 3188275SEric Cheng 3198275SEric Cheng #define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) { \ 3208275SEric Cheng uint_t hash, indx; \ 3218275SEric Cheng hash = HASH_HINT(hint); \ 3228275SEric Cheng indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); \ 3238275SEric Cheng softring = mac_srs->srs_oth_soft_rings[indx]; \ 3248275SEric Cheng (void) (mac_tx_soft_ring_process(softring, head, 0, NULL)); \ 3258275SEric Cheng } 3268275SEric Cheng 3278275SEric Cheng /* 3288275SEric Cheng * MAC_TX_SRS_BLOCK 3298275SEric Cheng * 3308275SEric Cheng * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED 3318275SEric Cheng * will be set only if srs_tx_woken_up is FALSE. If 3328275SEric Cheng * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived 3338275SEric Cheng * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to 3348275SEric Cheng * attempt to transmit again and not setting SRS_TX_BLOCKED does 3358275SEric Cheng * that. 3368275SEric Cheng */ 3378275SEric Cheng #define MAC_TX_SRS_BLOCK(srs, mp) { \ 3388275SEric Cheng ASSERT(MUTEX_HELD(&(srs)->srs_lock)); \ 3398275SEric Cheng if ((srs)->srs_tx.st_woken_up) { \ 3408275SEric Cheng (srs)->srs_tx.st_woken_up = B_FALSE; \ 3418275SEric Cheng } else { \ 3428275SEric Cheng ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED)); \ 3438275SEric Cheng (srs)->srs_state |= SRS_TX_BLOCKED; \ 3448275SEric Cheng (srs)->srs_tx.st_blocked_cnt++; \ 3458275SEric Cheng } \ 3468275SEric Cheng } 3478275SEric Cheng 3488275SEric Cheng /* 3498275SEric Cheng * MAC_TX_SRS_TEST_HIWAT 3508275SEric Cheng * 3518275SEric Cheng * Called before queueing a packet onto Tx SRS to test and set 3528275SEric Cheng * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat. 3538275SEric Cheng */ 3548275SEric Cheng #define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) { \ 3558275SEric Cheng boolean_t enqueue = 1; \ 3568275SEric Cheng \ 3578275SEric Cheng if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) { \ 3588275SEric Cheng /* \ 3598275SEric Cheng * flow-controlled. Store srs in cookie so that it \ 3608275SEric Cheng * can be returned as mac_tx_cookie_t to client \ 3618275SEric Cheng */ \ 3628275SEric Cheng (srs)->srs_state |= SRS_TX_HIWAT; \ 3638275SEric Cheng cookie = (mac_tx_cookie_t)srs; \ 3648275SEric Cheng (srs)->srs_tx.st_hiwat_cnt++; \ 3658275SEric Cheng if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) { \ 3668275SEric Cheng /* increment freed stats */ \ 3678275SEric Cheng (srs)->srs_tx.st_drop_count += cnt; \ 3688275SEric Cheng /* \ 3698275SEric Cheng * b_prev may be set to the fanout hint \ 3708275SEric Cheng * hence can't use freemsg directly \ 3718275SEric Cheng */ \ 3728275SEric Cheng mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \ 3738275SEric Cheng DTRACE_PROBE1(tx_queued_hiwat, \ 3748275SEric Cheng mac_soft_ring_set_t *, srs); \ 3758275SEric Cheng enqueue = 0; \ 3768275SEric Cheng } \ 3778275SEric Cheng } \ 3788275SEric Cheng if (enqueue) \ 3798275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz); \ 3808275SEric Cheng } 3818275SEric Cheng 3828275SEric Cheng /* Some utility macros */ 3838275SEric Cheng #define MAC_SRS_BW_LOCK(srs) \ 3848275SEric Cheng if (!(srs->srs_type & SRST_TX)) \ 3858275SEric Cheng mutex_enter(&srs->srs_bw->mac_bw_lock); 3868275SEric Cheng 3878275SEric Cheng #define MAC_SRS_BW_UNLOCK(srs) \ 3888275SEric Cheng if (!(srs->srs_type & SRST_TX)) \ 3898275SEric Cheng mutex_exit(&srs->srs_bw->mac_bw_lock); 3908275SEric Cheng 3918275SEric Cheng #define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \ 3928275SEric Cheng mac_pkt_drop(NULL, NULL, mp, B_FALSE); \ 3938275SEric Cheng /* increment freed stats */ \ 3948275SEric Cheng mac_srs->srs_tx.st_drop_count++; \ 3958275SEric Cheng cookie = (mac_tx_cookie_t)srs; \ 3968275SEric Cheng } 3978275SEric Cheng 3988275SEric Cheng #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \ 3998275SEric Cheng mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT; \ 4008275SEric Cheng cookie = (mac_tx_cookie_t)srs; \ 4018275SEric Cheng *ret_mp = mp_chain; \ 4028275SEric Cheng } 4038275SEric Cheng 4048275SEric Cheng /* 4058275SEric Cheng * Drop the rx packet and advance to the next one in the chain. 4068275SEric Cheng */ 4078275SEric Cheng static void 4088275SEric Cheng mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp) 4098275SEric Cheng { 4108275SEric Cheng mac_srs_rx_t *srs_rx = &srs->srs_rx; 4118275SEric Cheng 4128275SEric Cheng ASSERT(mp->b_next == NULL); 4138275SEric Cheng mutex_enter(&srs->srs_lock); 4148275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1); 4158275SEric Cheng MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp)); 4168275SEric Cheng mutex_exit(&srs->srs_lock); 4178275SEric Cheng 4188275SEric Cheng srs_rx->sr_drop_count++; 4198275SEric Cheng freemsg(mp); 4208275SEric Cheng } 4218275SEric Cheng 4228275SEric Cheng /* DATAPATH RUNTIME ROUTINES */ 4238275SEric Cheng 4248275SEric Cheng /* 4258275SEric Cheng * mac_srs_fire 4268275SEric Cheng * 4278275SEric Cheng * Timer callback routine for waking up the SRS worker thread. 4288275SEric Cheng */ 4298275SEric Cheng static void 4308275SEric Cheng mac_srs_fire(void *arg) 4318275SEric Cheng { 4328275SEric Cheng mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg; 4338275SEric Cheng 4348275SEric Cheng mutex_enter(&mac_srs->srs_lock); 4358275SEric Cheng if (mac_srs->srs_tid == 0) { 4368275SEric Cheng mutex_exit(&mac_srs->srs_lock); 4378275SEric Cheng return; 4388275SEric Cheng } 4398275SEric Cheng 4408275SEric Cheng mac_srs->srs_tid = 0; 4418275SEric Cheng if (!(mac_srs->srs_state & SRS_PROC)) 4428275SEric Cheng cv_signal(&mac_srs->srs_async); 4438275SEric Cheng 4448275SEric Cheng mutex_exit(&mac_srs->srs_lock); 4458275SEric Cheng } 4468275SEric Cheng 4478275SEric Cheng /* 4488275SEric Cheng * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack, 4498275SEric Cheng * and it is used on the TX path. 4508275SEric Cheng */ 451*11608SRao.Shoaib@Sun.COM #define HASH_HINT(hint) \ 452*11608SRao.Shoaib@Sun.COM ((hint) ^ ((hint) >> 24) ^ ((hint) >> 16) ^ ((hint) >> 8)) 453*11608SRao.Shoaib@Sun.COM 4548275SEric Cheng 4558275SEric Cheng /* 4568275SEric Cheng * hash based on the src address and the port information. 4578275SEric Cheng */ 4588275SEric Cheng #define HASH_ADDR(src, ports) \ 4598275SEric Cheng (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \ 4608275SEric Cheng ((ports) >> 8) ^ (ports)) 4618275SEric Cheng 4628275SEric Cheng #define COMPUTE_INDEX(key, sz) (key % sz) 4638275SEric Cheng 4648275SEric Cheng #define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) { \ 4658275SEric Cheng if ((tail) != NULL) { \ 4668275SEric Cheng ASSERT((tail)->b_next == NULL); \ 4678275SEric Cheng (tail)->b_next = (mp); \ 4688275SEric Cheng } else { \ 4698275SEric Cheng ASSERT((head) == NULL); \ 4708275SEric Cheng (head) = (mp); \ 4718275SEric Cheng } \ 4728275SEric Cheng (tail) = (mp); \ 4738275SEric Cheng (cnt)++; \ 4748275SEric Cheng if ((bw_ctl)) \ 4758275SEric Cheng (sz) += (sz0); \ 4768275SEric Cheng } 4778275SEric Cheng 4788275SEric Cheng #define MAC_FANOUT_DEFAULT 0 4798275SEric Cheng #define MAC_FANOUT_RND_ROBIN 1 4808275SEric Cheng int mac_fanout_type = MAC_FANOUT_DEFAULT; 4818275SEric Cheng 4828275SEric Cheng #define MAX_SR_TYPES 3 4838275SEric Cheng /* fanout types for port based hashing */ 4848275SEric Cheng enum pkt_type { 4858275SEric Cheng V4_TCP = 0, 4868275SEric Cheng V4_UDP, 4878275SEric Cheng OTH, 4888275SEric Cheng UNDEF 4898275SEric Cheng }; 4908275SEric Cheng 4918275SEric Cheng /* 4928275SEric Cheng * In general we do port based hashing to spread traffic over different 4938275SEric Cheng * softrings. The below tunable allows to override that behavior. Setting it 4948275SEric Cheng * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior 4958275SEric Cheng * is also the applicable to ipv6 packets carrying multiple optional headers 4968275SEric Cheng * and other uncommon packet types. 4978275SEric Cheng */ 4988275SEric Cheng boolean_t mac_src_ipv6_fanout = B_FALSE; 4998275SEric Cheng 5008275SEric Cheng /* 5018275SEric Cheng * Pair of local and remote ports in the transport header 5028275SEric Cheng */ 5038275SEric Cheng #define PORTS_SIZE 4 5048275SEric Cheng 5058275SEric Cheng /* 5068275SEric Cheng * mac_rx_srs_proto_fanout 5078275SEric Cheng * 5088275SEric Cheng * This routine delivers packets destined to an SRS into one of the 5098275SEric Cheng * protocol soft rings. 5108275SEric Cheng * 5118275SEric Cheng * Given a chain of packets we need to split it up into multiple sub chains 5128275SEric Cheng * destined into TCP, UDP or OTH soft ring. Instead of entering 5138275SEric Cheng * the soft ring one packet at a time, we want to enter it in the form of a 5148275SEric Cheng * chain otherwise we get this start/stop behaviour where the worker thread 5158275SEric Cheng * goes to sleep and then next packets comes in forcing it to wake up etc. 5168275SEric Cheng */ 5178275SEric Cheng static void 5188275SEric Cheng mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 5198275SEric Cheng { 5208275SEric Cheng struct ether_header *ehp; 5218833SVenu.Iyer@Sun.COM struct ether_vlan_header *evhp; 5228833SVenu.Iyer@Sun.COM uint32_t sap; 5238275SEric Cheng ipha_t *ipha; 5248833SVenu.Iyer@Sun.COM uint8_t *dstaddr; 5258833SVenu.Iyer@Sun.COM size_t hdrsize; 5268275SEric Cheng mblk_t *mp; 5278275SEric Cheng mblk_t *headmp[MAX_SR_TYPES]; 5288275SEric Cheng mblk_t *tailmp[MAX_SR_TYPES]; 5298275SEric Cheng int cnt[MAX_SR_TYPES]; 5308275SEric Cheng size_t sz[MAX_SR_TYPES]; 5318275SEric Cheng size_t sz1; 5328833SVenu.Iyer@Sun.COM boolean_t bw_ctl; 5338275SEric Cheng boolean_t hw_classified; 5348833SVenu.Iyer@Sun.COM boolean_t dls_bypass; 5358833SVenu.Iyer@Sun.COM boolean_t is_ether; 5368833SVenu.Iyer@Sun.COM boolean_t is_unicast; 5378833SVenu.Iyer@Sun.COM enum pkt_type type; 5388275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 5398833SVenu.Iyer@Sun.COM 5408833SVenu.Iyer@Sun.COM is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 5418833SVenu.Iyer@Sun.COM bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 5428275SEric Cheng 5438275SEric Cheng /* 5448275SEric Cheng * If we don't have a Rx ring, S/W classification would have done 5458275SEric Cheng * its job and its a packet meant for us. If we were polling on 5468275SEric Cheng * the default ring (i.e. there was a ring assigned to this SRS), 5478275SEric Cheng * then we need to make sure that the mac address really belongs 5488275SEric Cheng * to us. 5498275SEric Cheng */ 5508275SEric Cheng hw_classified = mac_srs->srs_ring != NULL && 5518275SEric Cheng mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 5528275SEric Cheng 5538275SEric Cheng /* 5548275SEric Cheng * Special clients (eg. VLAN, non ether, etc) need DLS 5558275SEric Cheng * processing in the Rx path. SRST_DLS_BYPASS will be clear for 55611021SEric.Cheng@Sun.COM * such SRSs. Another way of disabling bypass is to set the 55711021SEric.Cheng@Sun.COM * MCIS_RX_BYPASS_DISABLE flag. 5588275SEric Cheng */ 55911021SEric.Cheng@Sun.COM dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && 56011021SEric.Cheng@Sun.COM ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); 5618275SEric Cheng 5628275SEric Cheng bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *)); 5638275SEric Cheng bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *)); 5648275SEric Cheng bzero(cnt, MAX_SR_TYPES * sizeof (int)); 5658275SEric Cheng bzero(sz, MAX_SR_TYPES * sizeof (size_t)); 5668275SEric Cheng 5678275SEric Cheng /* 5688275SEric Cheng * We got a chain from SRS that we need to send to the soft rings. 5698275SEric Cheng * Since squeues for TCP & IPv4 sap poll their soft rings (for 5708275SEric Cheng * performance reasons), we need to separate out v4_tcp, v4_udp 5718275SEric Cheng * and the rest goes in other. 5728275SEric Cheng */ 5738275SEric Cheng while (head != NULL) { 5748275SEric Cheng mp = head; 5758275SEric Cheng head = head->b_next; 5768275SEric Cheng mp->b_next = NULL; 5778275SEric Cheng 5788275SEric Cheng type = OTH; 5798833SVenu.Iyer@Sun.COM sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 5808833SVenu.Iyer@Sun.COM 5818833SVenu.Iyer@Sun.COM if (is_ether) { 5828833SVenu.Iyer@Sun.COM /* 5838833SVenu.Iyer@Sun.COM * At this point we can be sure the packet at least 5848833SVenu.Iyer@Sun.COM * has an ether header. 5858833SVenu.Iyer@Sun.COM */ 5868833SVenu.Iyer@Sun.COM if (sz1 < sizeof (struct ether_header)) { 5878833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 5888833SVenu.Iyer@Sun.COM continue; 5898833SVenu.Iyer@Sun.COM } 5908275SEric Cheng ehp = (struct ether_header *)mp->b_rptr; 5918275SEric Cheng 5928275SEric Cheng /* 5938833SVenu.Iyer@Sun.COM * Determine if this is a VLAN or non-VLAN packet. 5948275SEric Cheng */ 5958833SVenu.Iyer@Sun.COM if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 5968833SVenu.Iyer@Sun.COM evhp = (struct ether_vlan_header *)mp->b_rptr; 5978833SVenu.Iyer@Sun.COM sap = ntohs(evhp->ether_type); 5988833SVenu.Iyer@Sun.COM hdrsize = sizeof (struct ether_vlan_header); 5998275SEric Cheng /* 6008833SVenu.Iyer@Sun.COM * Check if the VID of the packet, if any, 6018833SVenu.Iyer@Sun.COM * belongs to this client. 6028275SEric Cheng */ 6038275SEric Cheng if (!mac_client_check_flow_vid(mcip, 6048275SEric Cheng VLAN_ID(ntohs(evhp->ether_tci)))) { 6058275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 6068275SEric Cheng continue; 6078275SEric Cheng } 6088833SVenu.Iyer@Sun.COM } else { 6098833SVenu.Iyer@Sun.COM hdrsize = sizeof (struct ether_header); 6108275SEric Cheng } 6118833SVenu.Iyer@Sun.COM is_unicast = 6128833SVenu.Iyer@Sun.COM ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 6138833SVenu.Iyer@Sun.COM dstaddr = (uint8_t *)&ehp->ether_dhost; 6148833SVenu.Iyer@Sun.COM } else { 6158833SVenu.Iyer@Sun.COM mac_header_info_t mhi; 6168833SVenu.Iyer@Sun.COM 6178833SVenu.Iyer@Sun.COM if (mac_header_info((mac_handle_t)mcip->mci_mip, 6188833SVenu.Iyer@Sun.COM mp, &mhi) != 0) { 6198833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 6208833SVenu.Iyer@Sun.COM continue; 6218833SVenu.Iyer@Sun.COM } 6228833SVenu.Iyer@Sun.COM hdrsize = mhi.mhi_hdrsize; 6238833SVenu.Iyer@Sun.COM sap = mhi.mhi_bindsap; 6248833SVenu.Iyer@Sun.COM is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 6258833SVenu.Iyer@Sun.COM dstaddr = (uint8_t *)mhi.mhi_daddr; 6268833SVenu.Iyer@Sun.COM } 6278833SVenu.Iyer@Sun.COM 6288833SVenu.Iyer@Sun.COM if (!dls_bypass) { 6298275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 6308275SEric Cheng cnt[type], bw_ctl, sz[type], sz1, mp); 6318275SEric Cheng continue; 6328275SEric Cheng } 6338275SEric Cheng 6348833SVenu.Iyer@Sun.COM if (sap == ETHERTYPE_IP) { 6358275SEric Cheng /* 6368275SEric Cheng * If we are H/W classified, but we have promisc 6378275SEric Cheng * on, then we need to check for the unicast address. 6388275SEric Cheng */ 6398275SEric Cheng if (hw_classified && mcip->mci_promisc_list != NULL) { 6408275SEric Cheng mac_address_t *map; 6418275SEric Cheng 6428275SEric Cheng rw_enter(&mcip->mci_rw_lock, RW_READER); 6438275SEric Cheng map = mcip->mci_unicast; 6448833SVenu.Iyer@Sun.COM if (bcmp(dstaddr, map->ma_addr, 6458275SEric Cheng map->ma_len) == 0) 6468275SEric Cheng type = UNDEF; 6478275SEric Cheng rw_exit(&mcip->mci_rw_lock); 6488833SVenu.Iyer@Sun.COM } else if (is_unicast) { 6498275SEric Cheng type = UNDEF; 6508275SEric Cheng } 6518275SEric Cheng } 6528275SEric Cheng 6538275SEric Cheng /* 6548275SEric Cheng * This needs to become a contract with the driver for 6558275SEric Cheng * the fast path. 6568275SEric Cheng * 6578275SEric Cheng * In the normal case the packet will have at least the L2 6588275SEric Cheng * header and the IP + Transport header in the same mblk. 6598275SEric Cheng * This is usually the case when the NIC driver sends up 6608275SEric Cheng * the packet. This is also true when the stack generates 6618275SEric Cheng * a packet that is looped back and when the stack uses the 6628275SEric Cheng * fastpath mechanism. The normal case is optimized for 6638275SEric Cheng * performance and may bypass DLS. All other cases go through 6648275SEric Cheng * the 'OTH' type path without DLS bypass. 6658275SEric Cheng */ 6668275SEric Cheng 6678833SVenu.Iyer@Sun.COM ipha = (ipha_t *)(mp->b_rptr + hdrsize); 6688275SEric Cheng if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) 6698275SEric Cheng type = OTH; 6708275SEric Cheng 6718275SEric Cheng if (type == OTH) { 6728275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 6738275SEric Cheng cnt[type], bw_ctl, sz[type], sz1, mp); 6748275SEric Cheng continue; 6758275SEric Cheng } 6768275SEric Cheng 6778275SEric Cheng ASSERT(type == UNDEF); 6788275SEric Cheng /* 6798275SEric Cheng * We look for at least 4 bytes past the IP header to get 6808275SEric Cheng * the port information. If we get an IP fragment, we don't 6818275SEric Cheng * have the port information, and we use just the protocol 6828275SEric Cheng * information. 6838275SEric Cheng */ 6848275SEric Cheng switch (ipha->ipha_protocol) { 6858275SEric Cheng case IPPROTO_TCP: 6868275SEric Cheng type = V4_TCP; 6878833SVenu.Iyer@Sun.COM mp->b_rptr += hdrsize; 6888275SEric Cheng break; 6898275SEric Cheng case IPPROTO_UDP: 6908275SEric Cheng type = V4_UDP; 6918833SVenu.Iyer@Sun.COM mp->b_rptr += hdrsize; 6928275SEric Cheng break; 6938275SEric Cheng default: 6948275SEric Cheng type = OTH; 6958275SEric Cheng break; 6968275SEric Cheng } 6978275SEric Cheng 6988275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type], 6998275SEric Cheng bw_ctl, sz[type], sz1, mp); 7008275SEric Cheng } 7018275SEric Cheng 7028275SEric Cheng for (type = V4_TCP; type < UNDEF; type++) { 7038275SEric Cheng if (headmp[type] != NULL) { 7048833SVenu.Iyer@Sun.COM mac_soft_ring_t *softring; 7058833SVenu.Iyer@Sun.COM 7068275SEric Cheng ASSERT(tailmp[type]->b_next == NULL); 7078275SEric Cheng switch (type) { 7088275SEric Cheng case V4_TCP: 7098275SEric Cheng softring = mac_srs->srs_tcp_soft_rings[0]; 7108275SEric Cheng break; 7118275SEric Cheng case V4_UDP: 7128275SEric Cheng softring = mac_srs->srs_udp_soft_rings[0]; 7138275SEric Cheng break; 7148275SEric Cheng case OTH: 7158275SEric Cheng softring = mac_srs->srs_oth_soft_rings[0]; 7168275SEric Cheng } 7178833SVenu.Iyer@Sun.COM mac_rx_soft_ring_process(mcip, softring, 7188275SEric Cheng headmp[type], tailmp[type], cnt[type], sz[type]); 7198275SEric Cheng } 7208275SEric Cheng } 7218275SEric Cheng } 7228275SEric Cheng 7238275SEric Cheng int fanout_unalligned = 0; 7248275SEric Cheng 7258275SEric Cheng /* 7268275SEric Cheng * mac_rx_srs_long_fanout 7278275SEric Cheng * 7288275SEric Cheng * The fanout routine for IPv6 7298275SEric Cheng */ 7308275SEric Cheng static int 7318275SEric Cheng mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, 7328833SVenu.Iyer@Sun.COM uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx) 7338275SEric Cheng { 7348275SEric Cheng ip6_t *ip6h; 7358275SEric Cheng uint8_t *whereptr; 7368275SEric Cheng uint_t hash; 7378275SEric Cheng uint16_t remlen; 7388275SEric Cheng uint8_t nexthdr; 7398275SEric Cheng uint16_t hdr_len; 7408275SEric Cheng 7418833SVenu.Iyer@Sun.COM if (sap == ETHERTYPE_IPV6) { 7428275SEric Cheng boolean_t modifiable = B_TRUE; 7438275SEric Cheng 7448833SVenu.Iyer@Sun.COM ASSERT(MBLKL(mp) >= hdrsize); 7458833SVenu.Iyer@Sun.COM 7468833SVenu.Iyer@Sun.COM ip6h = (ip6_t *)(mp->b_rptr + hdrsize); 7478275SEric Cheng if ((unsigned char *)ip6h == mp->b_wptr) { 7488275SEric Cheng /* 7498833SVenu.Iyer@Sun.COM * The first mblk_t only includes the mac header. 7508275SEric Cheng * Note that it is safe to change the mp pointer here, 7518275SEric Cheng * as the subsequent operation does not assume mp 7528833SVenu.Iyer@Sun.COM * points to the start of the mac header. 7538275SEric Cheng */ 7548275SEric Cheng mp = mp->b_cont; 7558275SEric Cheng 7568275SEric Cheng /* 7578275SEric Cheng * Make sure ip6h holds the full ip6_t structure. 7588275SEric Cheng */ 7598275SEric Cheng if (mp == NULL) 7608275SEric Cheng return (-1); 7618275SEric Cheng 7628275SEric Cheng if (MBLKL(mp) < IPV6_HDR_LEN) { 7638275SEric Cheng modifiable = (DB_REF(mp) == 1); 7648275SEric Cheng 7658275SEric Cheng if (modifiable && 7668275SEric Cheng !pullupmsg(mp, IPV6_HDR_LEN)) { 7678275SEric Cheng return (-1); 7688275SEric Cheng } 7698275SEric Cheng } 7708275SEric Cheng 7718275SEric Cheng ip6h = (ip6_t *)mp->b_rptr; 7728275SEric Cheng } 7738275SEric Cheng 7748275SEric Cheng if (!modifiable || !(OK_32PTR((char *)ip6h)) || 7758275SEric Cheng ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) { 7768275SEric Cheng /* 7778275SEric Cheng * If either ip6h is not alligned, or ip6h does not 7788275SEric Cheng * hold the complete ip6_t structure (a pullupmsg() 7798275SEric Cheng * is not an option since it would result in an 7808275SEric Cheng * unalligned ip6h), fanout to the default ring. Note 7818275SEric Cheng * that this may cause packets reordering. 7828275SEric Cheng */ 7838275SEric Cheng *indx = 0; 7848275SEric Cheng *type = OTH; 7858275SEric Cheng fanout_unalligned++; 7868275SEric Cheng return (0); 7878275SEric Cheng } 7888275SEric Cheng 7898275SEric Cheng remlen = ntohs(ip6h->ip6_plen); 7908275SEric Cheng nexthdr = ip6h->ip6_nxt; 7918275SEric Cheng 7928275SEric Cheng if (remlen < MIN_EHDR_LEN) 7938275SEric Cheng return (-1); 7948275SEric Cheng /* 7958275SEric Cheng * Do src based fanout if below tunable is set to B_TRUE or 7968275SEric Cheng * when mac_ip_hdr_length_v6() fails because of malformed 7978275SEric Cheng * packets or because mblk's need to be concatenated using 7988275SEric Cheng * pullupmsg(). 7998275SEric Cheng */ 8008275SEric Cheng if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(mp, ip6h, 80111528SBaban.Kenkre@Sun.COM &hdr_len, &nexthdr, NULL, NULL)) { 8028275SEric Cheng goto src_based_fanout; 8038275SEric Cheng } 8048275SEric Cheng whereptr = (uint8_t *)ip6h + hdr_len; 8058275SEric Cheng 8068275SEric Cheng /* If the transport is one of below, we do port based fanout */ 8078275SEric Cheng switch (nexthdr) { 8088275SEric Cheng case IPPROTO_TCP: 8098275SEric Cheng case IPPROTO_UDP: 8108275SEric Cheng case IPPROTO_SCTP: 8118275SEric Cheng case IPPROTO_ESP: 8128275SEric Cheng /* 8138275SEric Cheng * If the ports in the transport header is not part of 8148275SEric Cheng * the mblk, do src_based_fanout, instead of calling 8158275SEric Cheng * pullupmsg(). 8168275SEric Cheng */ 8178275SEric Cheng if (mp->b_cont != NULL && 8188275SEric Cheng whereptr + PORTS_SIZE > mp->b_wptr) { 8198275SEric Cheng goto src_based_fanout; 8208275SEric Cheng } 8218275SEric Cheng break; 8228275SEric Cheng default: 8238275SEric Cheng break; 8248275SEric Cheng } 8258275SEric Cheng 8268275SEric Cheng switch (nexthdr) { 8278275SEric Cheng case IPPROTO_TCP: 8288275SEric Cheng hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 8298275SEric Cheng *(uint32_t *)whereptr); 8308275SEric Cheng *indx = COMPUTE_INDEX(hash, 8318275SEric Cheng mac_srs->srs_tcp_ring_count); 8328275SEric Cheng *type = OTH; 8338275SEric Cheng break; 8348275SEric Cheng 8358275SEric Cheng case IPPROTO_UDP: 8368275SEric Cheng case IPPROTO_SCTP: 8378275SEric Cheng case IPPROTO_ESP: 8388275SEric Cheng if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 8398275SEric Cheng hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 8408275SEric Cheng *(uint32_t *)whereptr); 8418275SEric Cheng *indx = COMPUTE_INDEX(hash, 8428275SEric Cheng mac_srs->srs_udp_ring_count); 8438275SEric Cheng } else { 8448275SEric Cheng *indx = mac_srs->srs_ind % 8458275SEric Cheng mac_srs->srs_udp_ring_count; 8468275SEric Cheng mac_srs->srs_ind++; 8478275SEric Cheng } 8488275SEric Cheng *type = OTH; 8498275SEric Cheng break; 8508275SEric Cheng 8518275SEric Cheng /* For all other protocol, do source based fanout */ 8528275SEric Cheng default: 8538275SEric Cheng goto src_based_fanout; 8548275SEric Cheng } 8558275SEric Cheng } else { 8568275SEric Cheng *indx = 0; 8578275SEric Cheng *type = OTH; 8588275SEric Cheng } 8598275SEric Cheng return (0); 8608275SEric Cheng 8618275SEric Cheng src_based_fanout: 8628275SEric Cheng hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0); 8638275SEric Cheng *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); 8648275SEric Cheng *type = OTH; 8658275SEric Cheng return (0); 8668275SEric Cheng } 8678275SEric Cheng 8688275SEric Cheng /* 8698275SEric Cheng * mac_rx_srs_fanout 8708275SEric Cheng * 8718275SEric Cheng * This routine delivers packets destined to an SRS into a soft ring member 8728275SEric Cheng * of the set. 8738275SEric Cheng * 8748275SEric Cheng * Given a chain of packets we need to split it up into multiple sub chains 8758275SEric Cheng * destined for one of the TCP, UDP or OTH soft rings. Instead of entering 8768275SEric Cheng * the soft ring one packet at a time, we want to enter it in the form of a 8778275SEric Cheng * chain otherwise we get this start/stop behaviour where the worker thread 8788275SEric Cheng * goes to sleep and then next packets comes in forcing it to wake up etc. 8798275SEric Cheng * 8808275SEric Cheng * Note: 8818275SEric Cheng * Since we know what is the maximum fanout possible, we create a 2D array 8828275SEric Cheng * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz 8838275SEric Cheng * variables so that we can enter the softrings with chain. We need the 8848275SEric Cheng * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc 8858275SEric Cheng * for each packet would be expensive). If we ever want to have the 8868275SEric Cheng * ability to have unlimited fanout, we should probably declare a head, 8878275SEric Cheng * tail, cnt, sz with each soft ring (a data struct which contains a softring 8888275SEric Cheng * along with these members) and create an array of this uber struct so we 8898275SEric Cheng * don't have to do kmem_alloc. 8908275SEric Cheng */ 8918275SEric Cheng int fanout_oth1 = 0; 8928275SEric Cheng int fanout_oth2 = 0; 8938275SEric Cheng int fanout_oth3 = 0; 8948275SEric Cheng int fanout_oth4 = 0; 8958275SEric Cheng int fanout_oth5 = 0; 8968275SEric Cheng 8978275SEric Cheng static void 8988275SEric Cheng mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 8998275SEric Cheng { 9008275SEric Cheng struct ether_header *ehp; 9018833SVenu.Iyer@Sun.COM struct ether_vlan_header *evhp; 9028833SVenu.Iyer@Sun.COM uint32_t sap; 9038275SEric Cheng ipha_t *ipha; 9048833SVenu.Iyer@Sun.COM uint8_t *dstaddr; 9058275SEric Cheng uint_t indx; 9068833SVenu.Iyer@Sun.COM size_t ports_offset; 9078833SVenu.Iyer@Sun.COM size_t ipha_len; 9088833SVenu.Iyer@Sun.COM size_t hdrsize; 9098275SEric Cheng uint_t hash; 9108275SEric Cheng mblk_t *mp; 9118275SEric Cheng mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 9128275SEric Cheng mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 9138275SEric Cheng int cnt[MAX_SR_TYPES][MAX_SR_FANOUT]; 9148275SEric Cheng size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT]; 9158275SEric Cheng size_t sz1; 9168833SVenu.Iyer@Sun.COM boolean_t bw_ctl; 9178275SEric Cheng boolean_t hw_classified; 9188833SVenu.Iyer@Sun.COM boolean_t dls_bypass; 9198833SVenu.Iyer@Sun.COM boolean_t is_ether; 9208833SVenu.Iyer@Sun.COM boolean_t is_unicast; 9218275SEric Cheng int fanout_cnt; 9228833SVenu.Iyer@Sun.COM enum pkt_type type; 9238275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 9248833SVenu.Iyer@Sun.COM 9258833SVenu.Iyer@Sun.COM is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 9268833SVenu.Iyer@Sun.COM bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 9278275SEric Cheng 9288275SEric Cheng /* 9298275SEric Cheng * If we don't have a Rx ring, S/W classification would have done 9308275SEric Cheng * its job and its a packet meant for us. If we were polling on 9318275SEric Cheng * the default ring (i.e. there was a ring assigned to this SRS), 9328275SEric Cheng * then we need to make sure that the mac address really belongs 9338275SEric Cheng * to us. 9348275SEric Cheng */ 9358275SEric Cheng hw_classified = mac_srs->srs_ring != NULL && 9368275SEric Cheng mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 9378275SEric Cheng 9388275SEric Cheng /* 9398275SEric Cheng * Special clients (eg. VLAN, non ether, etc) need DLS 9408275SEric Cheng * processing in the Rx path. SRST_DLS_BYPASS will be clear for 94111021SEric.Cheng@Sun.COM * such SRSs. Another way of disabling bypass is to set the 94211021SEric.Cheng@Sun.COM * MCIS_RX_BYPASS_DISABLE flag. 9438275SEric Cheng */ 94411021SEric.Cheng@Sun.COM dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0) && 94511021SEric.Cheng@Sun.COM ((mcip->mci_state_flags & MCIS_RX_BYPASS_DISABLE) == 0); 9468275SEric Cheng 9478275SEric Cheng /* 9488275SEric Cheng * Since the softrings are never destroyed and we always 9498275SEric Cheng * create equal number of softrings for TCP, UDP and rest, 9508275SEric Cheng * its OK to check one of them for count and use it without 9518275SEric Cheng * any lock. In future, if soft rings get destroyed because 9528275SEric Cheng * of reduction in fanout, we will need to ensure that happens 9538275SEric Cheng * behind the SRS_PROC. 9548275SEric Cheng */ 9558275SEric Cheng fanout_cnt = mac_srs->srs_tcp_ring_count; 9568275SEric Cheng 9578275SEric Cheng bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 9588275SEric Cheng bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 9598275SEric Cheng bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int)); 9608275SEric Cheng bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t)); 9618275SEric Cheng 9628275SEric Cheng /* 9638275SEric Cheng * We got a chain from SRS that we need to send to the soft rings. 9648275SEric Cheng * Since squeues for TCP & IPv4 sap poll their soft rings (for 9658275SEric Cheng * performance reasons), we need to separate out v4_tcp, v4_udp 9668275SEric Cheng * and the rest goes in other. 9678275SEric Cheng */ 9688275SEric Cheng while (head != NULL) { 9698275SEric Cheng mp = head; 9708275SEric Cheng head = head->b_next; 9718275SEric Cheng mp->b_next = NULL; 9728275SEric Cheng 9738275SEric Cheng type = OTH; 9748833SVenu.Iyer@Sun.COM sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 9758833SVenu.Iyer@Sun.COM 9768833SVenu.Iyer@Sun.COM if (is_ether) { 9778833SVenu.Iyer@Sun.COM /* 9788833SVenu.Iyer@Sun.COM * At this point we can be sure the packet at least 9798833SVenu.Iyer@Sun.COM * has an ether header. 9808833SVenu.Iyer@Sun.COM */ 9818833SVenu.Iyer@Sun.COM if (sz1 < sizeof (struct ether_header)) { 9828833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 9838833SVenu.Iyer@Sun.COM continue; 9848833SVenu.Iyer@Sun.COM } 9858833SVenu.Iyer@Sun.COM ehp = (struct ether_header *)mp->b_rptr; 9868833SVenu.Iyer@Sun.COM 9878833SVenu.Iyer@Sun.COM /* 9888833SVenu.Iyer@Sun.COM * Determine if this is a VLAN or non-VLAN packet. 9898833SVenu.Iyer@Sun.COM */ 9908833SVenu.Iyer@Sun.COM if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 9918833SVenu.Iyer@Sun.COM evhp = (struct ether_vlan_header *)mp->b_rptr; 9928833SVenu.Iyer@Sun.COM sap = ntohs(evhp->ether_type); 9938833SVenu.Iyer@Sun.COM hdrsize = sizeof (struct ether_vlan_header); 9948275SEric Cheng /* 9958833SVenu.Iyer@Sun.COM * Check if the VID of the packet, if any, 9968833SVenu.Iyer@Sun.COM * belongs to this client. 9978275SEric Cheng */ 9988833SVenu.Iyer@Sun.COM if (!mac_client_check_flow_vid(mcip, 9998833SVenu.Iyer@Sun.COM VLAN_ID(ntohs(evhp->ether_tci)))) { 10008275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 10018275SEric Cheng continue; 10028275SEric Cheng } 10038833SVenu.Iyer@Sun.COM } else { 10048833SVenu.Iyer@Sun.COM hdrsize = sizeof (struct ether_header); 10058833SVenu.Iyer@Sun.COM } 10068833SVenu.Iyer@Sun.COM is_unicast = 10078833SVenu.Iyer@Sun.COM ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 10088833SVenu.Iyer@Sun.COM dstaddr = (uint8_t *)&ehp->ether_dhost; 10098833SVenu.Iyer@Sun.COM } else { 10108833SVenu.Iyer@Sun.COM mac_header_info_t mhi; 10118833SVenu.Iyer@Sun.COM 10128833SVenu.Iyer@Sun.COM if (mac_header_info((mac_handle_t)mcip->mci_mip, 10138833SVenu.Iyer@Sun.COM mp, &mhi) != 0) { 10148833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 10158833SVenu.Iyer@Sun.COM continue; 10168833SVenu.Iyer@Sun.COM } 10178833SVenu.Iyer@Sun.COM hdrsize = mhi.mhi_hdrsize; 10188833SVenu.Iyer@Sun.COM sap = mhi.mhi_bindsap; 10198833SVenu.Iyer@Sun.COM is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 10208833SVenu.Iyer@Sun.COM dstaddr = (uint8_t *)mhi.mhi_daddr; 10218833SVenu.Iyer@Sun.COM } 10228833SVenu.Iyer@Sun.COM 10238833SVenu.Iyer@Sun.COM if (!dls_bypass) { 10248833SVenu.Iyer@Sun.COM if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 10258833SVenu.Iyer@Sun.COM hdrsize, &type, &indx) == -1) { 10268833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 10278833SVenu.Iyer@Sun.COM continue; 10288275SEric Cheng } 10298275SEric Cheng 10308275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type][indx], 10318275SEric Cheng tailmp[type][indx], cnt[type][indx], bw_ctl, 10328275SEric Cheng sz[type][indx], sz1, mp); 10338275SEric Cheng continue; 10348275SEric Cheng } 10358275SEric Cheng 10368275SEric Cheng 10378275SEric Cheng /* 10388275SEric Cheng * If we are using the default Rx ring where H/W or S/W 10398275SEric Cheng * classification has not happened, we need to verify if 10408275SEric Cheng * this unicast packet really belongs to us. 10418275SEric Cheng */ 10428833SVenu.Iyer@Sun.COM if (sap == ETHERTYPE_IP) { 10438275SEric Cheng /* 10448275SEric Cheng * If we are H/W classified, but we have promisc 10458275SEric Cheng * on, then we need to check for the unicast address. 10468275SEric Cheng */ 10478275SEric Cheng if (hw_classified && mcip->mci_promisc_list != NULL) { 10488275SEric Cheng mac_address_t *map; 10498275SEric Cheng 10508275SEric Cheng rw_enter(&mcip->mci_rw_lock, RW_READER); 10518275SEric Cheng map = mcip->mci_unicast; 10528833SVenu.Iyer@Sun.COM if (bcmp(dstaddr, map->ma_addr, 10538275SEric Cheng map->ma_len) == 0) 10548275SEric Cheng type = UNDEF; 10558275SEric Cheng rw_exit(&mcip->mci_rw_lock); 10568833SVenu.Iyer@Sun.COM } else if (is_unicast) { 10578275SEric Cheng type = UNDEF; 10588275SEric Cheng } 10598275SEric Cheng } 10608275SEric Cheng 10618275SEric Cheng /* 10628275SEric Cheng * This needs to become a contract with the driver for 10638275SEric Cheng * the fast path. 10648275SEric Cheng */ 10658275SEric Cheng 10668833SVenu.Iyer@Sun.COM ipha = (ipha_t *)(mp->b_rptr + hdrsize); 10678275SEric Cheng if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) { 10688275SEric Cheng type = OTH; 10698275SEric Cheng fanout_oth1++; 10708275SEric Cheng } 10718275SEric Cheng 10728275SEric Cheng if (type != OTH) { 10738833SVenu.Iyer@Sun.COM uint16_t frag_offset_flags; 10748833SVenu.Iyer@Sun.COM 10758275SEric Cheng switch (ipha->ipha_protocol) { 10768275SEric Cheng case IPPROTO_TCP: 10778275SEric Cheng case IPPROTO_UDP: 10788275SEric Cheng case IPPROTO_SCTP: 10798275SEric Cheng case IPPROTO_ESP: 10808275SEric Cheng ipha_len = IPH_HDR_LENGTH(ipha); 10818275SEric Cheng if ((uchar_t *)ipha + ipha_len + PORTS_SIZE > 10828275SEric Cheng mp->b_wptr) { 10838275SEric Cheng type = OTH; 10848275SEric Cheng break; 10858275SEric Cheng } 10868275SEric Cheng frag_offset_flags = 10878275SEric Cheng ntohs(ipha->ipha_fragment_offset_and_flags); 10888275SEric Cheng if ((frag_offset_flags & 10898275SEric Cheng (IPH_MF | IPH_OFFSET)) != 0) { 10908275SEric Cheng type = OTH; 10918275SEric Cheng fanout_oth3++; 10928275SEric Cheng break; 10938275SEric Cheng } 10948833SVenu.Iyer@Sun.COM ports_offset = hdrsize + ipha_len; 10958275SEric Cheng break; 10968275SEric Cheng default: 10978275SEric Cheng type = OTH; 10988275SEric Cheng fanout_oth4++; 10998275SEric Cheng break; 11008275SEric Cheng } 11018275SEric Cheng } 11028275SEric Cheng 11038275SEric Cheng if (type == OTH) { 11048833SVenu.Iyer@Sun.COM if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 11058833SVenu.Iyer@Sun.COM hdrsize, &type, &indx) == -1) { 11068275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 11078275SEric Cheng continue; 11088275SEric Cheng } 11098275SEric Cheng 11108275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type][indx], 11118275SEric Cheng tailmp[type][indx], cnt[type][indx], bw_ctl, 11128275SEric Cheng sz[type][indx], sz1, mp); 11138275SEric Cheng continue; 11148275SEric Cheng } 11158275SEric Cheng 11168275SEric Cheng ASSERT(type == UNDEF); 11178275SEric Cheng 11188275SEric Cheng /* 11198275SEric Cheng * XXX-Sunay: We should hold srs_lock since ring_count 11208275SEric Cheng * below can change. But if we are always called from 11218275SEric Cheng * mac_rx_srs_drain and SRS_PROC is set, then we can 11228275SEric Cheng * enforce that ring_count can't be changed i.e. 11238275SEric Cheng * to change fanout type or ring count, the calling 11248275SEric Cheng * thread needs to be behind SRS_PROC. 11258275SEric Cheng */ 11268275SEric Cheng switch (ipha->ipha_protocol) { 11278275SEric Cheng case IPPROTO_TCP: 11288275SEric Cheng /* 11298275SEric Cheng * Note that for ESP, we fanout on SPI and it is at the 11308275SEric Cheng * same offset as the 2x16-bit ports. So it is clumped 11318275SEric Cheng * along with TCP, UDP and SCTP. 11328275SEric Cheng */ 11338275SEric Cheng hash = HASH_ADDR(ipha->ipha_src, 11348275SEric Cheng *(uint32_t *)(mp->b_rptr + ports_offset)); 11358275SEric Cheng indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count); 11368275SEric Cheng type = V4_TCP; 11378833SVenu.Iyer@Sun.COM mp->b_rptr += hdrsize; 11388275SEric Cheng break; 11398275SEric Cheng case IPPROTO_UDP: 11408275SEric Cheng case IPPROTO_SCTP: 11418275SEric Cheng case IPPROTO_ESP: 11428275SEric Cheng if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 11438275SEric Cheng hash = HASH_ADDR(ipha->ipha_src, 11448275SEric Cheng *(uint32_t *)(mp->b_rptr + ports_offset)); 11458275SEric Cheng indx = COMPUTE_INDEX(hash, 11468275SEric Cheng mac_srs->srs_udp_ring_count); 11478275SEric Cheng } else { 11488275SEric Cheng indx = mac_srs->srs_ind % 11498275SEric Cheng mac_srs->srs_udp_ring_count; 11508275SEric Cheng mac_srs->srs_ind++; 11518275SEric Cheng } 11528275SEric Cheng type = V4_UDP; 11538833SVenu.Iyer@Sun.COM mp->b_rptr += hdrsize; 11548275SEric Cheng break; 11558833SVenu.Iyer@Sun.COM default: 11568833SVenu.Iyer@Sun.COM indx = 0; 11578833SVenu.Iyer@Sun.COM type = OTH; 11588275SEric Cheng } 11598275SEric Cheng 11608275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx], 11618275SEric Cheng cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp); 11628275SEric Cheng } 11638275SEric Cheng 11648275SEric Cheng for (type = V4_TCP; type < UNDEF; type++) { 11658833SVenu.Iyer@Sun.COM int i; 11668833SVenu.Iyer@Sun.COM 11678275SEric Cheng for (i = 0; i < fanout_cnt; i++) { 11688275SEric Cheng if (headmp[type][i] != NULL) { 11698833SVenu.Iyer@Sun.COM mac_soft_ring_t *softring; 11708833SVenu.Iyer@Sun.COM 11718275SEric Cheng ASSERT(tailmp[type][i]->b_next == NULL); 11728275SEric Cheng switch (type) { 11738275SEric Cheng case V4_TCP: 11748275SEric Cheng softring = 11758275SEric Cheng mac_srs->srs_tcp_soft_rings[i]; 11768275SEric Cheng break; 11778275SEric Cheng case V4_UDP: 11788275SEric Cheng softring = 11798275SEric Cheng mac_srs->srs_udp_soft_rings[i]; 11808275SEric Cheng break; 11818275SEric Cheng case OTH: 11828275SEric Cheng softring = 11838275SEric Cheng mac_srs->srs_oth_soft_rings[i]; 11848275SEric Cheng break; 11858275SEric Cheng } 11868833SVenu.Iyer@Sun.COM mac_rx_soft_ring_process(mcip, 11878275SEric Cheng softring, headmp[type][i], tailmp[type][i], 11888275SEric Cheng cnt[type][i], sz[type][i]); 11898275SEric Cheng } 11908275SEric Cheng } 11918275SEric Cheng } 11928275SEric Cheng } 11938275SEric Cheng 11948275SEric Cheng #define SRS_BYTES_TO_PICKUP 150000 11958275SEric Cheng ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP; 11968275SEric Cheng 11978275SEric Cheng /* 11988275SEric Cheng * mac_rx_srs_poll_ring 11998275SEric Cheng * 12008275SEric Cheng * This SRS Poll thread uses this routine to poll the underlying hardware 12018275SEric Cheng * Rx ring to get a chain of packets. It can inline process that chain 12028275SEric Cheng * if mac_latency_optimize is set (default) or signal the SRS worker thread 12038275SEric Cheng * to do the remaining processing. 12048275SEric Cheng * 12058275SEric Cheng * Since packets come in the system via interrupt or poll path, we also 12068275SEric Cheng * update the stats and deal with promiscous clients here. 12078275SEric Cheng */ 12088275SEric Cheng void 12098275SEric Cheng mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs) 12108275SEric Cheng { 12118275SEric Cheng kmutex_t *lock = &mac_srs->srs_lock; 12128275SEric Cheng kcondvar_t *async = &mac_srs->srs_cv; 12138275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 12148275SEric Cheng mblk_t *head, *tail, *mp; 12158275SEric Cheng callb_cpr_t cprinfo; 12168275SEric Cheng ssize_t bytes_to_pickup; 12178275SEric Cheng size_t sz; 12188275SEric Cheng int count; 12198275SEric Cheng mac_client_impl_t *smcip; 12208275SEric Cheng 12218275SEric Cheng CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll"); 12228275SEric Cheng mutex_enter(lock); 12238275SEric Cheng 12248275SEric Cheng start: 12258275SEric Cheng for (;;) { 12268275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 12278275SEric Cheng goto done; 12288275SEric Cheng 12298275SEric Cheng CALLB_CPR_SAFE_BEGIN(&cprinfo); 12308275SEric Cheng cv_wait(async, lock); 12318275SEric Cheng CALLB_CPR_SAFE_END(&cprinfo, lock); 12328275SEric Cheng 12338275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 12348275SEric Cheng goto done; 12358275SEric Cheng 12368275SEric Cheng check_again: 12378275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 12388275SEric Cheng /* 12398275SEric Cheng * We pick as many bytes as we are allowed to queue. 12408275SEric Cheng * Its possible that we will exceed the total 12418275SEric Cheng * packets queued in case this SRS is part of the 12428275SEric Cheng * Rx ring group since > 1 poll thread can be pulling 12438275SEric Cheng * upto the max allowed packets at the same time 12448275SEric Cheng * but that should be OK. 12458275SEric Cheng */ 12468275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 12478275SEric Cheng bytes_to_pickup = 12488275SEric Cheng mac_srs->srs_bw->mac_bw_drop_threshold - 12498275SEric Cheng mac_srs->srs_bw->mac_bw_sz; 12508275SEric Cheng /* 12518275SEric Cheng * We shouldn't have been signalled if we 12528275SEric Cheng * have 0 or less bytes to pick but since 12538275SEric Cheng * some of the bytes accounting is driver 12548275SEric Cheng * dependant, we do the safety check. 12558275SEric Cheng */ 12568275SEric Cheng if (bytes_to_pickup < 0) 12578275SEric Cheng bytes_to_pickup = 0; 12588275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 12598275SEric Cheng } else { 12608275SEric Cheng /* 12618275SEric Cheng * ToDO: Need to change the polling API 12628275SEric Cheng * to add a packet count and a flag which 12638275SEric Cheng * tells the driver whether we want packets 12648275SEric Cheng * based on a count, or bytes, or all the 12658275SEric Cheng * packets queued in the driver/HW. This 12668275SEric Cheng * way, we never have to check the limits 12678275SEric Cheng * on poll path. We truly let only as many 12688275SEric Cheng * packets enter the system as we are willing 12698275SEric Cheng * to process or queue. 12708275SEric Cheng * 12718275SEric Cheng * Something along the lines of 12728275SEric Cheng * pkts_to_pickup = mac_soft_ring_max_q_cnt - 12738275SEric Cheng * mac_srs->srs_poll_pkt_cnt 12748275SEric Cheng */ 12758275SEric Cheng 12768275SEric Cheng /* 12778275SEric Cheng * Since we are not doing B/W control, pick 12788275SEric Cheng * as many packets as allowed. 12798275SEric Cheng */ 12808275SEric Cheng bytes_to_pickup = max_bytes_to_pickup; 12818275SEric Cheng } 12828275SEric Cheng 12838275SEric Cheng /* Poll the underlying Hardware */ 12848275SEric Cheng mutex_exit(lock); 12858275SEric Cheng head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup); 12868275SEric Cheng mutex_enter(lock); 12878275SEric Cheng 12888275SEric Cheng ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 12898275SEric Cheng SRS_POLL_THR_OWNER); 12908275SEric Cheng 12918275SEric Cheng mp = tail = head; 12928275SEric Cheng count = 0; 12938275SEric Cheng sz = 0; 12948275SEric Cheng while (mp != NULL) { 12958275SEric Cheng tail = mp; 12968275SEric Cheng sz += msgdsize(mp); 12978275SEric Cheng mp = mp->b_next; 12988275SEric Cheng count++; 12998275SEric Cheng } 13008275SEric Cheng 13018275SEric Cheng if (head != NULL) { 13028275SEric Cheng tail->b_next = NULL; 13038275SEric Cheng smcip = mac_srs->srs_mcip; 13048275SEric Cheng 13058275SEric Cheng if ((mac_srs->srs_type & SRST_FLOW) || 13068275SEric Cheng (smcip == NULL)) { 13078275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, 13088275SEric Cheng rbytes, sz); 13098275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, 13108275SEric Cheng ipackets, count); 13118275SEric Cheng } 13128275SEric Cheng 13138275SEric Cheng /* 13148275SEric Cheng * If there are any promiscuous mode callbacks 13158275SEric Cheng * defined for this MAC client, pass them a copy 13168275SEric Cheng * if appropriate and also update the counters. 13178275SEric Cheng */ 13188275SEric Cheng if (smcip != NULL) { 13198275SEric Cheng smcip->mci_stat_ibytes += sz; 13208275SEric Cheng smcip->mci_stat_ipackets += count; 13218275SEric Cheng 13228275SEric Cheng if (smcip->mci_mip->mi_promisc_list != NULL) { 13238275SEric Cheng mutex_exit(lock); 13248275SEric Cheng mac_promisc_dispatch(smcip->mci_mip, 13258275SEric Cheng head, NULL); 13268275SEric Cheng mutex_enter(lock); 13278275SEric Cheng } 13288275SEric Cheng } 13298275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 13308275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 13318275SEric Cheng mac_srs->srs_bw->mac_bw_polled += sz; 13328275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 13338275SEric Cheng } 13348275SEric Cheng srs_rx->sr_poll_count += count; 13358275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, 13368275SEric Cheng count, sz); 13378275SEric Cheng if (count <= 10) 13388275SEric Cheng srs_rx->sr_chain_cnt_undr10++; 13398275SEric Cheng else if (count > 10 && count <= 50) 13408275SEric Cheng srs_rx->sr_chain_cnt_10to50++; 13418275SEric Cheng else 13428275SEric Cheng srs_rx->sr_chain_cnt_over50++; 13438275SEric Cheng } 13448275SEric Cheng 13458275SEric Cheng /* 13468275SEric Cheng * We are guaranteed that SRS_PROC will be set if we 13478275SEric Cheng * are here. Also, poll thread gets to run only if 13488275SEric Cheng * the drain was being done by a worker thread although 13498275SEric Cheng * its possible that worker thread is still running 13508275SEric Cheng * and poll thread was sent down to keep the pipeline 13518275SEric Cheng * going instead of doing a complete drain and then 13528275SEric Cheng * trying to poll the NIC. 13538275SEric Cheng * 13548275SEric Cheng * So we need to check SRS_WORKER flag to make sure 13558275SEric Cheng * that the worker thread is not processing the queue 13568275SEric Cheng * in parallel to us. The flags and conditions are 13578275SEric Cheng * protected by the srs_lock to prevent any race. We 13588275SEric Cheng * ensure that we don't drop the srs_lock from now 13598275SEric Cheng * till the end and similarly we don't drop the srs_lock 13608275SEric Cheng * in mac_rx_srs_drain() till similar condition check 13618275SEric Cheng * are complete. The mac_rx_srs_drain() needs to ensure 13628275SEric Cheng * that SRS_WORKER flag remains set as long as its 13638275SEric Cheng * processing the queue. 13648275SEric Cheng */ 13658275SEric Cheng if (!(mac_srs->srs_state & SRS_WORKER) && 13668275SEric Cheng (mac_srs->srs_first != NULL)) { 13678275SEric Cheng /* 13688275SEric Cheng * We have packets to process and worker thread 13698833SVenu.Iyer@Sun.COM * is not running. Check to see if poll thread is 13708833SVenu.Iyer@Sun.COM * allowed to process. 13718275SEric Cheng */ 13728833SVenu.Iyer@Sun.COM if (mac_srs->srs_state & SRS_LATENCY_OPT) { 13738275SEric Cheng mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC); 13749209SEric Cheng if (!(mac_srs->srs_state & SRS_PAUSE) && 13759209SEric Cheng srs_rx->sr_poll_pkt_cnt <= 13768275SEric Cheng srs_rx->sr_lowat) { 13778275SEric Cheng srs_rx->sr_poll_again++; 13788275SEric Cheng goto check_again; 13798833SVenu.Iyer@Sun.COM } 13808833SVenu.Iyer@Sun.COM /* 13818833SVenu.Iyer@Sun.COM * We are already above low water mark 13828833SVenu.Iyer@Sun.COM * so stay in the polling mode but no 13838833SVenu.Iyer@Sun.COM * need to poll. Once we dip below 13848833SVenu.Iyer@Sun.COM * the polling threshold, the processing 13858833SVenu.Iyer@Sun.COM * thread (soft ring) will signal us 13868833SVenu.Iyer@Sun.COM * to poll again (MAC_UPDATE_SRS_COUNT) 13878833SVenu.Iyer@Sun.COM */ 13888833SVenu.Iyer@Sun.COM srs_rx->sr_poll_drain_no_poll++; 13898833SVenu.Iyer@Sun.COM mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 13908833SVenu.Iyer@Sun.COM /* 13918833SVenu.Iyer@Sun.COM * In B/W control case, its possible 13928833SVenu.Iyer@Sun.COM * that the backlog built up due to 13938833SVenu.Iyer@Sun.COM * B/W limit being reached and packets 13948833SVenu.Iyer@Sun.COM * are queued only in SRS. In this case, 13958833SVenu.Iyer@Sun.COM * we should schedule worker thread 13968833SVenu.Iyer@Sun.COM * since no one else will wake us up. 13978833SVenu.Iyer@Sun.COM */ 13988833SVenu.Iyer@Sun.COM if ((mac_srs->srs_type & SRST_BW_CONTROL) && 13998833SVenu.Iyer@Sun.COM (mac_srs->srs_tid == NULL)) { 14008833SVenu.Iyer@Sun.COM mac_srs->srs_tid = 14018833SVenu.Iyer@Sun.COM timeout(mac_srs_fire, mac_srs, 1); 14028833SVenu.Iyer@Sun.COM srs_rx->sr_poll_worker_wakeup++; 14038275SEric Cheng } 14048275SEric Cheng } else { 14058275SEric Cheng /* 14068275SEric Cheng * Wakeup the worker thread for more processing. 14078275SEric Cheng * We optimize for throughput in this case. 14088275SEric Cheng */ 14098275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 14108275SEric Cheng MAC_SRS_WORKER_WAKEUP(mac_srs); 14118275SEric Cheng srs_rx->sr_poll_sig_worker++; 14128275SEric Cheng } 14138275SEric Cheng } else if ((mac_srs->srs_first == NULL) && 14148275SEric Cheng !(mac_srs->srs_state & SRS_WORKER)) { 14158275SEric Cheng /* 14168275SEric Cheng * There is nothing queued in SRS and 14178275SEric Cheng * no worker thread running. Plus we 14188275SEric Cheng * didn't get anything from the H/W 14198275SEric Cheng * as well (head == NULL); 14208275SEric Cheng */ 14218275SEric Cheng ASSERT(head == NULL); 14228275SEric Cheng mac_srs->srs_state &= 14238275SEric Cheng ~(SRS_PROC|SRS_GET_PKTS); 14248275SEric Cheng 14258275SEric Cheng /* 14268275SEric Cheng * If we have a packets in soft ring, don't allow 14278275SEric Cheng * more packets to come into this SRS by keeping the 14288275SEric Cheng * interrupts off but not polling the H/W. The 14298275SEric Cheng * poll thread will get signaled as soon as 14308275SEric Cheng * srs_poll_pkt_cnt dips below poll threshold. 14318275SEric Cheng */ 14328275SEric Cheng if (srs_rx->sr_poll_pkt_cnt == 0) { 14338275SEric Cheng srs_rx->sr_poll_intr_enable++; 14348275SEric Cheng MAC_SRS_POLLING_OFF(mac_srs); 14358275SEric Cheng } else { 14368275SEric Cheng /* 14378275SEric Cheng * We know nothing is queued in SRS 14388275SEric Cheng * since we are here after checking 14398275SEric Cheng * srs_first is NULL. The backlog 14408275SEric Cheng * is entirely due to packets queued 14418275SEric Cheng * in Soft ring which will wake us up 14428275SEric Cheng * and get the interface out of polling 14438275SEric Cheng * mode once the backlog dips below 14448275SEric Cheng * sr_poll_thres. 14458275SEric Cheng */ 14468275SEric Cheng srs_rx->sr_poll_no_poll++; 14478275SEric Cheng } 14488275SEric Cheng } else { 14498275SEric Cheng /* 14508275SEric Cheng * Worker thread is already running. 14518275SEric Cheng * Nothing much to do. If the polling 14528275SEric Cheng * was enabled, worker thread will deal 14538275SEric Cheng * with that. 14548275SEric Cheng */ 14558275SEric Cheng mac_srs->srs_state &= ~SRS_GET_PKTS; 14568275SEric Cheng srs_rx->sr_poll_goto_sleep++; 14578275SEric Cheng } 14588275SEric Cheng } 14598275SEric Cheng done: 14608275SEric Cheng mac_srs->srs_state |= SRS_POLL_THR_QUIESCED; 14618275SEric Cheng cv_signal(&mac_srs->srs_async); 14628275SEric Cheng /* 14638275SEric Cheng * If this is a temporary quiesce then wait for the restart signal 14648275SEric Cheng * from the srs worker. Then clear the flags and signal the srs worker 14658275SEric Cheng * to ensure a positive handshake and go back to start. 14668275SEric Cheng */ 14678275SEric Cheng while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART))) 14688275SEric Cheng cv_wait(async, lock); 14698275SEric Cheng if (mac_srs->srs_state & SRS_POLL_THR_RESTART) { 14708275SEric Cheng ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 14718275SEric Cheng mac_srs->srs_state &= 14728275SEric Cheng ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART); 14738275SEric Cheng cv_signal(&mac_srs->srs_async); 14748275SEric Cheng goto start; 14758275SEric Cheng } else { 14768275SEric Cheng mac_srs->srs_state |= SRS_POLL_THR_EXITED; 14778275SEric Cheng cv_signal(&mac_srs->srs_async); 14788275SEric Cheng CALLB_CPR_EXIT(&cprinfo); 14798275SEric Cheng thread_exit(); 14808275SEric Cheng } 14818275SEric Cheng } 14828275SEric Cheng 14838275SEric Cheng /* 14848275SEric Cheng * mac_srs_pick_chain 14858275SEric Cheng * 14868275SEric Cheng * In Bandwidth control case, checks how many packets can be processed 14878275SEric Cheng * and return them in a sub chain. 14888275SEric Cheng */ 14898275SEric Cheng static mblk_t * 14908275SEric Cheng mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail, 14918275SEric Cheng size_t *chain_sz, int *chain_cnt) 14928275SEric Cheng { 14938275SEric Cheng mblk_t *head = NULL; 14948275SEric Cheng mblk_t *tail = NULL; 14958275SEric Cheng size_t sz; 14968275SEric Cheng size_t tsz = 0; 14978275SEric Cheng int cnt = 0; 14988275SEric Cheng mblk_t *mp; 14998275SEric Cheng 15008275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 15018275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 15028275SEric Cheng if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <= 15038275SEric Cheng mac_srs->srs_bw->mac_bw_limit) || 15048275SEric Cheng (mac_srs->srs_bw->mac_bw_limit == 0)) { 15058275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 15068275SEric Cheng head = mac_srs->srs_first; 15078275SEric Cheng mac_srs->srs_first = NULL; 15088275SEric Cheng *chain_tail = mac_srs->srs_last; 15098275SEric Cheng mac_srs->srs_last = NULL; 15108275SEric Cheng *chain_sz = mac_srs->srs_size; 15118275SEric Cheng *chain_cnt = mac_srs->srs_count; 15128275SEric Cheng mac_srs->srs_count = 0; 15138275SEric Cheng mac_srs->srs_size = 0; 15148275SEric Cheng return (head); 15158275SEric Cheng } 15168275SEric Cheng 15178275SEric Cheng /* 15188275SEric Cheng * Can't clear the entire backlog. 15198275SEric Cheng * Need to find how many packets to pick 15208275SEric Cheng */ 15218275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock)); 15228275SEric Cheng while ((mp = mac_srs->srs_first) != NULL) { 15238275SEric Cheng sz = msgdsize(mp); 15248275SEric Cheng if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) > 15258275SEric Cheng mac_srs->srs_bw->mac_bw_limit) { 15268275SEric Cheng if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) 15278275SEric Cheng mac_srs->srs_bw->mac_bw_state |= 15288275SEric Cheng SRS_BW_ENFORCED; 15298275SEric Cheng break; 15308275SEric Cheng } 15318275SEric Cheng 15328275SEric Cheng /* 15338275SEric Cheng * The _size & cnt is decremented from the softrings 15348275SEric Cheng * when they send up the packet for polling to work 15358275SEric Cheng * properly. 15368275SEric Cheng */ 15378275SEric Cheng tsz += sz; 15388275SEric Cheng cnt++; 15398275SEric Cheng mac_srs->srs_count--; 15408275SEric Cheng mac_srs->srs_size -= sz; 15418275SEric Cheng if (tail != NULL) 15428275SEric Cheng tail->b_next = mp; 15438275SEric Cheng else 15448275SEric Cheng head = mp; 15458275SEric Cheng tail = mp; 15468275SEric Cheng mac_srs->srs_first = mac_srs->srs_first->b_next; 15478275SEric Cheng } 15488275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 15498275SEric Cheng if (mac_srs->srs_first == NULL) 15508275SEric Cheng mac_srs->srs_last = NULL; 15518275SEric Cheng 15528275SEric Cheng if (tail != NULL) 15538275SEric Cheng tail->b_next = NULL; 15548275SEric Cheng *chain_tail = tail; 15558275SEric Cheng *chain_cnt = cnt; 15568275SEric Cheng *chain_sz = tsz; 15578275SEric Cheng 15588275SEric Cheng return (head); 15598275SEric Cheng } 15608275SEric Cheng 15618275SEric Cheng /* 15628275SEric Cheng * mac_rx_srs_drain 15638275SEric Cheng * 15648275SEric Cheng * The SRS drain routine. Gets to run to clear the queue. Any thread 15658275SEric Cheng * (worker, interrupt, poll) can call this based on processing model. 15668275SEric Cheng * The first thing we do is disable interrupts if possible and then 15678275SEric Cheng * drain the queue. we also try to poll the underlying hardware if 15688275SEric Cheng * there is a dedicated hardware Rx ring assigned to this SRS. 15698275SEric Cheng * 15708275SEric Cheng * There is a equivalent drain routine in bandwidth control mode 15718275SEric Cheng * mac_rx_srs_drain_bw. There is some code duplication between the two 15728275SEric Cheng * routines but they are highly performance sensitive and are easier 15738275SEric Cheng * to read/debug if they stay separate. Any code changes here might 15748275SEric Cheng * also apply to mac_rx_srs_drain_bw as well. 15758275SEric Cheng */ 15768275SEric Cheng void 15778275SEric Cheng mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 15788275SEric Cheng { 15798275SEric Cheng mblk_t *head; 15808275SEric Cheng mblk_t *tail; 15818275SEric Cheng timeout_id_t tid; 15828275SEric Cheng int cnt = 0; 15838275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 15848275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 15858275SEric Cheng 15868275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 15878275SEric Cheng ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL)); 15888833SVenu.Iyer@Sun.COM 15898275SEric Cheng /* If we are blanked i.e. can't do upcalls, then we are done */ 15908275SEric Cheng if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 15918275SEric Cheng ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 15928275SEric Cheng (mac_srs->srs_state & SRS_PAUSE)); 15938275SEric Cheng goto out; 15948275SEric Cheng } 15958275SEric Cheng 15968275SEric Cheng if (mac_srs->srs_first == NULL) 15978275SEric Cheng goto out; 15988275SEric Cheng 15998833SVenu.Iyer@Sun.COM if (!(mac_srs->srs_state & SRS_LATENCY_OPT) && 16008833SVenu.Iyer@Sun.COM (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) { 16018833SVenu.Iyer@Sun.COM /* 16028833SVenu.Iyer@Sun.COM * In the normal case, the SRS worker thread does no 16038833SVenu.Iyer@Sun.COM * work and we wait for a backlog to build up before 16048833SVenu.Iyer@Sun.COM * we switch into polling mode. In case we are 16058833SVenu.Iyer@Sun.COM * optimizing for throughput, we use the worker thread 16068833SVenu.Iyer@Sun.COM * as well. The goal is to let worker thread process 16078833SVenu.Iyer@Sun.COM * the queue and poll thread to feed packets into 16088833SVenu.Iyer@Sun.COM * the queue. As such, we should signal the poll 16098833SVenu.Iyer@Sun.COM * thread to try and get more packets. 16108833SVenu.Iyer@Sun.COM * 16118833SVenu.Iyer@Sun.COM * We could have pulled this check in the POLL_RING 16128833SVenu.Iyer@Sun.COM * macro itself but keeping it explicit here makes 16138833SVenu.Iyer@Sun.COM * the architecture more human understandable. 16148833SVenu.Iyer@Sun.COM */ 16158833SVenu.Iyer@Sun.COM MAC_SRS_POLL_RING(mac_srs); 16168833SVenu.Iyer@Sun.COM } 16178833SVenu.Iyer@Sun.COM 16188833SVenu.Iyer@Sun.COM again: 16198275SEric Cheng head = mac_srs->srs_first; 16208275SEric Cheng mac_srs->srs_first = NULL; 16218275SEric Cheng tail = mac_srs->srs_last; 16228275SEric Cheng mac_srs->srs_last = NULL; 16238275SEric Cheng cnt = mac_srs->srs_count; 16248275SEric Cheng mac_srs->srs_count = 0; 16258275SEric Cheng 16268275SEric Cheng ASSERT(head != NULL); 16278275SEric Cheng ASSERT(tail != NULL); 16288275SEric Cheng 16298275SEric Cheng if ((tid = mac_srs->srs_tid) != 0) 16308275SEric Cheng mac_srs->srs_tid = 0; 16318275SEric Cheng 16328275SEric Cheng mac_srs->srs_state |= (SRS_PROC|proc_type); 16338275SEric Cheng 16348833SVenu.Iyer@Sun.COM 16358275SEric Cheng /* 16368275SEric Cheng * mcip is NULL for broadcast and multicast flows. The promisc 16378275SEric Cheng * callbacks for broadcast and multicast packets are delivered from 16388275SEric Cheng * mac_rx() and we don't need to worry about that case in this path 16398275SEric Cheng */ 16408275SEric Cheng if (mcip != NULL && mcip->mci_promisc_list != NULL) { 16418275SEric Cheng mutex_exit(&mac_srs->srs_lock); 16428275SEric Cheng mac_promisc_client_dispatch(mcip, head); 16438275SEric Cheng mutex_enter(&mac_srs->srs_lock); 16448275SEric Cheng } 16458275SEric Cheng 16468275SEric Cheng /* 16478275SEric Cheng * Check if SRS itself is doing the processing 16488275SEric Cheng * This direct path does not apply when subflows are present. In this 16498275SEric Cheng * case, packets need to be dispatched to a soft ring according to the 16508275SEric Cheng * flow's bandwidth and other resources contraints. 16518275SEric Cheng */ 16528275SEric Cheng if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 16538275SEric Cheng mac_direct_rx_t proc; 16548275SEric Cheng void *arg1; 16558275SEric Cheng mac_resource_handle_t arg2; 16568275SEric Cheng 16578275SEric Cheng /* 16588275SEric Cheng * This is the case when a Rx is directly 16598275SEric Cheng * assigned and we have a fully classified 16608275SEric Cheng * protocol chain. We can deal with it in 16618275SEric Cheng * one shot. 16628275SEric Cheng */ 16638275SEric Cheng proc = srs_rx->sr_func; 16648275SEric Cheng arg1 = srs_rx->sr_arg1; 16658275SEric Cheng arg2 = srs_rx->sr_arg2; 16668275SEric Cheng 16678275SEric Cheng mac_srs->srs_state |= SRS_CLIENT_PROC; 16688275SEric Cheng mutex_exit(&mac_srs->srs_lock); 16698275SEric Cheng if (tid != 0) { 16708275SEric Cheng (void) untimeout(tid); 16718275SEric Cheng tid = 0; 16728275SEric Cheng } 16738275SEric Cheng 16748275SEric Cheng proc(arg1, arg2, head, NULL); 16758275SEric Cheng /* 16768275SEric Cheng * Decrement the size and count here itelf 16778275SEric Cheng * since the packet has been processed. 16788275SEric Cheng */ 16798275SEric Cheng mutex_enter(&mac_srs->srs_lock); 16808275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 16818275SEric Cheng if (mac_srs->srs_state & SRS_CLIENT_WAIT) 16828275SEric Cheng cv_signal(&mac_srs->srs_client_cv); 16838275SEric Cheng mac_srs->srs_state &= ~SRS_CLIENT_PROC; 16848275SEric Cheng } else { 16858275SEric Cheng /* Some kind of softrings based fanout is required */ 16868275SEric Cheng mutex_exit(&mac_srs->srs_lock); 16878275SEric Cheng if (tid != 0) { 16888275SEric Cheng (void) untimeout(tid); 16898275SEric Cheng tid = 0; 16908275SEric Cheng } 16918275SEric Cheng 16928275SEric Cheng /* 16938275SEric Cheng * Since the fanout routines can deal with chains, 16948275SEric Cheng * shoot the entire chain up. 16958275SEric Cheng */ 16968275SEric Cheng if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 16978275SEric Cheng mac_rx_srs_fanout(mac_srs, head); 16988275SEric Cheng else 16998275SEric Cheng mac_rx_srs_proto_fanout(mac_srs, head); 17008275SEric Cheng mutex_enter(&mac_srs->srs_lock); 17018275SEric Cheng } 17028275SEric Cheng 17039820SEric Cheng if (!(mac_srs->srs_state & (SRS_BLANK|SRS_PAUSE)) && 17049820SEric Cheng (mac_srs->srs_first != NULL)) { 17058833SVenu.Iyer@Sun.COM /* 17069820SEric Cheng * More packets arrived while we were clearing the 17079820SEric Cheng * SRS. This can be possible because of one of 17089820SEric Cheng * three conditions below: 17099820SEric Cheng * 1) The driver is using multiple worker threads 17109820SEric Cheng * to send the packets to us. 17119820SEric Cheng * 2) The driver has a race in switching 17129820SEric Cheng * between interrupt and polling mode or 17139820SEric Cheng * 3) Packets are arriving in this SRS via the 17149820SEric Cheng * S/W classification as well. 17159820SEric Cheng * 17169820SEric Cheng * We should switch to polling mode and see if we 17179820SEric Cheng * need to send the poll thread down. Also, signal 17189820SEric Cheng * the worker thread to process whats just arrived. 17198833SVenu.Iyer@Sun.COM */ 17209820SEric Cheng MAC_SRS_POLLING_ON(mac_srs); 17218833SVenu.Iyer@Sun.COM if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) { 17228833SVenu.Iyer@Sun.COM srs_rx->sr_drain_poll_sig++; 17238833SVenu.Iyer@Sun.COM MAC_SRS_POLL_RING(mac_srs); 17248833SVenu.Iyer@Sun.COM } 17259820SEric Cheng 17269820SEric Cheng /* 17279820SEric Cheng * If we didn't signal the poll thread, we need 17289820SEric Cheng * to deal with the pending packets ourselves. 17299820SEric Cheng */ 17309820SEric Cheng if (proc_type == SRS_WORKER) { 17318275SEric Cheng srs_rx->sr_drain_again++; 17328275SEric Cheng goto again; 17339820SEric Cheng } else { 17349820SEric Cheng srs_rx->sr_drain_worker_sig++; 17359820SEric Cheng cv_signal(&mac_srs->srs_async); 17368275SEric Cheng } 17378275SEric Cheng } 17388275SEric Cheng 17398275SEric Cheng out: 17408275SEric Cheng if (mac_srs->srs_state & SRS_GET_PKTS) { 17418275SEric Cheng /* 17428275SEric Cheng * Poll thread is already running. Leave the 17438275SEric Cheng * SRS_RPOC set and hand over the control to 17448275SEric Cheng * poll thread. 17458275SEric Cheng */ 17468275SEric Cheng mac_srs->srs_state &= ~proc_type; 17478275SEric Cheng srs_rx->sr_drain_poll_running++; 17488275SEric Cheng return; 17498275SEric Cheng } 17508275SEric Cheng 17518275SEric Cheng /* 17528275SEric Cheng * Even if there are no packets queued in SRS, we 17538275SEric Cheng * need to make sure that the shared counter is 17548275SEric Cheng * clear and any associated softrings have cleared 17558275SEric Cheng * all the backlog. Otherwise, leave the interface 17568275SEric Cheng * in polling mode and the poll thread will get 17578275SEric Cheng * signalled once the count goes down to zero. 17588275SEric Cheng * 17598275SEric Cheng * If someone is already draining the queue (SRS_PROC is 17608275SEric Cheng * set) when the srs_poll_pkt_cnt goes down to zero, 17618275SEric Cheng * then it means that drain is already running and we 17628275SEric Cheng * will turn off polling at that time if there is 17638275SEric Cheng * no backlog. 17648275SEric Cheng * 17658275SEric Cheng * As long as there are packets queued either 17668275SEric Cheng * in soft ring set or its soft rings, we will leave 17678275SEric Cheng * the interface in polling mode (even if the drain 17688275SEric Cheng * was done being the interrupt thread). We signal 17698275SEric Cheng * the poll thread as well if we have dipped below 17708275SEric Cheng * low water mark. 17718275SEric Cheng * 17728275SEric Cheng * NOTE: We can't use the MAC_SRS_POLLING_ON macro 17738275SEric Cheng * since that turn polling on only for worker thread. 17748275SEric Cheng * Its not worth turning polling on for interrupt 17758275SEric Cheng * thread (since NIC will not issue another interrupt) 17768275SEric Cheng * unless a backlog builds up. 17778275SEric Cheng */ 17788275SEric Cheng if ((srs_rx->sr_poll_pkt_cnt > 0) && 17798275SEric Cheng (mac_srs->srs_state & SRS_POLLING_CAPAB)) { 17808275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 17818275SEric Cheng srs_rx->sr_drain_keep_polling++; 17828275SEric Cheng MAC_SRS_POLLING_ON(mac_srs); 17838275SEric Cheng if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) 17848275SEric Cheng MAC_SRS_POLL_RING(mac_srs); 17858275SEric Cheng return; 17868275SEric Cheng } 17878275SEric Cheng 17888275SEric Cheng /* Nothing else to do. Get out of poll mode */ 17898275SEric Cheng MAC_SRS_POLLING_OFF(mac_srs); 17908275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 17918275SEric Cheng srs_rx->sr_drain_finish_intr++; 17928275SEric Cheng } 17938275SEric Cheng 17948275SEric Cheng /* 17958275SEric Cheng * mac_rx_srs_drain_bw 17968275SEric Cheng * 17978275SEric Cheng * The SRS BW drain routine. Gets to run to clear the queue. Any thread 17988275SEric Cheng * (worker, interrupt, poll) can call this based on processing model. 17998275SEric Cheng * The first thing we do is disable interrupts if possible and then 18008275SEric Cheng * drain the queue. we also try to poll the underlying hardware if 18018275SEric Cheng * there is a dedicated hardware Rx ring assigned to this SRS. 18028275SEric Cheng * 18038275SEric Cheng * There is a equivalent drain routine in non bandwidth control mode 18048275SEric Cheng * mac_rx_srs_drain. There is some code duplication between the two 18058275SEric Cheng * routines but they are highly performance sensitive and are easier 18068275SEric Cheng * to read/debug if they stay separate. Any code changes here might 18078275SEric Cheng * also apply to mac_rx_srs_drain as well. 18088275SEric Cheng */ 18098275SEric Cheng void 18108275SEric Cheng mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 18118275SEric Cheng { 18128275SEric Cheng mblk_t *head; 18138275SEric Cheng mblk_t *tail; 18148275SEric Cheng timeout_id_t tid; 18158275SEric Cheng size_t sz = 0; 18168275SEric Cheng int cnt = 0; 18178275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 18188275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 181911066Srafael.vanoni@sun.com clock_t now; 18208275SEric Cheng 18218275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 18228275SEric Cheng ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 18238275SEric Cheng again: 18248275SEric Cheng /* Check if we are doing B/W control */ 18258275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 182611066Srafael.vanoni@sun.com now = ddi_get_lbolt(); 182711066Srafael.vanoni@sun.com if (mac_srs->srs_bw->mac_bw_curr_time != now) { 182811066Srafael.vanoni@sun.com mac_srs->srs_bw->mac_bw_curr_time = now; 18298275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 18308275SEric Cheng if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 18318275SEric Cheng mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; 18328275SEric Cheng } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) { 18338275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18348275SEric Cheng goto done; 18358275SEric Cheng } else if (mac_srs->srs_bw->mac_bw_used > 18368275SEric Cheng mac_srs->srs_bw->mac_bw_limit) { 18378275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 18388275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18398275SEric Cheng goto done; 18408275SEric Cheng } 18418275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18428275SEric Cheng 18438275SEric Cheng /* If we are blanked i.e. can't do upcalls, then we are done */ 18448275SEric Cheng if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 18458275SEric Cheng ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 18468275SEric Cheng (mac_srs->srs_state & SRS_PAUSE)); 18478275SEric Cheng goto done; 18488275SEric Cheng } 18498275SEric Cheng 18508275SEric Cheng sz = 0; 18518275SEric Cheng cnt = 0; 18528275SEric Cheng if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) { 18538275SEric Cheng /* 18548275SEric Cheng * We couldn't pick up a single packet. 18558275SEric Cheng */ 18568275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 18578275SEric Cheng if ((mac_srs->srs_bw->mac_bw_used == 0) && 18588275SEric Cheng (mac_srs->srs_size != 0) && 18598275SEric Cheng !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 18608275SEric Cheng /* 18618275SEric Cheng * Seems like configured B/W doesn't 18628275SEric Cheng * even allow processing of 1 packet 18638275SEric Cheng * per tick. 18648275SEric Cheng * 18658275SEric Cheng * XXX: raise the limit to processing 18668275SEric Cheng * at least 1 packet per tick. 18678275SEric Cheng */ 18688275SEric Cheng mac_srs->srs_bw->mac_bw_limit += 18698275SEric Cheng mac_srs->srs_bw->mac_bw_limit; 18708275SEric Cheng mac_srs->srs_bw->mac_bw_drop_threshold += 18718275SEric Cheng mac_srs->srs_bw->mac_bw_drop_threshold; 18728275SEric Cheng cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) " 18738275SEric Cheng "raised B/W limit to %d since not even a " 18748275SEric Cheng "single packet can be processed per " 18758275SEric Cheng "tick %d\n", (void *)mac_srs, 18768275SEric Cheng (int)mac_srs->srs_bw->mac_bw_limit, 18778275SEric Cheng (int)msgdsize(mac_srs->srs_first)); 18788275SEric Cheng } 18798275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18808275SEric Cheng goto done; 18818275SEric Cheng } 18828275SEric Cheng 18838275SEric Cheng ASSERT(head != NULL); 18848275SEric Cheng ASSERT(tail != NULL); 18858275SEric Cheng 18868275SEric Cheng /* zero bandwidth: drop all and return to interrupt mode */ 18878275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 18888275SEric Cheng if (mac_srs->srs_bw->mac_bw_limit == 0) { 18898275SEric Cheng srs_rx->sr_drop_count += cnt; 18908275SEric Cheng ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz); 18918275SEric Cheng mac_srs->srs_bw->mac_bw_sz -= sz; 18928275SEric Cheng mac_srs->srs_bw->mac_bw_drop_bytes += sz; 18938275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18948275SEric Cheng mac_pkt_drop(NULL, NULL, head, B_FALSE); 18958275SEric Cheng goto leave_poll; 18968275SEric Cheng } else { 18978275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18988275SEric Cheng } 18998275SEric Cheng 19008275SEric Cheng if ((tid = mac_srs->srs_tid) != 0) 19018275SEric Cheng mac_srs->srs_tid = 0; 19028275SEric Cheng 19038275SEric Cheng mac_srs->srs_state |= (SRS_PROC|proc_type); 19048275SEric Cheng MAC_SRS_WORKER_POLLING_ON(mac_srs); 19058275SEric Cheng 19068275SEric Cheng /* 19078275SEric Cheng * mcip is NULL for broadcast and multicast flows. The promisc 19088275SEric Cheng * callbacks for broadcast and multicast packets are delivered from 19098275SEric Cheng * mac_rx() and we don't need to worry about that case in this path 19108275SEric Cheng */ 19118275SEric Cheng if (mcip != NULL && mcip->mci_promisc_list != NULL) { 19128275SEric Cheng mutex_exit(&mac_srs->srs_lock); 19138275SEric Cheng mac_promisc_client_dispatch(mcip, head); 19148275SEric Cheng mutex_enter(&mac_srs->srs_lock); 19158275SEric Cheng } 19168275SEric Cheng 19178275SEric Cheng /* 19188275SEric Cheng * Check if SRS itself is doing the processing 19198275SEric Cheng * This direct path does not apply when subflows are present. In this 19208275SEric Cheng * case, packets need to be dispatched to a soft ring according to the 19218275SEric Cheng * flow's bandwidth and other resources contraints. 19228275SEric Cheng */ 19238275SEric Cheng if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 19248275SEric Cheng mac_direct_rx_t proc; 19258275SEric Cheng void *arg1; 19268275SEric Cheng mac_resource_handle_t arg2; 19278275SEric Cheng 19288275SEric Cheng /* 19298275SEric Cheng * This is the case when a Rx is directly 19308275SEric Cheng * assigned and we have a fully classified 19318275SEric Cheng * protocol chain. We can deal with it in 19328275SEric Cheng * one shot. 19338275SEric Cheng */ 19348275SEric Cheng proc = srs_rx->sr_func; 19358275SEric Cheng arg1 = srs_rx->sr_arg1; 19368275SEric Cheng arg2 = srs_rx->sr_arg2; 19378275SEric Cheng 19388275SEric Cheng mac_srs->srs_state |= SRS_CLIENT_PROC; 19398275SEric Cheng mutex_exit(&mac_srs->srs_lock); 19408275SEric Cheng if (tid != 0) { 19418275SEric Cheng (void) untimeout(tid); 19428275SEric Cheng tid = 0; 19438275SEric Cheng } 19448275SEric Cheng 19458275SEric Cheng proc(arg1, arg2, head, NULL); 19468275SEric Cheng /* 19478275SEric Cheng * Decrement the size and count here itelf 19488275SEric Cheng * since the packet has been processed. 19498275SEric Cheng */ 19508275SEric Cheng mutex_enter(&mac_srs->srs_lock); 19518275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 19528275SEric Cheng MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 19538275SEric Cheng 19548275SEric Cheng if (mac_srs->srs_state & SRS_CLIENT_WAIT) 19558275SEric Cheng cv_signal(&mac_srs->srs_client_cv); 19568275SEric Cheng mac_srs->srs_state &= ~SRS_CLIENT_PROC; 19578275SEric Cheng } else { 19588275SEric Cheng /* Some kind of softrings based fanout is required */ 19598275SEric Cheng mutex_exit(&mac_srs->srs_lock); 19608275SEric Cheng if (tid != 0) { 19618275SEric Cheng (void) untimeout(tid); 19628275SEric Cheng tid = 0; 19638275SEric Cheng } 19648275SEric Cheng 19658275SEric Cheng /* 19668275SEric Cheng * Since the fanout routines can deal with chains, 19678275SEric Cheng * shoot the entire chain up. 19688275SEric Cheng */ 19698275SEric Cheng if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 19708275SEric Cheng mac_rx_srs_fanout(mac_srs, head); 19718275SEric Cheng else 19728275SEric Cheng mac_rx_srs_proto_fanout(mac_srs, head); 19738275SEric Cheng mutex_enter(&mac_srs->srs_lock); 19748275SEric Cheng } 19758275SEric Cheng 19768275SEric Cheng /* 19778275SEric Cheng * Send the poll thread to pick up any packets arrived 19788275SEric Cheng * so far. This also serves as the last check in case 19798275SEric Cheng * nothing else is queued in the SRS. The poll thread 19808275SEric Cheng * is signalled only in the case the drain was done 19818275SEric Cheng * by the worker thread and SRS_WORKER is set. The 19828275SEric Cheng * worker thread can run in parallel as long as the 19838275SEric Cheng * SRS_WORKER flag is set. We we have nothing else to 19848275SEric Cheng * process, we can exit while leaving SRS_PROC set 19858275SEric Cheng * which gives the poll thread control to process and 19868275SEric Cheng * cleanup once it returns from the NIC. 19878275SEric Cheng * 19888275SEric Cheng * If we have nothing else to process, we need to 19898275SEric Cheng * ensure that we keep holding the srs_lock till 19908275SEric Cheng * all the checks below are done and control is 19918275SEric Cheng * handed to the poll thread if it was running. 19928275SEric Cheng */ 19938275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 19948275SEric Cheng if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 19958275SEric Cheng if (mac_srs->srs_first != NULL) { 19968275SEric Cheng if (proc_type == SRS_WORKER) { 19978275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 19988275SEric Cheng if (srs_rx->sr_poll_pkt_cnt <= 19998275SEric Cheng srs_rx->sr_lowat) 20008275SEric Cheng MAC_SRS_POLL_RING(mac_srs); 20018275SEric Cheng goto again; 20028275SEric Cheng } else { 20038275SEric Cheng cv_signal(&mac_srs->srs_async); 20048275SEric Cheng } 20058275SEric Cheng } 20068275SEric Cheng } 20078275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 20088275SEric Cheng 20098275SEric Cheng done: 20108275SEric Cheng 20118275SEric Cheng if (mac_srs->srs_state & SRS_GET_PKTS) { 20128275SEric Cheng /* 20138275SEric Cheng * Poll thread is already running. Leave the 20148275SEric Cheng * SRS_RPOC set and hand over the control to 20158275SEric Cheng * poll thread. 20168275SEric Cheng */ 20178275SEric Cheng mac_srs->srs_state &= ~proc_type; 20188275SEric Cheng return; 20198275SEric Cheng } 20208275SEric Cheng 20218275SEric Cheng /* 20228275SEric Cheng * If we can't process packets because we have exceeded 20238275SEric Cheng * B/W limit for this tick, just set the timeout 20248275SEric Cheng * and leave. 20258275SEric Cheng * 20268275SEric Cheng * Even if there are no packets queued in SRS, we 20278275SEric Cheng * need to make sure that the shared counter is 20288275SEric Cheng * clear and any associated softrings have cleared 20298275SEric Cheng * all the backlog. Otherwise, leave the interface 20308275SEric Cheng * in polling mode and the poll thread will get 20318275SEric Cheng * signalled once the count goes down to zero. 20328275SEric Cheng * 20338275SEric Cheng * If someone is already draining the queue (SRS_PROC is 20348275SEric Cheng * set) when the srs_poll_pkt_cnt goes down to zero, 20358275SEric Cheng * then it means that drain is already running and we 20368275SEric Cheng * will turn off polling at that time if there is 20378275SEric Cheng * no backlog. As long as there are packets queued either 20388275SEric Cheng * is soft ring set or its soft rings, we will leave 20398275SEric Cheng * the interface in polling mode. 20408275SEric Cheng */ 20418275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 20428275SEric Cheng if ((mac_srs->srs_state & SRS_POLLING_CAPAB) && 20438275SEric Cheng ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) || 20448275SEric Cheng (srs_rx->sr_poll_pkt_cnt > 0))) { 20458275SEric Cheng MAC_SRS_POLLING_ON(mac_srs); 20468275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 20478275SEric Cheng if ((mac_srs->srs_first != NULL) && 20488275SEric Cheng (mac_srs->srs_tid == NULL)) 20498275SEric Cheng mac_srs->srs_tid = timeout(mac_srs_fire, 20508275SEric Cheng mac_srs, 1); 20518275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 20528275SEric Cheng return; 20538275SEric Cheng } 20548275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 20558275SEric Cheng 20568275SEric Cheng leave_poll: 20578275SEric Cheng 20588275SEric Cheng /* Nothing else to do. Get out of poll mode */ 20598275SEric Cheng MAC_SRS_POLLING_OFF(mac_srs); 20608275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 20618275SEric Cheng } 20628275SEric Cheng 20638275SEric Cheng /* 20648275SEric Cheng * mac_srs_worker 20658275SEric Cheng * 20668275SEric Cheng * The SRS worker routine. Drains the queue when no one else is 20678275SEric Cheng * processing it. 20688275SEric Cheng */ 20698275SEric Cheng void 20708275SEric Cheng mac_srs_worker(mac_soft_ring_set_t *mac_srs) 20718275SEric Cheng { 20728275SEric Cheng kmutex_t *lock = &mac_srs->srs_lock; 20738275SEric Cheng kcondvar_t *async = &mac_srs->srs_async; 20748275SEric Cheng callb_cpr_t cprinfo; 20758275SEric Cheng boolean_t bw_ctl_flag; 20768275SEric Cheng 20778275SEric Cheng CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker"); 20788275SEric Cheng mutex_enter(lock); 20798275SEric Cheng 20808275SEric Cheng start: 20818275SEric Cheng for (;;) { 20828275SEric Cheng bw_ctl_flag = B_FALSE; 20838275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 20848275SEric Cheng MAC_SRS_BW_LOCK(mac_srs); 20858275SEric Cheng MAC_SRS_CHECK_BW_CONTROL(mac_srs); 20868275SEric Cheng if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 20878275SEric Cheng bw_ctl_flag = B_TRUE; 20888275SEric Cheng MAC_SRS_BW_UNLOCK(mac_srs); 20898275SEric Cheng } 20908275SEric Cheng /* 20918275SEric Cheng * The SRS_BW_ENFORCED flag may change since we have dropped 20928275SEric Cheng * the mac_bw_lock. However the drain function can handle both 20938275SEric Cheng * a drainable SRS or a bandwidth controlled SRS, and the 20948275SEric Cheng * effect of scheduling a timeout is to wakeup the worker 20958275SEric Cheng * thread which in turn will call the drain function. Since 20968275SEric Cheng * we release the srs_lock atomically only in the cv_wait there 20978275SEric Cheng * isn't a fear of waiting for ever. 20988275SEric Cheng */ 20998275SEric Cheng while (((mac_srs->srs_state & SRS_PROC) || 21008275SEric Cheng (mac_srs->srs_first == NULL) || bw_ctl_flag || 21018275SEric Cheng (mac_srs->srs_state & SRS_TX_BLOCKED)) && 21028275SEric Cheng !(mac_srs->srs_state & SRS_PAUSE)) { 21038275SEric Cheng /* 21048275SEric Cheng * If we have packets queued and we are here 21058275SEric Cheng * because B/W control is in place, we better 21068275SEric Cheng * schedule the worker wakeup after 1 tick 21078275SEric Cheng * to see if bandwidth control can be relaxed. 21088275SEric Cheng */ 21098275SEric Cheng if (bw_ctl_flag && mac_srs->srs_tid == NULL) { 21108275SEric Cheng /* 21118275SEric Cheng * We need to ensure that a timer is already 21128275SEric Cheng * scheduled or we force schedule one for 21138275SEric Cheng * later so that we can continue processing 21148275SEric Cheng * after this quanta is over. 21158275SEric Cheng */ 21168275SEric Cheng mac_srs->srs_tid = timeout(mac_srs_fire, 21178275SEric Cheng mac_srs, 1); 21188275SEric Cheng } 21198275SEric Cheng wait: 21208275SEric Cheng CALLB_CPR_SAFE_BEGIN(&cprinfo); 21218275SEric Cheng cv_wait(async, lock); 21228275SEric Cheng CALLB_CPR_SAFE_END(&cprinfo, lock); 21238275SEric Cheng 21248275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 21258275SEric Cheng goto done; 21268275SEric Cheng if (mac_srs->srs_state & SRS_PROC) 21278275SEric Cheng goto wait; 21288275SEric Cheng 21298275SEric Cheng if (mac_srs->srs_first != NULL && 21308275SEric Cheng mac_srs->srs_type & SRST_BW_CONTROL) { 21318275SEric Cheng MAC_SRS_BW_LOCK(mac_srs); 21328275SEric Cheng if (mac_srs->srs_bw->mac_bw_state & 21338275SEric Cheng SRS_BW_ENFORCED) { 21348275SEric Cheng MAC_SRS_CHECK_BW_CONTROL(mac_srs); 21358275SEric Cheng } 21368275SEric Cheng bw_ctl_flag = mac_srs->srs_bw->mac_bw_state & 21378275SEric Cheng SRS_BW_ENFORCED; 21388275SEric Cheng MAC_SRS_BW_UNLOCK(mac_srs); 21398275SEric Cheng } 21408275SEric Cheng } 21418275SEric Cheng 21428275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 21438275SEric Cheng goto done; 21448275SEric Cheng mac_srs->srs_drain_func(mac_srs, SRS_WORKER); 21458275SEric Cheng } 21468275SEric Cheng done: 21478275SEric Cheng /* 21488275SEric Cheng * The Rx SRS quiesce logic first cuts off packet supply to the SRS 21498275SEric Cheng * from both hard and soft classifications and waits for such threads 21508275SEric Cheng * to finish before signaling the worker. So at this point the only 21518275SEric Cheng * thread left that could be competing with the worker is the poll 21528275SEric Cheng * thread. In the case of Tx, there shouldn't be any thread holding 21538275SEric Cheng * SRS_PROC at this point. 21548275SEric Cheng */ 21558275SEric Cheng if (!(mac_srs->srs_state & SRS_PROC)) { 21568275SEric Cheng mac_srs->srs_state |= SRS_PROC; 21578275SEric Cheng } else { 21588275SEric Cheng ASSERT((mac_srs->srs_type & SRST_TX) == 0); 21598275SEric Cheng /* 21608275SEric Cheng * Poll thread still owns the SRS and is still running 21618275SEric Cheng */ 21628275SEric Cheng ASSERT((mac_srs->srs_poll_thr == NULL) || 21638275SEric Cheng ((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 21648275SEric Cheng SRS_POLL_THR_OWNER)); 21658275SEric Cheng } 21668275SEric Cheng mac_srs_worker_quiesce(mac_srs); 21678275SEric Cheng /* 21688275SEric Cheng * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator 21698275SEric Cheng * of the quiesce operation 21708275SEric Cheng */ 21718275SEric Cheng while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART))) 21728275SEric Cheng cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 21738275SEric Cheng 21748275SEric Cheng if (mac_srs->srs_state & SRS_RESTART) { 21758275SEric Cheng ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 21768275SEric Cheng mac_srs_worker_restart(mac_srs); 21778275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 21788275SEric Cheng goto start; 21798275SEric Cheng } 21808275SEric Cheng 21818275SEric Cheng if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE)) 21828275SEric Cheng mac_srs_worker_quiesce(mac_srs); 21838275SEric Cheng 21848275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 21858275SEric Cheng /* The macro drops the srs_lock */ 21868275SEric Cheng CALLB_CPR_EXIT(&cprinfo); 21878275SEric Cheng thread_exit(); 21888275SEric Cheng } 21898275SEric Cheng 21908275SEric Cheng /* 21918275SEric Cheng * mac_rx_srs_subflow_process 21928275SEric Cheng * 21938275SEric Cheng * Receive side routine called from interrupt path when there are 21948275SEric Cheng * sub flows present on this SRS. 21958275SEric Cheng */ 21968275SEric Cheng /* ARGSUSED */ 21978275SEric Cheng void 21988275SEric Cheng mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs, 21998275SEric Cheng mblk_t *mp_chain, boolean_t loopback) 22008275SEric Cheng { 22018275SEric Cheng flow_entry_t *flent = NULL; 22028275SEric Cheng flow_entry_t *prev_flent = NULL; 22038275SEric Cheng mblk_t *mp = NULL; 22048275SEric Cheng mblk_t *tail = NULL; 22058275SEric Cheng mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 22068275SEric Cheng mac_client_impl_t *mcip; 22078275SEric Cheng 22088275SEric Cheng mcip = mac_srs->srs_mcip; 22098275SEric Cheng ASSERT(mcip != NULL); 22108275SEric Cheng 22118275SEric Cheng /* 22128275SEric Cheng * We need to determine the SRS for every packet 22138275SEric Cheng * by walking the flow table, if we don't get any, 22148275SEric Cheng * then we proceed using the SRS we came with. 22158275SEric Cheng */ 22168275SEric Cheng mp = tail = mp_chain; 22178275SEric Cheng while (mp != NULL) { 22188275SEric Cheng 22198275SEric Cheng /* 22208275SEric Cheng * We will increment the stats for the mactching subflow. 22218275SEric Cheng * when we get the bytes/pkt count for the classified packets 22228275SEric Cheng * later in mac_rx_srs_process. 22238275SEric Cheng */ 22248275SEric Cheng (void) mac_flow_lookup(mcip->mci_subflow_tab, mp, 22258275SEric Cheng FLOW_INBOUND, &flent); 22268275SEric Cheng 22278275SEric Cheng if (mp == mp_chain || flent == prev_flent) { 22288275SEric Cheng if (prev_flent != NULL) 22298275SEric Cheng FLOW_REFRELE(prev_flent); 22308275SEric Cheng prev_flent = flent; 22318275SEric Cheng flent = NULL; 22328275SEric Cheng tail = mp; 22338275SEric Cheng mp = mp->b_next; 22348275SEric Cheng continue; 22358275SEric Cheng } 22368275SEric Cheng tail->b_next = NULL; 22378275SEric Cheng /* 22388275SEric Cheng * A null indicates, this is for the mac_srs itself. 22398275SEric Cheng * XXX-venu : probably assert for fe_rx_srs_cnt == 0. 22408275SEric Cheng */ 22418275SEric Cheng if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 22428275SEric Cheng mac_rx_srs_process(arg, 22438275SEric Cheng (mac_resource_handle_t)mac_srs, mp_chain, 22448275SEric Cheng loopback); 22458275SEric Cheng } else { 22468275SEric Cheng (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 22478275SEric Cheng prev_flent->fe_cb_arg2, mp_chain, loopback); 22488275SEric Cheng FLOW_REFRELE(prev_flent); 22498275SEric Cheng } 22508275SEric Cheng prev_flent = flent; 22518275SEric Cheng flent = NULL; 22528275SEric Cheng mp_chain = mp; 22538275SEric Cheng tail = mp; 22548275SEric Cheng mp = mp->b_next; 22558275SEric Cheng } 22568275SEric Cheng /* Last chain */ 22578275SEric Cheng ASSERT(mp_chain != NULL); 22588275SEric Cheng if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 22598275SEric Cheng mac_rx_srs_process(arg, 22608275SEric Cheng (mac_resource_handle_t)mac_srs, mp_chain, loopback); 22618275SEric Cheng } else { 22628275SEric Cheng (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 22638275SEric Cheng prev_flent->fe_cb_arg2, mp_chain, loopback); 22648275SEric Cheng FLOW_REFRELE(prev_flent); 22658275SEric Cheng } 22668275SEric Cheng } 22678275SEric Cheng 22688275SEric Cheng /* 22698275SEric Cheng * mac_rx_srs_process 22708275SEric Cheng * 22718275SEric Cheng * Receive side routine called from the interrupt path. 22728275SEric Cheng * 22738275SEric Cheng * loopback is set to force a context switch on the loopback 22748275SEric Cheng * path between MAC clients. 22758275SEric Cheng */ 22768275SEric Cheng /* ARGSUSED */ 22778275SEric Cheng void 22788275SEric Cheng mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, 22798275SEric Cheng boolean_t loopback) 22808275SEric Cheng { 22818275SEric Cheng mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 22828275SEric Cheng mblk_t *mp, *tail, *head; 22838275SEric Cheng int count = 0; 22848275SEric Cheng int count1; 22858275SEric Cheng size_t sz = 0; 22868275SEric Cheng size_t chain_sz, sz1; 22878275SEric Cheng mac_bw_ctl_t *mac_bw; 22888275SEric Cheng mac_client_impl_t *smcip; 22898275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 22908275SEric Cheng 22918275SEric Cheng /* 22928275SEric Cheng * Set the tail, count and sz. We set the sz irrespective 22938275SEric Cheng * of whether we are doing B/W control or not for the 22948275SEric Cheng * purpose of updating the stats. 22958275SEric Cheng */ 22968275SEric Cheng mp = tail = mp_chain; 22978275SEric Cheng while (mp != NULL) { 22988275SEric Cheng tail = mp; 22998275SEric Cheng count++; 23008275SEric Cheng sz += msgdsize(mp); 23018275SEric Cheng mp = mp->b_next; 23028275SEric Cheng } 23038275SEric Cheng 23048275SEric Cheng mutex_enter(&mac_srs->srs_lock); 23058275SEric Cheng smcip = mac_srs->srs_mcip; 23068275SEric Cheng 23078275SEric Cheng if (mac_srs->srs_type & SRST_FLOW || smcip == NULL) { 23088275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, rbytes, sz); 23098275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, ipackets, count); 23108275SEric Cheng } 23118275SEric Cheng if (smcip != NULL) { 23128275SEric Cheng smcip->mci_stat_ibytes += sz; 23138275SEric Cheng smcip->mci_stat_ipackets += count; 23148275SEric Cheng } 23158275SEric Cheng 23168275SEric Cheng /* 23178275SEric Cheng * If the SRS in already being processed; has been blanked; 23188275SEric Cheng * can be processed by worker thread only; or the B/W limit 23198275SEric Cheng * has been reached, then queue the chain and check if 23208275SEric Cheng * worker thread needs to be awakend. 23218275SEric Cheng */ 23228275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 23238275SEric Cheng mac_bw = mac_srs->srs_bw; 23248275SEric Cheng ASSERT(mac_bw != NULL); 23258275SEric Cheng mutex_enter(&mac_bw->mac_bw_lock); 23268275SEric Cheng /* Count the packets and bytes via interrupt */ 23278275SEric Cheng srs_rx->sr_intr_count += count; 23288275SEric Cheng mac_bw->mac_bw_intr += sz; 23298275SEric Cheng if (mac_bw->mac_bw_limit == 0) { 23308275SEric Cheng /* zero bandwidth: drop all */ 23318275SEric Cheng srs_rx->sr_drop_count += count; 23328275SEric Cheng mac_bw->mac_bw_drop_bytes += sz; 23338275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23348275SEric Cheng mutex_exit(&mac_srs->srs_lock); 23358275SEric Cheng mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 23368275SEric Cheng return; 23378275SEric Cheng } else { 23388275SEric Cheng if ((mac_bw->mac_bw_sz + sz) <= 23398275SEric Cheng mac_bw->mac_bw_drop_threshold) { 23408275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23418275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, 23428275SEric Cheng tail, count, sz); 23438275SEric Cheng } else { 23448275SEric Cheng mp = mp_chain; 23458275SEric Cheng chain_sz = 0; 23468275SEric Cheng count1 = 0; 23478275SEric Cheng tail = NULL; 23488275SEric Cheng head = NULL; 23498275SEric Cheng while (mp != NULL) { 23508275SEric Cheng sz1 = msgdsize(mp); 23518275SEric Cheng if (mac_bw->mac_bw_sz + chain_sz + sz1 > 23528275SEric Cheng mac_bw->mac_bw_drop_threshold) 23538275SEric Cheng break; 23548275SEric Cheng chain_sz += sz1; 23558275SEric Cheng count1++; 23568275SEric Cheng tail = mp; 23578275SEric Cheng mp = mp->b_next; 23588275SEric Cheng } 23598275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23608275SEric Cheng if (tail != NULL) { 23618275SEric Cheng head = tail->b_next; 23628275SEric Cheng tail->b_next = NULL; 23638275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, 23648275SEric Cheng mp_chain, tail, count1, chain_sz); 23658275SEric Cheng sz -= chain_sz; 23668275SEric Cheng count -= count1; 23678275SEric Cheng } else { 23688275SEric Cheng /* Can't pick up any */ 23698275SEric Cheng head = mp_chain; 23708275SEric Cheng } 23718275SEric Cheng if (head != NULL) { 23728275SEric Cheng /* Drop any packet over the threshold */ 23738275SEric Cheng srs_rx->sr_drop_count += count; 23748275SEric Cheng mutex_enter(&mac_bw->mac_bw_lock); 23758275SEric Cheng mac_bw->mac_bw_drop_bytes += sz; 23768275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23778275SEric Cheng freemsgchain(head); 23788275SEric Cheng } 23798275SEric Cheng } 23808275SEric Cheng MAC_SRS_WORKER_WAKEUP(mac_srs); 23818275SEric Cheng mutex_exit(&mac_srs->srs_lock); 23828275SEric Cheng return; 23838275SEric Cheng } 23848275SEric Cheng } 23858275SEric Cheng 23868275SEric Cheng /* 23878275SEric Cheng * If the total number of packets queued in the SRS and 23888275SEric Cheng * its associated soft rings exceeds the max allowed, 23898275SEric Cheng * then drop the chain. If we are polling capable, this 23908275SEric Cheng * shouldn't be happening. 23918275SEric Cheng */ 23928275SEric Cheng if (!(mac_srs->srs_type & SRST_BW_CONTROL) && 23938275SEric Cheng (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) { 23948275SEric Cheng mac_bw = mac_srs->srs_bw; 23958275SEric Cheng srs_rx->sr_drop_count += count; 23968275SEric Cheng mutex_enter(&mac_bw->mac_bw_lock); 23978275SEric Cheng mac_bw->mac_bw_drop_bytes += sz; 23988275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23998275SEric Cheng freemsgchain(mp_chain); 24008275SEric Cheng mutex_exit(&mac_srs->srs_lock); 24018275SEric Cheng return; 24028275SEric Cheng } 24038275SEric Cheng 24048275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz); 24058275SEric Cheng /* Count the packets entering via interrupt path */ 24068275SEric Cheng srs_rx->sr_intr_count += count; 24078275SEric Cheng 24088275SEric Cheng if (!(mac_srs->srs_state & SRS_PROC)) { 24098275SEric Cheng /* 24108275SEric Cheng * If we are coming via loopback or if we are not 24118275SEric Cheng * optimizing for latency, we should signal the 24128275SEric Cheng * worker thread. 24138275SEric Cheng */ 24148833SVenu.Iyer@Sun.COM if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT)) { 24158275SEric Cheng /* 24168275SEric Cheng * For loopback, We need to let the worker take 24178275SEric Cheng * over as we don't want to continue in the same 24188275SEric Cheng * thread even if we can. This could lead to stack 24198275SEric Cheng * overflows and may also end up using 24208275SEric Cheng * resources (cpu) incorrectly. 24218275SEric Cheng */ 24228275SEric Cheng cv_signal(&mac_srs->srs_async); 24238275SEric Cheng } else { 24248275SEric Cheng /* 24258275SEric Cheng * Seems like no one is processing the SRS and 24268275SEric Cheng * there is no backlog. We also inline process 24278275SEric Cheng * our packet if its a single packet in non 24288275SEric Cheng * latency optimized case (in latency optimized 24298275SEric Cheng * case, we inline process chains of any size). 24308275SEric Cheng */ 24318275SEric Cheng mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST); 24328275SEric Cheng } 24338275SEric Cheng } 24348275SEric Cheng mutex_exit(&mac_srs->srs_lock); 24358275SEric Cheng } 24368275SEric Cheng 24378275SEric Cheng /* TX SIDE ROUTINES (RUNTIME) */ 24388275SEric Cheng 24398275SEric Cheng /* 24408275SEric Cheng * mac_tx_srs_no_desc 24418275SEric Cheng * 24428275SEric Cheng * This routine is called by Tx single ring default mode 24438275SEric Cheng * when Tx ring runs out of descs. 24448275SEric Cheng */ 24458275SEric Cheng mac_tx_cookie_t 24468275SEric Cheng mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 24478275SEric Cheng uint16_t flag, mblk_t **ret_mp) 24488275SEric Cheng { 24498275SEric Cheng mac_tx_cookie_t cookie = NULL; 24508275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 24518275SEric Cheng boolean_t wakeup_worker = B_TRUE; 24528275SEric Cheng uint32_t tx_mode = srs_tx->st_mode; 24538275SEric Cheng int cnt, sz; 24548275SEric Cheng mblk_t *tail; 24558275SEric Cheng 24568275SEric Cheng ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW); 24578275SEric Cheng if (flag & MAC_DROP_ON_NO_DESC) { 24588275SEric Cheng MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 24598275SEric Cheng } else { 24608275SEric Cheng if (mac_srs->srs_first != NULL) 24618275SEric Cheng wakeup_worker = B_FALSE; 24628275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 24638275SEric Cheng if (flag & MAC_TX_NO_ENQUEUE) { 24648275SEric Cheng /* 24658275SEric Cheng * If TX_QUEUED is not set, queue the 24668275SEric Cheng * packet and let mac_tx_srs_drain() 24678275SEric Cheng * set the TX_BLOCKED bit for the 24688275SEric Cheng * reasons explained above. Otherwise, 24698275SEric Cheng * return the mblks. 24708275SEric Cheng */ 24718275SEric Cheng if (wakeup_worker) { 24728275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 24738275SEric Cheng mp_chain, tail, cnt, sz); 24748275SEric Cheng } else { 24758275SEric Cheng MAC_TX_SET_NO_ENQUEUE(mac_srs, 24768275SEric Cheng mp_chain, ret_mp, cookie); 24778275SEric Cheng } 24788275SEric Cheng } else { 24798275SEric Cheng MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 24808275SEric Cheng tail, cnt, sz, cookie); 24818275SEric Cheng } 24828275SEric Cheng if (wakeup_worker) 24838275SEric Cheng cv_signal(&mac_srs->srs_async); 24848275SEric Cheng } 24858275SEric Cheng return (cookie); 24868275SEric Cheng } 24878275SEric Cheng 24888275SEric Cheng /* 24898275SEric Cheng * mac_tx_srs_enqueue 24908275SEric Cheng * 24918275SEric Cheng * This routine is called when Tx SRS is operating in either serializer 24928275SEric Cheng * or bandwidth mode. In serializer mode, a packet will get enqueued 24938275SEric Cheng * when a thread cannot enter SRS exclusively. In bandwidth mode, 24948275SEric Cheng * packets gets queued if allowed byte-count limit for a tick is 24958275SEric Cheng * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and 24968275SEric Cheng * MAC_TX_NO_ENQUEUE is set is different than when operaing in either 24978275SEric Cheng * the default mode or fanout mode. Here packets get dropped or 24988275SEric Cheng * returned back to the caller only after hi-watermark worth of data 24998275SEric Cheng * is queued. 25008275SEric Cheng */ 25018275SEric Cheng static mac_tx_cookie_t 25028275SEric Cheng mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 25038275SEric Cheng uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp) 25048275SEric Cheng { 25058275SEric Cheng mac_tx_cookie_t cookie = NULL; 25068275SEric Cheng int cnt, sz; 25078275SEric Cheng mblk_t *tail; 25088275SEric Cheng boolean_t wakeup_worker = B_TRUE; 25098275SEric Cheng 25108833SVenu.Iyer@Sun.COM /* 25118833SVenu.Iyer@Sun.COM * Ignore fanout hint if we don't have multiple tx rings. 25128833SVenu.Iyer@Sun.COM */ 25138833SVenu.Iyer@Sun.COM if (!TX_MULTI_RING_MODE(mac_srs)) 25148833SVenu.Iyer@Sun.COM fanout_hint = 0; 25158833SVenu.Iyer@Sun.COM 25168275SEric Cheng if (mac_srs->srs_first != NULL) 25178275SEric Cheng wakeup_worker = B_FALSE; 25188275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 25198275SEric Cheng if (flag & MAC_DROP_ON_NO_DESC) { 25208275SEric Cheng if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) { 25218275SEric Cheng MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 25228275SEric Cheng } else { 25238275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 25248275SEric Cheng mp_chain, tail, cnt, sz); 25258275SEric Cheng } 25268275SEric Cheng } else if (flag & MAC_TX_NO_ENQUEUE) { 25278275SEric Cheng if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) || 25288275SEric Cheng (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) { 25298275SEric Cheng MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain, 25308275SEric Cheng ret_mp, cookie); 25318275SEric Cheng } else { 25328275SEric Cheng mp_chain->b_prev = (mblk_t *)fanout_hint; 25338275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 25348275SEric Cheng mp_chain, tail, cnt, sz); 25358275SEric Cheng } 25368275SEric Cheng } else { 25378275SEric Cheng /* 25388275SEric Cheng * If you are BW_ENFORCED, just enqueue the 25398275SEric Cheng * packet. srs_worker will drain it at the 25408275SEric Cheng * prescribed rate. Before enqueueing, save 25418275SEric Cheng * the fanout hint. 25428275SEric Cheng */ 25438275SEric Cheng mp_chain->b_prev = (mblk_t *)fanout_hint; 25448275SEric Cheng MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 25458275SEric Cheng tail, cnt, sz, cookie); 25468275SEric Cheng } 25478275SEric Cheng if (wakeup_worker) 25488275SEric Cheng cv_signal(&mac_srs->srs_async); 25498275SEric Cheng return (cookie); 25508275SEric Cheng } 25518275SEric Cheng 25528275SEric Cheng /* 25538275SEric Cheng * There are five tx modes: 25548275SEric Cheng * 25558275SEric Cheng * 1) Default mode (SRS_TX_DEFAULT) 25568275SEric Cheng * 2) Serialization mode (SRS_TX_SERIALIZE) 25578275SEric Cheng * 3) Fanout mode (SRS_TX_FANOUT) 25588275SEric Cheng * 4) Bandwdith mode (SRS_TX_BW) 25598275SEric Cheng * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT) 25608275SEric Cheng * 25618275SEric Cheng * The tx mode in which an SRS operates is decided in mac_tx_srs_setup() 25628275SEric Cheng * based on the number of Tx rings requested for an SRS and whether 25638275SEric Cheng * bandwidth control is requested or not. 25648275SEric Cheng * 25658275SEric Cheng * In the default mode (i.e., no fanout/no bandwidth), the SRS acts as a 25668275SEric Cheng * pass-thru. Packets will go directly to mac_tx_send(). When the underlying 25678275SEric Cheng * Tx ring runs out of Tx descs, it starts queueing up packets in SRS. 25688275SEric Cheng * When flow-control is relieved, the srs_worker drains the queued 25698275SEric Cheng * packets and informs blocked clients to restart sending packets. 25708275SEric Cheng * 25718275SEric Cheng * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. 25728275SEric Cheng * 25738275SEric Cheng * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple 25748275SEric Cheng * Tx rings. Each Tx ring will have a soft ring associated with it. 25758275SEric Cheng * These soft rings will be hung off the Tx SRS. Queueing if it happens 25768275SEric Cheng * due to lack of Tx desc will be in individual soft ring (and not srs) 25778275SEric Cheng * associated with Tx ring. 25788275SEric Cheng * 25798275SEric Cheng * In the TX_BW mode, tx srs will allow packets to go down to Tx ring 25808275SEric Cheng * only if bw is available. Otherwise the packets will be queued in 25818275SEric Cheng * SRS. If fanout to multiple Tx rings is configured, the packets will 25828275SEric Cheng * be fanned out among the soft rings associated with the Tx rings. 25838275SEric Cheng * 25848275SEric Cheng * Four flags are used in srs_state for indicating flow control 25858275SEric Cheng * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT. 25868275SEric Cheng * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the 25878275SEric Cheng * driver below. 25888275SEric Cheng * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat 25898275SEric Cheng * and flow-control pressure is applied back to clients. The clients expect 25908275SEric Cheng * wakeup when flow-control is relieved. 25918275SEric Cheng * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk 25928275SEric Cheng * got returned back to client either due to lack of Tx descs or due to bw 25938275SEric Cheng * control reasons. The clients expect a wakeup when condition is relieved. 25948275SEric Cheng * 25958275SEric Cheng * The fourth argument to mac_tx() is the flag. Normally it will be 0 but 25968275SEric Cheng * some clients set the following values too: MAC_DROP_ON_NO_DESC, 25978275SEric Cheng * MAC_TX_NO_ENQUEUE 25988275SEric Cheng * Mac clients that do not want packets to be enqueued in the mac layer set 25998275SEric Cheng * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or 26008275SEric Cheng * Tx soft rings but instead get dropped when the NIC runs out of desc. The 26018275SEric Cheng * behaviour of this flag is different when the Tx is running in serializer 26028275SEric Cheng * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet 26038275SEric Cheng * get dropped when Tx high watermark is reached. 26048275SEric Cheng * There are some mac clients like vsw, aggr that want the mblks to be 26058275SEric Cheng * returned back to clients instead of being queued in Tx SRS (or Tx soft 26068275SEric Cheng * rings) under flow-control (i.e., out of desc or exceeding bw limits) 26078275SEric Cheng * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set. 26088275SEric Cheng * In the default and Tx fanout mode, the un-transmitted mblks will be 26098275SEric Cheng * returned back to the clients when the driver runs out of Tx descs. 26108275SEric Cheng * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or 26118275SEric Cheng * soft ring) so that the clients can be woken up when Tx desc become 26128275SEric Cheng * available. When running in serializer or bandwidth mode mode, 26138275SEric Cheng * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached. 26148275SEric Cheng */ 26158275SEric Cheng 26168275SEric Cheng mac_tx_func_t 26178275SEric Cheng mac_tx_get_func(uint32_t mode) 26188275SEric Cheng { 26198275SEric Cheng return (mac_tx_mode_list[mode].mac_tx_func); 26208275SEric Cheng } 26218275SEric Cheng 26228275SEric Cheng /* ARGSUSED */ 26238275SEric Cheng static mac_tx_cookie_t 26248275SEric Cheng mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 26258275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 26268275SEric Cheng { 26278275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 26288275SEric Cheng boolean_t is_subflow; 26298275SEric Cheng mac_tx_stats_t stats; 26308275SEric Cheng mac_tx_cookie_t cookie = NULL; 26318275SEric Cheng 26328275SEric Cheng ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT); 26338275SEric Cheng 26348275SEric Cheng /* Regular case with a single Tx ring */ 26358275SEric Cheng /* 26368275SEric Cheng * SRS_TX_BLOCKED is set when underlying NIC runs 26378275SEric Cheng * out of Tx descs and messages start getting 26388275SEric Cheng * queued. It won't get reset until 26398275SEric Cheng * tx_srs_drain() completely drains out the 26408275SEric Cheng * messages. 26418275SEric Cheng */ 26428275SEric Cheng if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 26438275SEric Cheng /* Tx descs/resources not available */ 26448275SEric Cheng mutex_enter(&mac_srs->srs_lock); 26458275SEric Cheng if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 26468275SEric Cheng cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, 26478275SEric Cheng flag, ret_mp); 26488275SEric Cheng mutex_exit(&mac_srs->srs_lock); 26498275SEric Cheng return (cookie); 26508275SEric Cheng } 26518275SEric Cheng /* 26528275SEric Cheng * While we were computing mblk count, the 26538275SEric Cheng * flow control condition got relieved. 26548275SEric Cheng * Continue with the transmission. 26558275SEric Cheng */ 26568275SEric Cheng mutex_exit(&mac_srs->srs_lock); 26578275SEric Cheng } 26588275SEric Cheng 26598275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 26608275SEric Cheng 26618275SEric Cheng mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 26628275SEric Cheng mp_chain, (is_subflow ? &stats : NULL)); 26638275SEric Cheng 26648275SEric Cheng /* 26658275SEric Cheng * Multiple threads could be here sending packets. 26668275SEric Cheng * Under such conditions, it is not possible to 26678275SEric Cheng * automically set SRS_TX_BLOCKED bit to indicate 26688275SEric Cheng * out of tx desc condition. To atomically set 26698275SEric Cheng * this, we queue the returned packet and do 26708275SEric Cheng * the setting of SRS_TX_BLOCKED in 26718275SEric Cheng * mac_tx_srs_drain(). 26728275SEric Cheng */ 26738275SEric Cheng if (mp_chain != NULL) { 26748275SEric Cheng mutex_enter(&mac_srs->srs_lock); 26758275SEric Cheng cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp); 26768275SEric Cheng mutex_exit(&mac_srs->srs_lock); 26778275SEric Cheng return (cookie); 26788275SEric Cheng } 26798275SEric Cheng 26808275SEric Cheng if (is_subflow) 26818275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 26828275SEric Cheng 26838275SEric Cheng return (NULL); 26848275SEric Cheng } 26858275SEric Cheng 26868275SEric Cheng /* 26878275SEric Cheng * mac_tx_serialize_mode 26888275SEric Cheng * 26898275SEric Cheng * This is an experimental mode implemented as per the request of PAE. 26908275SEric Cheng * In this mode, all callers attempting to send a packet to the NIC 26918275SEric Cheng * will get serialized. Only one thread at any time will access the 26928275SEric Cheng * NIC to send the packet out. 26938275SEric Cheng */ 26948275SEric Cheng /* ARGSUSED */ 26958275SEric Cheng static mac_tx_cookie_t 26968275SEric Cheng mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 26978275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 26988275SEric Cheng { 26998275SEric Cheng boolean_t is_subflow; 27008275SEric Cheng mac_tx_stats_t stats; 27018275SEric Cheng mac_tx_cookie_t cookie = NULL; 27028275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 27038275SEric Cheng 27048275SEric Cheng /* Single ring, serialize below */ 27058275SEric Cheng ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE); 27068275SEric Cheng mutex_enter(&mac_srs->srs_lock); 27078275SEric Cheng if ((mac_srs->srs_first != NULL) || 27088275SEric Cheng (mac_srs->srs_state & SRS_PROC)) { 27098275SEric Cheng /* 27108275SEric Cheng * In serialization mode, queue all packets until 27118275SEric Cheng * TX_HIWAT is set. 27128275SEric Cheng * If drop bit is set, drop if TX_HIWAT is set. 27138275SEric Cheng * If no_enqueue is set, still enqueue until hiwat 27148275SEric Cheng * is set and return mblks after TX_HIWAT is set. 27158275SEric Cheng */ 27168275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, 27178275SEric Cheng flag, NULL, ret_mp); 27188275SEric Cheng mutex_exit(&mac_srs->srs_lock); 27198275SEric Cheng return (cookie); 27208275SEric Cheng } 27218275SEric Cheng /* 27228275SEric Cheng * No packets queued, nothing on proc and no flow 27238275SEric Cheng * control condition. Fast-path, ok. Do inline 27248275SEric Cheng * processing. 27258275SEric Cheng */ 27268275SEric Cheng mac_srs->srs_state |= SRS_PROC; 27278275SEric Cheng mutex_exit(&mac_srs->srs_lock); 27288275SEric Cheng 27298275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 27308275SEric Cheng 27318275SEric Cheng mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 27328275SEric Cheng mp_chain, (is_subflow ? &stats : NULL)); 27338275SEric Cheng 27348275SEric Cheng mutex_enter(&mac_srs->srs_lock); 27358275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 27368275SEric Cheng if (mp_chain != NULL) { 27378275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, 27388275SEric Cheng mp_chain, flag, NULL, ret_mp); 27398275SEric Cheng } 27408275SEric Cheng if (mac_srs->srs_first != NULL) { 27418275SEric Cheng /* 27428275SEric Cheng * We processed inline our packet and a new 27438275SEric Cheng * packet/s got queued while we were 27448275SEric Cheng * processing. Wakeup srs worker 27458275SEric Cheng */ 27468275SEric Cheng cv_signal(&mac_srs->srs_async); 27478275SEric Cheng } 27488275SEric Cheng mutex_exit(&mac_srs->srs_lock); 27498275SEric Cheng 27508275SEric Cheng if (is_subflow && cookie == NULL) 27518275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 27528275SEric Cheng 27538275SEric Cheng return (cookie); 27548275SEric Cheng } 27558275SEric Cheng 27568275SEric Cheng /* 27578275SEric Cheng * mac_tx_fanout_mode 27588275SEric Cheng * 27598275SEric Cheng * In this mode, the SRS will have access to multiple Tx rings to send 27608275SEric Cheng * the packet out. The fanout hint that is passed as an argument is 27618275SEric Cheng * used to find an appropriate ring to fanout the traffic. Each Tx 27628275SEric Cheng * ring, in turn, will have a soft ring associated with it. If a Tx 27638275SEric Cheng * ring runs out of Tx desc's the returned packet will be queued in 27648275SEric Cheng * the soft ring associated with that Tx ring. The srs itself will not 27658275SEric Cheng * queue any packets. 27668275SEric Cheng */ 27678833SVenu.Iyer@Sun.COM 27688833SVenu.Iyer@Sun.COM #define MAC_TX_SOFT_RING_PROCESS(chain) { \ 27698833SVenu.Iyer@Sun.COM index = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count), \ 27708833SVenu.Iyer@Sun.COM softring = mac_srs->srs_oth_soft_rings[index]; \ 27718833SVenu.Iyer@Sun.COM cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \ 27728833SVenu.Iyer@Sun.COM DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index); \ 27738833SVenu.Iyer@Sun.COM } 27748833SVenu.Iyer@Sun.COM 27758275SEric Cheng static mac_tx_cookie_t 27768275SEric Cheng mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 27778275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 27788275SEric Cheng { 27798275SEric Cheng mac_soft_ring_t *softring; 27808833SVenu.Iyer@Sun.COM uint64_t hash; 27818833SVenu.Iyer@Sun.COM uint_t index; 27828833SVenu.Iyer@Sun.COM mac_tx_cookie_t cookie = NULL; 27838275SEric Cheng 27848275SEric Cheng ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT); 27858833SVenu.Iyer@Sun.COM if (fanout_hint != 0) { 27868833SVenu.Iyer@Sun.COM /* 27878833SVenu.Iyer@Sun.COM * The hint is specified by the caller, simply pass the 27888833SVenu.Iyer@Sun.COM * whole chain to the soft ring. 27898833SVenu.Iyer@Sun.COM */ 27908833SVenu.Iyer@Sun.COM hash = HASH_HINT(fanout_hint); 27918833SVenu.Iyer@Sun.COM MAC_TX_SOFT_RING_PROCESS(mp_chain); 27928833SVenu.Iyer@Sun.COM } else { 27938833SVenu.Iyer@Sun.COM mblk_t *last_mp, *cur_mp, *sub_chain; 27948833SVenu.Iyer@Sun.COM uint64_t last_hash = 0; 27958833SVenu.Iyer@Sun.COM uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media; 27968833SVenu.Iyer@Sun.COM 27978833SVenu.Iyer@Sun.COM /* 27988833SVenu.Iyer@Sun.COM * Compute the hash from the contents (headers) of the 27998833SVenu.Iyer@Sun.COM * packets of the mblk chain. Split the chains into 28008833SVenu.Iyer@Sun.COM * subchains of the same conversation. 28018833SVenu.Iyer@Sun.COM * 28028833SVenu.Iyer@Sun.COM * Since there may be more than one ring used for 28038833SVenu.Iyer@Sun.COM * sub-chains of the same call, and since the caller 28048833SVenu.Iyer@Sun.COM * does not maintain per conversation state since it 28058833SVenu.Iyer@Sun.COM * passed a zero hint, unsent subchains will be 28068833SVenu.Iyer@Sun.COM * dropped. 28078833SVenu.Iyer@Sun.COM */ 28088833SVenu.Iyer@Sun.COM 28098833SVenu.Iyer@Sun.COM flag |= MAC_DROP_ON_NO_DESC; 28108833SVenu.Iyer@Sun.COM ret_mp = NULL; 28118833SVenu.Iyer@Sun.COM 28128833SVenu.Iyer@Sun.COM ASSERT(ret_mp == NULL); 28138833SVenu.Iyer@Sun.COM 28148833SVenu.Iyer@Sun.COM sub_chain = NULL; 28158833SVenu.Iyer@Sun.COM last_mp = NULL; 28168833SVenu.Iyer@Sun.COM 28178833SVenu.Iyer@Sun.COM for (cur_mp = mp_chain; cur_mp != NULL; 28188833SVenu.Iyer@Sun.COM cur_mp = cur_mp->b_next) { 28198833SVenu.Iyer@Sun.COM hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4, 28208833SVenu.Iyer@Sun.COM B_TRUE); 28218833SVenu.Iyer@Sun.COM if (last_hash != 0 && hash != last_hash) { 28228833SVenu.Iyer@Sun.COM /* 28238833SVenu.Iyer@Sun.COM * Starting a different subchain, send current 28248833SVenu.Iyer@Sun.COM * chain out. 28258833SVenu.Iyer@Sun.COM */ 28268833SVenu.Iyer@Sun.COM ASSERT(last_mp != NULL); 28278833SVenu.Iyer@Sun.COM last_mp->b_next = NULL; 28288833SVenu.Iyer@Sun.COM MAC_TX_SOFT_RING_PROCESS(sub_chain); 28298833SVenu.Iyer@Sun.COM sub_chain = NULL; 28308833SVenu.Iyer@Sun.COM } 28318833SVenu.Iyer@Sun.COM 28328833SVenu.Iyer@Sun.COM /* add packet to subchain */ 28338833SVenu.Iyer@Sun.COM if (sub_chain == NULL) 28348833SVenu.Iyer@Sun.COM sub_chain = cur_mp; 28358833SVenu.Iyer@Sun.COM last_mp = cur_mp; 28368833SVenu.Iyer@Sun.COM last_hash = hash; 28378833SVenu.Iyer@Sun.COM } 28388833SVenu.Iyer@Sun.COM 28398833SVenu.Iyer@Sun.COM if (sub_chain != NULL) { 28408833SVenu.Iyer@Sun.COM /* send last subchain */ 28418833SVenu.Iyer@Sun.COM ASSERT(last_mp != NULL); 28428833SVenu.Iyer@Sun.COM last_mp->b_next = NULL; 28438833SVenu.Iyer@Sun.COM MAC_TX_SOFT_RING_PROCESS(sub_chain); 28448833SVenu.Iyer@Sun.COM } 28458833SVenu.Iyer@Sun.COM 28468833SVenu.Iyer@Sun.COM cookie = NULL; 28478833SVenu.Iyer@Sun.COM } 28488833SVenu.Iyer@Sun.COM 28498833SVenu.Iyer@Sun.COM return (cookie); 28508275SEric Cheng } 28518275SEric Cheng 28528275SEric Cheng /* 28538275SEric Cheng * mac_tx_bw_mode 28548275SEric Cheng * 28558275SEric Cheng * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring 28568275SEric Cheng * only if bw is available. Otherwise the packets will be queued in 28578275SEric Cheng * SRS. If the SRS has multiple Tx rings, then packets will get fanned 28588275SEric Cheng * out to a Tx rings. 28598275SEric Cheng */ 28608275SEric Cheng static mac_tx_cookie_t 28618275SEric Cheng mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 28628275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 28638275SEric Cheng { 28648275SEric Cheng int cnt, sz; 28658275SEric Cheng mblk_t *tail; 28668275SEric Cheng mac_tx_cookie_t cookie = NULL; 28678275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 286811066Srafael.vanoni@sun.com clock_t now; 28698275SEric Cheng 28708275SEric Cheng ASSERT(TX_BANDWIDTH_MODE(mac_srs)); 28718275SEric Cheng ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 28728275SEric Cheng mutex_enter(&mac_srs->srs_lock); 28738275SEric Cheng if (mac_srs->srs_bw->mac_bw_limit == 0) { 28748833SVenu.Iyer@Sun.COM /* 28758833SVenu.Iyer@Sun.COM * zero bandwidth, no traffic is sent: drop the packets, 28768833SVenu.Iyer@Sun.COM * or return the whole chain if the caller requests all 28778833SVenu.Iyer@Sun.COM * unsent packets back. 28788833SVenu.Iyer@Sun.COM */ 28798833SVenu.Iyer@Sun.COM if (flag & MAC_TX_NO_ENQUEUE) { 28808833SVenu.Iyer@Sun.COM cookie = (mac_tx_cookie_t)mac_srs; 28818833SVenu.Iyer@Sun.COM *ret_mp = mp_chain; 28828833SVenu.Iyer@Sun.COM } else { 28838833SVenu.Iyer@Sun.COM MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 28848833SVenu.Iyer@Sun.COM } 28858275SEric Cheng mutex_exit(&mac_srs->srs_lock); 28868275SEric Cheng return (cookie); 28878275SEric Cheng } else if ((mac_srs->srs_first != NULL) || 28888275SEric Cheng (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 28898275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 28908275SEric Cheng fanout_hint, ret_mp); 28918275SEric Cheng mutex_exit(&mac_srs->srs_lock); 28928275SEric Cheng return (cookie); 28938275SEric Cheng } 28948275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 289511066Srafael.vanoni@sun.com now = ddi_get_lbolt(); 289611066Srafael.vanoni@sun.com if (mac_srs->srs_bw->mac_bw_curr_time != now) { 289711066Srafael.vanoni@sun.com mac_srs->srs_bw->mac_bw_curr_time = now; 28988275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 28998275SEric Cheng } else if (mac_srs->srs_bw->mac_bw_used > 29008275SEric Cheng mac_srs->srs_bw->mac_bw_limit) { 29018275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 29028275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 29038275SEric Cheng mp_chain, tail, cnt, sz); 29048275SEric Cheng /* 29058275SEric Cheng * Wakeup worker thread. Note that worker 29068275SEric Cheng * thread has to be woken up so that it 29078275SEric Cheng * can fire up the timer to be woken up 29088275SEric Cheng * on the next tick. Also once 29098275SEric Cheng * BW_ENFORCED is set, it can only be 29108275SEric Cheng * reset by srs_worker thread. Until then 29118275SEric Cheng * all packets will get queued up in SRS 29128275SEric Cheng * and hence this this code path won't be 29138275SEric Cheng * entered until BW_ENFORCED is reset. 29148275SEric Cheng */ 29158275SEric Cheng cv_signal(&mac_srs->srs_async); 29168275SEric Cheng mutex_exit(&mac_srs->srs_lock); 29178275SEric Cheng return (cookie); 29188275SEric Cheng } 29198275SEric Cheng 29208275SEric Cheng mac_srs->srs_bw->mac_bw_used += sz; 29218275SEric Cheng mutex_exit(&mac_srs->srs_lock); 29228275SEric Cheng 29238275SEric Cheng if (srs_tx->st_mode == SRS_TX_BW_FANOUT) { 29248275SEric Cheng mac_soft_ring_t *softring; 29258275SEric Cheng uint_t indx, hash; 29268275SEric Cheng 29278275SEric Cheng hash = HASH_HINT(fanout_hint); 29288275SEric Cheng indx = COMPUTE_INDEX(hash, 29298275SEric Cheng mac_srs->srs_oth_ring_count); 29308275SEric Cheng softring = mac_srs->srs_oth_soft_rings[indx]; 29318275SEric Cheng return (mac_tx_soft_ring_process(softring, mp_chain, flag, 29328275SEric Cheng ret_mp)); 29338275SEric Cheng } else { 29348275SEric Cheng boolean_t is_subflow; 29358275SEric Cheng mac_tx_stats_t stats; 29368275SEric Cheng 29378275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 29388275SEric Cheng 29398275SEric Cheng mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 29408275SEric Cheng mp_chain, (is_subflow ? &stats : NULL)); 29418275SEric Cheng 29428275SEric Cheng if (mp_chain != NULL) { 29438275SEric Cheng mutex_enter(&mac_srs->srs_lock); 29448275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 29458275SEric Cheng if (mac_srs->srs_bw->mac_bw_used > sz) 29468275SEric Cheng mac_srs->srs_bw->mac_bw_used -= sz; 29478275SEric Cheng else 29488275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 29498275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 29508275SEric Cheng fanout_hint, ret_mp); 29518275SEric Cheng mutex_exit(&mac_srs->srs_lock); 29528275SEric Cheng return (cookie); 29538275SEric Cheng } 29548275SEric Cheng if (is_subflow) 29558275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 29568275SEric Cheng 29578275SEric Cheng return (NULL); 29588275SEric Cheng } 29598275SEric Cheng } 29608275SEric Cheng 29618275SEric Cheng /* ARGSUSED */ 29628275SEric Cheng void 29638275SEric Cheng mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 29648275SEric Cheng { 29658275SEric Cheng mblk_t *head, *tail; 29668275SEric Cheng size_t sz; 29678275SEric Cheng uint32_t tx_mode; 29688275SEric Cheng uint_t saved_pkt_count; 29698275SEric Cheng boolean_t is_subflow; 29708275SEric Cheng mac_tx_stats_t stats; 29718275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 297211066Srafael.vanoni@sun.com clock_t now; 29738275SEric Cheng 29748275SEric Cheng saved_pkt_count = 0; 29758275SEric Cheng ASSERT(mutex_owned(&mac_srs->srs_lock)); 29768275SEric Cheng ASSERT(!(mac_srs->srs_state & SRS_PROC)); 29778275SEric Cheng 29788275SEric Cheng mac_srs->srs_state |= SRS_PROC; 29798275SEric Cheng 29808275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 29818275SEric Cheng tx_mode = srs_tx->st_mode; 29828275SEric Cheng if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) { 29838275SEric Cheng if (mac_srs->srs_first != NULL) { 29848275SEric Cheng head = mac_srs->srs_first; 29858275SEric Cheng tail = mac_srs->srs_last; 29868275SEric Cheng saved_pkt_count = mac_srs->srs_count; 29878275SEric Cheng mac_srs->srs_first = NULL; 29888275SEric Cheng mac_srs->srs_last = NULL; 29898275SEric Cheng mac_srs->srs_count = 0; 29908275SEric Cheng mutex_exit(&mac_srs->srs_lock); 29918275SEric Cheng 29928275SEric Cheng head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 29938275SEric Cheng head, &stats); 29948275SEric Cheng 29958275SEric Cheng mutex_enter(&mac_srs->srs_lock); 29968275SEric Cheng if (head != NULL) { 29978275SEric Cheng /* Device out of tx desc, set block */ 29988275SEric Cheng if (head->b_next == NULL) 29998275SEric Cheng VERIFY(head == tail); 30008275SEric Cheng tail->b_next = mac_srs->srs_first; 30018275SEric Cheng mac_srs->srs_first = head; 30028275SEric Cheng mac_srs->srs_count += 30038275SEric Cheng (saved_pkt_count - stats.ts_opackets); 30048275SEric Cheng if (mac_srs->srs_last == NULL) 30058275SEric Cheng mac_srs->srs_last = tail; 30068275SEric Cheng MAC_TX_SRS_BLOCK(mac_srs, head); 30078275SEric Cheng } else { 30088275SEric Cheng srs_tx->st_woken_up = B_FALSE; 30098275SEric Cheng if (is_subflow) { 30108275SEric Cheng FLOW_TX_STATS_UPDATE( 30118275SEric Cheng mac_srs->srs_flent, &stats); 30128275SEric Cheng } 30138275SEric Cheng } 30148275SEric Cheng } 30158275SEric Cheng } else if (tx_mode == SRS_TX_BW) { 30168275SEric Cheng /* 30178275SEric Cheng * We are here because the timer fired and we have some data 30188275SEric Cheng * to tranmit. Also mac_tx_srs_worker should have reset 30198275SEric Cheng * SRS_BW_ENFORCED flag 30208275SEric Cheng */ 30218275SEric Cheng ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)); 30228275SEric Cheng head = tail = mac_srs->srs_first; 30238275SEric Cheng while (mac_srs->srs_first != NULL) { 30248275SEric Cheng tail = mac_srs->srs_first; 30258275SEric Cheng tail->b_prev = NULL; 30268275SEric Cheng mac_srs->srs_first = tail->b_next; 30278275SEric Cheng if (mac_srs->srs_first == NULL) 30288275SEric Cheng mac_srs->srs_last = NULL; 30298275SEric Cheng mac_srs->srs_count--; 30308275SEric Cheng sz = msgdsize(tail); 30318275SEric Cheng mac_srs->srs_size -= sz; 30328275SEric Cheng saved_pkt_count++; 30338275SEric Cheng MAC_TX_UPDATE_BW_INFO(mac_srs, sz); 30348275SEric Cheng 30358275SEric Cheng if (mac_srs->srs_bw->mac_bw_used < 30368275SEric Cheng mac_srs->srs_bw->mac_bw_limit) 30378275SEric Cheng continue; 30388275SEric Cheng 303911066Srafael.vanoni@sun.com now = ddi_get_lbolt(); 304011066Srafael.vanoni@sun.com if (mac_srs->srs_bw->mac_bw_curr_time != now) { 304111066Srafael.vanoni@sun.com mac_srs->srs_bw->mac_bw_curr_time = now; 30428275SEric Cheng mac_srs->srs_bw->mac_bw_used = sz; 30438275SEric Cheng continue; 30448275SEric Cheng } 30458275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 30468275SEric Cheng break; 30478275SEric Cheng } 30488275SEric Cheng 30498275SEric Cheng ASSERT((head == NULL && tail == NULL) || 30508275SEric Cheng (head != NULL && tail != NULL)); 30518275SEric Cheng if (tail != NULL) { 30528275SEric Cheng tail->b_next = NULL; 30538275SEric Cheng mutex_exit(&mac_srs->srs_lock); 30548275SEric Cheng 30558275SEric Cheng head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 30568275SEric Cheng head, &stats); 30578275SEric Cheng 30588275SEric Cheng mutex_enter(&mac_srs->srs_lock); 30598275SEric Cheng if (head != NULL) { 30608275SEric Cheng uint_t size_sent; 30618275SEric Cheng 30628275SEric Cheng /* Device out of tx desc, set block */ 30638275SEric Cheng if (head->b_next == NULL) 30648275SEric Cheng VERIFY(head == tail); 30658275SEric Cheng tail->b_next = mac_srs->srs_first; 30668275SEric Cheng mac_srs->srs_first = head; 30678275SEric Cheng mac_srs->srs_count += 30688275SEric Cheng (saved_pkt_count - stats.ts_opackets); 30698275SEric Cheng if (mac_srs->srs_last == NULL) 30708275SEric Cheng mac_srs->srs_last = tail; 30718275SEric Cheng size_sent = sz - stats.ts_obytes; 30728275SEric Cheng mac_srs->srs_size += size_sent; 30738275SEric Cheng mac_srs->srs_bw->mac_bw_sz += size_sent; 30748275SEric Cheng if (mac_srs->srs_bw->mac_bw_used > size_sent) { 30758275SEric Cheng mac_srs->srs_bw->mac_bw_used -= 30768275SEric Cheng size_sent; 30778275SEric Cheng } else { 30788275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 30798275SEric Cheng } 30808275SEric Cheng MAC_TX_SRS_BLOCK(mac_srs, head); 30818275SEric Cheng } else { 30828275SEric Cheng srs_tx->st_woken_up = B_FALSE; 30838275SEric Cheng if (is_subflow) { 30848275SEric Cheng FLOW_TX_STATS_UPDATE( 30858275SEric Cheng mac_srs->srs_flent, &stats); 30868275SEric Cheng } 30878275SEric Cheng } 30888275SEric Cheng } 30898275SEric Cheng } else if (tx_mode == SRS_TX_BW_FANOUT) { 30908275SEric Cheng mblk_t *prev; 30918275SEric Cheng mac_soft_ring_t *softring; 30928275SEric Cheng uint64_t hint; 30938275SEric Cheng 30948275SEric Cheng /* 30958275SEric Cheng * We are here because the timer fired and we 30968275SEric Cheng * have some quota to tranmit. 30978275SEric Cheng */ 30988275SEric Cheng prev = NULL; 30998275SEric Cheng head = tail = mac_srs->srs_first; 31008275SEric Cheng while (mac_srs->srs_first != NULL) { 31018275SEric Cheng tail = mac_srs->srs_first; 31028275SEric Cheng mac_srs->srs_first = tail->b_next; 31038275SEric Cheng if (mac_srs->srs_first == NULL) 31048275SEric Cheng mac_srs->srs_last = NULL; 31058275SEric Cheng mac_srs->srs_count--; 31068275SEric Cheng sz = msgdsize(tail); 31078275SEric Cheng mac_srs->srs_size -= sz; 31088275SEric Cheng mac_srs->srs_bw->mac_bw_used += sz; 31098275SEric Cheng if (prev == NULL) 31108275SEric Cheng hint = (ulong_t)tail->b_prev; 31118275SEric Cheng if (hint != (ulong_t)tail->b_prev) { 31128275SEric Cheng prev->b_next = NULL; 31138275SEric Cheng mutex_exit(&mac_srs->srs_lock); 31148275SEric Cheng TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 31158275SEric Cheng head = tail; 31168275SEric Cheng hint = (ulong_t)tail->b_prev; 31178275SEric Cheng mutex_enter(&mac_srs->srs_lock); 31188275SEric Cheng } 31198275SEric Cheng 31208275SEric Cheng prev = tail; 31218275SEric Cheng tail->b_prev = NULL; 31228275SEric Cheng if (mac_srs->srs_bw->mac_bw_used < 31238275SEric Cheng mac_srs->srs_bw->mac_bw_limit) 31248275SEric Cheng continue; 31258275SEric Cheng 312611066Srafael.vanoni@sun.com now = ddi_get_lbolt(); 312711066Srafael.vanoni@sun.com if (mac_srs->srs_bw->mac_bw_curr_time != now) { 312811066Srafael.vanoni@sun.com mac_srs->srs_bw->mac_bw_curr_time = now; 31298275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 31308275SEric Cheng continue; 31318275SEric Cheng } 31328275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 31338275SEric Cheng break; 31348275SEric Cheng } 31358275SEric Cheng ASSERT((head == NULL && tail == NULL) || 31368275SEric Cheng (head != NULL && tail != NULL)); 31378275SEric Cheng if (tail != NULL) { 31388275SEric Cheng tail->b_next = NULL; 31398275SEric Cheng mutex_exit(&mac_srs->srs_lock); 31408275SEric Cheng TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 31418275SEric Cheng mutex_enter(&mac_srs->srs_lock); 31428275SEric Cheng } 31438275SEric Cheng } 31448275SEric Cheng /* 31458275SEric Cheng * SRS_TX_FANOUT case not considered here because packets 31468275SEric Cheng * won't be queued in the SRS for this case. Packets will 31478275SEric Cheng * be sent directly to soft rings underneath and if there 31488275SEric Cheng * is any queueing at all, it would be in Tx side soft 31498275SEric Cheng * rings. 31508275SEric Cheng */ 31518275SEric Cheng 31528275SEric Cheng /* 31538275SEric Cheng * When srs_count becomes 0, reset SRS_TX_HIWAT and 31548275SEric Cheng * SRS_TX_WAKEUP_CLIENT and wakeup registered clients. 31558275SEric Cheng */ 31568275SEric Cheng if (mac_srs->srs_count == 0 && (mac_srs->srs_state & 31578275SEric Cheng (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) { 31588275SEric Cheng mac_tx_notify_cb_t *mtnfp; 31598275SEric Cheng mac_cb_t *mcb; 31608275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 31618275SEric Cheng boolean_t wakeup_required = B_FALSE; 31628275SEric Cheng 31638275SEric Cheng if (mac_srs->srs_state & 31648275SEric Cheng (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) { 31658275SEric Cheng wakeup_required = B_TRUE; 31668275SEric Cheng } 31678275SEric Cheng mac_srs->srs_state &= ~(SRS_TX_HIWAT | 31688275SEric Cheng SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED); 31698275SEric Cheng mutex_exit(&mac_srs->srs_lock); 31708275SEric Cheng if (wakeup_required) { 31718275SEric Cheng /* Wakeup callback registered clients */ 31728275SEric Cheng MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info); 31738275SEric Cheng for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL; 31748275SEric Cheng mcb = mcb->mcb_nextp) { 31758275SEric Cheng mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp; 31768275SEric Cheng mtnfp->mtnf_fn(mtnfp->mtnf_arg, 31778275SEric Cheng (mac_tx_cookie_t)mac_srs); 31788275SEric Cheng } 31798275SEric Cheng MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info, 31808275SEric Cheng &mcip->mci_tx_notify_cb_list); 31818275SEric Cheng /* 31828275SEric Cheng * If the client is not the primary MAC client, then we 31838275SEric Cheng * need to send the notification to the clients upper 31848275SEric Cheng * MAC, i.e. mci_upper_mip. 31858275SEric Cheng */ 31868275SEric Cheng mac_tx_notify(mcip->mci_upper_mip != NULL ? 31878275SEric Cheng mcip->mci_upper_mip : mcip->mci_mip); 31888275SEric Cheng } 31898275SEric Cheng mutex_enter(&mac_srs->srs_lock); 31908275SEric Cheng } 31918275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 31928275SEric Cheng } 31938275SEric Cheng 31948275SEric Cheng /* 31958275SEric Cheng * Given a packet, get the flow_entry that identifies the flow 31968275SEric Cheng * to which that packet belongs. The flow_entry will contain 31978275SEric Cheng * the transmit function to be used to send the packet. If the 31988275SEric Cheng * function returns NULL, the packet should be sent using the 31998275SEric Cheng * underlying NIC. 32008275SEric Cheng */ 32018275SEric Cheng static flow_entry_t * 32028275SEric Cheng mac_tx_classify(mac_impl_t *mip, mblk_t *mp) 32038275SEric Cheng { 32048275SEric Cheng flow_entry_t *flent = NULL; 32058275SEric Cheng mac_client_impl_t *mcip; 32068275SEric Cheng int err; 32078275SEric Cheng 32088275SEric Cheng /* 32098275SEric Cheng * Do classification on the packet. 32108275SEric Cheng */ 32118275SEric Cheng err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent); 32128275SEric Cheng if (err != 0) 32138275SEric Cheng return (NULL); 32148275SEric Cheng 32158275SEric Cheng /* 32168275SEric Cheng * This flent might just be an additional one on the MAC client, 32178275SEric Cheng * i.e. for classification purposes (different fdesc), however 32188275SEric Cheng * the resources, SRS et. al., are in the mci_flent, so if 32198275SEric Cheng * this isn't the mci_flent, we need to get it. 32208275SEric Cheng */ 32218275SEric Cheng if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) { 32228275SEric Cheng FLOW_REFRELE(flent); 32238275SEric Cheng flent = mcip->mci_flent; 32248275SEric Cheng FLOW_TRY_REFHOLD(flent, err); 32258275SEric Cheng if (err != 0) 32268275SEric Cheng return (NULL); 32278275SEric Cheng } 32288275SEric Cheng 32298275SEric Cheng return (flent); 32308275SEric Cheng } 32318275SEric Cheng 32328275SEric Cheng /* 32338275SEric Cheng * This macro is only meant to be used by mac_tx_send(). 32348275SEric Cheng */ 32358275SEric Cheng #define CHECK_VID_AND_ADD_TAG(mp) { \ 32368275SEric Cheng if (vid_check) { \ 32378275SEric Cheng int err = 0; \ 32388275SEric Cheng \ 32398275SEric Cheng MAC_VID_CHECK(src_mcip, (mp), err); \ 32408275SEric Cheng if (err != 0) { \ 32418275SEric Cheng freemsg((mp)); \ 32428275SEric Cheng (mp) = next; \ 32438275SEric Cheng oerrors++; \ 32448275SEric Cheng continue; \ 32458275SEric Cheng } \ 32468275SEric Cheng } \ 32478275SEric Cheng if (add_tag) { \ 32488275SEric Cheng (mp) = mac_add_vlan_tag((mp), 0, vid); \ 32498275SEric Cheng if ((mp) == NULL) { \ 32508275SEric Cheng (mp) = next; \ 32518275SEric Cheng oerrors++; \ 32528275SEric Cheng continue; \ 32538275SEric Cheng } \ 32548275SEric Cheng } \ 32558275SEric Cheng } 32568275SEric Cheng 32578275SEric Cheng mblk_t * 32588275SEric Cheng mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, 32598275SEric Cheng mac_tx_stats_t *stats) 32608275SEric Cheng { 32618275SEric Cheng mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch; 32628275SEric Cheng mac_impl_t *mip = src_mcip->mci_mip; 32638275SEric Cheng uint_t obytes = 0, opackets = 0, oerrors = 0; 32648275SEric Cheng mblk_t *mp = NULL, *next; 32658275SEric Cheng boolean_t vid_check, add_tag; 32668275SEric Cheng uint16_t vid = 0; 32678275SEric Cheng 32688275SEric Cheng if (mip->mi_nclients > 1) { 32698275SEric Cheng vid_check = MAC_VID_CHECK_NEEDED(src_mcip); 32708275SEric Cheng add_tag = MAC_TAG_NEEDED(src_mcip); 32718275SEric Cheng if (add_tag) 32728275SEric Cheng vid = mac_client_vid(mch); 32738275SEric Cheng } else { 32748275SEric Cheng ASSERT(mip->mi_nclients == 1); 32758275SEric Cheng vid_check = add_tag = B_FALSE; 32768275SEric Cheng } 32778275SEric Cheng 32788275SEric Cheng /* 32798275SEric Cheng * Fastpath: if there's only one client, and there's no 32808275SEric Cheng * multicast listeners, we simply send the packet down to the 32818275SEric Cheng * underlying NIC. 32828275SEric Cheng */ 32838275SEric Cheng if (mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL) { 32848275SEric Cheng DTRACE_PROBE2(fastpath, 32858275SEric Cheng mac_client_impl_t *, src_mcip, mblk_t *, mp_chain); 32868275SEric Cheng 32878275SEric Cheng mp = mp_chain; 32888275SEric Cheng while (mp != NULL) { 32898275SEric Cheng next = mp->b_next; 32908275SEric Cheng mp->b_next = NULL; 32918275SEric Cheng opackets++; 32928275SEric Cheng obytes += (mp->b_cont == NULL ? MBLKL(mp) : 32938275SEric Cheng msgdsize(mp)); 32948275SEric Cheng 32958275SEric Cheng CHECK_VID_AND_ADD_TAG(mp); 329610491SRishi.Srivatsavai@Sun.COM MAC_TX(mip, ring, mp, 329710491SRishi.Srivatsavai@Sun.COM ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) != 329810491SRishi.Srivatsavai@Sun.COM 0)); 32998275SEric Cheng 33008275SEric Cheng /* 33018275SEric Cheng * If the driver is out of descriptors and does a 33028275SEric Cheng * partial send it will return a chain of unsent 33038275SEric Cheng * mblks. Adjust the accounting stats. 33048275SEric Cheng */ 33058275SEric Cheng if (mp != NULL) { 33068275SEric Cheng opackets--; 33078275SEric Cheng obytes -= msgdsize(mp); 33088275SEric Cheng mp->b_next = next; 33098275SEric Cheng break; 33108275SEric Cheng } 33118275SEric Cheng mp = next; 33128275SEric Cheng } 33138275SEric Cheng goto done; 33148275SEric Cheng } 33158275SEric Cheng 33168275SEric Cheng /* 33178275SEric Cheng * No fastpath, we either have more than one MAC client 33188275SEric Cheng * defined on top of the same MAC, or one or more MAC 33198275SEric Cheng * client promiscuous callbacks. 33208275SEric Cheng */ 33218275SEric Cheng DTRACE_PROBE3(slowpath, mac_client_impl_t *, 33228275SEric Cheng src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain); 33238275SEric Cheng 33248275SEric Cheng mp = mp_chain; 33258275SEric Cheng while (mp != NULL) { 33268275SEric Cheng flow_entry_t *dst_flow_ent; 33278275SEric Cheng void *flow_cookie; 33288275SEric Cheng size_t pkt_size; 33298275SEric Cheng mblk_t *mp1; 33308275SEric Cheng 33318275SEric Cheng next = mp->b_next; 33328275SEric Cheng mp->b_next = NULL; 33338275SEric Cheng opackets++; 33348275SEric Cheng pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp)); 33358275SEric Cheng obytes += pkt_size; 33368275SEric Cheng CHECK_VID_AND_ADD_TAG(mp); 33378275SEric Cheng 33388275SEric Cheng /* 33398833SVenu.Iyer@Sun.COM * Check if there are promiscuous mode callbacks defined. 33408833SVenu.Iyer@Sun.COM */ 33418833SVenu.Iyer@Sun.COM if (mip->mi_promisc_list != NULL) 33428833SVenu.Iyer@Sun.COM mac_promisc_dispatch(mip, mp, src_mcip); 33438833SVenu.Iyer@Sun.COM 33448833SVenu.Iyer@Sun.COM /* 33458275SEric Cheng * Find the destination. 33468275SEric Cheng */ 33478275SEric Cheng dst_flow_ent = mac_tx_classify(mip, mp); 33488275SEric Cheng 33498275SEric Cheng if (dst_flow_ent != NULL) { 33508275SEric Cheng size_t hdrsize; 33518275SEric Cheng int err = 0; 33528275SEric Cheng 33538275SEric Cheng if (mip->mi_info.mi_nativemedia == DL_ETHER) { 33548275SEric Cheng struct ether_vlan_header *evhp = 33558275SEric Cheng (struct ether_vlan_header *)mp->b_rptr; 33568275SEric Cheng 33578275SEric Cheng if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) 33588275SEric Cheng hdrsize = sizeof (*evhp); 33598275SEric Cheng else 33608275SEric Cheng hdrsize = sizeof (struct ether_header); 33618275SEric Cheng } else { 33628275SEric Cheng mac_header_info_t mhi; 33638275SEric Cheng 33648275SEric Cheng err = mac_header_info((mac_handle_t)mip, 33658275SEric Cheng mp, &mhi); 33668275SEric Cheng if (err == 0) 33678275SEric Cheng hdrsize = mhi.mhi_hdrsize; 33688275SEric Cheng } 33698275SEric Cheng 33708275SEric Cheng /* 33718275SEric Cheng * Got a matching flow. It's either another 33728275SEric Cheng * MAC client, or a broadcast/multicast flow. 33738275SEric Cheng * Make sure the packet size is within the 33748275SEric Cheng * allowed size. If not drop the packet and 33758275SEric Cheng * move to next packet. 33768275SEric Cheng */ 33778275SEric Cheng if (err != 0 || 33788275SEric Cheng (pkt_size - hdrsize) > mip->mi_sdu_max) { 33798275SEric Cheng oerrors++; 33808275SEric Cheng DTRACE_PROBE2(loopback__drop, size_t, pkt_size, 33818275SEric Cheng mblk_t *, mp); 33828275SEric Cheng freemsg(mp); 33838275SEric Cheng mp = next; 33848275SEric Cheng FLOW_REFRELE(dst_flow_ent); 33858275SEric Cheng continue; 33868275SEric Cheng } 33878275SEric Cheng flow_cookie = mac_flow_get_client_cookie(dst_flow_ent); 33888275SEric Cheng if (flow_cookie != NULL) { 33898275SEric Cheng /* 33908275SEric Cheng * The vnic_bcast_send function expects 33918275SEric Cheng * to receive the sender MAC client 33928275SEric Cheng * as value for arg2. 33938275SEric Cheng */ 33948275SEric Cheng mac_bcast_send(flow_cookie, src_mcip, mp, 33958275SEric Cheng B_TRUE); 33968275SEric Cheng } else { 33978275SEric Cheng /* 33988275SEric Cheng * loopback the packet to a 33998275SEric Cheng * local MAC client. We force a context 34008275SEric Cheng * switch if both source and destination 34018275SEric Cheng * MAC clients are used by IP, i.e. bypass 34028275SEric Cheng * is set. 34038275SEric Cheng */ 34048275SEric Cheng boolean_t do_switch; 34058275SEric Cheng mac_client_impl_t *dst_mcip = 34068275SEric Cheng dst_flow_ent->fe_mcip; 34078275SEric Cheng 34088275SEric Cheng do_switch = ((src_mcip->mci_state_flags & 34098275SEric Cheng dst_mcip->mci_state_flags & 34108275SEric Cheng MCIS_CLIENT_POLL_CAPABLE) != 0); 34118275SEric Cheng 34128275SEric Cheng if ((mp1 = mac_fix_cksum(mp)) != NULL) { 34138275SEric Cheng (dst_flow_ent->fe_cb_fn)( 34148275SEric Cheng dst_flow_ent->fe_cb_arg1, 34158275SEric Cheng dst_flow_ent->fe_cb_arg2, 34168275SEric Cheng mp1, do_switch); 34178275SEric Cheng } 34188275SEric Cheng } 34198275SEric Cheng FLOW_REFRELE(dst_flow_ent); 34208275SEric Cheng } else { 34218275SEric Cheng /* 34228275SEric Cheng * Unknown destination, send via the underlying 34238275SEric Cheng * NIC. 34248275SEric Cheng */ 342510491SRishi.Srivatsavai@Sun.COM MAC_TX(mip, ring, mp, 342610491SRishi.Srivatsavai@Sun.COM ((src_mcip->mci_state_flags & MCIS_SHARE_BOUND) != 342710491SRishi.Srivatsavai@Sun.COM 0)); 34288275SEric Cheng if (mp != NULL) { 34298275SEric Cheng /* 34308275SEric Cheng * Adjust for the last packet that 34318275SEric Cheng * could not be transmitted 34328275SEric Cheng */ 34338275SEric Cheng opackets--; 34348275SEric Cheng obytes -= pkt_size; 34358275SEric Cheng mp->b_next = next; 34368275SEric Cheng break; 34378275SEric Cheng } 34388275SEric Cheng } 34398275SEric Cheng mp = next; 34408275SEric Cheng } 34418275SEric Cheng 34428275SEric Cheng done: 34438275SEric Cheng src_mcip->mci_stat_obytes += obytes; 34448275SEric Cheng src_mcip->mci_stat_opackets += opackets; 34458275SEric Cheng src_mcip->mci_stat_oerrors += oerrors; 34468275SEric Cheng 34478275SEric Cheng if (stats != NULL) { 34488275SEric Cheng stats->ts_opackets = opackets; 34498275SEric Cheng stats->ts_obytes = obytes; 34508275SEric Cheng stats->ts_oerrors = oerrors; 34518275SEric Cheng } 34528275SEric Cheng return (mp); 34538275SEric Cheng } 34548275SEric Cheng 34558275SEric Cheng /* 34568275SEric Cheng * mac_tx_srs_ring_present 34578275SEric Cheng * 34588275SEric Cheng * Returns whether the specified ring is part of the specified SRS. 34598275SEric Cheng */ 34608275SEric Cheng boolean_t 34618275SEric Cheng mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring) 34628275SEric Cheng { 34638275SEric Cheng int i; 34648275SEric Cheng mac_soft_ring_t *soft_ring; 34658275SEric Cheng 34668275SEric Cheng if (srs->srs_tx.st_arg2 == tx_ring) 34678275SEric Cheng return (B_TRUE); 34688275SEric Cheng 34698275SEric Cheng for (i = 0; i < srs->srs_oth_ring_count; i++) { 34708275SEric Cheng soft_ring = srs->srs_oth_soft_rings[i]; 34718275SEric Cheng if (soft_ring->s_ring_tx_arg2 == tx_ring) 34728275SEric Cheng return (B_TRUE); 34738275SEric Cheng } 34748275SEric Cheng 34758275SEric Cheng return (B_FALSE); 34768275SEric Cheng } 34778275SEric Cheng 34788275SEric Cheng /* 34798275SEric Cheng * mac_tx_srs_wakeup 34808275SEric Cheng * 34818275SEric Cheng * Called when Tx desc become available. Wakeup the appropriate worker 34828275SEric Cheng * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the 34838275SEric Cheng * state field. 34848275SEric Cheng */ 34858275SEric Cheng void 34868275SEric Cheng mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring) 34878275SEric Cheng { 34888275SEric Cheng int i; 34898275SEric Cheng mac_soft_ring_t *sringp; 34908275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 34918275SEric Cheng 34928275SEric Cheng mutex_enter(&mac_srs->srs_lock); 34938275SEric Cheng if (TX_SINGLE_RING_MODE(mac_srs)) { 34948275SEric Cheng if (srs_tx->st_arg2 == ring && 34958275SEric Cheng mac_srs->srs_state & SRS_TX_BLOCKED) { 34968275SEric Cheng mac_srs->srs_state &= ~SRS_TX_BLOCKED; 34978275SEric Cheng srs_tx->st_unblocked_cnt++; 34988275SEric Cheng cv_signal(&mac_srs->srs_async); 34998275SEric Cheng } 35008275SEric Cheng /* 35018275SEric Cheng * A wakeup can come before tx_srs_drain() could 35028275SEric Cheng * grab srs lock and set SRS_TX_BLOCKED. So 35038275SEric Cheng * always set woken_up flag when we come here. 35048275SEric Cheng */ 35058275SEric Cheng srs_tx->st_woken_up = B_TRUE; 35068275SEric Cheng mutex_exit(&mac_srs->srs_lock); 35078275SEric Cheng return; 35088275SEric Cheng } 35098275SEric Cheng 35108275SEric Cheng /* If you are here, it is for FANOUT or BW_FANOUT case */ 35118275SEric Cheng ASSERT(TX_MULTI_RING_MODE(mac_srs)); 35128275SEric Cheng for (i = 0; i < mac_srs->srs_oth_ring_count; i++) { 35138275SEric Cheng sringp = mac_srs->srs_oth_soft_rings[i]; 35148275SEric Cheng mutex_enter(&sringp->s_ring_lock); 35158275SEric Cheng if (sringp->s_ring_tx_arg2 == ring) { 35168275SEric Cheng if (sringp->s_ring_state & S_RING_BLOCK) { 35178275SEric Cheng sringp->s_ring_state &= ~S_RING_BLOCK; 35188275SEric Cheng sringp->s_ring_unblocked_cnt++; 35198275SEric Cheng cv_signal(&sringp->s_ring_async); 35208275SEric Cheng } 35218275SEric Cheng sringp->s_ring_tx_woken_up = B_TRUE; 35228275SEric Cheng } 35238275SEric Cheng mutex_exit(&sringp->s_ring_lock); 35248275SEric Cheng } 35258275SEric Cheng mutex_exit(&mac_srs->srs_lock); 35268275SEric Cheng } 35278275SEric Cheng 35288275SEric Cheng /* 35298275SEric Cheng * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash 35308275SEric Cheng * the blocked clients again. 35318275SEric Cheng */ 35328275SEric Cheng void 35338275SEric Cheng mac_tx_notify(mac_impl_t *mip) 35348275SEric Cheng { 35358275SEric Cheng i_mac_notify(mip, MAC_NOTE_TX); 35368275SEric Cheng } 35378275SEric Cheng 35388275SEric Cheng /* 35398275SEric Cheng * RX SOFTRING RELATED FUNCTIONS 35408275SEric Cheng * 35418275SEric Cheng * These functions really belong in mac_soft_ring.c and here for 35428275SEric Cheng * a short period. 35438275SEric Cheng */ 35448275SEric Cheng 35458275SEric Cheng #define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 35468275SEric Cheng /* \ 35478275SEric Cheng * Enqueue our mblk chain. \ 35488275SEric Cheng */ \ 35498275SEric Cheng ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \ 35508275SEric Cheng \ 35518275SEric Cheng if ((ringp)->s_ring_last != NULL) \ 35528275SEric Cheng (ringp)->s_ring_last->b_next = (mp); \ 35538275SEric Cheng else \ 35548275SEric Cheng (ringp)->s_ring_first = (mp); \ 35558275SEric Cheng (ringp)->s_ring_last = (tail); \ 35568275SEric Cheng (ringp)->s_ring_count += (cnt); \ 35578275SEric Cheng ASSERT((ringp)->s_ring_count > 0); \ 35588275SEric Cheng if ((ringp)->s_ring_type & ST_RING_BW_CTL) { \ 35598275SEric Cheng (ringp)->s_ring_size += sz; \ 35608275SEric Cheng } \ 35618275SEric Cheng } 35628275SEric Cheng 35638275SEric Cheng /* 35648275SEric Cheng * Default entry point to deliver a packet chain to a MAC client. 35658275SEric Cheng * If the MAC client has flows, do the classification with these 35668275SEric Cheng * flows as well. 35678275SEric Cheng */ 35688275SEric Cheng /* ARGSUSED */ 35698275SEric Cheng void 35708275SEric Cheng mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain, 35718275SEric Cheng mac_header_info_t *arg3) 35728275SEric Cheng { 35738275SEric Cheng mac_client_impl_t *mcip = arg1; 35748275SEric Cheng 35758275SEric Cheng if (mcip->mci_nvids == 1 && 35769109SVenu.Iyer@Sun.COM !(mcip->mci_state_flags & MCIS_STRIP_DISABLE)) { 35778275SEric Cheng /* 35788275SEric Cheng * If the client has exactly one VID associated with it 35798275SEric Cheng * and striping of VLAN header is not disabled, 35808275SEric Cheng * remove the VLAN tag from the packet before 35818275SEric Cheng * passing it on to the client's receive callback. 35828275SEric Cheng * Note that this needs to be done after we dispatch 35838275SEric Cheng * the packet to the promiscuous listeners of the 35848275SEric Cheng * client, since they expect to see the whole 35858275SEric Cheng * frame including the VLAN headers. 35868275SEric Cheng */ 35878275SEric Cheng mp_chain = mac_strip_vlan_tag_chain(mp_chain); 35888275SEric Cheng } 35898275SEric Cheng 35908275SEric Cheng mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE); 35918275SEric Cheng } 35928275SEric Cheng 35938275SEric Cheng /* 35948275SEric Cheng * mac_rx_soft_ring_process 35958275SEric Cheng * 35968275SEric Cheng * process a chain for a given soft ring. The number of packets queued 35978275SEric Cheng * in the SRS and its associated soft rings (including this one) is 35988275SEric Cheng * very small (tracked by srs_poll_pkt_cnt), then allow the entering 35998275SEric Cheng * thread (interrupt or poll thread) to do inline processing. This 36008275SEric Cheng * helps keep the latency down under low load. 36018275SEric Cheng * 36028275SEric Cheng * The proc and arg for each mblk is already stored in the mblk in 36038275SEric Cheng * appropriate places. 36048275SEric Cheng */ 36058275SEric Cheng /* ARGSUSED */ 36068275SEric Cheng void 36078275SEric Cheng mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, 36088275SEric Cheng mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz) 36098275SEric Cheng { 36108275SEric Cheng mac_direct_rx_t proc; 36118275SEric Cheng void *arg1; 36128275SEric Cheng mac_resource_handle_t arg2; 36138275SEric Cheng mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 36148275SEric Cheng 36158275SEric Cheng ASSERT(ringp != NULL); 36168275SEric Cheng ASSERT(mp_chain != NULL); 36178275SEric Cheng ASSERT(tail != NULL); 36188275SEric Cheng ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 36198275SEric Cheng 36208275SEric Cheng mutex_enter(&ringp->s_ring_lock); 36218275SEric Cheng ringp->s_ring_total_inpkt += cnt; 36228833SVenu.Iyer@Sun.COM if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) && 36238833SVenu.Iyer@Sun.COM !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) { 36248275SEric Cheng /* If on processor or blanking on, then enqueue and return */ 36258275SEric Cheng if (ringp->s_ring_state & S_RING_BLANK || 36268275SEric Cheng ringp->s_ring_state & S_RING_PROC) { 36278275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 36288275SEric Cheng mutex_exit(&ringp->s_ring_lock); 36298275SEric Cheng return; 36308275SEric Cheng } 36318275SEric Cheng proc = ringp->s_ring_rx_func; 36328275SEric Cheng arg1 = ringp->s_ring_rx_arg1; 36338275SEric Cheng arg2 = ringp->s_ring_rx_arg2; 36348275SEric Cheng /* 36358275SEric Cheng * See if anything is already queued. If we are the 36368275SEric Cheng * first packet, do inline processing else queue the 36378275SEric Cheng * packet and do the drain. 36388275SEric Cheng */ 36398275SEric Cheng if (ringp->s_ring_first == NULL) { 36408275SEric Cheng /* 36418275SEric Cheng * Fast-path, ok to process and nothing queued. 36428275SEric Cheng */ 36438275SEric Cheng ringp->s_ring_run = curthread; 36448275SEric Cheng ringp->s_ring_state |= (S_RING_PROC); 36458275SEric Cheng 36468275SEric Cheng mutex_exit(&ringp->s_ring_lock); 36478275SEric Cheng 36488275SEric Cheng /* 36498275SEric Cheng * We are the chain of 1 packet so 36508275SEric Cheng * go through this fast path. 36518275SEric Cheng */ 36528275SEric Cheng ASSERT(mp_chain->b_next == NULL); 36538275SEric Cheng 36548275SEric Cheng (*proc)(arg1, arg2, mp_chain, NULL); 36558275SEric Cheng 36568275SEric Cheng ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 36578275SEric Cheng /* 36588275SEric Cheng * If we have a soft ring set which is doing 36598275SEric Cheng * bandwidth control, we need to decrement 36608275SEric Cheng * srs_size and count so it the SRS can have a 36618275SEric Cheng * accurate idea of what is the real data 36628275SEric Cheng * queued between SRS and its soft rings. We 36638275SEric Cheng * decrement the counters only when the packet 36648275SEric Cheng * gets processed by both SRS and the soft ring. 36658275SEric Cheng */ 36668275SEric Cheng mutex_enter(&mac_srs->srs_lock); 36678275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 36688275SEric Cheng MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 36698275SEric Cheng mutex_exit(&mac_srs->srs_lock); 36708275SEric Cheng 36718275SEric Cheng mutex_enter(&ringp->s_ring_lock); 36728275SEric Cheng ringp->s_ring_run = NULL; 36738275SEric Cheng ringp->s_ring_state &= ~S_RING_PROC; 36748275SEric Cheng if (ringp->s_ring_state & S_RING_CLIENT_WAIT) 36758275SEric Cheng cv_signal(&ringp->s_ring_client_cv); 36768275SEric Cheng 36778275SEric Cheng if ((ringp->s_ring_first == NULL) || 36788275SEric Cheng (ringp->s_ring_state & S_RING_BLANK)) { 36798275SEric Cheng /* 36808275SEric Cheng * We processed inline our packet and 36818275SEric Cheng * nothing new has arrived or our 36828275SEric Cheng * receiver doesn't want to receive 36838275SEric Cheng * any packets. We are done. 36848275SEric Cheng */ 36858275SEric Cheng mutex_exit(&ringp->s_ring_lock); 36868275SEric Cheng return; 36878275SEric Cheng } 36888275SEric Cheng } else { 36898275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, 36908275SEric Cheng mp_chain, tail, cnt, sz); 36918275SEric Cheng } 36928275SEric Cheng 36938275SEric Cheng /* 36948275SEric Cheng * We are here because either we couldn't do inline 36958275SEric Cheng * processing (because something was already 36968275SEric Cheng * queued), or we had a chain of more than one 36978275SEric Cheng * packet, or something else arrived after we were 36988275SEric Cheng * done with inline processing. 36998275SEric Cheng */ 37008275SEric Cheng ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 37018275SEric Cheng ASSERT(ringp->s_ring_first != NULL); 37028275SEric Cheng 37038275SEric Cheng ringp->s_ring_drain_func(ringp); 37048275SEric Cheng mutex_exit(&ringp->s_ring_lock); 37058275SEric Cheng return; 37068275SEric Cheng } else { 37078275SEric Cheng /* ST_RING_WORKER_ONLY case */ 37088275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 37098275SEric Cheng mac_soft_ring_worker_wakeup(ringp); 37108275SEric Cheng mutex_exit(&ringp->s_ring_lock); 37118275SEric Cheng } 37128275SEric Cheng } 37138275SEric Cheng 37148275SEric Cheng /* 37158275SEric Cheng * TX SOFTRING RELATED FUNCTIONS 37168275SEric Cheng * 37178275SEric Cheng * These functions really belong in mac_soft_ring.c and here for 37188275SEric Cheng * a short period. 37198275SEric Cheng */ 37208275SEric Cheng 37218275SEric Cheng #define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 37228275SEric Cheng ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); \ 37238275SEric Cheng ringp->s_ring_state |= S_RING_ENQUEUED; \ 37248275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); \ 37258275SEric Cheng } 37268275SEric Cheng 37278275SEric Cheng /* 37288275SEric Cheng * mac_tx_sring_queued 37298275SEric Cheng * 37308275SEric Cheng * When we are out of transmit descriptors and we already have a 37318275SEric Cheng * queue that exceeds hiwat (or the client called us with 37328275SEric Cheng * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the 37338275SEric Cheng * soft ring pointer as the opaque cookie for the client enable 37348275SEric Cheng * flow control. 37358275SEric Cheng */ 37368275SEric Cheng static mac_tx_cookie_t 37378275SEric Cheng mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, 37388275SEric Cheng mblk_t **ret_mp) 37398275SEric Cheng { 37408275SEric Cheng int cnt; 37418275SEric Cheng size_t sz; 37428275SEric Cheng mblk_t *tail; 37438275SEric Cheng mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 37448275SEric Cheng mac_tx_cookie_t cookie = NULL; 37458275SEric Cheng boolean_t wakeup_worker = B_TRUE; 37468275SEric Cheng 37478275SEric Cheng ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 37488275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 37498275SEric Cheng if (flag & MAC_DROP_ON_NO_DESC) { 37508275SEric Cheng mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 37518275SEric Cheng /* increment freed stats */ 37528275SEric Cheng ringp->s_ring_drops += cnt; 37538275SEric Cheng cookie = (mac_tx_cookie_t)ringp; 37548275SEric Cheng } else { 37558275SEric Cheng if (ringp->s_ring_first != NULL) 37568275SEric Cheng wakeup_worker = B_FALSE; 37578275SEric Cheng 37588275SEric Cheng if (flag & MAC_TX_NO_ENQUEUE) { 37598275SEric Cheng /* 37608275SEric Cheng * If QUEUED is not set, queue the packet 37618275SEric Cheng * and let mac_tx_soft_ring_drain() set 37628275SEric Cheng * the TX_BLOCKED bit for the reasons 37638275SEric Cheng * explained above. Otherwise, return the 37648275SEric Cheng * mblks. 37658275SEric Cheng */ 37668275SEric Cheng if (wakeup_worker) { 37678275SEric Cheng TX_SOFT_RING_ENQUEUE_CHAIN(ringp, 37688275SEric Cheng mp_chain, tail, cnt, sz); 37698275SEric Cheng } else { 37708275SEric Cheng ringp->s_ring_state |= S_RING_WAKEUP_CLIENT; 37718275SEric Cheng cookie = (mac_tx_cookie_t)ringp; 37728275SEric Cheng *ret_mp = mp_chain; 37738275SEric Cheng } 37748275SEric Cheng } else { 37758275SEric Cheng boolean_t enqueue = B_TRUE; 37768275SEric Cheng 37778275SEric Cheng if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 37788275SEric Cheng /* 37798275SEric Cheng * flow-controlled. Store ringp in cookie 37808275SEric Cheng * so that it can be returned as 37818275SEric Cheng * mac_tx_cookie_t to client 37828275SEric Cheng */ 37838275SEric Cheng ringp->s_ring_state |= S_RING_TX_HIWAT; 37848275SEric Cheng cookie = (mac_tx_cookie_t)ringp; 37858275SEric Cheng ringp->s_ring_hiwat_cnt++; 37868275SEric Cheng if (ringp->s_ring_count > 37878275SEric Cheng ringp->s_ring_tx_max_q_cnt) { 37888275SEric Cheng /* increment freed stats */ 37898275SEric Cheng ringp->s_ring_drops += cnt; 37908275SEric Cheng /* 37918275SEric Cheng * b_prev may be set to the fanout hint 37928275SEric Cheng * hence can't use freemsg directly 37938275SEric Cheng */ 37948275SEric Cheng mac_pkt_drop(NULL, NULL, 37958275SEric Cheng mp_chain, B_FALSE); 37968275SEric Cheng DTRACE_PROBE1(tx_queued_hiwat, 37978275SEric Cheng mac_soft_ring_t *, ringp); 37988275SEric Cheng enqueue = B_FALSE; 37998275SEric Cheng } 38008275SEric Cheng } 38018275SEric Cheng if (enqueue) { 38028275SEric Cheng TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, 38038275SEric Cheng tail, cnt, sz); 38048275SEric Cheng } 38058275SEric Cheng } 38068275SEric Cheng if (wakeup_worker) 38078275SEric Cheng cv_signal(&ringp->s_ring_async); 38088275SEric Cheng } 38098275SEric Cheng return (cookie); 38108275SEric Cheng } 38118275SEric Cheng 38128275SEric Cheng 38138275SEric Cheng /* 38148275SEric Cheng * mac_tx_soft_ring_process 38158275SEric Cheng * 38168275SEric Cheng * This routine is called when fanning out outgoing traffic among 38178275SEric Cheng * multipe Tx rings. 38188275SEric Cheng * Note that a soft ring is associated with a h/w Tx ring. 38198275SEric Cheng */ 38208275SEric Cheng mac_tx_cookie_t 38218275SEric Cheng mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain, 38228275SEric Cheng uint16_t flag, mblk_t **ret_mp) 38238275SEric Cheng { 38248275SEric Cheng mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 38258275SEric Cheng int cnt; 38268275SEric Cheng size_t sz; 38278275SEric Cheng mblk_t *tail; 38288275SEric Cheng mac_tx_cookie_t cookie = NULL; 38298275SEric Cheng 38308275SEric Cheng ASSERT(ringp != NULL); 38318275SEric Cheng ASSERT(mp_chain != NULL); 38328275SEric Cheng ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 38338275SEric Cheng /* 38348275SEric Cheng * Only two modes can come here; either it can be 38358275SEric Cheng * SRS_TX_BW_FANOUT or SRS_TX_FANOUT 38368275SEric Cheng */ 38378275SEric Cheng ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT || 38388275SEric Cheng mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT); 38398275SEric Cheng 38408275SEric Cheng if (ringp->s_ring_type & ST_RING_WORKER_ONLY) { 38418275SEric Cheng /* Serialization mode */ 38428275SEric Cheng 38438275SEric Cheng mutex_enter(&ringp->s_ring_lock); 38448275SEric Cheng if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 38458275SEric Cheng cookie = mac_tx_sring_enqueue(ringp, mp_chain, 38468275SEric Cheng flag, ret_mp); 38478275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38488275SEric Cheng return (cookie); 38498275SEric Cheng } 38508275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 38518275SEric Cheng TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 38528275SEric Cheng if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) { 38538275SEric Cheng /* 38548275SEric Cheng * If ring is blocked due to lack of Tx 38558275SEric Cheng * descs, just return. Worker thread 38568275SEric Cheng * will get scheduled when Tx desc's 38578275SEric Cheng * become available. 38588275SEric Cheng */ 38598275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38608275SEric Cheng return (cookie); 38618275SEric Cheng } 38628275SEric Cheng mac_soft_ring_worker_wakeup(ringp); 38638275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38648275SEric Cheng return (cookie); 38658275SEric Cheng } else { 38668275SEric Cheng /* Default fanout mode */ 38678275SEric Cheng /* 38688275SEric Cheng * S_RING_BLOCKED is set when underlying NIC runs 38698275SEric Cheng * out of Tx descs and messages start getting 38708275SEric Cheng * queued. It won't get reset until 38718275SEric Cheng * tx_srs_drain() completely drains out the 38728275SEric Cheng * messages. 38738275SEric Cheng */ 38748275SEric Cheng boolean_t is_subflow; 38758275SEric Cheng mac_tx_stats_t stats; 38768275SEric Cheng 38778275SEric Cheng if (ringp->s_ring_state & S_RING_ENQUEUED) { 38788275SEric Cheng /* Tx descs/resources not available */ 38798275SEric Cheng mutex_enter(&ringp->s_ring_lock); 38808275SEric Cheng if (ringp->s_ring_state & S_RING_ENQUEUED) { 38818275SEric Cheng cookie = mac_tx_sring_enqueue(ringp, mp_chain, 38828275SEric Cheng flag, ret_mp); 38838275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38848275SEric Cheng return (cookie); 38858275SEric Cheng } 38868275SEric Cheng /* 38878275SEric Cheng * While we were computing mblk count, the 38888275SEric Cheng * flow control condition got relieved. 38898275SEric Cheng * Continue with the transmission. 38908275SEric Cheng */ 38918275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38928275SEric Cheng } 38938275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 38948275SEric Cheng 38958275SEric Cheng mp_chain = mac_tx_send(ringp->s_ring_tx_arg1, 38968275SEric Cheng ringp->s_ring_tx_arg2, mp_chain, 38978275SEric Cheng (is_subflow ? &stats : NULL)); 38988275SEric Cheng 38998275SEric Cheng /* 39008275SEric Cheng * Multiple threads could be here sending packets. 39018275SEric Cheng * Under such conditions, it is not possible to 39028275SEric Cheng * automically set S_RING_BLOCKED bit to indicate 39038275SEric Cheng * out of tx desc condition. To atomically set 39048275SEric Cheng * this, we queue the returned packet and do 39058275SEric Cheng * the setting of S_RING_BLOCKED in 39068275SEric Cheng * mac_tx_soft_ring_drain(). 39078275SEric Cheng */ 39088275SEric Cheng if (mp_chain != NULL) { 39098275SEric Cheng mutex_enter(&ringp->s_ring_lock); 39108275SEric Cheng cookie = 39118275SEric Cheng mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp); 39128275SEric Cheng mutex_exit(&ringp->s_ring_lock); 39138275SEric Cheng return (cookie); 39148275SEric Cheng } 39158275SEric Cheng if (is_subflow) { 39168275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 39178275SEric Cheng } 39188275SEric Cheng return (NULL); 39198275SEric Cheng } 39208275SEric Cheng } 3921