18275SEric Cheng /* 28275SEric Cheng * CDDL HEADER START 38275SEric Cheng * 48275SEric Cheng * The contents of this file are subject to the terms of the 58275SEric Cheng * Common Development and Distribution License (the "License"). 68275SEric Cheng * You may not use this file except in compliance with the License. 78275SEric Cheng * 88275SEric Cheng * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 98275SEric Cheng * or http://www.opensolaris.org/os/licensing. 108275SEric Cheng * See the License for the specific language governing permissions 118275SEric Cheng * and limitations under the License. 128275SEric Cheng * 138275SEric Cheng * When distributing Covered Code, include this CDDL HEADER in each 148275SEric Cheng * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 158275SEric Cheng * If applicable, add the following below this CDDL HEADER, with the 168275SEric Cheng * fields enclosed by brackets "[]" replaced with your own identifying 178275SEric Cheng * information: Portions Copyright [yyyy] [name of copyright owner] 188275SEric Cheng * 198275SEric Cheng * CDDL HEADER END 208275SEric Cheng */ 218275SEric Cheng /* 22*8833SVenu.Iyer@Sun.COM * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 238275SEric Cheng * Use is subject to license terms. 248275SEric Cheng */ 258275SEric Cheng 268275SEric Cheng #include <sys/types.h> 278275SEric Cheng #include <sys/callb.h> 288275SEric Cheng #include <sys/sdt.h> 298275SEric Cheng #include <sys/strsubr.h> 308275SEric Cheng #include <sys/strsun.h> 318275SEric Cheng #include <sys/vlan.h> 328275SEric Cheng #include <inet/ipsec_impl.h> 338275SEric Cheng #include <inet/ip_impl.h> 348275SEric Cheng #include <inet/sadb.h> 358275SEric Cheng #include <inet/ipsecesp.h> 368275SEric Cheng #include <inet/ipsecah.h> 378275SEric Cheng #include <inet/ip6.h> 388275SEric Cheng 398275SEric Cheng #include <sys/mac_impl.h> 408275SEric Cheng #include <sys/mac_client_impl.h> 418275SEric Cheng #include <sys/mac_client_priv.h> 428275SEric Cheng #include <sys/mac_soft_ring.h> 438275SEric Cheng #include <sys/mac_flow_impl.h> 448275SEric Cheng 458275SEric Cheng static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *, 468275SEric Cheng uintptr_t, uint16_t, mblk_t **); 478275SEric Cheng static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *, 488275SEric Cheng uintptr_t, uint16_t, mblk_t **); 498275SEric Cheng static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *, 508275SEric Cheng uintptr_t, uint16_t, mblk_t **); 518275SEric Cheng static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *, 528275SEric Cheng uintptr_t, uint16_t, mblk_t **); 538275SEric Cheng 548275SEric Cheng typedef struct mac_tx_mode_s { 558275SEric Cheng mac_tx_srs_mode_t mac_tx_mode; 568275SEric Cheng mac_tx_func_t mac_tx_func; 578275SEric Cheng } mac_tx_mode_t; 588275SEric Cheng 598275SEric Cheng /* 608275SEric Cheng * There are five modes of operation on the Tx side. These modes get set 618275SEric Cheng * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode, 628275SEric Cheng * none of the other modes are user configurable. They get selected by 638275SEric Cheng * the system depending upon whether the link (or flow) has multiple Tx 648275SEric Cheng * rings or a bandwidth configured, etc. 658275SEric Cheng */ 668275SEric Cheng mac_tx_mode_t mac_tx_mode_list[] = { 678275SEric Cheng {SRS_TX_DEFAULT, mac_tx_single_ring_mode}, 688275SEric Cheng {SRS_TX_SERIALIZE, mac_tx_serializer_mode}, 698275SEric Cheng {SRS_TX_FANOUT, mac_tx_fanout_mode}, 708275SEric Cheng {SRS_TX_BW, mac_tx_bw_mode}, 718275SEric Cheng {SRS_TX_BW_FANOUT, mac_tx_bw_mode} 728275SEric Cheng }; 738275SEric Cheng 748275SEric Cheng /* 758275SEric Cheng * Soft Ring Set (SRS) - The Run time code that deals with 768275SEric Cheng * dynamic polling from the hardware, bandwidth enforcement, 778275SEric Cheng * fanout etc. 788275SEric Cheng * 798275SEric Cheng * We try to use H/W classification on NIC and assign traffic for 808275SEric Cheng * a MAC address to a particular Rx ring or ring group. There is a 818275SEric Cheng * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically 828275SEric Cheng * switches the underlying Rx ring between interrupt and 838275SEric Cheng * polling mode and enforces any specified B/W control. 848275SEric Cheng * 858275SEric Cheng * There is always a SRS created and tied to each H/W and S/W rule. 868275SEric Cheng * Whenever we create a H/W rule, we always add the the same rule to 878275SEric Cheng * S/W classifier and tie a SRS to it. 888275SEric Cheng * 898275SEric Cheng * In case a B/W control is specified, it is broken into bytes 908275SEric Cheng * per ticks and as soon as the quota for a tick is exhausted, 918275SEric Cheng * the underlying Rx ring is forced into poll mode for remainder of 928275SEric Cheng * the tick. The SRS poll thread only polls for bytes that are 938275SEric Cheng * allowed to come in the SRS. We typically let 4x the configured 948275SEric Cheng * B/W worth of packets to come in the SRS (to prevent unnecessary 958275SEric Cheng * drops due to bursts) but only process the specified amount. 968275SEric Cheng * 978275SEric Cheng * A MAC client (e.g. a VNIC or aggr) can have 1 or more 988275SEric Cheng * Rx rings (and corresponding SRSs) assigned to it. The SRS 998275SEric Cheng * in turn can have softrings to do protocol level fanout or 1008275SEric Cheng * softrings to do S/W based fanout or both. In case the NIC 1018275SEric Cheng * has no Rx rings, we do S/W classification to respective SRS. 1028275SEric Cheng * The S/W classification rule is always setup and ready. This 1038275SEric Cheng * allows the MAC layer to reassign Rx rings whenever needed 1048275SEric Cheng * but packets still continue to flow via the default path and 1058275SEric Cheng * getting S/W classified to correct SRS. 1068275SEric Cheng * 1078275SEric Cheng * The SRS's are used on both Tx and Rx side. They use the same 1088275SEric Cheng * data structure but the processing routines have slightly different 1098275SEric Cheng * semantics due to the fact that Rx side needs to do dynamic 1108275SEric Cheng * polling etc. 1118275SEric Cheng * 1128275SEric Cheng * Dynamic Polling Notes 1138275SEric Cheng * ===================== 1148275SEric Cheng * 1158275SEric Cheng * Each Soft ring set is capable of switching its Rx ring between 1168275SEric Cheng * interrupt and poll mode and actively 'polls' for packets in 1178275SEric Cheng * poll mode. If the SRS is implementing a B/W limit, it makes 1188275SEric Cheng * sure that only Max allowed packets are pulled in poll mode 1198275SEric Cheng * and goes to poll mode as soon as B/W limit is exceeded. As 1208275SEric Cheng * such, there are no overheads to implement B/W limits. 1218275SEric Cheng * 1228275SEric Cheng * In poll mode, its better to keep the pipeline going where the 1238275SEric Cheng * SRS worker thread keeps processing packets and poll thread 1248275SEric Cheng * keeps bringing more packets (specially if they get to run 1258275SEric Cheng * on different CPUs). This also prevents the overheads associated 1268275SEric Cheng * by excessive signalling (on NUMA machines, this can be 1278275SEric Cheng * pretty devastating). The exception is latency optimized case 1288275SEric Cheng * where worker thread does no work and interrupt and poll thread 1298275SEric Cheng * are allowed to do their own drain. 1308275SEric Cheng * 1318275SEric Cheng * We use the following policy to control Dynamic Polling: 1328275SEric Cheng * 1) We switch to poll mode anytime the processing 1338275SEric Cheng * thread causes a backlog to build up in SRS and 1348275SEric Cheng * its associated Soft Rings (sr_poll_pkt_cnt > 0). 1358275SEric Cheng * 2) As long as the backlog stays under the low water 1368275SEric Cheng * mark (sr_lowat), we poll the H/W for more packets. 1378275SEric Cheng * 3) If the backlog (sr_poll_pkt_cnt) exceeds low 1388275SEric Cheng * water mark, we stay in poll mode but don't poll 1398275SEric Cheng * the H/W for more packets. 1408275SEric Cheng * 4) Anytime in polling mode, if we poll the H/W for 1418275SEric Cheng * packets and find nothing plus we have an existing 1428275SEric Cheng * backlog (sr_poll_pkt_cnt > 0), we stay in polling 1438275SEric Cheng * mode but don't poll the H/W for packets anymore 1448275SEric Cheng * (let the polling thread go to sleep). 1458275SEric Cheng * 5) Once the backlog is relived (packets are processed) 1468275SEric Cheng * we reenable polling (by signalling the poll thread) 1478275SEric Cheng * only when the backlog dips below sr_poll_thres. 1488275SEric Cheng * 6) sr_hiwat is used exclusively when we are not 1498275SEric Cheng * polling capable and is used to decide when to 1508275SEric Cheng * drop packets so the SRS queue length doesn't grow 1518275SEric Cheng * infinitely. 1528275SEric Cheng * 1538275SEric Cheng * NOTE: Also see the block level comment on top of mac_soft_ring.c 1548275SEric Cheng */ 1558275SEric Cheng 1568275SEric Cheng /* 1578275SEric Cheng * mac_latency_optimize 1588275SEric Cheng * 1598275SEric Cheng * Controls whether the poll thread can process the packets inline 1608275SEric Cheng * or let the SRS worker thread do the processing. This applies if 1618275SEric Cheng * the SRS was not being processed. For latency sensitive traffic, 1628275SEric Cheng * this needs to be true to allow inline processing. For throughput 1638275SEric Cheng * under load, this should be false. 1648275SEric Cheng * 1658275SEric Cheng * This (and other similar) tunable should be rolled into a link 1668275SEric Cheng * or flow specific workload hint that can be set using dladm 1678275SEric Cheng * linkprop (instead of multiple such tunables). 1688275SEric Cheng */ 1698275SEric Cheng boolean_t mac_latency_optimize = B_TRUE; 1708275SEric Cheng 1718275SEric Cheng /* 1728275SEric Cheng * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN 1738275SEric Cheng * 1748275SEric Cheng * queue a mp or chain in soft ring set and increment the 1758275SEric Cheng * local count (srs_count) for the SRS and the shared counter 1768275SEric Cheng * (srs_poll_pkt_cnt - shared between SRS and its soft rings 1778275SEric Cheng * to track the total unprocessed packets for polling to work 1788275SEric Cheng * correctly). 1798275SEric Cheng * 1808275SEric Cheng * The size (total bytes queued) counters are incremented only 1818275SEric Cheng * if we are doing B/W control. 1828275SEric Cheng */ 1838275SEric Cheng #define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 1848275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 1858275SEric Cheng if ((mac_srs)->srs_last != NULL) \ 1868275SEric Cheng (mac_srs)->srs_last->b_next = (head); \ 1878275SEric Cheng else \ 1888275SEric Cheng (mac_srs)->srs_first = (head); \ 1898275SEric Cheng (mac_srs)->srs_last = (tail); \ 1908275SEric Cheng (mac_srs)->srs_count += count; \ 1918275SEric Cheng } 1928275SEric Cheng 1938275SEric Cheng #define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 1948275SEric Cheng mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 1958275SEric Cheng \ 1968275SEric Cheng MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 1978275SEric Cheng srs_rx->sr_poll_pkt_cnt += count; \ 1988275SEric Cheng ASSERT(srs_rx->sr_poll_pkt_cnt > 0); \ 1998275SEric Cheng if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 2008275SEric Cheng (mac_srs)->srs_size += (sz); \ 2018275SEric Cheng mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock); \ 2028275SEric Cheng (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 2038275SEric Cheng mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock); \ 2048275SEric Cheng } \ 2058275SEric Cheng } 2068275SEric Cheng 2078275SEric Cheng #define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 2088275SEric Cheng mac_srs->srs_state |= SRS_ENQUEUED; \ 2098275SEric Cheng MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 2108275SEric Cheng if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 2118275SEric Cheng (mac_srs)->srs_size += (sz); \ 2128275SEric Cheng (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 2138275SEric Cheng } \ 2148275SEric Cheng } 2158275SEric Cheng 2168275SEric Cheng /* 2178275SEric Cheng * Turn polling on routines 2188275SEric Cheng */ 2198275SEric Cheng #define MAC_SRS_POLLING_ON(mac_srs) { \ 2208275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2218275SEric Cheng if (((mac_srs)->srs_state & \ 2228275SEric Cheng (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) { \ 2238275SEric Cheng (mac_srs)->srs_state |= SRS_POLLING; \ 2248275SEric Cheng (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 2258275SEric Cheng (mac_srs)->srs_ring); \ 2268275SEric Cheng (mac_srs)->srs_rx.sr_poll_on++; \ 2278275SEric Cheng } \ 2288275SEric Cheng } 2298275SEric Cheng 2308275SEric Cheng #define MAC_SRS_WORKER_POLLING_ON(mac_srs) { \ 2318275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2328275SEric Cheng if (((mac_srs)->srs_state & \ 2338275SEric Cheng (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == \ 2348275SEric Cheng (SRS_POLLING_CAPAB|SRS_WORKER)) { \ 2358275SEric Cheng (mac_srs)->srs_state |= SRS_POLLING; \ 2368275SEric Cheng (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 2378275SEric Cheng (mac_srs)->srs_ring); \ 2388275SEric Cheng (mac_srs)->srs_rx.sr_worker_poll_on++; \ 2398275SEric Cheng } \ 2408275SEric Cheng } 2418275SEric Cheng 2428275SEric Cheng /* 2438275SEric Cheng * MAC_SRS_POLL_RING 2448275SEric Cheng * 2458275SEric Cheng * Signal the SRS poll thread to poll the underlying H/W ring 2468275SEric Cheng * provided it wasn't already polling (SRS_GET_PKTS was set). 2478275SEric Cheng * 2488275SEric Cheng * Poll thread gets to run only from mac_rx_srs_drain() and only 2498275SEric Cheng * if the drain was being done by the worker thread. 2508275SEric Cheng */ 2518275SEric Cheng #define MAC_SRS_POLL_RING(mac_srs) { \ 2528275SEric Cheng mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 2538275SEric Cheng \ 2548275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2558275SEric Cheng srs_rx->sr_poll_thr_sig++; \ 2568275SEric Cheng if (((mac_srs)->srs_state & \ 2578275SEric Cheng (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) == \ 2588275SEric Cheng (SRS_WORKER|SRS_POLLING_CAPAB)) { \ 2598275SEric Cheng (mac_srs)->srs_state |= SRS_GET_PKTS; \ 2608275SEric Cheng cv_signal(&(mac_srs)->srs_cv); \ 2618275SEric Cheng } else { \ 2628275SEric Cheng srs_rx->sr_poll_thr_busy++; \ 2638275SEric Cheng } \ 2648275SEric Cheng } 2658275SEric Cheng 2668275SEric Cheng /* 2678275SEric Cheng * MAC_SRS_CHECK_BW_CONTROL 2688275SEric Cheng * 2698275SEric Cheng * Check to see if next tick has started so we can reset the 2708275SEric Cheng * SRS_BW_ENFORCED flag and allow more packets to come in the 2718275SEric Cheng * system. 2728275SEric Cheng */ 2738275SEric Cheng #define MAC_SRS_CHECK_BW_CONTROL(mac_srs) { \ 2748275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2758275SEric Cheng ASSERT(((mac_srs)->srs_type & SRST_TX) || \ 2768275SEric Cheng MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock)); \ 2778275SEric Cheng if ((mac_srs)->srs_bw->mac_bw_curr_time != lbolt) { \ 2788275SEric Cheng (mac_srs)->srs_bw->mac_bw_curr_time = lbolt; \ 2798275SEric Cheng (mac_srs)->srs_bw->mac_bw_used = 0; \ 2808275SEric Cheng if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED) \ 2818275SEric Cheng (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \ 2828275SEric Cheng } \ 2838275SEric Cheng } 2848275SEric Cheng 2858275SEric Cheng /* 2868275SEric Cheng * MAC_SRS_WORKER_WAKEUP 2878275SEric Cheng * 2888275SEric Cheng * Wake up the SRS worker thread to process the queue as long as 2898275SEric Cheng * no one else is processing the queue. If we are optimizing for 2908275SEric Cheng * latency, we wake up the worker thread immediately or else we 2918275SEric Cheng * wait mac_srs_worker_wakeup_ticks before worker thread gets 2928275SEric Cheng * woken up. 2938275SEric Cheng */ 2948275SEric Cheng int mac_srs_worker_wakeup_ticks = 0; 2958275SEric Cheng #define MAC_SRS_WORKER_WAKEUP(mac_srs) { \ 2968275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 2978275SEric Cheng if (!((mac_srs)->srs_state & SRS_PROC) && \ 2988275SEric Cheng (mac_srs)->srs_tid == NULL) { \ 2998275SEric Cheng if (mac_latency_optimize || \ 3008275SEric Cheng (mac_srs_worker_wakeup_ticks == 0)) \ 3018275SEric Cheng cv_signal(&(mac_srs)->srs_async); \ 3028275SEric Cheng else \ 3038275SEric Cheng (mac_srs)->srs_tid = \ 3048275SEric Cheng timeout(mac_srs_fire, (mac_srs), \ 3058275SEric Cheng mac_srs_worker_wakeup_ticks); \ 3068275SEric Cheng } \ 3078275SEric Cheng } 3088275SEric Cheng 3098275SEric Cheng #define TX_SINGLE_RING_MODE(mac_srs) \ 3108275SEric Cheng ((mac_srs)->srs_tx.st_mode == SRS_TX_DEFAULT || \ 3118275SEric Cheng (mac_srs)->srs_tx.st_mode == SRS_TX_SERIALIZE || \ 3128275SEric Cheng (mac_srs)->srs_tx.st_mode == SRS_TX_BW) 3138275SEric Cheng 3148275SEric Cheng #define TX_BANDWIDTH_MODE(mac_srs) \ 3158275SEric Cheng ((mac_srs)->srs_tx.st_mode == SRS_TX_BW || \ 3168275SEric Cheng (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT) 3178275SEric Cheng 3188275SEric Cheng #define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) { \ 3198275SEric Cheng uint_t hash, indx; \ 3208275SEric Cheng hash = HASH_HINT(hint); \ 3218275SEric Cheng indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); \ 3228275SEric Cheng softring = mac_srs->srs_oth_soft_rings[indx]; \ 3238275SEric Cheng (void) (mac_tx_soft_ring_process(softring, head, 0, NULL)); \ 3248275SEric Cheng } 3258275SEric Cheng 3268275SEric Cheng /* 3278275SEric Cheng * MAC_TX_SRS_BLOCK 3288275SEric Cheng * 3298275SEric Cheng * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED 3308275SEric Cheng * will be set only if srs_tx_woken_up is FALSE. If 3318275SEric Cheng * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived 3328275SEric Cheng * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to 3338275SEric Cheng * attempt to transmit again and not setting SRS_TX_BLOCKED does 3348275SEric Cheng * that. 3358275SEric Cheng */ 3368275SEric Cheng #define MAC_TX_SRS_BLOCK(srs, mp) { \ 3378275SEric Cheng ASSERT(MUTEX_HELD(&(srs)->srs_lock)); \ 3388275SEric Cheng if ((srs)->srs_tx.st_woken_up) { \ 3398275SEric Cheng (srs)->srs_tx.st_woken_up = B_FALSE; \ 3408275SEric Cheng } else { \ 3418275SEric Cheng ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED)); \ 3428275SEric Cheng (srs)->srs_state |= SRS_TX_BLOCKED; \ 3438275SEric Cheng (srs)->srs_tx.st_blocked_cnt++; \ 3448275SEric Cheng } \ 3458275SEric Cheng } 3468275SEric Cheng 3478275SEric Cheng /* 3488275SEric Cheng * MAC_TX_SRS_TEST_HIWAT 3498275SEric Cheng * 3508275SEric Cheng * Called before queueing a packet onto Tx SRS to test and set 3518275SEric Cheng * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat. 3528275SEric Cheng */ 3538275SEric Cheng #define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) { \ 3548275SEric Cheng boolean_t enqueue = 1; \ 3558275SEric Cheng \ 3568275SEric Cheng if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) { \ 3578275SEric Cheng /* \ 3588275SEric Cheng * flow-controlled. Store srs in cookie so that it \ 3598275SEric Cheng * can be returned as mac_tx_cookie_t to client \ 3608275SEric Cheng */ \ 3618275SEric Cheng (srs)->srs_state |= SRS_TX_HIWAT; \ 3628275SEric Cheng cookie = (mac_tx_cookie_t)srs; \ 3638275SEric Cheng (srs)->srs_tx.st_hiwat_cnt++; \ 3648275SEric Cheng if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) { \ 3658275SEric Cheng /* increment freed stats */ \ 3668275SEric Cheng (srs)->srs_tx.st_drop_count += cnt; \ 3678275SEric Cheng /* \ 3688275SEric Cheng * b_prev may be set to the fanout hint \ 3698275SEric Cheng * hence can't use freemsg directly \ 3708275SEric Cheng */ \ 3718275SEric Cheng mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \ 3728275SEric Cheng DTRACE_PROBE1(tx_queued_hiwat, \ 3738275SEric Cheng mac_soft_ring_set_t *, srs); \ 3748275SEric Cheng enqueue = 0; \ 3758275SEric Cheng } \ 3768275SEric Cheng } \ 3778275SEric Cheng if (enqueue) \ 3788275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz); \ 3798275SEric Cheng } 3808275SEric Cheng 3818275SEric Cheng /* Some utility macros */ 3828275SEric Cheng #define MAC_SRS_BW_LOCK(srs) \ 3838275SEric Cheng if (!(srs->srs_type & SRST_TX)) \ 3848275SEric Cheng mutex_enter(&srs->srs_bw->mac_bw_lock); 3858275SEric Cheng 3868275SEric Cheng #define MAC_SRS_BW_UNLOCK(srs) \ 3878275SEric Cheng if (!(srs->srs_type & SRST_TX)) \ 3888275SEric Cheng mutex_exit(&srs->srs_bw->mac_bw_lock); 3898275SEric Cheng 3908275SEric Cheng #define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \ 3918275SEric Cheng mac_pkt_drop(NULL, NULL, mp, B_FALSE); \ 3928275SEric Cheng /* increment freed stats */ \ 3938275SEric Cheng mac_srs->srs_tx.st_drop_count++; \ 3948275SEric Cheng cookie = (mac_tx_cookie_t)srs; \ 3958275SEric Cheng } 3968275SEric Cheng 3978275SEric Cheng #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \ 3988275SEric Cheng mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT; \ 3998275SEric Cheng cookie = (mac_tx_cookie_t)srs; \ 4008275SEric Cheng *ret_mp = mp_chain; \ 4018275SEric Cheng } 4028275SEric Cheng 4038275SEric Cheng /* 4048275SEric Cheng * Drop the rx packet and advance to the next one in the chain. 4058275SEric Cheng */ 4068275SEric Cheng static void 4078275SEric Cheng mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp) 4088275SEric Cheng { 4098275SEric Cheng mac_srs_rx_t *srs_rx = &srs->srs_rx; 4108275SEric Cheng 4118275SEric Cheng ASSERT(mp->b_next == NULL); 4128275SEric Cheng mutex_enter(&srs->srs_lock); 4138275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1); 4148275SEric Cheng MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp)); 4158275SEric Cheng mutex_exit(&srs->srs_lock); 4168275SEric Cheng 4178275SEric Cheng srs_rx->sr_drop_count++; 4188275SEric Cheng freemsg(mp); 4198275SEric Cheng } 4208275SEric Cheng 4218275SEric Cheng /* DATAPATH RUNTIME ROUTINES */ 4228275SEric Cheng 4238275SEric Cheng /* 4248275SEric Cheng * mac_srs_fire 4258275SEric Cheng * 4268275SEric Cheng * Timer callback routine for waking up the SRS worker thread. 4278275SEric Cheng */ 4288275SEric Cheng static void 4298275SEric Cheng mac_srs_fire(void *arg) 4308275SEric Cheng { 4318275SEric Cheng mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg; 4328275SEric Cheng 4338275SEric Cheng mutex_enter(&mac_srs->srs_lock); 4348275SEric Cheng if (mac_srs->srs_tid == 0) { 4358275SEric Cheng mutex_exit(&mac_srs->srs_lock); 4368275SEric Cheng return; 4378275SEric Cheng } 4388275SEric Cheng 4398275SEric Cheng mac_srs->srs_tid = 0; 4408275SEric Cheng if (!(mac_srs->srs_state & SRS_PROC)) 4418275SEric Cheng cv_signal(&mac_srs->srs_async); 4428275SEric Cheng 4438275SEric Cheng mutex_exit(&mac_srs->srs_lock); 4448275SEric Cheng } 4458275SEric Cheng 4468275SEric Cheng /* 4478275SEric Cheng * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack, 4488275SEric Cheng * and it is used on the TX path. 4498275SEric Cheng */ 4508275SEric Cheng #define HASH_HINT(hint) (((hint) << 17) | ((hint) >> 16)) 4518275SEric Cheng 4528275SEric Cheng /* 4538275SEric Cheng * hash based on the src address and the port information. 4548275SEric Cheng */ 4558275SEric Cheng #define HASH_ADDR(src, ports) \ 4568275SEric Cheng (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \ 4578275SEric Cheng ((ports) >> 8) ^ (ports)) 4588275SEric Cheng 4598275SEric Cheng #define COMPUTE_INDEX(key, sz) (key % sz) 4608275SEric Cheng 4618275SEric Cheng #define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) { \ 4628275SEric Cheng if ((tail) != NULL) { \ 4638275SEric Cheng ASSERT((tail)->b_next == NULL); \ 4648275SEric Cheng (tail)->b_next = (mp); \ 4658275SEric Cheng } else { \ 4668275SEric Cheng ASSERT((head) == NULL); \ 4678275SEric Cheng (head) = (mp); \ 4688275SEric Cheng } \ 4698275SEric Cheng (tail) = (mp); \ 4708275SEric Cheng (cnt)++; \ 4718275SEric Cheng if ((bw_ctl)) \ 4728275SEric Cheng (sz) += (sz0); \ 4738275SEric Cheng } 4748275SEric Cheng 4758275SEric Cheng #define MAC_FANOUT_DEFAULT 0 4768275SEric Cheng #define MAC_FANOUT_RND_ROBIN 1 4778275SEric Cheng int mac_fanout_type = MAC_FANOUT_DEFAULT; 4788275SEric Cheng 4798275SEric Cheng #define MAX_SR_TYPES 3 4808275SEric Cheng /* fanout types for port based hashing */ 4818275SEric Cheng enum pkt_type { 4828275SEric Cheng V4_TCP = 0, 4838275SEric Cheng V4_UDP, 4848275SEric Cheng OTH, 4858275SEric Cheng UNDEF 4868275SEric Cheng }; 4878275SEric Cheng 4888275SEric Cheng /* 4898275SEric Cheng * In general we do port based hashing to spread traffic over different 4908275SEric Cheng * softrings. The below tunable allows to override that behavior. Setting it 4918275SEric Cheng * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior 4928275SEric Cheng * is also the applicable to ipv6 packets carrying multiple optional headers 4938275SEric Cheng * and other uncommon packet types. 4948275SEric Cheng */ 4958275SEric Cheng boolean_t mac_src_ipv6_fanout = B_FALSE; 4968275SEric Cheng 4978275SEric Cheng /* 4988275SEric Cheng * Pair of local and remote ports in the transport header 4998275SEric Cheng */ 5008275SEric Cheng #define PORTS_SIZE 4 5018275SEric Cheng 5028275SEric Cheng /* 5038275SEric Cheng * mac_rx_srs_proto_fanout 5048275SEric Cheng * 5058275SEric Cheng * This routine delivers packets destined to an SRS into one of the 5068275SEric Cheng * protocol soft rings. 5078275SEric Cheng * 5088275SEric Cheng * Given a chain of packets we need to split it up into multiple sub chains 5098275SEric Cheng * destined into TCP, UDP or OTH soft ring. Instead of entering 5108275SEric Cheng * the soft ring one packet at a time, we want to enter it in the form of a 5118275SEric Cheng * chain otherwise we get this start/stop behaviour where the worker thread 5128275SEric Cheng * goes to sleep and then next packets comes in forcing it to wake up etc. 5138275SEric Cheng */ 5148275SEric Cheng static void 5158275SEric Cheng mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 5168275SEric Cheng { 5178275SEric Cheng struct ether_header *ehp; 518*8833SVenu.Iyer@Sun.COM struct ether_vlan_header *evhp; 519*8833SVenu.Iyer@Sun.COM uint32_t sap; 5208275SEric Cheng ipha_t *ipha; 521*8833SVenu.Iyer@Sun.COM uint8_t *dstaddr; 522*8833SVenu.Iyer@Sun.COM size_t hdrsize; 5238275SEric Cheng mblk_t *mp; 5248275SEric Cheng mblk_t *headmp[MAX_SR_TYPES]; 5258275SEric Cheng mblk_t *tailmp[MAX_SR_TYPES]; 5268275SEric Cheng int cnt[MAX_SR_TYPES]; 5278275SEric Cheng size_t sz[MAX_SR_TYPES]; 5288275SEric Cheng size_t sz1; 529*8833SVenu.Iyer@Sun.COM boolean_t bw_ctl; 5308275SEric Cheng boolean_t hw_classified; 531*8833SVenu.Iyer@Sun.COM boolean_t dls_bypass; 532*8833SVenu.Iyer@Sun.COM boolean_t is_ether; 533*8833SVenu.Iyer@Sun.COM boolean_t is_unicast; 534*8833SVenu.Iyer@Sun.COM enum pkt_type type; 5358275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 536*8833SVenu.Iyer@Sun.COM 537*8833SVenu.Iyer@Sun.COM is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 538*8833SVenu.Iyer@Sun.COM bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 5398275SEric Cheng 5408275SEric Cheng /* 5418275SEric Cheng * If we don't have a Rx ring, S/W classification would have done 5428275SEric Cheng * its job and its a packet meant for us. If we were polling on 5438275SEric Cheng * the default ring (i.e. there was a ring assigned to this SRS), 5448275SEric Cheng * then we need to make sure that the mac address really belongs 5458275SEric Cheng * to us. 5468275SEric Cheng */ 5478275SEric Cheng hw_classified = mac_srs->srs_ring != NULL && 5488275SEric Cheng mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 5498275SEric Cheng 5508275SEric Cheng /* 5518275SEric Cheng * Special clients (eg. VLAN, non ether, etc) need DLS 5528275SEric Cheng * processing in the Rx path. SRST_DLS_BYPASS will be clear for 5538275SEric Cheng * such SRSs. 5548275SEric Cheng */ 555*8833SVenu.Iyer@Sun.COM dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0); 5568275SEric Cheng 5578275SEric Cheng bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *)); 5588275SEric Cheng bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *)); 5598275SEric Cheng bzero(cnt, MAX_SR_TYPES * sizeof (int)); 5608275SEric Cheng bzero(sz, MAX_SR_TYPES * sizeof (size_t)); 5618275SEric Cheng 5628275SEric Cheng /* 5638275SEric Cheng * We got a chain from SRS that we need to send to the soft rings. 5648275SEric Cheng * Since squeues for TCP & IPv4 sap poll their soft rings (for 5658275SEric Cheng * performance reasons), we need to separate out v4_tcp, v4_udp 5668275SEric Cheng * and the rest goes in other. 5678275SEric Cheng */ 5688275SEric Cheng while (head != NULL) { 5698275SEric Cheng mp = head; 5708275SEric Cheng head = head->b_next; 5718275SEric Cheng mp->b_next = NULL; 5728275SEric Cheng 5738275SEric Cheng type = OTH; 574*8833SVenu.Iyer@Sun.COM sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 575*8833SVenu.Iyer@Sun.COM 576*8833SVenu.Iyer@Sun.COM if (is_ether) { 577*8833SVenu.Iyer@Sun.COM /* 578*8833SVenu.Iyer@Sun.COM * At this point we can be sure the packet at least 579*8833SVenu.Iyer@Sun.COM * has an ether header. 580*8833SVenu.Iyer@Sun.COM */ 581*8833SVenu.Iyer@Sun.COM if (sz1 < sizeof (struct ether_header)) { 582*8833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 583*8833SVenu.Iyer@Sun.COM continue; 584*8833SVenu.Iyer@Sun.COM } 5858275SEric Cheng ehp = (struct ether_header *)mp->b_rptr; 5868275SEric Cheng 5878275SEric Cheng /* 588*8833SVenu.Iyer@Sun.COM * Determine if this is a VLAN or non-VLAN packet. 5898275SEric Cheng */ 590*8833SVenu.Iyer@Sun.COM if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 591*8833SVenu.Iyer@Sun.COM evhp = (struct ether_vlan_header *)mp->b_rptr; 592*8833SVenu.Iyer@Sun.COM sap = ntohs(evhp->ether_type); 593*8833SVenu.Iyer@Sun.COM hdrsize = sizeof (struct ether_vlan_header); 5948275SEric Cheng /* 595*8833SVenu.Iyer@Sun.COM * Check if the VID of the packet, if any, 596*8833SVenu.Iyer@Sun.COM * belongs to this client. 5978275SEric Cheng */ 5988275SEric Cheng if (!mac_client_check_flow_vid(mcip, 5998275SEric Cheng VLAN_ID(ntohs(evhp->ether_tci)))) { 6008275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 6018275SEric Cheng continue; 6028275SEric Cheng } 603*8833SVenu.Iyer@Sun.COM } else { 604*8833SVenu.Iyer@Sun.COM hdrsize = sizeof (struct ether_header); 6058275SEric Cheng } 606*8833SVenu.Iyer@Sun.COM is_unicast = 607*8833SVenu.Iyer@Sun.COM ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 608*8833SVenu.Iyer@Sun.COM dstaddr = (uint8_t *)&ehp->ether_dhost; 609*8833SVenu.Iyer@Sun.COM } else { 610*8833SVenu.Iyer@Sun.COM mac_header_info_t mhi; 611*8833SVenu.Iyer@Sun.COM 612*8833SVenu.Iyer@Sun.COM if (mac_header_info((mac_handle_t)mcip->mci_mip, 613*8833SVenu.Iyer@Sun.COM mp, &mhi) != 0) { 614*8833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 615*8833SVenu.Iyer@Sun.COM continue; 616*8833SVenu.Iyer@Sun.COM } 617*8833SVenu.Iyer@Sun.COM hdrsize = mhi.mhi_hdrsize; 618*8833SVenu.Iyer@Sun.COM sap = mhi.mhi_bindsap; 619*8833SVenu.Iyer@Sun.COM is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 620*8833SVenu.Iyer@Sun.COM dstaddr = (uint8_t *)mhi.mhi_daddr; 621*8833SVenu.Iyer@Sun.COM } 622*8833SVenu.Iyer@Sun.COM 623*8833SVenu.Iyer@Sun.COM if (!dls_bypass) { 6248275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 6258275SEric Cheng cnt[type], bw_ctl, sz[type], sz1, mp); 6268275SEric Cheng continue; 6278275SEric Cheng } 6288275SEric Cheng 629*8833SVenu.Iyer@Sun.COM if (sap == ETHERTYPE_IP) { 6308275SEric Cheng /* 6318275SEric Cheng * If we are H/W classified, but we have promisc 6328275SEric Cheng * on, then we need to check for the unicast address. 6338275SEric Cheng */ 6348275SEric Cheng if (hw_classified && mcip->mci_promisc_list != NULL) { 6358275SEric Cheng mac_address_t *map; 6368275SEric Cheng 6378275SEric Cheng rw_enter(&mcip->mci_rw_lock, RW_READER); 6388275SEric Cheng map = mcip->mci_unicast; 639*8833SVenu.Iyer@Sun.COM if (bcmp(dstaddr, map->ma_addr, 6408275SEric Cheng map->ma_len) == 0) 6418275SEric Cheng type = UNDEF; 6428275SEric Cheng rw_exit(&mcip->mci_rw_lock); 643*8833SVenu.Iyer@Sun.COM } else if (is_unicast) { 6448275SEric Cheng type = UNDEF; 6458275SEric Cheng } 6468275SEric Cheng } 6478275SEric Cheng 6488275SEric Cheng /* 6498275SEric Cheng * This needs to become a contract with the driver for 6508275SEric Cheng * the fast path. 6518275SEric Cheng * 6528275SEric Cheng * In the normal case the packet will have at least the L2 6538275SEric Cheng * header and the IP + Transport header in the same mblk. 6548275SEric Cheng * This is usually the case when the NIC driver sends up 6558275SEric Cheng * the packet. This is also true when the stack generates 6568275SEric Cheng * a packet that is looped back and when the stack uses the 6578275SEric Cheng * fastpath mechanism. The normal case is optimized for 6588275SEric Cheng * performance and may bypass DLS. All other cases go through 6598275SEric Cheng * the 'OTH' type path without DLS bypass. 6608275SEric Cheng */ 6618275SEric Cheng 662*8833SVenu.Iyer@Sun.COM ipha = (ipha_t *)(mp->b_rptr + hdrsize); 6638275SEric Cheng if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) 6648275SEric Cheng type = OTH; 6658275SEric Cheng 6668275SEric Cheng if (type == OTH) { 6678275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 6688275SEric Cheng cnt[type], bw_ctl, sz[type], sz1, mp); 6698275SEric Cheng continue; 6708275SEric Cheng } 6718275SEric Cheng 6728275SEric Cheng ASSERT(type == UNDEF); 6738275SEric Cheng /* 6748275SEric Cheng * We look for at least 4 bytes past the IP header to get 6758275SEric Cheng * the port information. If we get an IP fragment, we don't 6768275SEric Cheng * have the port information, and we use just the protocol 6778275SEric Cheng * information. 6788275SEric Cheng */ 6798275SEric Cheng switch (ipha->ipha_protocol) { 6808275SEric Cheng case IPPROTO_TCP: 6818275SEric Cheng type = V4_TCP; 682*8833SVenu.Iyer@Sun.COM mp->b_rptr += hdrsize; 6838275SEric Cheng break; 6848275SEric Cheng case IPPROTO_UDP: 6858275SEric Cheng type = V4_UDP; 686*8833SVenu.Iyer@Sun.COM mp->b_rptr += hdrsize; 6878275SEric Cheng break; 6888275SEric Cheng default: 6898275SEric Cheng type = OTH; 6908275SEric Cheng break; 6918275SEric Cheng } 6928275SEric Cheng 6938275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type], 6948275SEric Cheng bw_ctl, sz[type], sz1, mp); 6958275SEric Cheng } 6968275SEric Cheng 6978275SEric Cheng for (type = V4_TCP; type < UNDEF; type++) { 6988275SEric Cheng if (headmp[type] != NULL) { 699*8833SVenu.Iyer@Sun.COM mac_soft_ring_t *softring; 700*8833SVenu.Iyer@Sun.COM 7018275SEric Cheng ASSERT(tailmp[type]->b_next == NULL); 7028275SEric Cheng switch (type) { 7038275SEric Cheng case V4_TCP: 7048275SEric Cheng softring = mac_srs->srs_tcp_soft_rings[0]; 7058275SEric Cheng break; 7068275SEric Cheng case V4_UDP: 7078275SEric Cheng softring = mac_srs->srs_udp_soft_rings[0]; 7088275SEric Cheng break; 7098275SEric Cheng case OTH: 7108275SEric Cheng softring = mac_srs->srs_oth_soft_rings[0]; 7118275SEric Cheng } 712*8833SVenu.Iyer@Sun.COM mac_rx_soft_ring_process(mcip, softring, 7138275SEric Cheng headmp[type], tailmp[type], cnt[type], sz[type]); 7148275SEric Cheng } 7158275SEric Cheng } 7168275SEric Cheng } 7178275SEric Cheng 7188275SEric Cheng int fanout_unalligned = 0; 7198275SEric Cheng 7208275SEric Cheng /* 7218275SEric Cheng * mac_rx_srs_long_fanout 7228275SEric Cheng * 7238275SEric Cheng * The fanout routine for IPv6 7248275SEric Cheng */ 7258275SEric Cheng static int 7268275SEric Cheng mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, 727*8833SVenu.Iyer@Sun.COM uint32_t sap, size_t hdrsize, enum pkt_type *type, uint_t *indx) 7288275SEric Cheng { 7298275SEric Cheng ip6_t *ip6h; 7308275SEric Cheng uint8_t *whereptr; 7318275SEric Cheng uint_t hash; 7328275SEric Cheng uint16_t remlen; 7338275SEric Cheng uint8_t nexthdr; 7348275SEric Cheng uint16_t hdr_len; 7358275SEric Cheng 736*8833SVenu.Iyer@Sun.COM if (sap == ETHERTYPE_IPV6) { 7378275SEric Cheng boolean_t modifiable = B_TRUE; 7388275SEric Cheng 739*8833SVenu.Iyer@Sun.COM ASSERT(MBLKL(mp) >= hdrsize); 740*8833SVenu.Iyer@Sun.COM 741*8833SVenu.Iyer@Sun.COM ip6h = (ip6_t *)(mp->b_rptr + hdrsize); 7428275SEric Cheng if ((unsigned char *)ip6h == mp->b_wptr) { 7438275SEric Cheng /* 744*8833SVenu.Iyer@Sun.COM * The first mblk_t only includes the mac header. 7458275SEric Cheng * Note that it is safe to change the mp pointer here, 7468275SEric Cheng * as the subsequent operation does not assume mp 747*8833SVenu.Iyer@Sun.COM * points to the start of the mac header. 7488275SEric Cheng */ 7498275SEric Cheng mp = mp->b_cont; 7508275SEric Cheng 7518275SEric Cheng /* 7528275SEric Cheng * Make sure ip6h holds the full ip6_t structure. 7538275SEric Cheng */ 7548275SEric Cheng if (mp == NULL) 7558275SEric Cheng return (-1); 7568275SEric Cheng 7578275SEric Cheng if (MBLKL(mp) < IPV6_HDR_LEN) { 7588275SEric Cheng modifiable = (DB_REF(mp) == 1); 7598275SEric Cheng 7608275SEric Cheng if (modifiable && 7618275SEric Cheng !pullupmsg(mp, IPV6_HDR_LEN)) { 7628275SEric Cheng return (-1); 7638275SEric Cheng } 7648275SEric Cheng } 7658275SEric Cheng 7668275SEric Cheng ip6h = (ip6_t *)mp->b_rptr; 7678275SEric Cheng } 7688275SEric Cheng 7698275SEric Cheng if (!modifiable || !(OK_32PTR((char *)ip6h)) || 7708275SEric Cheng ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) { 7718275SEric Cheng /* 7728275SEric Cheng * If either ip6h is not alligned, or ip6h does not 7738275SEric Cheng * hold the complete ip6_t structure (a pullupmsg() 7748275SEric Cheng * is not an option since it would result in an 7758275SEric Cheng * unalligned ip6h), fanout to the default ring. Note 7768275SEric Cheng * that this may cause packets reordering. 7778275SEric Cheng */ 7788275SEric Cheng *indx = 0; 7798275SEric Cheng *type = OTH; 7808275SEric Cheng fanout_unalligned++; 7818275SEric Cheng return (0); 7828275SEric Cheng } 7838275SEric Cheng 7848275SEric Cheng remlen = ntohs(ip6h->ip6_plen); 7858275SEric Cheng nexthdr = ip6h->ip6_nxt; 7868275SEric Cheng 7878275SEric Cheng if (remlen < MIN_EHDR_LEN) 7888275SEric Cheng return (-1); 7898275SEric Cheng /* 7908275SEric Cheng * Do src based fanout if below tunable is set to B_TRUE or 7918275SEric Cheng * when mac_ip_hdr_length_v6() fails because of malformed 7928275SEric Cheng * packets or because mblk's need to be concatenated using 7938275SEric Cheng * pullupmsg(). 7948275SEric Cheng */ 7958275SEric Cheng if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(mp, ip6h, 7968275SEric Cheng &hdr_len, &nexthdr)) { 7978275SEric Cheng goto src_based_fanout; 7988275SEric Cheng } 7998275SEric Cheng whereptr = (uint8_t *)ip6h + hdr_len; 8008275SEric Cheng 8018275SEric Cheng /* If the transport is one of below, we do port based fanout */ 8028275SEric Cheng switch (nexthdr) { 8038275SEric Cheng case IPPROTO_TCP: 8048275SEric Cheng case IPPROTO_UDP: 8058275SEric Cheng case IPPROTO_SCTP: 8068275SEric Cheng case IPPROTO_ESP: 8078275SEric Cheng /* 8088275SEric Cheng * If the ports in the transport header is not part of 8098275SEric Cheng * the mblk, do src_based_fanout, instead of calling 8108275SEric Cheng * pullupmsg(). 8118275SEric Cheng */ 8128275SEric Cheng if (mp->b_cont != NULL && 8138275SEric Cheng whereptr + PORTS_SIZE > mp->b_wptr) { 8148275SEric Cheng goto src_based_fanout; 8158275SEric Cheng } 8168275SEric Cheng break; 8178275SEric Cheng default: 8188275SEric Cheng break; 8198275SEric Cheng } 8208275SEric Cheng 8218275SEric Cheng switch (nexthdr) { 8228275SEric Cheng case IPPROTO_TCP: 8238275SEric Cheng hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 8248275SEric Cheng *(uint32_t *)whereptr); 8258275SEric Cheng *indx = COMPUTE_INDEX(hash, 8268275SEric Cheng mac_srs->srs_tcp_ring_count); 8278275SEric Cheng *type = OTH; 8288275SEric Cheng break; 8298275SEric Cheng 8308275SEric Cheng case IPPROTO_UDP: 8318275SEric Cheng case IPPROTO_SCTP: 8328275SEric Cheng case IPPROTO_ESP: 8338275SEric Cheng if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 8348275SEric Cheng hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 8358275SEric Cheng *(uint32_t *)whereptr); 8368275SEric Cheng *indx = COMPUTE_INDEX(hash, 8378275SEric Cheng mac_srs->srs_udp_ring_count); 8388275SEric Cheng } else { 8398275SEric Cheng *indx = mac_srs->srs_ind % 8408275SEric Cheng mac_srs->srs_udp_ring_count; 8418275SEric Cheng mac_srs->srs_ind++; 8428275SEric Cheng } 8438275SEric Cheng *type = OTH; 8448275SEric Cheng break; 8458275SEric Cheng 8468275SEric Cheng /* For all other protocol, do source based fanout */ 8478275SEric Cheng default: 8488275SEric Cheng goto src_based_fanout; 8498275SEric Cheng } 8508275SEric Cheng } else { 8518275SEric Cheng *indx = 0; 8528275SEric Cheng *type = OTH; 8538275SEric Cheng } 8548275SEric Cheng return (0); 8558275SEric Cheng 8568275SEric Cheng src_based_fanout: 8578275SEric Cheng hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0); 8588275SEric Cheng *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); 8598275SEric Cheng *type = OTH; 8608275SEric Cheng return (0); 8618275SEric Cheng } 8628275SEric Cheng 8638275SEric Cheng /* 8648275SEric Cheng * mac_rx_srs_fanout 8658275SEric Cheng * 8668275SEric Cheng * This routine delivers packets destined to an SRS into a soft ring member 8678275SEric Cheng * of the set. 8688275SEric Cheng * 8698275SEric Cheng * Given a chain of packets we need to split it up into multiple sub chains 8708275SEric Cheng * destined for one of the TCP, UDP or OTH soft rings. Instead of entering 8718275SEric Cheng * the soft ring one packet at a time, we want to enter it in the form of a 8728275SEric Cheng * chain otherwise we get this start/stop behaviour where the worker thread 8738275SEric Cheng * goes to sleep and then next packets comes in forcing it to wake up etc. 8748275SEric Cheng * 8758275SEric Cheng * Note: 8768275SEric Cheng * Since we know what is the maximum fanout possible, we create a 2D array 8778275SEric Cheng * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz 8788275SEric Cheng * variables so that we can enter the softrings with chain. We need the 8798275SEric Cheng * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc 8808275SEric Cheng * for each packet would be expensive). If we ever want to have the 8818275SEric Cheng * ability to have unlimited fanout, we should probably declare a head, 8828275SEric Cheng * tail, cnt, sz with each soft ring (a data struct which contains a softring 8838275SEric Cheng * along with these members) and create an array of this uber struct so we 8848275SEric Cheng * don't have to do kmem_alloc. 8858275SEric Cheng */ 8868275SEric Cheng int fanout_oth1 = 0; 8878275SEric Cheng int fanout_oth2 = 0; 8888275SEric Cheng int fanout_oth3 = 0; 8898275SEric Cheng int fanout_oth4 = 0; 8908275SEric Cheng int fanout_oth5 = 0; 8918275SEric Cheng 8928275SEric Cheng static void 8938275SEric Cheng mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 8948275SEric Cheng { 8958275SEric Cheng struct ether_header *ehp; 896*8833SVenu.Iyer@Sun.COM struct ether_vlan_header *evhp; 897*8833SVenu.Iyer@Sun.COM uint32_t sap; 8988275SEric Cheng ipha_t *ipha; 899*8833SVenu.Iyer@Sun.COM uint8_t *dstaddr; 9008275SEric Cheng uint_t indx; 901*8833SVenu.Iyer@Sun.COM size_t ports_offset; 902*8833SVenu.Iyer@Sun.COM size_t ipha_len; 903*8833SVenu.Iyer@Sun.COM size_t hdrsize; 9048275SEric Cheng uint_t hash; 9058275SEric Cheng mblk_t *mp; 9068275SEric Cheng mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 9078275SEric Cheng mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 9088275SEric Cheng int cnt[MAX_SR_TYPES][MAX_SR_FANOUT]; 9098275SEric Cheng size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT]; 9108275SEric Cheng size_t sz1; 911*8833SVenu.Iyer@Sun.COM boolean_t bw_ctl; 9128275SEric Cheng boolean_t hw_classified; 913*8833SVenu.Iyer@Sun.COM boolean_t dls_bypass; 914*8833SVenu.Iyer@Sun.COM boolean_t is_ether; 915*8833SVenu.Iyer@Sun.COM boolean_t is_unicast; 9168275SEric Cheng int fanout_cnt; 917*8833SVenu.Iyer@Sun.COM enum pkt_type type; 9188275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 919*8833SVenu.Iyer@Sun.COM 920*8833SVenu.Iyer@Sun.COM is_ether = (mcip->mci_mip->mi_info.mi_nativemedia == DL_ETHER); 921*8833SVenu.Iyer@Sun.COM bw_ctl = ((mac_srs->srs_type & SRST_BW_CONTROL) != 0); 9228275SEric Cheng 9238275SEric Cheng /* 9248275SEric Cheng * If we don't have a Rx ring, S/W classification would have done 9258275SEric Cheng * its job and its a packet meant for us. If we were polling on 9268275SEric Cheng * the default ring (i.e. there was a ring assigned to this SRS), 9278275SEric Cheng * then we need to make sure that the mac address really belongs 9288275SEric Cheng * to us. 9298275SEric Cheng */ 9308275SEric Cheng hw_classified = mac_srs->srs_ring != NULL && 9318275SEric Cheng mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 9328275SEric Cheng 9338275SEric Cheng /* 9348275SEric Cheng * Special clients (eg. VLAN, non ether, etc) need DLS 9358275SEric Cheng * processing in the Rx path. SRST_DLS_BYPASS will be clear for 9368275SEric Cheng * such SRSs. 9378275SEric Cheng */ 938*8833SVenu.Iyer@Sun.COM dls_bypass = ((mac_srs->srs_type & SRST_DLS_BYPASS) != 0); 9398275SEric Cheng 9408275SEric Cheng /* 9418275SEric Cheng * Since the softrings are never destroyed and we always 9428275SEric Cheng * create equal number of softrings for TCP, UDP and rest, 9438275SEric Cheng * its OK to check one of them for count and use it without 9448275SEric Cheng * any lock. In future, if soft rings get destroyed because 9458275SEric Cheng * of reduction in fanout, we will need to ensure that happens 9468275SEric Cheng * behind the SRS_PROC. 9478275SEric Cheng */ 9488275SEric Cheng fanout_cnt = mac_srs->srs_tcp_ring_count; 9498275SEric Cheng 9508275SEric Cheng bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 9518275SEric Cheng bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 9528275SEric Cheng bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int)); 9538275SEric Cheng bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t)); 9548275SEric Cheng 9558275SEric Cheng /* 9568275SEric Cheng * We got a chain from SRS that we need to send to the soft rings. 9578275SEric Cheng * Since squeues for TCP & IPv4 sap poll their soft rings (for 9588275SEric Cheng * performance reasons), we need to separate out v4_tcp, v4_udp 9598275SEric Cheng * and the rest goes in other. 9608275SEric Cheng */ 9618275SEric Cheng while (head != NULL) { 9628275SEric Cheng mp = head; 9638275SEric Cheng head = head->b_next; 9648275SEric Cheng mp->b_next = NULL; 9658275SEric Cheng 9668275SEric Cheng type = OTH; 967*8833SVenu.Iyer@Sun.COM sz1 = (mp->b_cont == NULL) ? MBLKL(mp) : msgdsize(mp); 968*8833SVenu.Iyer@Sun.COM 969*8833SVenu.Iyer@Sun.COM if (is_ether) { 970*8833SVenu.Iyer@Sun.COM /* 971*8833SVenu.Iyer@Sun.COM * At this point we can be sure the packet at least 972*8833SVenu.Iyer@Sun.COM * has an ether header. 973*8833SVenu.Iyer@Sun.COM */ 974*8833SVenu.Iyer@Sun.COM if (sz1 < sizeof (struct ether_header)) { 975*8833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 976*8833SVenu.Iyer@Sun.COM continue; 977*8833SVenu.Iyer@Sun.COM } 978*8833SVenu.Iyer@Sun.COM ehp = (struct ether_header *)mp->b_rptr; 979*8833SVenu.Iyer@Sun.COM 980*8833SVenu.Iyer@Sun.COM /* 981*8833SVenu.Iyer@Sun.COM * Determine if this is a VLAN or non-VLAN packet. 982*8833SVenu.Iyer@Sun.COM */ 983*8833SVenu.Iyer@Sun.COM if ((sap = ntohs(ehp->ether_type)) == VLAN_TPID) { 984*8833SVenu.Iyer@Sun.COM evhp = (struct ether_vlan_header *)mp->b_rptr; 985*8833SVenu.Iyer@Sun.COM sap = ntohs(evhp->ether_type); 986*8833SVenu.Iyer@Sun.COM hdrsize = sizeof (struct ether_vlan_header); 9878275SEric Cheng /* 988*8833SVenu.Iyer@Sun.COM * Check if the VID of the packet, if any, 989*8833SVenu.Iyer@Sun.COM * belongs to this client. 9908275SEric Cheng */ 991*8833SVenu.Iyer@Sun.COM if (!mac_client_check_flow_vid(mcip, 992*8833SVenu.Iyer@Sun.COM VLAN_ID(ntohs(evhp->ether_tci)))) { 9938275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 9948275SEric Cheng continue; 9958275SEric Cheng } 996*8833SVenu.Iyer@Sun.COM } else { 997*8833SVenu.Iyer@Sun.COM hdrsize = sizeof (struct ether_header); 998*8833SVenu.Iyer@Sun.COM } 999*8833SVenu.Iyer@Sun.COM is_unicast = 1000*8833SVenu.Iyer@Sun.COM ((((uint8_t *)&ehp->ether_dhost)[0] & 0x01) == 0); 1001*8833SVenu.Iyer@Sun.COM dstaddr = (uint8_t *)&ehp->ether_dhost; 1002*8833SVenu.Iyer@Sun.COM } else { 1003*8833SVenu.Iyer@Sun.COM mac_header_info_t mhi; 1004*8833SVenu.Iyer@Sun.COM 1005*8833SVenu.Iyer@Sun.COM if (mac_header_info((mac_handle_t)mcip->mci_mip, 1006*8833SVenu.Iyer@Sun.COM mp, &mhi) != 0) { 1007*8833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 1008*8833SVenu.Iyer@Sun.COM continue; 1009*8833SVenu.Iyer@Sun.COM } 1010*8833SVenu.Iyer@Sun.COM hdrsize = mhi.mhi_hdrsize; 1011*8833SVenu.Iyer@Sun.COM sap = mhi.mhi_bindsap; 1012*8833SVenu.Iyer@Sun.COM is_unicast = (mhi.mhi_dsttype == MAC_ADDRTYPE_UNICAST); 1013*8833SVenu.Iyer@Sun.COM dstaddr = (uint8_t *)mhi.mhi_daddr; 1014*8833SVenu.Iyer@Sun.COM } 1015*8833SVenu.Iyer@Sun.COM 1016*8833SVenu.Iyer@Sun.COM if (!dls_bypass) { 1017*8833SVenu.Iyer@Sun.COM if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 1018*8833SVenu.Iyer@Sun.COM hdrsize, &type, &indx) == -1) { 1019*8833SVenu.Iyer@Sun.COM mac_rx_drop_pkt(mac_srs, mp); 1020*8833SVenu.Iyer@Sun.COM continue; 10218275SEric Cheng } 10228275SEric Cheng 10238275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type][indx], 10248275SEric Cheng tailmp[type][indx], cnt[type][indx], bw_ctl, 10258275SEric Cheng sz[type][indx], sz1, mp); 10268275SEric Cheng continue; 10278275SEric Cheng } 10288275SEric Cheng 10298275SEric Cheng 10308275SEric Cheng /* 10318275SEric Cheng * If we are using the default Rx ring where H/W or S/W 10328275SEric Cheng * classification has not happened, we need to verify if 10338275SEric Cheng * this unicast packet really belongs to us. 10348275SEric Cheng */ 1035*8833SVenu.Iyer@Sun.COM if (sap == ETHERTYPE_IP) { 10368275SEric Cheng /* 10378275SEric Cheng * If we are H/W classified, but we have promisc 10388275SEric Cheng * on, then we need to check for the unicast address. 10398275SEric Cheng */ 10408275SEric Cheng if (hw_classified && mcip->mci_promisc_list != NULL) { 10418275SEric Cheng mac_address_t *map; 10428275SEric Cheng 10438275SEric Cheng rw_enter(&mcip->mci_rw_lock, RW_READER); 10448275SEric Cheng map = mcip->mci_unicast; 1045*8833SVenu.Iyer@Sun.COM if (bcmp(dstaddr, map->ma_addr, 10468275SEric Cheng map->ma_len) == 0) 10478275SEric Cheng type = UNDEF; 10488275SEric Cheng rw_exit(&mcip->mci_rw_lock); 1049*8833SVenu.Iyer@Sun.COM } else if (is_unicast) { 10508275SEric Cheng type = UNDEF; 10518275SEric Cheng } 10528275SEric Cheng } 10538275SEric Cheng 10548275SEric Cheng /* 10558275SEric Cheng * This needs to become a contract with the driver for 10568275SEric Cheng * the fast path. 10578275SEric Cheng */ 10588275SEric Cheng 1059*8833SVenu.Iyer@Sun.COM ipha = (ipha_t *)(mp->b_rptr + hdrsize); 10608275SEric Cheng if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) { 10618275SEric Cheng type = OTH; 10628275SEric Cheng fanout_oth1++; 10638275SEric Cheng } 10648275SEric Cheng 10658275SEric Cheng if (type != OTH) { 1066*8833SVenu.Iyer@Sun.COM uint16_t frag_offset_flags; 1067*8833SVenu.Iyer@Sun.COM 10688275SEric Cheng switch (ipha->ipha_protocol) { 10698275SEric Cheng case IPPROTO_TCP: 10708275SEric Cheng case IPPROTO_UDP: 10718275SEric Cheng case IPPROTO_SCTP: 10728275SEric Cheng case IPPROTO_ESP: 10738275SEric Cheng ipha_len = IPH_HDR_LENGTH(ipha); 10748275SEric Cheng if ((uchar_t *)ipha + ipha_len + PORTS_SIZE > 10758275SEric Cheng mp->b_wptr) { 10768275SEric Cheng type = OTH; 10778275SEric Cheng break; 10788275SEric Cheng } 10798275SEric Cheng frag_offset_flags = 10808275SEric Cheng ntohs(ipha->ipha_fragment_offset_and_flags); 10818275SEric Cheng if ((frag_offset_flags & 10828275SEric Cheng (IPH_MF | IPH_OFFSET)) != 0) { 10838275SEric Cheng type = OTH; 10848275SEric Cheng fanout_oth3++; 10858275SEric Cheng break; 10868275SEric Cheng } 1087*8833SVenu.Iyer@Sun.COM ports_offset = hdrsize + ipha_len; 10888275SEric Cheng break; 10898275SEric Cheng default: 10908275SEric Cheng type = OTH; 10918275SEric Cheng fanout_oth4++; 10928275SEric Cheng break; 10938275SEric Cheng } 10948275SEric Cheng } 10958275SEric Cheng 10968275SEric Cheng if (type == OTH) { 1097*8833SVenu.Iyer@Sun.COM if (mac_rx_srs_long_fanout(mac_srs, mp, sap, 1098*8833SVenu.Iyer@Sun.COM hdrsize, &type, &indx) == -1) { 10998275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 11008275SEric Cheng continue; 11018275SEric Cheng } 11028275SEric Cheng 11038275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type][indx], 11048275SEric Cheng tailmp[type][indx], cnt[type][indx], bw_ctl, 11058275SEric Cheng sz[type][indx], sz1, mp); 11068275SEric Cheng continue; 11078275SEric Cheng } 11088275SEric Cheng 11098275SEric Cheng ASSERT(type == UNDEF); 11108275SEric Cheng 11118275SEric Cheng /* 11128275SEric Cheng * XXX-Sunay: We should hold srs_lock since ring_count 11138275SEric Cheng * below can change. But if we are always called from 11148275SEric Cheng * mac_rx_srs_drain and SRS_PROC is set, then we can 11158275SEric Cheng * enforce that ring_count can't be changed i.e. 11168275SEric Cheng * to change fanout type or ring count, the calling 11178275SEric Cheng * thread needs to be behind SRS_PROC. 11188275SEric Cheng */ 11198275SEric Cheng switch (ipha->ipha_protocol) { 11208275SEric Cheng case IPPROTO_TCP: 11218275SEric Cheng /* 11228275SEric Cheng * Note that for ESP, we fanout on SPI and it is at the 11238275SEric Cheng * same offset as the 2x16-bit ports. So it is clumped 11248275SEric Cheng * along with TCP, UDP and SCTP. 11258275SEric Cheng */ 11268275SEric Cheng hash = HASH_ADDR(ipha->ipha_src, 11278275SEric Cheng *(uint32_t *)(mp->b_rptr + ports_offset)); 11288275SEric Cheng indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count); 11298275SEric Cheng type = V4_TCP; 1130*8833SVenu.Iyer@Sun.COM mp->b_rptr += hdrsize; 11318275SEric Cheng break; 11328275SEric Cheng case IPPROTO_UDP: 11338275SEric Cheng case IPPROTO_SCTP: 11348275SEric Cheng case IPPROTO_ESP: 11358275SEric Cheng if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 11368275SEric Cheng hash = HASH_ADDR(ipha->ipha_src, 11378275SEric Cheng *(uint32_t *)(mp->b_rptr + ports_offset)); 11388275SEric Cheng indx = COMPUTE_INDEX(hash, 11398275SEric Cheng mac_srs->srs_udp_ring_count); 11408275SEric Cheng } else { 11418275SEric Cheng indx = mac_srs->srs_ind % 11428275SEric Cheng mac_srs->srs_udp_ring_count; 11438275SEric Cheng mac_srs->srs_ind++; 11448275SEric Cheng } 11458275SEric Cheng type = V4_UDP; 1146*8833SVenu.Iyer@Sun.COM mp->b_rptr += hdrsize; 11478275SEric Cheng break; 1148*8833SVenu.Iyer@Sun.COM default: 1149*8833SVenu.Iyer@Sun.COM indx = 0; 1150*8833SVenu.Iyer@Sun.COM type = OTH; 11518275SEric Cheng } 11528275SEric Cheng 11538275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx], 11548275SEric Cheng cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp); 11558275SEric Cheng } 11568275SEric Cheng 11578275SEric Cheng for (type = V4_TCP; type < UNDEF; type++) { 1158*8833SVenu.Iyer@Sun.COM int i; 1159*8833SVenu.Iyer@Sun.COM 11608275SEric Cheng for (i = 0; i < fanout_cnt; i++) { 11618275SEric Cheng if (headmp[type][i] != NULL) { 1162*8833SVenu.Iyer@Sun.COM mac_soft_ring_t *softring; 1163*8833SVenu.Iyer@Sun.COM 11648275SEric Cheng ASSERT(tailmp[type][i]->b_next == NULL); 11658275SEric Cheng switch (type) { 11668275SEric Cheng case V4_TCP: 11678275SEric Cheng softring = 11688275SEric Cheng mac_srs->srs_tcp_soft_rings[i]; 11698275SEric Cheng break; 11708275SEric Cheng case V4_UDP: 11718275SEric Cheng softring = 11728275SEric Cheng mac_srs->srs_udp_soft_rings[i]; 11738275SEric Cheng break; 11748275SEric Cheng case OTH: 11758275SEric Cheng softring = 11768275SEric Cheng mac_srs->srs_oth_soft_rings[i]; 11778275SEric Cheng break; 11788275SEric Cheng } 1179*8833SVenu.Iyer@Sun.COM mac_rx_soft_ring_process(mcip, 11808275SEric Cheng softring, headmp[type][i], tailmp[type][i], 11818275SEric Cheng cnt[type][i], sz[type][i]); 11828275SEric Cheng } 11838275SEric Cheng } 11848275SEric Cheng } 11858275SEric Cheng } 11868275SEric Cheng 11878275SEric Cheng #define SRS_BYTES_TO_PICKUP 150000 11888275SEric Cheng ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP; 11898275SEric Cheng 11908275SEric Cheng /* 11918275SEric Cheng * mac_rx_srs_poll_ring 11928275SEric Cheng * 11938275SEric Cheng * This SRS Poll thread uses this routine to poll the underlying hardware 11948275SEric Cheng * Rx ring to get a chain of packets. It can inline process that chain 11958275SEric Cheng * if mac_latency_optimize is set (default) or signal the SRS worker thread 11968275SEric Cheng * to do the remaining processing. 11978275SEric Cheng * 11988275SEric Cheng * Since packets come in the system via interrupt or poll path, we also 11998275SEric Cheng * update the stats and deal with promiscous clients here. 12008275SEric Cheng */ 12018275SEric Cheng void 12028275SEric Cheng mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs) 12038275SEric Cheng { 12048275SEric Cheng kmutex_t *lock = &mac_srs->srs_lock; 12058275SEric Cheng kcondvar_t *async = &mac_srs->srs_cv; 12068275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 12078275SEric Cheng mblk_t *head, *tail, *mp; 12088275SEric Cheng callb_cpr_t cprinfo; 12098275SEric Cheng ssize_t bytes_to_pickup; 12108275SEric Cheng size_t sz; 12118275SEric Cheng int count; 12128275SEric Cheng mac_client_impl_t *smcip; 12138275SEric Cheng 12148275SEric Cheng CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll"); 12158275SEric Cheng mutex_enter(lock); 12168275SEric Cheng 12178275SEric Cheng start: 12188275SEric Cheng for (;;) { 12198275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 12208275SEric Cheng goto done; 12218275SEric Cheng 12228275SEric Cheng CALLB_CPR_SAFE_BEGIN(&cprinfo); 12238275SEric Cheng cv_wait(async, lock); 12248275SEric Cheng CALLB_CPR_SAFE_END(&cprinfo, lock); 12258275SEric Cheng 12268275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 12278275SEric Cheng goto done; 12288275SEric Cheng 12298275SEric Cheng check_again: 12308275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 12318275SEric Cheng /* 12328275SEric Cheng * We pick as many bytes as we are allowed to queue. 12338275SEric Cheng * Its possible that we will exceed the total 12348275SEric Cheng * packets queued in case this SRS is part of the 12358275SEric Cheng * Rx ring group since > 1 poll thread can be pulling 12368275SEric Cheng * upto the max allowed packets at the same time 12378275SEric Cheng * but that should be OK. 12388275SEric Cheng */ 12398275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 12408275SEric Cheng bytes_to_pickup = 12418275SEric Cheng mac_srs->srs_bw->mac_bw_drop_threshold - 12428275SEric Cheng mac_srs->srs_bw->mac_bw_sz; 12438275SEric Cheng /* 12448275SEric Cheng * We shouldn't have been signalled if we 12458275SEric Cheng * have 0 or less bytes to pick but since 12468275SEric Cheng * some of the bytes accounting is driver 12478275SEric Cheng * dependant, we do the safety check. 12488275SEric Cheng */ 12498275SEric Cheng if (bytes_to_pickup < 0) 12508275SEric Cheng bytes_to_pickup = 0; 12518275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 12528275SEric Cheng } else { 12538275SEric Cheng /* 12548275SEric Cheng * ToDO: Need to change the polling API 12558275SEric Cheng * to add a packet count and a flag which 12568275SEric Cheng * tells the driver whether we want packets 12578275SEric Cheng * based on a count, or bytes, or all the 12588275SEric Cheng * packets queued in the driver/HW. This 12598275SEric Cheng * way, we never have to check the limits 12608275SEric Cheng * on poll path. We truly let only as many 12618275SEric Cheng * packets enter the system as we are willing 12628275SEric Cheng * to process or queue. 12638275SEric Cheng * 12648275SEric Cheng * Something along the lines of 12658275SEric Cheng * pkts_to_pickup = mac_soft_ring_max_q_cnt - 12668275SEric Cheng * mac_srs->srs_poll_pkt_cnt 12678275SEric Cheng */ 12688275SEric Cheng 12698275SEric Cheng /* 12708275SEric Cheng * Since we are not doing B/W control, pick 12718275SEric Cheng * as many packets as allowed. 12728275SEric Cheng */ 12738275SEric Cheng bytes_to_pickup = max_bytes_to_pickup; 12748275SEric Cheng } 12758275SEric Cheng 12768275SEric Cheng /* Poll the underlying Hardware */ 12778275SEric Cheng mutex_exit(lock); 12788275SEric Cheng head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup); 12798275SEric Cheng mutex_enter(lock); 12808275SEric Cheng 12818275SEric Cheng ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 12828275SEric Cheng SRS_POLL_THR_OWNER); 12838275SEric Cheng 12848275SEric Cheng mp = tail = head; 12858275SEric Cheng count = 0; 12868275SEric Cheng sz = 0; 12878275SEric Cheng while (mp != NULL) { 12888275SEric Cheng tail = mp; 12898275SEric Cheng sz += msgdsize(mp); 12908275SEric Cheng mp = mp->b_next; 12918275SEric Cheng count++; 12928275SEric Cheng } 12938275SEric Cheng 12948275SEric Cheng if (head != NULL) { 12958275SEric Cheng tail->b_next = NULL; 12968275SEric Cheng smcip = mac_srs->srs_mcip; 12978275SEric Cheng 12988275SEric Cheng if ((mac_srs->srs_type & SRST_FLOW) || 12998275SEric Cheng (smcip == NULL)) { 13008275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, 13018275SEric Cheng rbytes, sz); 13028275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, 13038275SEric Cheng ipackets, count); 13048275SEric Cheng } 13058275SEric Cheng 13068275SEric Cheng /* 13078275SEric Cheng * If there are any promiscuous mode callbacks 13088275SEric Cheng * defined for this MAC client, pass them a copy 13098275SEric Cheng * if appropriate and also update the counters. 13108275SEric Cheng */ 13118275SEric Cheng if (smcip != NULL) { 13128275SEric Cheng smcip->mci_stat_ibytes += sz; 13138275SEric Cheng smcip->mci_stat_ipackets += count; 13148275SEric Cheng 13158275SEric Cheng if (smcip->mci_mip->mi_promisc_list != NULL) { 13168275SEric Cheng mutex_exit(lock); 13178275SEric Cheng mac_promisc_dispatch(smcip->mci_mip, 13188275SEric Cheng head, NULL); 13198275SEric Cheng mutex_enter(lock); 13208275SEric Cheng } 13218275SEric Cheng } 13228275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 13238275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 13248275SEric Cheng mac_srs->srs_bw->mac_bw_polled += sz; 13258275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 13268275SEric Cheng } 13278275SEric Cheng srs_rx->sr_poll_count += count; 13288275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, 13298275SEric Cheng count, sz); 13308275SEric Cheng if (count <= 10) 13318275SEric Cheng srs_rx->sr_chain_cnt_undr10++; 13328275SEric Cheng else if (count > 10 && count <= 50) 13338275SEric Cheng srs_rx->sr_chain_cnt_10to50++; 13348275SEric Cheng else 13358275SEric Cheng srs_rx->sr_chain_cnt_over50++; 13368275SEric Cheng } 13378275SEric Cheng 13388275SEric Cheng /* 13398275SEric Cheng * We are guaranteed that SRS_PROC will be set if we 13408275SEric Cheng * are here. Also, poll thread gets to run only if 13418275SEric Cheng * the drain was being done by a worker thread although 13428275SEric Cheng * its possible that worker thread is still running 13438275SEric Cheng * and poll thread was sent down to keep the pipeline 13448275SEric Cheng * going instead of doing a complete drain and then 13458275SEric Cheng * trying to poll the NIC. 13468275SEric Cheng * 13478275SEric Cheng * So we need to check SRS_WORKER flag to make sure 13488275SEric Cheng * that the worker thread is not processing the queue 13498275SEric Cheng * in parallel to us. The flags and conditions are 13508275SEric Cheng * protected by the srs_lock to prevent any race. We 13518275SEric Cheng * ensure that we don't drop the srs_lock from now 13528275SEric Cheng * till the end and similarly we don't drop the srs_lock 13538275SEric Cheng * in mac_rx_srs_drain() till similar condition check 13548275SEric Cheng * are complete. The mac_rx_srs_drain() needs to ensure 13558275SEric Cheng * that SRS_WORKER flag remains set as long as its 13568275SEric Cheng * processing the queue. 13578275SEric Cheng */ 13588275SEric Cheng if (!(mac_srs->srs_state & SRS_WORKER) && 13598275SEric Cheng (mac_srs->srs_first != NULL)) { 13608275SEric Cheng /* 13618275SEric Cheng * We have packets to process and worker thread 1362*8833SVenu.Iyer@Sun.COM * is not running. Check to see if poll thread is 1363*8833SVenu.Iyer@Sun.COM * allowed to process. 13648275SEric Cheng */ 1365*8833SVenu.Iyer@Sun.COM if (mac_srs->srs_state & SRS_LATENCY_OPT) { 13668275SEric Cheng mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC); 13678275SEric Cheng if (srs_rx->sr_poll_pkt_cnt <= 13688275SEric Cheng srs_rx->sr_lowat) { 13698275SEric Cheng srs_rx->sr_poll_again++; 13708275SEric Cheng goto check_again; 1371*8833SVenu.Iyer@Sun.COM } 1372*8833SVenu.Iyer@Sun.COM /* 1373*8833SVenu.Iyer@Sun.COM * We are already above low water mark 1374*8833SVenu.Iyer@Sun.COM * so stay in the polling mode but no 1375*8833SVenu.Iyer@Sun.COM * need to poll. Once we dip below 1376*8833SVenu.Iyer@Sun.COM * the polling threshold, the processing 1377*8833SVenu.Iyer@Sun.COM * thread (soft ring) will signal us 1378*8833SVenu.Iyer@Sun.COM * to poll again (MAC_UPDATE_SRS_COUNT) 1379*8833SVenu.Iyer@Sun.COM */ 1380*8833SVenu.Iyer@Sun.COM srs_rx->sr_poll_drain_no_poll++; 1381*8833SVenu.Iyer@Sun.COM mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 1382*8833SVenu.Iyer@Sun.COM /* 1383*8833SVenu.Iyer@Sun.COM * In B/W control case, its possible 1384*8833SVenu.Iyer@Sun.COM * that the backlog built up due to 1385*8833SVenu.Iyer@Sun.COM * B/W limit being reached and packets 1386*8833SVenu.Iyer@Sun.COM * are queued only in SRS. In this case, 1387*8833SVenu.Iyer@Sun.COM * we should schedule worker thread 1388*8833SVenu.Iyer@Sun.COM * since no one else will wake us up. 1389*8833SVenu.Iyer@Sun.COM */ 1390*8833SVenu.Iyer@Sun.COM if ((mac_srs->srs_type & SRST_BW_CONTROL) && 1391*8833SVenu.Iyer@Sun.COM (mac_srs->srs_tid == NULL)) { 1392*8833SVenu.Iyer@Sun.COM mac_srs->srs_tid = 1393*8833SVenu.Iyer@Sun.COM timeout(mac_srs_fire, mac_srs, 1); 1394*8833SVenu.Iyer@Sun.COM srs_rx->sr_poll_worker_wakeup++; 13958275SEric Cheng } 13968275SEric Cheng } else { 13978275SEric Cheng /* 13988275SEric Cheng * Wakeup the worker thread for more processing. 13998275SEric Cheng * We optimize for throughput in this case. 14008275SEric Cheng */ 14018275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 14028275SEric Cheng MAC_SRS_WORKER_WAKEUP(mac_srs); 14038275SEric Cheng srs_rx->sr_poll_sig_worker++; 14048275SEric Cheng } 14058275SEric Cheng } else if ((mac_srs->srs_first == NULL) && 14068275SEric Cheng !(mac_srs->srs_state & SRS_WORKER)) { 14078275SEric Cheng /* 14088275SEric Cheng * There is nothing queued in SRS and 14098275SEric Cheng * no worker thread running. Plus we 14108275SEric Cheng * didn't get anything from the H/W 14118275SEric Cheng * as well (head == NULL); 14128275SEric Cheng */ 14138275SEric Cheng ASSERT(head == NULL); 14148275SEric Cheng mac_srs->srs_state &= 14158275SEric Cheng ~(SRS_PROC|SRS_GET_PKTS); 14168275SEric Cheng 14178275SEric Cheng /* 14188275SEric Cheng * If we have a packets in soft ring, don't allow 14198275SEric Cheng * more packets to come into this SRS by keeping the 14208275SEric Cheng * interrupts off but not polling the H/W. The 14218275SEric Cheng * poll thread will get signaled as soon as 14228275SEric Cheng * srs_poll_pkt_cnt dips below poll threshold. 14238275SEric Cheng */ 14248275SEric Cheng if (srs_rx->sr_poll_pkt_cnt == 0) { 14258275SEric Cheng srs_rx->sr_poll_intr_enable++; 14268275SEric Cheng MAC_SRS_POLLING_OFF(mac_srs); 14278275SEric Cheng } else { 14288275SEric Cheng /* 14298275SEric Cheng * We know nothing is queued in SRS 14308275SEric Cheng * since we are here after checking 14318275SEric Cheng * srs_first is NULL. The backlog 14328275SEric Cheng * is entirely due to packets queued 14338275SEric Cheng * in Soft ring which will wake us up 14348275SEric Cheng * and get the interface out of polling 14358275SEric Cheng * mode once the backlog dips below 14368275SEric Cheng * sr_poll_thres. 14378275SEric Cheng */ 14388275SEric Cheng srs_rx->sr_poll_no_poll++; 14398275SEric Cheng } 14408275SEric Cheng } else { 14418275SEric Cheng /* 14428275SEric Cheng * Worker thread is already running. 14438275SEric Cheng * Nothing much to do. If the polling 14448275SEric Cheng * was enabled, worker thread will deal 14458275SEric Cheng * with that. 14468275SEric Cheng */ 14478275SEric Cheng mac_srs->srs_state &= ~SRS_GET_PKTS; 14488275SEric Cheng srs_rx->sr_poll_goto_sleep++; 14498275SEric Cheng } 14508275SEric Cheng } 14518275SEric Cheng done: 14528275SEric Cheng mac_srs->srs_state |= SRS_POLL_THR_QUIESCED; 14538275SEric Cheng cv_signal(&mac_srs->srs_async); 14548275SEric Cheng /* 14558275SEric Cheng * If this is a temporary quiesce then wait for the restart signal 14568275SEric Cheng * from the srs worker. Then clear the flags and signal the srs worker 14578275SEric Cheng * to ensure a positive handshake and go back to start. 14588275SEric Cheng */ 14598275SEric Cheng while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART))) 14608275SEric Cheng cv_wait(async, lock); 14618275SEric Cheng if (mac_srs->srs_state & SRS_POLL_THR_RESTART) { 14628275SEric Cheng ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 14638275SEric Cheng mac_srs->srs_state &= 14648275SEric Cheng ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART); 14658275SEric Cheng cv_signal(&mac_srs->srs_async); 14668275SEric Cheng goto start; 14678275SEric Cheng } else { 14688275SEric Cheng mac_srs->srs_state |= SRS_POLL_THR_EXITED; 14698275SEric Cheng cv_signal(&mac_srs->srs_async); 14708275SEric Cheng CALLB_CPR_EXIT(&cprinfo); 14718275SEric Cheng thread_exit(); 14728275SEric Cheng } 14738275SEric Cheng } 14748275SEric Cheng 14758275SEric Cheng /* 14768275SEric Cheng * mac_srs_pick_chain 14778275SEric Cheng * 14788275SEric Cheng * In Bandwidth control case, checks how many packets can be processed 14798275SEric Cheng * and return them in a sub chain. 14808275SEric Cheng */ 14818275SEric Cheng static mblk_t * 14828275SEric Cheng mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail, 14838275SEric Cheng size_t *chain_sz, int *chain_cnt) 14848275SEric Cheng { 14858275SEric Cheng mblk_t *head = NULL; 14868275SEric Cheng mblk_t *tail = NULL; 14878275SEric Cheng size_t sz; 14888275SEric Cheng size_t tsz = 0; 14898275SEric Cheng int cnt = 0; 14908275SEric Cheng mblk_t *mp; 14918275SEric Cheng 14928275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 14938275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 14948275SEric Cheng if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <= 14958275SEric Cheng mac_srs->srs_bw->mac_bw_limit) || 14968275SEric Cheng (mac_srs->srs_bw->mac_bw_limit == 0)) { 14978275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 14988275SEric Cheng head = mac_srs->srs_first; 14998275SEric Cheng mac_srs->srs_first = NULL; 15008275SEric Cheng *chain_tail = mac_srs->srs_last; 15018275SEric Cheng mac_srs->srs_last = NULL; 15028275SEric Cheng *chain_sz = mac_srs->srs_size; 15038275SEric Cheng *chain_cnt = mac_srs->srs_count; 15048275SEric Cheng mac_srs->srs_count = 0; 15058275SEric Cheng mac_srs->srs_size = 0; 15068275SEric Cheng return (head); 15078275SEric Cheng } 15088275SEric Cheng 15098275SEric Cheng /* 15108275SEric Cheng * Can't clear the entire backlog. 15118275SEric Cheng * Need to find how many packets to pick 15128275SEric Cheng */ 15138275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock)); 15148275SEric Cheng while ((mp = mac_srs->srs_first) != NULL) { 15158275SEric Cheng sz = msgdsize(mp); 15168275SEric Cheng if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) > 15178275SEric Cheng mac_srs->srs_bw->mac_bw_limit) { 15188275SEric Cheng if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) 15198275SEric Cheng mac_srs->srs_bw->mac_bw_state |= 15208275SEric Cheng SRS_BW_ENFORCED; 15218275SEric Cheng break; 15228275SEric Cheng } 15238275SEric Cheng 15248275SEric Cheng /* 15258275SEric Cheng * The _size & cnt is decremented from the softrings 15268275SEric Cheng * when they send up the packet for polling to work 15278275SEric Cheng * properly. 15288275SEric Cheng */ 15298275SEric Cheng tsz += sz; 15308275SEric Cheng cnt++; 15318275SEric Cheng mac_srs->srs_count--; 15328275SEric Cheng mac_srs->srs_size -= sz; 15338275SEric Cheng if (tail != NULL) 15348275SEric Cheng tail->b_next = mp; 15358275SEric Cheng else 15368275SEric Cheng head = mp; 15378275SEric Cheng tail = mp; 15388275SEric Cheng mac_srs->srs_first = mac_srs->srs_first->b_next; 15398275SEric Cheng } 15408275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 15418275SEric Cheng if (mac_srs->srs_first == NULL) 15428275SEric Cheng mac_srs->srs_last = NULL; 15438275SEric Cheng 15448275SEric Cheng if (tail != NULL) 15458275SEric Cheng tail->b_next = NULL; 15468275SEric Cheng *chain_tail = tail; 15478275SEric Cheng *chain_cnt = cnt; 15488275SEric Cheng *chain_sz = tsz; 15498275SEric Cheng 15508275SEric Cheng return (head); 15518275SEric Cheng } 15528275SEric Cheng 15538275SEric Cheng /* 15548275SEric Cheng * mac_rx_srs_drain 15558275SEric Cheng * 15568275SEric Cheng * The SRS drain routine. Gets to run to clear the queue. Any thread 15578275SEric Cheng * (worker, interrupt, poll) can call this based on processing model. 15588275SEric Cheng * The first thing we do is disable interrupts if possible and then 15598275SEric Cheng * drain the queue. we also try to poll the underlying hardware if 15608275SEric Cheng * there is a dedicated hardware Rx ring assigned to this SRS. 15618275SEric Cheng * 15628275SEric Cheng * There is a equivalent drain routine in bandwidth control mode 15638275SEric Cheng * mac_rx_srs_drain_bw. There is some code duplication between the two 15648275SEric Cheng * routines but they are highly performance sensitive and are easier 15658275SEric Cheng * to read/debug if they stay separate. Any code changes here might 15668275SEric Cheng * also apply to mac_rx_srs_drain_bw as well. 15678275SEric Cheng */ 15688275SEric Cheng void 15698275SEric Cheng mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 15708275SEric Cheng { 15718275SEric Cheng mblk_t *head; 15728275SEric Cheng mblk_t *tail; 15738275SEric Cheng timeout_id_t tid; 15748275SEric Cheng int cnt = 0; 15758275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 15768275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 15778275SEric Cheng 15788275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 15798275SEric Cheng ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL)); 1580*8833SVenu.Iyer@Sun.COM 15818275SEric Cheng /* If we are blanked i.e. can't do upcalls, then we are done */ 15828275SEric Cheng if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 15838275SEric Cheng ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 15848275SEric Cheng (mac_srs->srs_state & SRS_PAUSE)); 15858275SEric Cheng goto out; 15868275SEric Cheng } 15878275SEric Cheng 15888275SEric Cheng if (mac_srs->srs_first == NULL) 15898275SEric Cheng goto out; 15908275SEric Cheng 1591*8833SVenu.Iyer@Sun.COM if (!(mac_srs->srs_state & SRS_LATENCY_OPT) && 1592*8833SVenu.Iyer@Sun.COM (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat)) { 1593*8833SVenu.Iyer@Sun.COM /* 1594*8833SVenu.Iyer@Sun.COM * In the normal case, the SRS worker thread does no 1595*8833SVenu.Iyer@Sun.COM * work and we wait for a backlog to build up before 1596*8833SVenu.Iyer@Sun.COM * we switch into polling mode. In case we are 1597*8833SVenu.Iyer@Sun.COM * optimizing for throughput, we use the worker thread 1598*8833SVenu.Iyer@Sun.COM * as well. The goal is to let worker thread process 1599*8833SVenu.Iyer@Sun.COM * the queue and poll thread to feed packets into 1600*8833SVenu.Iyer@Sun.COM * the queue. As such, we should signal the poll 1601*8833SVenu.Iyer@Sun.COM * thread to try and get more packets. 1602*8833SVenu.Iyer@Sun.COM * 1603*8833SVenu.Iyer@Sun.COM * We could have pulled this check in the POLL_RING 1604*8833SVenu.Iyer@Sun.COM * macro itself but keeping it explicit here makes 1605*8833SVenu.Iyer@Sun.COM * the architecture more human understandable. 1606*8833SVenu.Iyer@Sun.COM */ 1607*8833SVenu.Iyer@Sun.COM MAC_SRS_POLL_RING(mac_srs); 1608*8833SVenu.Iyer@Sun.COM } 1609*8833SVenu.Iyer@Sun.COM 1610*8833SVenu.Iyer@Sun.COM again: 16118275SEric Cheng head = mac_srs->srs_first; 16128275SEric Cheng mac_srs->srs_first = NULL; 16138275SEric Cheng tail = mac_srs->srs_last; 16148275SEric Cheng mac_srs->srs_last = NULL; 16158275SEric Cheng cnt = mac_srs->srs_count; 16168275SEric Cheng mac_srs->srs_count = 0; 16178275SEric Cheng 16188275SEric Cheng ASSERT(head != NULL); 16198275SEric Cheng ASSERT(tail != NULL); 16208275SEric Cheng 16218275SEric Cheng if ((tid = mac_srs->srs_tid) != 0) 16228275SEric Cheng mac_srs->srs_tid = 0; 16238275SEric Cheng 16248275SEric Cheng mac_srs->srs_state |= (SRS_PROC|proc_type); 16258275SEric Cheng 1626*8833SVenu.Iyer@Sun.COM 16278275SEric Cheng /* 16288275SEric Cheng * mcip is NULL for broadcast and multicast flows. The promisc 16298275SEric Cheng * callbacks for broadcast and multicast packets are delivered from 16308275SEric Cheng * mac_rx() and we don't need to worry about that case in this path 16318275SEric Cheng */ 16328275SEric Cheng if (mcip != NULL && mcip->mci_promisc_list != NULL) { 16338275SEric Cheng mutex_exit(&mac_srs->srs_lock); 16348275SEric Cheng mac_promisc_client_dispatch(mcip, head); 16358275SEric Cheng mutex_enter(&mac_srs->srs_lock); 16368275SEric Cheng } 16378275SEric Cheng 16388275SEric Cheng /* 16398275SEric Cheng * Check if SRS itself is doing the processing 16408275SEric Cheng * This direct path does not apply when subflows are present. In this 16418275SEric Cheng * case, packets need to be dispatched to a soft ring according to the 16428275SEric Cheng * flow's bandwidth and other resources contraints. 16438275SEric Cheng */ 16448275SEric Cheng if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 16458275SEric Cheng mac_direct_rx_t proc; 16468275SEric Cheng void *arg1; 16478275SEric Cheng mac_resource_handle_t arg2; 16488275SEric Cheng 16498275SEric Cheng /* 16508275SEric Cheng * This is the case when a Rx is directly 16518275SEric Cheng * assigned and we have a fully classified 16528275SEric Cheng * protocol chain. We can deal with it in 16538275SEric Cheng * one shot. 16548275SEric Cheng */ 16558275SEric Cheng proc = srs_rx->sr_func; 16568275SEric Cheng arg1 = srs_rx->sr_arg1; 16578275SEric Cheng arg2 = srs_rx->sr_arg2; 16588275SEric Cheng 16598275SEric Cheng mac_srs->srs_state |= SRS_CLIENT_PROC; 16608275SEric Cheng mutex_exit(&mac_srs->srs_lock); 16618275SEric Cheng if (tid != 0) { 16628275SEric Cheng (void) untimeout(tid); 16638275SEric Cheng tid = 0; 16648275SEric Cheng } 16658275SEric Cheng 16668275SEric Cheng proc(arg1, arg2, head, NULL); 16678275SEric Cheng /* 16688275SEric Cheng * Decrement the size and count here itelf 16698275SEric Cheng * since the packet has been processed. 16708275SEric Cheng */ 16718275SEric Cheng mutex_enter(&mac_srs->srs_lock); 16728275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 16738275SEric Cheng if (mac_srs->srs_state & SRS_CLIENT_WAIT) 16748275SEric Cheng cv_signal(&mac_srs->srs_client_cv); 16758275SEric Cheng mac_srs->srs_state &= ~SRS_CLIENT_PROC; 16768275SEric Cheng } else { 16778275SEric Cheng /* Some kind of softrings based fanout is required */ 16788275SEric Cheng mutex_exit(&mac_srs->srs_lock); 16798275SEric Cheng if (tid != 0) { 16808275SEric Cheng (void) untimeout(tid); 16818275SEric Cheng tid = 0; 16828275SEric Cheng } 16838275SEric Cheng 16848275SEric Cheng /* 16858275SEric Cheng * Since the fanout routines can deal with chains, 16868275SEric Cheng * shoot the entire chain up. 16878275SEric Cheng */ 16888275SEric Cheng if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 16898275SEric Cheng mac_rx_srs_fanout(mac_srs, head); 16908275SEric Cheng else 16918275SEric Cheng mac_rx_srs_proto_fanout(mac_srs, head); 16928275SEric Cheng mutex_enter(&mac_srs->srs_lock); 16938275SEric Cheng } 16948275SEric Cheng 1695*8833SVenu.Iyer@Sun.COM if (!(mac_srs->srs_state & (SRS_LATENCY_OPT|SRS_BLANK|SRS_PAUSE))) { 1696*8833SVenu.Iyer@Sun.COM /* 1697*8833SVenu.Iyer@Sun.COM * In case we are optimizing for throughput, we 1698*8833SVenu.Iyer@Sun.COM * should try and keep the worker thread running 1699*8833SVenu.Iyer@Sun.COM * as much as possible. Send the poll thread down 1700*8833SVenu.Iyer@Sun.COM * to check one more time if something else 1701*8833SVenu.Iyer@Sun.COM * arrived. In the meanwhile, if poll thread had 1702*8833SVenu.Iyer@Sun.COM * collected something due to earlier signal, 1703*8833SVenu.Iyer@Sun.COM * process it now. 1704*8833SVenu.Iyer@Sun.COM */ 1705*8833SVenu.Iyer@Sun.COM if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) { 1706*8833SVenu.Iyer@Sun.COM srs_rx->sr_drain_poll_sig++; 1707*8833SVenu.Iyer@Sun.COM MAC_SRS_POLL_RING(mac_srs); 1708*8833SVenu.Iyer@Sun.COM } 1709*8833SVenu.Iyer@Sun.COM if (mac_srs->srs_first != NULL) { 17108275SEric Cheng srs_rx->sr_drain_again++; 17118275SEric Cheng goto again; 17128275SEric Cheng } 17138275SEric Cheng } 17148275SEric Cheng 17158275SEric Cheng out: 17168275SEric Cheng if (mac_srs->srs_state & SRS_GET_PKTS) { 17178275SEric Cheng /* 17188275SEric Cheng * Poll thread is already running. Leave the 17198275SEric Cheng * SRS_RPOC set and hand over the control to 17208275SEric Cheng * poll thread. 17218275SEric Cheng */ 17228275SEric Cheng mac_srs->srs_state &= ~proc_type; 17238275SEric Cheng srs_rx->sr_drain_poll_running++; 17248275SEric Cheng return; 17258275SEric Cheng } 17268275SEric Cheng 17278275SEric Cheng /* 17288275SEric Cheng * Even if there are no packets queued in SRS, we 17298275SEric Cheng * need to make sure that the shared counter is 17308275SEric Cheng * clear and any associated softrings have cleared 17318275SEric Cheng * all the backlog. Otherwise, leave the interface 17328275SEric Cheng * in polling mode and the poll thread will get 17338275SEric Cheng * signalled once the count goes down to zero. 17348275SEric Cheng * 17358275SEric Cheng * If someone is already draining the queue (SRS_PROC is 17368275SEric Cheng * set) when the srs_poll_pkt_cnt goes down to zero, 17378275SEric Cheng * then it means that drain is already running and we 17388275SEric Cheng * will turn off polling at that time if there is 17398275SEric Cheng * no backlog. 17408275SEric Cheng * 17418275SEric Cheng * As long as there are packets queued either 17428275SEric Cheng * in soft ring set or its soft rings, we will leave 17438275SEric Cheng * the interface in polling mode (even if the drain 17448275SEric Cheng * was done being the interrupt thread). We signal 17458275SEric Cheng * the poll thread as well if we have dipped below 17468275SEric Cheng * low water mark. 17478275SEric Cheng * 17488275SEric Cheng * NOTE: We can't use the MAC_SRS_POLLING_ON macro 17498275SEric Cheng * since that turn polling on only for worker thread. 17508275SEric Cheng * Its not worth turning polling on for interrupt 17518275SEric Cheng * thread (since NIC will not issue another interrupt) 17528275SEric Cheng * unless a backlog builds up. 17538275SEric Cheng */ 17548275SEric Cheng if ((srs_rx->sr_poll_pkt_cnt > 0) && 17558275SEric Cheng (mac_srs->srs_state & SRS_POLLING_CAPAB)) { 17568275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 17578275SEric Cheng srs_rx->sr_drain_keep_polling++; 17588275SEric Cheng MAC_SRS_POLLING_ON(mac_srs); 17598275SEric Cheng if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) 17608275SEric Cheng MAC_SRS_POLL_RING(mac_srs); 17618275SEric Cheng return; 17628275SEric Cheng } 17638275SEric Cheng 17648275SEric Cheng /* Nothing else to do. Get out of poll mode */ 17658275SEric Cheng MAC_SRS_POLLING_OFF(mac_srs); 17668275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 17678275SEric Cheng srs_rx->sr_drain_finish_intr++; 17688275SEric Cheng } 17698275SEric Cheng 17708275SEric Cheng /* 17718275SEric Cheng * mac_rx_srs_drain_bw 17728275SEric Cheng * 17738275SEric Cheng * The SRS BW drain routine. Gets to run to clear the queue. Any thread 17748275SEric Cheng * (worker, interrupt, poll) can call this based on processing model. 17758275SEric Cheng * The first thing we do is disable interrupts if possible and then 17768275SEric Cheng * drain the queue. we also try to poll the underlying hardware if 17778275SEric Cheng * there is a dedicated hardware Rx ring assigned to this SRS. 17788275SEric Cheng * 17798275SEric Cheng * There is a equivalent drain routine in non bandwidth control mode 17808275SEric Cheng * mac_rx_srs_drain. There is some code duplication between the two 17818275SEric Cheng * routines but they are highly performance sensitive and are easier 17828275SEric Cheng * to read/debug if they stay separate. Any code changes here might 17838275SEric Cheng * also apply to mac_rx_srs_drain as well. 17848275SEric Cheng */ 17858275SEric Cheng void 17868275SEric Cheng mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 17878275SEric Cheng { 17888275SEric Cheng mblk_t *head; 17898275SEric Cheng mblk_t *tail; 17908275SEric Cheng timeout_id_t tid; 17918275SEric Cheng size_t sz = 0; 17928275SEric Cheng int cnt = 0; 17938275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 17948275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 17958275SEric Cheng 17968275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 17978275SEric Cheng ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 17988275SEric Cheng again: 17998275SEric Cheng /* Check if we are doing B/W control */ 18008275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 18018275SEric Cheng if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 18028275SEric Cheng mac_srs->srs_bw->mac_bw_curr_time = lbolt; 18038275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 18048275SEric Cheng if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 18058275SEric Cheng mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; 18068275SEric Cheng } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) { 18078275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18088275SEric Cheng goto done; 18098275SEric Cheng } else if (mac_srs->srs_bw->mac_bw_used > 18108275SEric Cheng mac_srs->srs_bw->mac_bw_limit) { 18118275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 18128275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18138275SEric Cheng goto done; 18148275SEric Cheng } 18158275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18168275SEric Cheng 18178275SEric Cheng /* If we are blanked i.e. can't do upcalls, then we are done */ 18188275SEric Cheng if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 18198275SEric Cheng ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 18208275SEric Cheng (mac_srs->srs_state & SRS_PAUSE)); 18218275SEric Cheng goto done; 18228275SEric Cheng } 18238275SEric Cheng 18248275SEric Cheng sz = 0; 18258275SEric Cheng cnt = 0; 18268275SEric Cheng if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) { 18278275SEric Cheng /* 18288275SEric Cheng * We couldn't pick up a single packet. 18298275SEric Cheng */ 18308275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 18318275SEric Cheng if ((mac_srs->srs_bw->mac_bw_used == 0) && 18328275SEric Cheng (mac_srs->srs_size != 0) && 18338275SEric Cheng !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 18348275SEric Cheng /* 18358275SEric Cheng * Seems like configured B/W doesn't 18368275SEric Cheng * even allow processing of 1 packet 18378275SEric Cheng * per tick. 18388275SEric Cheng * 18398275SEric Cheng * XXX: raise the limit to processing 18408275SEric Cheng * at least 1 packet per tick. 18418275SEric Cheng */ 18428275SEric Cheng mac_srs->srs_bw->mac_bw_limit += 18438275SEric Cheng mac_srs->srs_bw->mac_bw_limit; 18448275SEric Cheng mac_srs->srs_bw->mac_bw_drop_threshold += 18458275SEric Cheng mac_srs->srs_bw->mac_bw_drop_threshold; 18468275SEric Cheng cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) " 18478275SEric Cheng "raised B/W limit to %d since not even a " 18488275SEric Cheng "single packet can be processed per " 18498275SEric Cheng "tick %d\n", (void *)mac_srs, 18508275SEric Cheng (int)mac_srs->srs_bw->mac_bw_limit, 18518275SEric Cheng (int)msgdsize(mac_srs->srs_first)); 18528275SEric Cheng } 18538275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18548275SEric Cheng goto done; 18558275SEric Cheng } 18568275SEric Cheng 18578275SEric Cheng ASSERT(head != NULL); 18588275SEric Cheng ASSERT(tail != NULL); 18598275SEric Cheng 18608275SEric Cheng /* zero bandwidth: drop all and return to interrupt mode */ 18618275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 18628275SEric Cheng if (mac_srs->srs_bw->mac_bw_limit == 0) { 18638275SEric Cheng srs_rx->sr_drop_count += cnt; 18648275SEric Cheng ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz); 18658275SEric Cheng mac_srs->srs_bw->mac_bw_sz -= sz; 18668275SEric Cheng mac_srs->srs_bw->mac_bw_drop_bytes += sz; 18678275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18688275SEric Cheng mac_pkt_drop(NULL, NULL, head, B_FALSE); 18698275SEric Cheng goto leave_poll; 18708275SEric Cheng } else { 18718275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 18728275SEric Cheng } 18738275SEric Cheng 18748275SEric Cheng if ((tid = mac_srs->srs_tid) != 0) 18758275SEric Cheng mac_srs->srs_tid = 0; 18768275SEric Cheng 18778275SEric Cheng mac_srs->srs_state |= (SRS_PROC|proc_type); 18788275SEric Cheng MAC_SRS_WORKER_POLLING_ON(mac_srs); 18798275SEric Cheng 18808275SEric Cheng /* 18818275SEric Cheng * mcip is NULL for broadcast and multicast flows. The promisc 18828275SEric Cheng * callbacks for broadcast and multicast packets are delivered from 18838275SEric Cheng * mac_rx() and we don't need to worry about that case in this path 18848275SEric Cheng */ 18858275SEric Cheng if (mcip != NULL && mcip->mci_promisc_list != NULL) { 18868275SEric Cheng mutex_exit(&mac_srs->srs_lock); 18878275SEric Cheng mac_promisc_client_dispatch(mcip, head); 18888275SEric Cheng mutex_enter(&mac_srs->srs_lock); 18898275SEric Cheng } 18908275SEric Cheng 18918275SEric Cheng /* 18928275SEric Cheng * Check if SRS itself is doing the processing 18938275SEric Cheng * This direct path does not apply when subflows are present. In this 18948275SEric Cheng * case, packets need to be dispatched to a soft ring according to the 18958275SEric Cheng * flow's bandwidth and other resources contraints. 18968275SEric Cheng */ 18978275SEric Cheng if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 18988275SEric Cheng mac_direct_rx_t proc; 18998275SEric Cheng void *arg1; 19008275SEric Cheng mac_resource_handle_t arg2; 19018275SEric Cheng 19028275SEric Cheng /* 19038275SEric Cheng * This is the case when a Rx is directly 19048275SEric Cheng * assigned and we have a fully classified 19058275SEric Cheng * protocol chain. We can deal with it in 19068275SEric Cheng * one shot. 19078275SEric Cheng */ 19088275SEric Cheng proc = srs_rx->sr_func; 19098275SEric Cheng arg1 = srs_rx->sr_arg1; 19108275SEric Cheng arg2 = srs_rx->sr_arg2; 19118275SEric Cheng 19128275SEric Cheng mac_srs->srs_state |= SRS_CLIENT_PROC; 19138275SEric Cheng mutex_exit(&mac_srs->srs_lock); 19148275SEric Cheng if (tid != 0) { 19158275SEric Cheng (void) untimeout(tid); 19168275SEric Cheng tid = 0; 19178275SEric Cheng } 19188275SEric Cheng 19198275SEric Cheng proc(arg1, arg2, head, NULL); 19208275SEric Cheng /* 19218275SEric Cheng * Decrement the size and count here itelf 19228275SEric Cheng * since the packet has been processed. 19238275SEric Cheng */ 19248275SEric Cheng mutex_enter(&mac_srs->srs_lock); 19258275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 19268275SEric Cheng MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 19278275SEric Cheng 19288275SEric Cheng if (mac_srs->srs_state & SRS_CLIENT_WAIT) 19298275SEric Cheng cv_signal(&mac_srs->srs_client_cv); 19308275SEric Cheng mac_srs->srs_state &= ~SRS_CLIENT_PROC; 19318275SEric Cheng } else { 19328275SEric Cheng /* Some kind of softrings based fanout is required */ 19338275SEric Cheng mutex_exit(&mac_srs->srs_lock); 19348275SEric Cheng if (tid != 0) { 19358275SEric Cheng (void) untimeout(tid); 19368275SEric Cheng tid = 0; 19378275SEric Cheng } 19388275SEric Cheng 19398275SEric Cheng /* 19408275SEric Cheng * Since the fanout routines can deal with chains, 19418275SEric Cheng * shoot the entire chain up. 19428275SEric Cheng */ 19438275SEric Cheng if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 19448275SEric Cheng mac_rx_srs_fanout(mac_srs, head); 19458275SEric Cheng else 19468275SEric Cheng mac_rx_srs_proto_fanout(mac_srs, head); 19478275SEric Cheng mutex_enter(&mac_srs->srs_lock); 19488275SEric Cheng } 19498275SEric Cheng 19508275SEric Cheng /* 19518275SEric Cheng * Send the poll thread to pick up any packets arrived 19528275SEric Cheng * so far. This also serves as the last check in case 19538275SEric Cheng * nothing else is queued in the SRS. The poll thread 19548275SEric Cheng * is signalled only in the case the drain was done 19558275SEric Cheng * by the worker thread and SRS_WORKER is set. The 19568275SEric Cheng * worker thread can run in parallel as long as the 19578275SEric Cheng * SRS_WORKER flag is set. We we have nothing else to 19588275SEric Cheng * process, we can exit while leaving SRS_PROC set 19598275SEric Cheng * which gives the poll thread control to process and 19608275SEric Cheng * cleanup once it returns from the NIC. 19618275SEric Cheng * 19628275SEric Cheng * If we have nothing else to process, we need to 19638275SEric Cheng * ensure that we keep holding the srs_lock till 19648275SEric Cheng * all the checks below are done and control is 19658275SEric Cheng * handed to the poll thread if it was running. 19668275SEric Cheng */ 19678275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 19688275SEric Cheng if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 19698275SEric Cheng if (mac_srs->srs_first != NULL) { 19708275SEric Cheng if (proc_type == SRS_WORKER) { 19718275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 19728275SEric Cheng if (srs_rx->sr_poll_pkt_cnt <= 19738275SEric Cheng srs_rx->sr_lowat) 19748275SEric Cheng MAC_SRS_POLL_RING(mac_srs); 19758275SEric Cheng goto again; 19768275SEric Cheng } else { 19778275SEric Cheng cv_signal(&mac_srs->srs_async); 19788275SEric Cheng } 19798275SEric Cheng } 19808275SEric Cheng } 19818275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 19828275SEric Cheng 19838275SEric Cheng done: 19848275SEric Cheng 19858275SEric Cheng if (mac_srs->srs_state & SRS_GET_PKTS) { 19868275SEric Cheng /* 19878275SEric Cheng * Poll thread is already running. Leave the 19888275SEric Cheng * SRS_RPOC set and hand over the control to 19898275SEric Cheng * poll thread. 19908275SEric Cheng */ 19918275SEric Cheng mac_srs->srs_state &= ~proc_type; 19928275SEric Cheng return; 19938275SEric Cheng } 19948275SEric Cheng 19958275SEric Cheng /* 19968275SEric Cheng * If we can't process packets because we have exceeded 19978275SEric Cheng * B/W limit for this tick, just set the timeout 19988275SEric Cheng * and leave. 19998275SEric Cheng * 20008275SEric Cheng * Even if there are no packets queued in SRS, we 20018275SEric Cheng * need to make sure that the shared counter is 20028275SEric Cheng * clear and any associated softrings have cleared 20038275SEric Cheng * all the backlog. Otherwise, leave the interface 20048275SEric Cheng * in polling mode and the poll thread will get 20058275SEric Cheng * signalled once the count goes down to zero. 20068275SEric Cheng * 20078275SEric Cheng * If someone is already draining the queue (SRS_PROC is 20088275SEric Cheng * set) when the srs_poll_pkt_cnt goes down to zero, 20098275SEric Cheng * then it means that drain is already running and we 20108275SEric Cheng * will turn off polling at that time if there is 20118275SEric Cheng * no backlog. As long as there are packets queued either 20128275SEric Cheng * is soft ring set or its soft rings, we will leave 20138275SEric Cheng * the interface in polling mode. 20148275SEric Cheng */ 20158275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 20168275SEric Cheng if ((mac_srs->srs_state & SRS_POLLING_CAPAB) && 20178275SEric Cheng ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) || 20188275SEric Cheng (srs_rx->sr_poll_pkt_cnt > 0))) { 20198275SEric Cheng MAC_SRS_POLLING_ON(mac_srs); 20208275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 20218275SEric Cheng if ((mac_srs->srs_first != NULL) && 20228275SEric Cheng (mac_srs->srs_tid == NULL)) 20238275SEric Cheng mac_srs->srs_tid = timeout(mac_srs_fire, 20248275SEric Cheng mac_srs, 1); 20258275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 20268275SEric Cheng return; 20278275SEric Cheng } 20288275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 20298275SEric Cheng 20308275SEric Cheng leave_poll: 20318275SEric Cheng 20328275SEric Cheng /* Nothing else to do. Get out of poll mode */ 20338275SEric Cheng MAC_SRS_POLLING_OFF(mac_srs); 20348275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 20358275SEric Cheng } 20368275SEric Cheng 20378275SEric Cheng /* 20388275SEric Cheng * mac_srs_worker 20398275SEric Cheng * 20408275SEric Cheng * The SRS worker routine. Drains the queue when no one else is 20418275SEric Cheng * processing it. 20428275SEric Cheng */ 20438275SEric Cheng void 20448275SEric Cheng mac_srs_worker(mac_soft_ring_set_t *mac_srs) 20458275SEric Cheng { 20468275SEric Cheng kmutex_t *lock = &mac_srs->srs_lock; 20478275SEric Cheng kcondvar_t *async = &mac_srs->srs_async; 20488275SEric Cheng callb_cpr_t cprinfo; 20498275SEric Cheng boolean_t bw_ctl_flag; 20508275SEric Cheng 20518275SEric Cheng CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker"); 20528275SEric Cheng mutex_enter(lock); 20538275SEric Cheng 20548275SEric Cheng start: 20558275SEric Cheng for (;;) { 20568275SEric Cheng bw_ctl_flag = B_FALSE; 20578275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 20588275SEric Cheng MAC_SRS_BW_LOCK(mac_srs); 20598275SEric Cheng MAC_SRS_CHECK_BW_CONTROL(mac_srs); 20608275SEric Cheng if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 20618275SEric Cheng bw_ctl_flag = B_TRUE; 20628275SEric Cheng MAC_SRS_BW_UNLOCK(mac_srs); 20638275SEric Cheng } 20648275SEric Cheng /* 20658275SEric Cheng * The SRS_BW_ENFORCED flag may change since we have dropped 20668275SEric Cheng * the mac_bw_lock. However the drain function can handle both 20678275SEric Cheng * a drainable SRS or a bandwidth controlled SRS, and the 20688275SEric Cheng * effect of scheduling a timeout is to wakeup the worker 20698275SEric Cheng * thread which in turn will call the drain function. Since 20708275SEric Cheng * we release the srs_lock atomically only in the cv_wait there 20718275SEric Cheng * isn't a fear of waiting for ever. 20728275SEric Cheng */ 20738275SEric Cheng while (((mac_srs->srs_state & SRS_PROC) || 20748275SEric Cheng (mac_srs->srs_first == NULL) || bw_ctl_flag || 20758275SEric Cheng (mac_srs->srs_state & SRS_TX_BLOCKED)) && 20768275SEric Cheng !(mac_srs->srs_state & SRS_PAUSE)) { 20778275SEric Cheng /* 20788275SEric Cheng * If we have packets queued and we are here 20798275SEric Cheng * because B/W control is in place, we better 20808275SEric Cheng * schedule the worker wakeup after 1 tick 20818275SEric Cheng * to see if bandwidth control can be relaxed. 20828275SEric Cheng */ 20838275SEric Cheng if (bw_ctl_flag && mac_srs->srs_tid == NULL) { 20848275SEric Cheng /* 20858275SEric Cheng * We need to ensure that a timer is already 20868275SEric Cheng * scheduled or we force schedule one for 20878275SEric Cheng * later so that we can continue processing 20888275SEric Cheng * after this quanta is over. 20898275SEric Cheng */ 20908275SEric Cheng mac_srs->srs_tid = timeout(mac_srs_fire, 20918275SEric Cheng mac_srs, 1); 20928275SEric Cheng } 20938275SEric Cheng wait: 20948275SEric Cheng CALLB_CPR_SAFE_BEGIN(&cprinfo); 20958275SEric Cheng cv_wait(async, lock); 20968275SEric Cheng CALLB_CPR_SAFE_END(&cprinfo, lock); 20978275SEric Cheng 20988275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 20998275SEric Cheng goto done; 21008275SEric Cheng if (mac_srs->srs_state & SRS_PROC) 21018275SEric Cheng goto wait; 21028275SEric Cheng 21038275SEric Cheng if (mac_srs->srs_first != NULL && 21048275SEric Cheng mac_srs->srs_type & SRST_BW_CONTROL) { 21058275SEric Cheng MAC_SRS_BW_LOCK(mac_srs); 21068275SEric Cheng if (mac_srs->srs_bw->mac_bw_state & 21078275SEric Cheng SRS_BW_ENFORCED) { 21088275SEric Cheng MAC_SRS_CHECK_BW_CONTROL(mac_srs); 21098275SEric Cheng } 21108275SEric Cheng bw_ctl_flag = mac_srs->srs_bw->mac_bw_state & 21118275SEric Cheng SRS_BW_ENFORCED; 21128275SEric Cheng MAC_SRS_BW_UNLOCK(mac_srs); 21138275SEric Cheng } 21148275SEric Cheng } 21158275SEric Cheng 21168275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 21178275SEric Cheng goto done; 21188275SEric Cheng mac_srs->srs_drain_func(mac_srs, SRS_WORKER); 21198275SEric Cheng } 21208275SEric Cheng done: 21218275SEric Cheng /* 21228275SEric Cheng * The Rx SRS quiesce logic first cuts off packet supply to the SRS 21238275SEric Cheng * from both hard and soft classifications and waits for such threads 21248275SEric Cheng * to finish before signaling the worker. So at this point the only 21258275SEric Cheng * thread left that could be competing with the worker is the poll 21268275SEric Cheng * thread. In the case of Tx, there shouldn't be any thread holding 21278275SEric Cheng * SRS_PROC at this point. 21288275SEric Cheng */ 21298275SEric Cheng if (!(mac_srs->srs_state & SRS_PROC)) { 21308275SEric Cheng mac_srs->srs_state |= SRS_PROC; 21318275SEric Cheng } else { 21328275SEric Cheng ASSERT((mac_srs->srs_type & SRST_TX) == 0); 21338275SEric Cheng /* 21348275SEric Cheng * Poll thread still owns the SRS and is still running 21358275SEric Cheng */ 21368275SEric Cheng ASSERT((mac_srs->srs_poll_thr == NULL) || 21378275SEric Cheng ((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 21388275SEric Cheng SRS_POLL_THR_OWNER)); 21398275SEric Cheng } 21408275SEric Cheng mac_srs_worker_quiesce(mac_srs); 21418275SEric Cheng /* 21428275SEric Cheng * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator 21438275SEric Cheng * of the quiesce operation 21448275SEric Cheng */ 21458275SEric Cheng while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART))) 21468275SEric Cheng cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 21478275SEric Cheng 21488275SEric Cheng if (mac_srs->srs_state & SRS_RESTART) { 21498275SEric Cheng ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 21508275SEric Cheng mac_srs_worker_restart(mac_srs); 21518275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 21528275SEric Cheng goto start; 21538275SEric Cheng } 21548275SEric Cheng 21558275SEric Cheng if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE)) 21568275SEric Cheng mac_srs_worker_quiesce(mac_srs); 21578275SEric Cheng 21588275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 21598275SEric Cheng /* The macro drops the srs_lock */ 21608275SEric Cheng CALLB_CPR_EXIT(&cprinfo); 21618275SEric Cheng thread_exit(); 21628275SEric Cheng } 21638275SEric Cheng 21648275SEric Cheng /* 21658275SEric Cheng * mac_rx_srs_subflow_process 21668275SEric Cheng * 21678275SEric Cheng * Receive side routine called from interrupt path when there are 21688275SEric Cheng * sub flows present on this SRS. 21698275SEric Cheng */ 21708275SEric Cheng /* ARGSUSED */ 21718275SEric Cheng void 21728275SEric Cheng mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs, 21738275SEric Cheng mblk_t *mp_chain, boolean_t loopback) 21748275SEric Cheng { 21758275SEric Cheng flow_entry_t *flent = NULL; 21768275SEric Cheng flow_entry_t *prev_flent = NULL; 21778275SEric Cheng mblk_t *mp = NULL; 21788275SEric Cheng mblk_t *tail = NULL; 21798275SEric Cheng mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 21808275SEric Cheng mac_client_impl_t *mcip; 21818275SEric Cheng 21828275SEric Cheng mcip = mac_srs->srs_mcip; 21838275SEric Cheng ASSERT(mcip != NULL); 21848275SEric Cheng 21858275SEric Cheng /* 21868275SEric Cheng * We need to determine the SRS for every packet 21878275SEric Cheng * by walking the flow table, if we don't get any, 21888275SEric Cheng * then we proceed using the SRS we came with. 21898275SEric Cheng */ 21908275SEric Cheng mp = tail = mp_chain; 21918275SEric Cheng while (mp != NULL) { 21928275SEric Cheng 21938275SEric Cheng /* 21948275SEric Cheng * We will increment the stats for the mactching subflow. 21958275SEric Cheng * when we get the bytes/pkt count for the classified packets 21968275SEric Cheng * later in mac_rx_srs_process. 21978275SEric Cheng */ 21988275SEric Cheng (void) mac_flow_lookup(mcip->mci_subflow_tab, mp, 21998275SEric Cheng FLOW_INBOUND, &flent); 22008275SEric Cheng 22018275SEric Cheng if (mp == mp_chain || flent == prev_flent) { 22028275SEric Cheng if (prev_flent != NULL) 22038275SEric Cheng FLOW_REFRELE(prev_flent); 22048275SEric Cheng prev_flent = flent; 22058275SEric Cheng flent = NULL; 22068275SEric Cheng tail = mp; 22078275SEric Cheng mp = mp->b_next; 22088275SEric Cheng continue; 22098275SEric Cheng } 22108275SEric Cheng tail->b_next = NULL; 22118275SEric Cheng /* 22128275SEric Cheng * A null indicates, this is for the mac_srs itself. 22138275SEric Cheng * XXX-venu : probably assert for fe_rx_srs_cnt == 0. 22148275SEric Cheng */ 22158275SEric Cheng if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 22168275SEric Cheng mac_rx_srs_process(arg, 22178275SEric Cheng (mac_resource_handle_t)mac_srs, mp_chain, 22188275SEric Cheng loopback); 22198275SEric Cheng } else { 22208275SEric Cheng (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 22218275SEric Cheng prev_flent->fe_cb_arg2, mp_chain, loopback); 22228275SEric Cheng FLOW_REFRELE(prev_flent); 22238275SEric Cheng } 22248275SEric Cheng prev_flent = flent; 22258275SEric Cheng flent = NULL; 22268275SEric Cheng mp_chain = mp; 22278275SEric Cheng tail = mp; 22288275SEric Cheng mp = mp->b_next; 22298275SEric Cheng } 22308275SEric Cheng /* Last chain */ 22318275SEric Cheng ASSERT(mp_chain != NULL); 22328275SEric Cheng if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 22338275SEric Cheng mac_rx_srs_process(arg, 22348275SEric Cheng (mac_resource_handle_t)mac_srs, mp_chain, loopback); 22358275SEric Cheng } else { 22368275SEric Cheng (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 22378275SEric Cheng prev_flent->fe_cb_arg2, mp_chain, loopback); 22388275SEric Cheng FLOW_REFRELE(prev_flent); 22398275SEric Cheng } 22408275SEric Cheng } 22418275SEric Cheng 22428275SEric Cheng /* 22438275SEric Cheng * mac_rx_srs_process 22448275SEric Cheng * 22458275SEric Cheng * Receive side routine called from the interrupt path. 22468275SEric Cheng * 22478275SEric Cheng * loopback is set to force a context switch on the loopback 22488275SEric Cheng * path between MAC clients. 22498275SEric Cheng */ 22508275SEric Cheng /* ARGSUSED */ 22518275SEric Cheng void 22528275SEric Cheng mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, 22538275SEric Cheng boolean_t loopback) 22548275SEric Cheng { 22558275SEric Cheng mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 22568275SEric Cheng mblk_t *mp, *tail, *head; 22578275SEric Cheng int count = 0; 22588275SEric Cheng int count1; 22598275SEric Cheng size_t sz = 0; 22608275SEric Cheng size_t chain_sz, sz1; 22618275SEric Cheng mac_bw_ctl_t *mac_bw; 22628275SEric Cheng mac_client_impl_t *smcip; 22638275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 22648275SEric Cheng 22658275SEric Cheng /* 22668275SEric Cheng * Set the tail, count and sz. We set the sz irrespective 22678275SEric Cheng * of whether we are doing B/W control or not for the 22688275SEric Cheng * purpose of updating the stats. 22698275SEric Cheng */ 22708275SEric Cheng mp = tail = mp_chain; 22718275SEric Cheng while (mp != NULL) { 22728275SEric Cheng tail = mp; 22738275SEric Cheng count++; 22748275SEric Cheng sz += msgdsize(mp); 22758275SEric Cheng mp = mp->b_next; 22768275SEric Cheng } 22778275SEric Cheng 22788275SEric Cheng mutex_enter(&mac_srs->srs_lock); 22798275SEric Cheng smcip = mac_srs->srs_mcip; 22808275SEric Cheng 22818275SEric Cheng if (mac_srs->srs_type & SRST_FLOW || smcip == NULL) { 22828275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, rbytes, sz); 22838275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, ipackets, count); 22848275SEric Cheng } 22858275SEric Cheng if (smcip != NULL) { 22868275SEric Cheng smcip->mci_stat_ibytes += sz; 22878275SEric Cheng smcip->mci_stat_ipackets += count; 22888275SEric Cheng } 22898275SEric Cheng 22908275SEric Cheng /* 22918275SEric Cheng * If the SRS in already being processed; has been blanked; 22928275SEric Cheng * can be processed by worker thread only; or the B/W limit 22938275SEric Cheng * has been reached, then queue the chain and check if 22948275SEric Cheng * worker thread needs to be awakend. 22958275SEric Cheng */ 22968275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 22978275SEric Cheng mac_bw = mac_srs->srs_bw; 22988275SEric Cheng ASSERT(mac_bw != NULL); 22998275SEric Cheng mutex_enter(&mac_bw->mac_bw_lock); 23008275SEric Cheng /* Count the packets and bytes via interrupt */ 23018275SEric Cheng srs_rx->sr_intr_count += count; 23028275SEric Cheng mac_bw->mac_bw_intr += sz; 23038275SEric Cheng if (mac_bw->mac_bw_limit == 0) { 23048275SEric Cheng /* zero bandwidth: drop all */ 23058275SEric Cheng srs_rx->sr_drop_count += count; 23068275SEric Cheng mac_bw->mac_bw_drop_bytes += sz; 23078275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23088275SEric Cheng mutex_exit(&mac_srs->srs_lock); 23098275SEric Cheng mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 23108275SEric Cheng return; 23118275SEric Cheng } else { 23128275SEric Cheng if ((mac_bw->mac_bw_sz + sz) <= 23138275SEric Cheng mac_bw->mac_bw_drop_threshold) { 23148275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23158275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, 23168275SEric Cheng tail, count, sz); 23178275SEric Cheng } else { 23188275SEric Cheng mp = mp_chain; 23198275SEric Cheng chain_sz = 0; 23208275SEric Cheng count1 = 0; 23218275SEric Cheng tail = NULL; 23228275SEric Cheng head = NULL; 23238275SEric Cheng while (mp != NULL) { 23248275SEric Cheng sz1 = msgdsize(mp); 23258275SEric Cheng if (mac_bw->mac_bw_sz + chain_sz + sz1 > 23268275SEric Cheng mac_bw->mac_bw_drop_threshold) 23278275SEric Cheng break; 23288275SEric Cheng chain_sz += sz1; 23298275SEric Cheng count1++; 23308275SEric Cheng tail = mp; 23318275SEric Cheng mp = mp->b_next; 23328275SEric Cheng } 23338275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23348275SEric Cheng if (tail != NULL) { 23358275SEric Cheng head = tail->b_next; 23368275SEric Cheng tail->b_next = NULL; 23378275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, 23388275SEric Cheng mp_chain, tail, count1, chain_sz); 23398275SEric Cheng sz -= chain_sz; 23408275SEric Cheng count -= count1; 23418275SEric Cheng } else { 23428275SEric Cheng /* Can't pick up any */ 23438275SEric Cheng head = mp_chain; 23448275SEric Cheng } 23458275SEric Cheng if (head != NULL) { 23468275SEric Cheng /* Drop any packet over the threshold */ 23478275SEric Cheng srs_rx->sr_drop_count += count; 23488275SEric Cheng mutex_enter(&mac_bw->mac_bw_lock); 23498275SEric Cheng mac_bw->mac_bw_drop_bytes += sz; 23508275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23518275SEric Cheng freemsgchain(head); 23528275SEric Cheng } 23538275SEric Cheng } 23548275SEric Cheng MAC_SRS_WORKER_WAKEUP(mac_srs); 23558275SEric Cheng mutex_exit(&mac_srs->srs_lock); 23568275SEric Cheng return; 23578275SEric Cheng } 23588275SEric Cheng } 23598275SEric Cheng 23608275SEric Cheng /* 23618275SEric Cheng * If the total number of packets queued in the SRS and 23628275SEric Cheng * its associated soft rings exceeds the max allowed, 23638275SEric Cheng * then drop the chain. If we are polling capable, this 23648275SEric Cheng * shouldn't be happening. 23658275SEric Cheng */ 23668275SEric Cheng if (!(mac_srs->srs_type & SRST_BW_CONTROL) && 23678275SEric Cheng (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) { 23688275SEric Cheng mac_bw = mac_srs->srs_bw; 23698275SEric Cheng srs_rx->sr_drop_count += count; 23708275SEric Cheng mutex_enter(&mac_bw->mac_bw_lock); 23718275SEric Cheng mac_bw->mac_bw_drop_bytes += sz; 23728275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 23738275SEric Cheng freemsgchain(mp_chain); 23748275SEric Cheng mutex_exit(&mac_srs->srs_lock); 23758275SEric Cheng return; 23768275SEric Cheng } 23778275SEric Cheng 23788275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz); 23798275SEric Cheng /* Count the packets entering via interrupt path */ 23808275SEric Cheng srs_rx->sr_intr_count += count; 23818275SEric Cheng 23828275SEric Cheng if (!(mac_srs->srs_state & SRS_PROC)) { 23838275SEric Cheng /* 23848275SEric Cheng * If we are coming via loopback or if we are not 23858275SEric Cheng * optimizing for latency, we should signal the 23868275SEric Cheng * worker thread. 23878275SEric Cheng */ 2388*8833SVenu.Iyer@Sun.COM if (loopback || !(mac_srs->srs_state & SRS_LATENCY_OPT)) { 23898275SEric Cheng /* 23908275SEric Cheng * For loopback, We need to let the worker take 23918275SEric Cheng * over as we don't want to continue in the same 23928275SEric Cheng * thread even if we can. This could lead to stack 23938275SEric Cheng * overflows and may also end up using 23948275SEric Cheng * resources (cpu) incorrectly. 23958275SEric Cheng */ 23968275SEric Cheng cv_signal(&mac_srs->srs_async); 23978275SEric Cheng } else { 23988275SEric Cheng /* 23998275SEric Cheng * Seems like no one is processing the SRS and 24008275SEric Cheng * there is no backlog. We also inline process 24018275SEric Cheng * our packet if its a single packet in non 24028275SEric Cheng * latency optimized case (in latency optimized 24038275SEric Cheng * case, we inline process chains of any size). 24048275SEric Cheng */ 24058275SEric Cheng mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST); 24068275SEric Cheng } 24078275SEric Cheng } 24088275SEric Cheng mutex_exit(&mac_srs->srs_lock); 24098275SEric Cheng } 24108275SEric Cheng 24118275SEric Cheng /* TX SIDE ROUTINES (RUNTIME) */ 24128275SEric Cheng 24138275SEric Cheng /* 24148275SEric Cheng * mac_tx_srs_no_desc 24158275SEric Cheng * 24168275SEric Cheng * This routine is called by Tx single ring default mode 24178275SEric Cheng * when Tx ring runs out of descs. 24188275SEric Cheng */ 24198275SEric Cheng mac_tx_cookie_t 24208275SEric Cheng mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 24218275SEric Cheng uint16_t flag, mblk_t **ret_mp) 24228275SEric Cheng { 24238275SEric Cheng mac_tx_cookie_t cookie = NULL; 24248275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 24258275SEric Cheng boolean_t wakeup_worker = B_TRUE; 24268275SEric Cheng uint32_t tx_mode = srs_tx->st_mode; 24278275SEric Cheng int cnt, sz; 24288275SEric Cheng mblk_t *tail; 24298275SEric Cheng 24308275SEric Cheng ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW); 24318275SEric Cheng if (flag & MAC_DROP_ON_NO_DESC) { 24328275SEric Cheng MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 24338275SEric Cheng } else { 24348275SEric Cheng if (mac_srs->srs_first != NULL) 24358275SEric Cheng wakeup_worker = B_FALSE; 24368275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 24378275SEric Cheng if (flag & MAC_TX_NO_ENQUEUE) { 24388275SEric Cheng /* 24398275SEric Cheng * If TX_QUEUED is not set, queue the 24408275SEric Cheng * packet and let mac_tx_srs_drain() 24418275SEric Cheng * set the TX_BLOCKED bit for the 24428275SEric Cheng * reasons explained above. Otherwise, 24438275SEric Cheng * return the mblks. 24448275SEric Cheng */ 24458275SEric Cheng if (wakeup_worker) { 24468275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 24478275SEric Cheng mp_chain, tail, cnt, sz); 24488275SEric Cheng } else { 24498275SEric Cheng MAC_TX_SET_NO_ENQUEUE(mac_srs, 24508275SEric Cheng mp_chain, ret_mp, cookie); 24518275SEric Cheng } 24528275SEric Cheng } else { 24538275SEric Cheng MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 24548275SEric Cheng tail, cnt, sz, cookie); 24558275SEric Cheng } 24568275SEric Cheng if (wakeup_worker) 24578275SEric Cheng cv_signal(&mac_srs->srs_async); 24588275SEric Cheng } 24598275SEric Cheng return (cookie); 24608275SEric Cheng } 24618275SEric Cheng 24628275SEric Cheng /* 24638275SEric Cheng * mac_tx_srs_enqueue 24648275SEric Cheng * 24658275SEric Cheng * This routine is called when Tx SRS is operating in either serializer 24668275SEric Cheng * or bandwidth mode. In serializer mode, a packet will get enqueued 24678275SEric Cheng * when a thread cannot enter SRS exclusively. In bandwidth mode, 24688275SEric Cheng * packets gets queued if allowed byte-count limit for a tick is 24698275SEric Cheng * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and 24708275SEric Cheng * MAC_TX_NO_ENQUEUE is set is different than when operaing in either 24718275SEric Cheng * the default mode or fanout mode. Here packets get dropped or 24728275SEric Cheng * returned back to the caller only after hi-watermark worth of data 24738275SEric Cheng * is queued. 24748275SEric Cheng */ 24758275SEric Cheng static mac_tx_cookie_t 24768275SEric Cheng mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 24778275SEric Cheng uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp) 24788275SEric Cheng { 24798275SEric Cheng mac_tx_cookie_t cookie = NULL; 24808275SEric Cheng int cnt, sz; 24818275SEric Cheng mblk_t *tail; 24828275SEric Cheng boolean_t wakeup_worker = B_TRUE; 24838275SEric Cheng 2484*8833SVenu.Iyer@Sun.COM /* 2485*8833SVenu.Iyer@Sun.COM * Ignore fanout hint if we don't have multiple tx rings. 2486*8833SVenu.Iyer@Sun.COM */ 2487*8833SVenu.Iyer@Sun.COM if (!TX_MULTI_RING_MODE(mac_srs)) 2488*8833SVenu.Iyer@Sun.COM fanout_hint = 0; 2489*8833SVenu.Iyer@Sun.COM 24908275SEric Cheng if (mac_srs->srs_first != NULL) 24918275SEric Cheng wakeup_worker = B_FALSE; 24928275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 24938275SEric Cheng if (flag & MAC_DROP_ON_NO_DESC) { 24948275SEric Cheng if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) { 24958275SEric Cheng MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 24968275SEric Cheng } else { 24978275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 24988275SEric Cheng mp_chain, tail, cnt, sz); 24998275SEric Cheng } 25008275SEric Cheng } else if (flag & MAC_TX_NO_ENQUEUE) { 25018275SEric Cheng if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) || 25028275SEric Cheng (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) { 25038275SEric Cheng MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain, 25048275SEric Cheng ret_mp, cookie); 25058275SEric Cheng } else { 25068275SEric Cheng mp_chain->b_prev = (mblk_t *)fanout_hint; 25078275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 25088275SEric Cheng mp_chain, tail, cnt, sz); 25098275SEric Cheng } 25108275SEric Cheng } else { 25118275SEric Cheng /* 25128275SEric Cheng * If you are BW_ENFORCED, just enqueue the 25138275SEric Cheng * packet. srs_worker will drain it at the 25148275SEric Cheng * prescribed rate. Before enqueueing, save 25158275SEric Cheng * the fanout hint. 25168275SEric Cheng */ 25178275SEric Cheng mp_chain->b_prev = (mblk_t *)fanout_hint; 25188275SEric Cheng MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 25198275SEric Cheng tail, cnt, sz, cookie); 25208275SEric Cheng } 25218275SEric Cheng if (wakeup_worker) 25228275SEric Cheng cv_signal(&mac_srs->srs_async); 25238275SEric Cheng return (cookie); 25248275SEric Cheng } 25258275SEric Cheng 25268275SEric Cheng /* 25278275SEric Cheng * There are five tx modes: 25288275SEric Cheng * 25298275SEric Cheng * 1) Default mode (SRS_TX_DEFAULT) 25308275SEric Cheng * 2) Serialization mode (SRS_TX_SERIALIZE) 25318275SEric Cheng * 3) Fanout mode (SRS_TX_FANOUT) 25328275SEric Cheng * 4) Bandwdith mode (SRS_TX_BW) 25338275SEric Cheng * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT) 25348275SEric Cheng * 25358275SEric Cheng * The tx mode in which an SRS operates is decided in mac_tx_srs_setup() 25368275SEric Cheng * based on the number of Tx rings requested for an SRS and whether 25378275SEric Cheng * bandwidth control is requested or not. 25388275SEric Cheng * 25398275SEric Cheng * In the default mode (i.e., no fanout/no bandwidth), the SRS acts as a 25408275SEric Cheng * pass-thru. Packets will go directly to mac_tx_send(). When the underlying 25418275SEric Cheng * Tx ring runs out of Tx descs, it starts queueing up packets in SRS. 25428275SEric Cheng * When flow-control is relieved, the srs_worker drains the queued 25438275SEric Cheng * packets and informs blocked clients to restart sending packets. 25448275SEric Cheng * 25458275SEric Cheng * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. 25468275SEric Cheng * 25478275SEric Cheng * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple 25488275SEric Cheng * Tx rings. Each Tx ring will have a soft ring associated with it. 25498275SEric Cheng * These soft rings will be hung off the Tx SRS. Queueing if it happens 25508275SEric Cheng * due to lack of Tx desc will be in individual soft ring (and not srs) 25518275SEric Cheng * associated with Tx ring. 25528275SEric Cheng * 25538275SEric Cheng * In the TX_BW mode, tx srs will allow packets to go down to Tx ring 25548275SEric Cheng * only if bw is available. Otherwise the packets will be queued in 25558275SEric Cheng * SRS. If fanout to multiple Tx rings is configured, the packets will 25568275SEric Cheng * be fanned out among the soft rings associated with the Tx rings. 25578275SEric Cheng * 25588275SEric Cheng * Four flags are used in srs_state for indicating flow control 25598275SEric Cheng * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT. 25608275SEric Cheng * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the 25618275SEric Cheng * driver below. 25628275SEric Cheng * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat 25638275SEric Cheng * and flow-control pressure is applied back to clients. The clients expect 25648275SEric Cheng * wakeup when flow-control is relieved. 25658275SEric Cheng * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk 25668275SEric Cheng * got returned back to client either due to lack of Tx descs or due to bw 25678275SEric Cheng * control reasons. The clients expect a wakeup when condition is relieved. 25688275SEric Cheng * 25698275SEric Cheng * The fourth argument to mac_tx() is the flag. Normally it will be 0 but 25708275SEric Cheng * some clients set the following values too: MAC_DROP_ON_NO_DESC, 25718275SEric Cheng * MAC_TX_NO_ENQUEUE 25728275SEric Cheng * Mac clients that do not want packets to be enqueued in the mac layer set 25738275SEric Cheng * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or 25748275SEric Cheng * Tx soft rings but instead get dropped when the NIC runs out of desc. The 25758275SEric Cheng * behaviour of this flag is different when the Tx is running in serializer 25768275SEric Cheng * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet 25778275SEric Cheng * get dropped when Tx high watermark is reached. 25788275SEric Cheng * There are some mac clients like vsw, aggr that want the mblks to be 25798275SEric Cheng * returned back to clients instead of being queued in Tx SRS (or Tx soft 25808275SEric Cheng * rings) under flow-control (i.e., out of desc or exceeding bw limits) 25818275SEric Cheng * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set. 25828275SEric Cheng * In the default and Tx fanout mode, the un-transmitted mblks will be 25838275SEric Cheng * returned back to the clients when the driver runs out of Tx descs. 25848275SEric Cheng * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or 25858275SEric Cheng * soft ring) so that the clients can be woken up when Tx desc become 25868275SEric Cheng * available. When running in serializer or bandwidth mode mode, 25878275SEric Cheng * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached. 25888275SEric Cheng */ 25898275SEric Cheng 25908275SEric Cheng mac_tx_func_t 25918275SEric Cheng mac_tx_get_func(uint32_t mode) 25928275SEric Cheng { 25938275SEric Cheng return (mac_tx_mode_list[mode].mac_tx_func); 25948275SEric Cheng } 25958275SEric Cheng 25968275SEric Cheng /* ARGSUSED */ 25978275SEric Cheng static mac_tx_cookie_t 25988275SEric Cheng mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 25998275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 26008275SEric Cheng { 26018275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 26028275SEric Cheng boolean_t is_subflow; 26038275SEric Cheng mac_tx_stats_t stats; 26048275SEric Cheng mac_tx_cookie_t cookie = NULL; 26058275SEric Cheng 26068275SEric Cheng ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT); 26078275SEric Cheng 26088275SEric Cheng /* Regular case with a single Tx ring */ 26098275SEric Cheng /* 26108275SEric Cheng * SRS_TX_BLOCKED is set when underlying NIC runs 26118275SEric Cheng * out of Tx descs and messages start getting 26128275SEric Cheng * queued. It won't get reset until 26138275SEric Cheng * tx_srs_drain() completely drains out the 26148275SEric Cheng * messages. 26158275SEric Cheng */ 26168275SEric Cheng if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 26178275SEric Cheng /* Tx descs/resources not available */ 26188275SEric Cheng mutex_enter(&mac_srs->srs_lock); 26198275SEric Cheng if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 26208275SEric Cheng cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, 26218275SEric Cheng flag, ret_mp); 26228275SEric Cheng mutex_exit(&mac_srs->srs_lock); 26238275SEric Cheng return (cookie); 26248275SEric Cheng } 26258275SEric Cheng /* 26268275SEric Cheng * While we were computing mblk count, the 26278275SEric Cheng * flow control condition got relieved. 26288275SEric Cheng * Continue with the transmission. 26298275SEric Cheng */ 26308275SEric Cheng mutex_exit(&mac_srs->srs_lock); 26318275SEric Cheng } 26328275SEric Cheng 26338275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 26348275SEric Cheng 26358275SEric Cheng mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 26368275SEric Cheng mp_chain, (is_subflow ? &stats : NULL)); 26378275SEric Cheng 26388275SEric Cheng /* 26398275SEric Cheng * Multiple threads could be here sending packets. 26408275SEric Cheng * Under such conditions, it is not possible to 26418275SEric Cheng * automically set SRS_TX_BLOCKED bit to indicate 26428275SEric Cheng * out of tx desc condition. To atomically set 26438275SEric Cheng * this, we queue the returned packet and do 26448275SEric Cheng * the setting of SRS_TX_BLOCKED in 26458275SEric Cheng * mac_tx_srs_drain(). 26468275SEric Cheng */ 26478275SEric Cheng if (mp_chain != NULL) { 26488275SEric Cheng mutex_enter(&mac_srs->srs_lock); 26498275SEric Cheng cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp); 26508275SEric Cheng mutex_exit(&mac_srs->srs_lock); 26518275SEric Cheng return (cookie); 26528275SEric Cheng } 26538275SEric Cheng 26548275SEric Cheng if (is_subflow) 26558275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 26568275SEric Cheng 26578275SEric Cheng return (NULL); 26588275SEric Cheng } 26598275SEric Cheng 26608275SEric Cheng /* 26618275SEric Cheng * mac_tx_serialize_mode 26628275SEric Cheng * 26638275SEric Cheng * This is an experimental mode implemented as per the request of PAE. 26648275SEric Cheng * In this mode, all callers attempting to send a packet to the NIC 26658275SEric Cheng * will get serialized. Only one thread at any time will access the 26668275SEric Cheng * NIC to send the packet out. 26678275SEric Cheng */ 26688275SEric Cheng /* ARGSUSED */ 26698275SEric Cheng static mac_tx_cookie_t 26708275SEric Cheng mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 26718275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 26728275SEric Cheng { 26738275SEric Cheng boolean_t is_subflow; 26748275SEric Cheng mac_tx_stats_t stats; 26758275SEric Cheng mac_tx_cookie_t cookie = NULL; 26768275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 26778275SEric Cheng 26788275SEric Cheng /* Single ring, serialize below */ 26798275SEric Cheng ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE); 26808275SEric Cheng mutex_enter(&mac_srs->srs_lock); 26818275SEric Cheng if ((mac_srs->srs_first != NULL) || 26828275SEric Cheng (mac_srs->srs_state & SRS_PROC)) { 26838275SEric Cheng /* 26848275SEric Cheng * In serialization mode, queue all packets until 26858275SEric Cheng * TX_HIWAT is set. 26868275SEric Cheng * If drop bit is set, drop if TX_HIWAT is set. 26878275SEric Cheng * If no_enqueue is set, still enqueue until hiwat 26888275SEric Cheng * is set and return mblks after TX_HIWAT is set. 26898275SEric Cheng */ 26908275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, 26918275SEric Cheng flag, NULL, ret_mp); 26928275SEric Cheng mutex_exit(&mac_srs->srs_lock); 26938275SEric Cheng return (cookie); 26948275SEric Cheng } 26958275SEric Cheng /* 26968275SEric Cheng * No packets queued, nothing on proc and no flow 26978275SEric Cheng * control condition. Fast-path, ok. Do inline 26988275SEric Cheng * processing. 26998275SEric Cheng */ 27008275SEric Cheng mac_srs->srs_state |= SRS_PROC; 27018275SEric Cheng mutex_exit(&mac_srs->srs_lock); 27028275SEric Cheng 27038275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 27048275SEric Cheng 27058275SEric Cheng mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 27068275SEric Cheng mp_chain, (is_subflow ? &stats : NULL)); 27078275SEric Cheng 27088275SEric Cheng mutex_enter(&mac_srs->srs_lock); 27098275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 27108275SEric Cheng if (mp_chain != NULL) { 27118275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, 27128275SEric Cheng mp_chain, flag, NULL, ret_mp); 27138275SEric Cheng } 27148275SEric Cheng if (mac_srs->srs_first != NULL) { 27158275SEric Cheng /* 27168275SEric Cheng * We processed inline our packet and a new 27178275SEric Cheng * packet/s got queued while we were 27188275SEric Cheng * processing. Wakeup srs worker 27198275SEric Cheng */ 27208275SEric Cheng cv_signal(&mac_srs->srs_async); 27218275SEric Cheng } 27228275SEric Cheng mutex_exit(&mac_srs->srs_lock); 27238275SEric Cheng 27248275SEric Cheng if (is_subflow && cookie == NULL) 27258275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 27268275SEric Cheng 27278275SEric Cheng return (cookie); 27288275SEric Cheng } 27298275SEric Cheng 27308275SEric Cheng /* 27318275SEric Cheng * mac_tx_fanout_mode 27328275SEric Cheng * 27338275SEric Cheng * In this mode, the SRS will have access to multiple Tx rings to send 27348275SEric Cheng * the packet out. The fanout hint that is passed as an argument is 27358275SEric Cheng * used to find an appropriate ring to fanout the traffic. Each Tx 27368275SEric Cheng * ring, in turn, will have a soft ring associated with it. If a Tx 27378275SEric Cheng * ring runs out of Tx desc's the returned packet will be queued in 27388275SEric Cheng * the soft ring associated with that Tx ring. The srs itself will not 27398275SEric Cheng * queue any packets. 27408275SEric Cheng */ 2741*8833SVenu.Iyer@Sun.COM 2742*8833SVenu.Iyer@Sun.COM #define MAC_TX_SOFT_RING_PROCESS(chain) { \ 2743*8833SVenu.Iyer@Sun.COM index = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count), \ 2744*8833SVenu.Iyer@Sun.COM softring = mac_srs->srs_oth_soft_rings[index]; \ 2745*8833SVenu.Iyer@Sun.COM cookie = mac_tx_soft_ring_process(softring, chain, flag, ret_mp); \ 2746*8833SVenu.Iyer@Sun.COM DTRACE_PROBE2(tx__fanout, uint64_t, hash, uint_t, index); \ 2747*8833SVenu.Iyer@Sun.COM } 2748*8833SVenu.Iyer@Sun.COM 27498275SEric Cheng static mac_tx_cookie_t 27508275SEric Cheng mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 27518275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 27528275SEric Cheng { 27538275SEric Cheng mac_soft_ring_t *softring; 2754*8833SVenu.Iyer@Sun.COM uint64_t hash; 2755*8833SVenu.Iyer@Sun.COM uint_t index; 2756*8833SVenu.Iyer@Sun.COM mac_tx_cookie_t cookie = NULL; 27578275SEric Cheng 27588275SEric Cheng ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT); 2759*8833SVenu.Iyer@Sun.COM if (fanout_hint != 0) { 2760*8833SVenu.Iyer@Sun.COM /* 2761*8833SVenu.Iyer@Sun.COM * The hint is specified by the caller, simply pass the 2762*8833SVenu.Iyer@Sun.COM * whole chain to the soft ring. 2763*8833SVenu.Iyer@Sun.COM */ 2764*8833SVenu.Iyer@Sun.COM hash = HASH_HINT(fanout_hint); 2765*8833SVenu.Iyer@Sun.COM MAC_TX_SOFT_RING_PROCESS(mp_chain); 2766*8833SVenu.Iyer@Sun.COM } else { 2767*8833SVenu.Iyer@Sun.COM mblk_t *last_mp, *cur_mp, *sub_chain; 2768*8833SVenu.Iyer@Sun.COM uint64_t last_hash = 0; 2769*8833SVenu.Iyer@Sun.COM uint_t media = mac_srs->srs_mcip->mci_mip->mi_info.mi_media; 2770*8833SVenu.Iyer@Sun.COM 2771*8833SVenu.Iyer@Sun.COM /* 2772*8833SVenu.Iyer@Sun.COM * Compute the hash from the contents (headers) of the 2773*8833SVenu.Iyer@Sun.COM * packets of the mblk chain. Split the chains into 2774*8833SVenu.Iyer@Sun.COM * subchains of the same conversation. 2775*8833SVenu.Iyer@Sun.COM * 2776*8833SVenu.Iyer@Sun.COM * Since there may be more than one ring used for 2777*8833SVenu.Iyer@Sun.COM * sub-chains of the same call, and since the caller 2778*8833SVenu.Iyer@Sun.COM * does not maintain per conversation state since it 2779*8833SVenu.Iyer@Sun.COM * passed a zero hint, unsent subchains will be 2780*8833SVenu.Iyer@Sun.COM * dropped. 2781*8833SVenu.Iyer@Sun.COM */ 2782*8833SVenu.Iyer@Sun.COM 2783*8833SVenu.Iyer@Sun.COM flag |= MAC_DROP_ON_NO_DESC; 2784*8833SVenu.Iyer@Sun.COM ret_mp = NULL; 2785*8833SVenu.Iyer@Sun.COM 2786*8833SVenu.Iyer@Sun.COM ASSERT(ret_mp == NULL); 2787*8833SVenu.Iyer@Sun.COM 2788*8833SVenu.Iyer@Sun.COM sub_chain = NULL; 2789*8833SVenu.Iyer@Sun.COM last_mp = NULL; 2790*8833SVenu.Iyer@Sun.COM 2791*8833SVenu.Iyer@Sun.COM for (cur_mp = mp_chain; cur_mp != NULL; 2792*8833SVenu.Iyer@Sun.COM cur_mp = cur_mp->b_next) { 2793*8833SVenu.Iyer@Sun.COM hash = mac_pkt_hash(media, cur_mp, MAC_PKT_HASH_L4, 2794*8833SVenu.Iyer@Sun.COM B_TRUE); 2795*8833SVenu.Iyer@Sun.COM if (last_hash != 0 && hash != last_hash) { 2796*8833SVenu.Iyer@Sun.COM /* 2797*8833SVenu.Iyer@Sun.COM * Starting a different subchain, send current 2798*8833SVenu.Iyer@Sun.COM * chain out. 2799*8833SVenu.Iyer@Sun.COM */ 2800*8833SVenu.Iyer@Sun.COM ASSERT(last_mp != NULL); 2801*8833SVenu.Iyer@Sun.COM last_mp->b_next = NULL; 2802*8833SVenu.Iyer@Sun.COM MAC_TX_SOFT_RING_PROCESS(sub_chain); 2803*8833SVenu.Iyer@Sun.COM sub_chain = NULL; 2804*8833SVenu.Iyer@Sun.COM } 2805*8833SVenu.Iyer@Sun.COM 2806*8833SVenu.Iyer@Sun.COM /* add packet to subchain */ 2807*8833SVenu.Iyer@Sun.COM if (sub_chain == NULL) 2808*8833SVenu.Iyer@Sun.COM sub_chain = cur_mp; 2809*8833SVenu.Iyer@Sun.COM last_mp = cur_mp; 2810*8833SVenu.Iyer@Sun.COM last_hash = hash; 2811*8833SVenu.Iyer@Sun.COM } 2812*8833SVenu.Iyer@Sun.COM 2813*8833SVenu.Iyer@Sun.COM if (sub_chain != NULL) { 2814*8833SVenu.Iyer@Sun.COM /* send last subchain */ 2815*8833SVenu.Iyer@Sun.COM ASSERT(last_mp != NULL); 2816*8833SVenu.Iyer@Sun.COM last_mp->b_next = NULL; 2817*8833SVenu.Iyer@Sun.COM MAC_TX_SOFT_RING_PROCESS(sub_chain); 2818*8833SVenu.Iyer@Sun.COM } 2819*8833SVenu.Iyer@Sun.COM 2820*8833SVenu.Iyer@Sun.COM cookie = NULL; 2821*8833SVenu.Iyer@Sun.COM } 2822*8833SVenu.Iyer@Sun.COM 2823*8833SVenu.Iyer@Sun.COM return (cookie); 28248275SEric Cheng } 28258275SEric Cheng 28268275SEric Cheng /* 28278275SEric Cheng * mac_tx_bw_mode 28288275SEric Cheng * 28298275SEric Cheng * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring 28308275SEric Cheng * only if bw is available. Otherwise the packets will be queued in 28318275SEric Cheng * SRS. If the SRS has multiple Tx rings, then packets will get fanned 28328275SEric Cheng * out to a Tx rings. 28338275SEric Cheng */ 28348275SEric Cheng static mac_tx_cookie_t 28358275SEric Cheng mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 28368275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 28378275SEric Cheng { 28388275SEric Cheng int cnt, sz; 28398275SEric Cheng mblk_t *tail; 28408275SEric Cheng mac_tx_cookie_t cookie = NULL; 28418275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 28428275SEric Cheng 28438275SEric Cheng ASSERT(TX_BANDWIDTH_MODE(mac_srs)); 28448275SEric Cheng ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 28458275SEric Cheng mutex_enter(&mac_srs->srs_lock); 28468275SEric Cheng if (mac_srs->srs_bw->mac_bw_limit == 0) { 2847*8833SVenu.Iyer@Sun.COM /* 2848*8833SVenu.Iyer@Sun.COM * zero bandwidth, no traffic is sent: drop the packets, 2849*8833SVenu.Iyer@Sun.COM * or return the whole chain if the caller requests all 2850*8833SVenu.Iyer@Sun.COM * unsent packets back. 2851*8833SVenu.Iyer@Sun.COM */ 2852*8833SVenu.Iyer@Sun.COM if (flag & MAC_TX_NO_ENQUEUE) { 2853*8833SVenu.Iyer@Sun.COM cookie = (mac_tx_cookie_t)mac_srs; 2854*8833SVenu.Iyer@Sun.COM *ret_mp = mp_chain; 2855*8833SVenu.Iyer@Sun.COM } else { 2856*8833SVenu.Iyer@Sun.COM MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2857*8833SVenu.Iyer@Sun.COM } 28588275SEric Cheng mutex_exit(&mac_srs->srs_lock); 28598275SEric Cheng return (cookie); 28608275SEric Cheng } else if ((mac_srs->srs_first != NULL) || 28618275SEric Cheng (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 28628275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 28638275SEric Cheng fanout_hint, ret_mp); 28648275SEric Cheng mutex_exit(&mac_srs->srs_lock); 28658275SEric Cheng return (cookie); 28668275SEric Cheng } 28678275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 28688275SEric Cheng if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 28698275SEric Cheng mac_srs->srs_bw->mac_bw_curr_time = lbolt; 28708275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 28718275SEric Cheng } else if (mac_srs->srs_bw->mac_bw_used > 28728275SEric Cheng mac_srs->srs_bw->mac_bw_limit) { 28738275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 28748275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 28758275SEric Cheng mp_chain, tail, cnt, sz); 28768275SEric Cheng /* 28778275SEric Cheng * Wakeup worker thread. Note that worker 28788275SEric Cheng * thread has to be woken up so that it 28798275SEric Cheng * can fire up the timer to be woken up 28808275SEric Cheng * on the next tick. Also once 28818275SEric Cheng * BW_ENFORCED is set, it can only be 28828275SEric Cheng * reset by srs_worker thread. Until then 28838275SEric Cheng * all packets will get queued up in SRS 28848275SEric Cheng * and hence this this code path won't be 28858275SEric Cheng * entered until BW_ENFORCED is reset. 28868275SEric Cheng */ 28878275SEric Cheng cv_signal(&mac_srs->srs_async); 28888275SEric Cheng mutex_exit(&mac_srs->srs_lock); 28898275SEric Cheng return (cookie); 28908275SEric Cheng } 28918275SEric Cheng 28928275SEric Cheng mac_srs->srs_bw->mac_bw_used += sz; 28938275SEric Cheng mutex_exit(&mac_srs->srs_lock); 28948275SEric Cheng 28958275SEric Cheng if (srs_tx->st_mode == SRS_TX_BW_FANOUT) { 28968275SEric Cheng mac_soft_ring_t *softring; 28978275SEric Cheng uint_t indx, hash; 28988275SEric Cheng 28998275SEric Cheng hash = HASH_HINT(fanout_hint); 29008275SEric Cheng indx = COMPUTE_INDEX(hash, 29018275SEric Cheng mac_srs->srs_oth_ring_count); 29028275SEric Cheng softring = mac_srs->srs_oth_soft_rings[indx]; 29038275SEric Cheng return (mac_tx_soft_ring_process(softring, mp_chain, flag, 29048275SEric Cheng ret_mp)); 29058275SEric Cheng } else { 29068275SEric Cheng boolean_t is_subflow; 29078275SEric Cheng mac_tx_stats_t stats; 29088275SEric Cheng 29098275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 29108275SEric Cheng 29118275SEric Cheng mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 29128275SEric Cheng mp_chain, (is_subflow ? &stats : NULL)); 29138275SEric Cheng 29148275SEric Cheng if (mp_chain != NULL) { 29158275SEric Cheng mutex_enter(&mac_srs->srs_lock); 29168275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 29178275SEric Cheng if (mac_srs->srs_bw->mac_bw_used > sz) 29188275SEric Cheng mac_srs->srs_bw->mac_bw_used -= sz; 29198275SEric Cheng else 29208275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 29218275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 29228275SEric Cheng fanout_hint, ret_mp); 29238275SEric Cheng mutex_exit(&mac_srs->srs_lock); 29248275SEric Cheng return (cookie); 29258275SEric Cheng } 29268275SEric Cheng if (is_subflow) 29278275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 29288275SEric Cheng 29298275SEric Cheng return (NULL); 29308275SEric Cheng } 29318275SEric Cheng } 29328275SEric Cheng 29338275SEric Cheng /* ARGSUSED */ 29348275SEric Cheng void 29358275SEric Cheng mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 29368275SEric Cheng { 29378275SEric Cheng mblk_t *head, *tail; 29388275SEric Cheng size_t sz; 29398275SEric Cheng uint32_t tx_mode; 29408275SEric Cheng uint_t saved_pkt_count; 29418275SEric Cheng boolean_t is_subflow; 29428275SEric Cheng mac_tx_stats_t stats; 29438275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 29448275SEric Cheng 29458275SEric Cheng saved_pkt_count = 0; 29468275SEric Cheng ASSERT(mutex_owned(&mac_srs->srs_lock)); 29478275SEric Cheng ASSERT(!(mac_srs->srs_state & SRS_PROC)); 29488275SEric Cheng 29498275SEric Cheng mac_srs->srs_state |= SRS_PROC; 29508275SEric Cheng 29518275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 29528275SEric Cheng tx_mode = srs_tx->st_mode; 29538275SEric Cheng if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) { 29548275SEric Cheng if (mac_srs->srs_first != NULL) { 29558275SEric Cheng head = mac_srs->srs_first; 29568275SEric Cheng tail = mac_srs->srs_last; 29578275SEric Cheng saved_pkt_count = mac_srs->srs_count; 29588275SEric Cheng mac_srs->srs_first = NULL; 29598275SEric Cheng mac_srs->srs_last = NULL; 29608275SEric Cheng mac_srs->srs_count = 0; 29618275SEric Cheng mutex_exit(&mac_srs->srs_lock); 29628275SEric Cheng 29638275SEric Cheng head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 29648275SEric Cheng head, &stats); 29658275SEric Cheng 29668275SEric Cheng mutex_enter(&mac_srs->srs_lock); 29678275SEric Cheng if (head != NULL) { 29688275SEric Cheng /* Device out of tx desc, set block */ 29698275SEric Cheng if (head->b_next == NULL) 29708275SEric Cheng VERIFY(head == tail); 29718275SEric Cheng tail->b_next = mac_srs->srs_first; 29728275SEric Cheng mac_srs->srs_first = head; 29738275SEric Cheng mac_srs->srs_count += 29748275SEric Cheng (saved_pkt_count - stats.ts_opackets); 29758275SEric Cheng if (mac_srs->srs_last == NULL) 29768275SEric Cheng mac_srs->srs_last = tail; 29778275SEric Cheng MAC_TX_SRS_BLOCK(mac_srs, head); 29788275SEric Cheng } else { 29798275SEric Cheng srs_tx->st_woken_up = B_FALSE; 29808275SEric Cheng if (is_subflow) { 29818275SEric Cheng FLOW_TX_STATS_UPDATE( 29828275SEric Cheng mac_srs->srs_flent, &stats); 29838275SEric Cheng } 29848275SEric Cheng } 29858275SEric Cheng } 29868275SEric Cheng } else if (tx_mode == SRS_TX_BW) { 29878275SEric Cheng /* 29888275SEric Cheng * We are here because the timer fired and we have some data 29898275SEric Cheng * to tranmit. Also mac_tx_srs_worker should have reset 29908275SEric Cheng * SRS_BW_ENFORCED flag 29918275SEric Cheng */ 29928275SEric Cheng ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)); 29938275SEric Cheng head = tail = mac_srs->srs_first; 29948275SEric Cheng while (mac_srs->srs_first != NULL) { 29958275SEric Cheng tail = mac_srs->srs_first; 29968275SEric Cheng tail->b_prev = NULL; 29978275SEric Cheng mac_srs->srs_first = tail->b_next; 29988275SEric Cheng if (mac_srs->srs_first == NULL) 29998275SEric Cheng mac_srs->srs_last = NULL; 30008275SEric Cheng mac_srs->srs_count--; 30018275SEric Cheng sz = msgdsize(tail); 30028275SEric Cheng mac_srs->srs_size -= sz; 30038275SEric Cheng saved_pkt_count++; 30048275SEric Cheng MAC_TX_UPDATE_BW_INFO(mac_srs, sz); 30058275SEric Cheng 30068275SEric Cheng if (mac_srs->srs_bw->mac_bw_used < 30078275SEric Cheng mac_srs->srs_bw->mac_bw_limit) 30088275SEric Cheng continue; 30098275SEric Cheng 30108275SEric Cheng if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 30118275SEric Cheng mac_srs->srs_bw->mac_bw_curr_time = lbolt; 30128275SEric Cheng mac_srs->srs_bw->mac_bw_used = sz; 30138275SEric Cheng continue; 30148275SEric Cheng } 30158275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 30168275SEric Cheng break; 30178275SEric Cheng } 30188275SEric Cheng 30198275SEric Cheng ASSERT((head == NULL && tail == NULL) || 30208275SEric Cheng (head != NULL && tail != NULL)); 30218275SEric Cheng if (tail != NULL) { 30228275SEric Cheng tail->b_next = NULL; 30238275SEric Cheng mutex_exit(&mac_srs->srs_lock); 30248275SEric Cheng 30258275SEric Cheng head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 30268275SEric Cheng head, &stats); 30278275SEric Cheng 30288275SEric Cheng mutex_enter(&mac_srs->srs_lock); 30298275SEric Cheng if (head != NULL) { 30308275SEric Cheng uint_t size_sent; 30318275SEric Cheng 30328275SEric Cheng /* Device out of tx desc, set block */ 30338275SEric Cheng if (head->b_next == NULL) 30348275SEric Cheng VERIFY(head == tail); 30358275SEric Cheng tail->b_next = mac_srs->srs_first; 30368275SEric Cheng mac_srs->srs_first = head; 30378275SEric Cheng mac_srs->srs_count += 30388275SEric Cheng (saved_pkt_count - stats.ts_opackets); 30398275SEric Cheng if (mac_srs->srs_last == NULL) 30408275SEric Cheng mac_srs->srs_last = tail; 30418275SEric Cheng size_sent = sz - stats.ts_obytes; 30428275SEric Cheng mac_srs->srs_size += size_sent; 30438275SEric Cheng mac_srs->srs_bw->mac_bw_sz += size_sent; 30448275SEric Cheng if (mac_srs->srs_bw->mac_bw_used > size_sent) { 30458275SEric Cheng mac_srs->srs_bw->mac_bw_used -= 30468275SEric Cheng size_sent; 30478275SEric Cheng } else { 30488275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 30498275SEric Cheng } 30508275SEric Cheng MAC_TX_SRS_BLOCK(mac_srs, head); 30518275SEric Cheng } else { 30528275SEric Cheng srs_tx->st_woken_up = B_FALSE; 30538275SEric Cheng if (is_subflow) { 30548275SEric Cheng FLOW_TX_STATS_UPDATE( 30558275SEric Cheng mac_srs->srs_flent, &stats); 30568275SEric Cheng } 30578275SEric Cheng } 30588275SEric Cheng } 30598275SEric Cheng } else if (tx_mode == SRS_TX_BW_FANOUT) { 30608275SEric Cheng mblk_t *prev; 30618275SEric Cheng mac_soft_ring_t *softring; 30628275SEric Cheng uint64_t hint; 30638275SEric Cheng 30648275SEric Cheng /* 30658275SEric Cheng * We are here because the timer fired and we 30668275SEric Cheng * have some quota to tranmit. 30678275SEric Cheng */ 30688275SEric Cheng prev = NULL; 30698275SEric Cheng head = tail = mac_srs->srs_first; 30708275SEric Cheng while (mac_srs->srs_first != NULL) { 30718275SEric Cheng tail = mac_srs->srs_first; 30728275SEric Cheng mac_srs->srs_first = tail->b_next; 30738275SEric Cheng if (mac_srs->srs_first == NULL) 30748275SEric Cheng mac_srs->srs_last = NULL; 30758275SEric Cheng mac_srs->srs_count--; 30768275SEric Cheng sz = msgdsize(tail); 30778275SEric Cheng mac_srs->srs_size -= sz; 30788275SEric Cheng mac_srs->srs_bw->mac_bw_used += sz; 30798275SEric Cheng if (prev == NULL) 30808275SEric Cheng hint = (ulong_t)tail->b_prev; 30818275SEric Cheng if (hint != (ulong_t)tail->b_prev) { 30828275SEric Cheng prev->b_next = NULL; 30838275SEric Cheng mutex_exit(&mac_srs->srs_lock); 30848275SEric Cheng TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 30858275SEric Cheng head = tail; 30868275SEric Cheng hint = (ulong_t)tail->b_prev; 30878275SEric Cheng mutex_enter(&mac_srs->srs_lock); 30888275SEric Cheng } 30898275SEric Cheng 30908275SEric Cheng prev = tail; 30918275SEric Cheng tail->b_prev = NULL; 30928275SEric Cheng if (mac_srs->srs_bw->mac_bw_used < 30938275SEric Cheng mac_srs->srs_bw->mac_bw_limit) 30948275SEric Cheng continue; 30958275SEric Cheng 30968275SEric Cheng if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 30978275SEric Cheng mac_srs->srs_bw->mac_bw_curr_time = lbolt; 30988275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 30998275SEric Cheng continue; 31008275SEric Cheng } 31018275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 31028275SEric Cheng break; 31038275SEric Cheng } 31048275SEric Cheng ASSERT((head == NULL && tail == NULL) || 31058275SEric Cheng (head != NULL && tail != NULL)); 31068275SEric Cheng if (tail != NULL) { 31078275SEric Cheng tail->b_next = NULL; 31088275SEric Cheng mutex_exit(&mac_srs->srs_lock); 31098275SEric Cheng TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 31108275SEric Cheng mutex_enter(&mac_srs->srs_lock); 31118275SEric Cheng } 31128275SEric Cheng } 31138275SEric Cheng /* 31148275SEric Cheng * SRS_TX_FANOUT case not considered here because packets 31158275SEric Cheng * won't be queued in the SRS for this case. Packets will 31168275SEric Cheng * be sent directly to soft rings underneath and if there 31178275SEric Cheng * is any queueing at all, it would be in Tx side soft 31188275SEric Cheng * rings. 31198275SEric Cheng */ 31208275SEric Cheng 31218275SEric Cheng /* 31228275SEric Cheng * When srs_count becomes 0, reset SRS_TX_HIWAT and 31238275SEric Cheng * SRS_TX_WAKEUP_CLIENT and wakeup registered clients. 31248275SEric Cheng */ 31258275SEric Cheng if (mac_srs->srs_count == 0 && (mac_srs->srs_state & 31268275SEric Cheng (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) { 31278275SEric Cheng mac_tx_notify_cb_t *mtnfp; 31288275SEric Cheng mac_cb_t *mcb; 31298275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 31308275SEric Cheng boolean_t wakeup_required = B_FALSE; 31318275SEric Cheng 31328275SEric Cheng if (mac_srs->srs_state & 31338275SEric Cheng (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) { 31348275SEric Cheng wakeup_required = B_TRUE; 31358275SEric Cheng } 31368275SEric Cheng mac_srs->srs_state &= ~(SRS_TX_HIWAT | 31378275SEric Cheng SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED); 31388275SEric Cheng mutex_exit(&mac_srs->srs_lock); 31398275SEric Cheng if (wakeup_required) { 31408275SEric Cheng /* Wakeup callback registered clients */ 31418275SEric Cheng MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info); 31428275SEric Cheng for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL; 31438275SEric Cheng mcb = mcb->mcb_nextp) { 31448275SEric Cheng mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp; 31458275SEric Cheng mtnfp->mtnf_fn(mtnfp->mtnf_arg, 31468275SEric Cheng (mac_tx_cookie_t)mac_srs); 31478275SEric Cheng } 31488275SEric Cheng MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info, 31498275SEric Cheng &mcip->mci_tx_notify_cb_list); 31508275SEric Cheng /* 31518275SEric Cheng * If the client is not the primary MAC client, then we 31528275SEric Cheng * need to send the notification to the clients upper 31538275SEric Cheng * MAC, i.e. mci_upper_mip. 31548275SEric Cheng */ 31558275SEric Cheng mac_tx_notify(mcip->mci_upper_mip != NULL ? 31568275SEric Cheng mcip->mci_upper_mip : mcip->mci_mip); 31578275SEric Cheng } 31588275SEric Cheng mutex_enter(&mac_srs->srs_lock); 31598275SEric Cheng } 31608275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 31618275SEric Cheng } 31628275SEric Cheng 31638275SEric Cheng /* 31648275SEric Cheng * Given a packet, get the flow_entry that identifies the flow 31658275SEric Cheng * to which that packet belongs. The flow_entry will contain 31668275SEric Cheng * the transmit function to be used to send the packet. If the 31678275SEric Cheng * function returns NULL, the packet should be sent using the 31688275SEric Cheng * underlying NIC. 31698275SEric Cheng */ 31708275SEric Cheng static flow_entry_t * 31718275SEric Cheng mac_tx_classify(mac_impl_t *mip, mblk_t *mp) 31728275SEric Cheng { 31738275SEric Cheng flow_entry_t *flent = NULL; 31748275SEric Cheng mac_client_impl_t *mcip; 31758275SEric Cheng int err; 31768275SEric Cheng 31778275SEric Cheng /* 31788275SEric Cheng * Do classification on the packet. 31798275SEric Cheng */ 31808275SEric Cheng err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent); 31818275SEric Cheng if (err != 0) 31828275SEric Cheng return (NULL); 31838275SEric Cheng 31848275SEric Cheng /* 31858275SEric Cheng * This flent might just be an additional one on the MAC client, 31868275SEric Cheng * i.e. for classification purposes (different fdesc), however 31878275SEric Cheng * the resources, SRS et. al., are in the mci_flent, so if 31888275SEric Cheng * this isn't the mci_flent, we need to get it. 31898275SEric Cheng */ 31908275SEric Cheng if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) { 31918275SEric Cheng FLOW_REFRELE(flent); 31928275SEric Cheng flent = mcip->mci_flent; 31938275SEric Cheng FLOW_TRY_REFHOLD(flent, err); 31948275SEric Cheng if (err != 0) 31958275SEric Cheng return (NULL); 31968275SEric Cheng } 31978275SEric Cheng 31988275SEric Cheng return (flent); 31998275SEric Cheng } 32008275SEric Cheng 32018275SEric Cheng /* 32028275SEric Cheng * This macro is only meant to be used by mac_tx_send(). 32038275SEric Cheng */ 32048275SEric Cheng #define CHECK_VID_AND_ADD_TAG(mp) { \ 32058275SEric Cheng if (vid_check) { \ 32068275SEric Cheng int err = 0; \ 32078275SEric Cheng \ 32088275SEric Cheng MAC_VID_CHECK(src_mcip, (mp), err); \ 32098275SEric Cheng if (err != 0) { \ 32108275SEric Cheng freemsg((mp)); \ 32118275SEric Cheng (mp) = next; \ 32128275SEric Cheng oerrors++; \ 32138275SEric Cheng continue; \ 32148275SEric Cheng } \ 32158275SEric Cheng } \ 32168275SEric Cheng if (add_tag) { \ 32178275SEric Cheng (mp) = mac_add_vlan_tag((mp), 0, vid); \ 32188275SEric Cheng if ((mp) == NULL) { \ 32198275SEric Cheng (mp) = next; \ 32208275SEric Cheng oerrors++; \ 32218275SEric Cheng continue; \ 32228275SEric Cheng } \ 32238275SEric Cheng } \ 32248275SEric Cheng } 32258275SEric Cheng 32268275SEric Cheng mblk_t * 32278275SEric Cheng mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, 32288275SEric Cheng mac_tx_stats_t *stats) 32298275SEric Cheng { 32308275SEric Cheng mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch; 32318275SEric Cheng mac_impl_t *mip = src_mcip->mci_mip; 32328275SEric Cheng uint_t obytes = 0, opackets = 0, oerrors = 0; 32338275SEric Cheng mblk_t *mp = NULL, *next; 32348275SEric Cheng boolean_t vid_check, add_tag; 32358275SEric Cheng uint16_t vid = 0; 32368275SEric Cheng 32378275SEric Cheng if (mip->mi_nclients > 1) { 32388275SEric Cheng vid_check = MAC_VID_CHECK_NEEDED(src_mcip); 32398275SEric Cheng add_tag = MAC_TAG_NEEDED(src_mcip); 32408275SEric Cheng if (add_tag) 32418275SEric Cheng vid = mac_client_vid(mch); 32428275SEric Cheng } else { 32438275SEric Cheng ASSERT(mip->mi_nclients == 1); 32448275SEric Cheng vid_check = add_tag = B_FALSE; 32458275SEric Cheng } 32468275SEric Cheng 32478275SEric Cheng /* 32488275SEric Cheng * Fastpath: if there's only one client, and there's no 32498275SEric Cheng * multicast listeners, we simply send the packet down to the 32508275SEric Cheng * underlying NIC. 32518275SEric Cheng */ 32528275SEric Cheng if (mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL) { 32538275SEric Cheng DTRACE_PROBE2(fastpath, 32548275SEric Cheng mac_client_impl_t *, src_mcip, mblk_t *, mp_chain); 32558275SEric Cheng 32568275SEric Cheng mp = mp_chain; 32578275SEric Cheng while (mp != NULL) { 32588275SEric Cheng next = mp->b_next; 32598275SEric Cheng mp->b_next = NULL; 32608275SEric Cheng opackets++; 32618275SEric Cheng obytes += (mp->b_cont == NULL ? MBLKL(mp) : 32628275SEric Cheng msgdsize(mp)); 32638275SEric Cheng 32648275SEric Cheng CHECK_VID_AND_ADD_TAG(mp); 32658275SEric Cheng MAC_TX(mip, ring, mp, src_mcip); 32668275SEric Cheng 32678275SEric Cheng /* 32688275SEric Cheng * If the driver is out of descriptors and does a 32698275SEric Cheng * partial send it will return a chain of unsent 32708275SEric Cheng * mblks. Adjust the accounting stats. 32718275SEric Cheng */ 32728275SEric Cheng if (mp != NULL) { 32738275SEric Cheng opackets--; 32748275SEric Cheng obytes -= msgdsize(mp); 32758275SEric Cheng mp->b_next = next; 32768275SEric Cheng break; 32778275SEric Cheng } 32788275SEric Cheng mp = next; 32798275SEric Cheng } 32808275SEric Cheng goto done; 32818275SEric Cheng } 32828275SEric Cheng 32838275SEric Cheng /* 32848275SEric Cheng * No fastpath, we either have more than one MAC client 32858275SEric Cheng * defined on top of the same MAC, or one or more MAC 32868275SEric Cheng * client promiscuous callbacks. 32878275SEric Cheng */ 32888275SEric Cheng DTRACE_PROBE3(slowpath, mac_client_impl_t *, 32898275SEric Cheng src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain); 32908275SEric Cheng 32918275SEric Cheng mp = mp_chain; 32928275SEric Cheng while (mp != NULL) { 32938275SEric Cheng flow_entry_t *dst_flow_ent; 32948275SEric Cheng void *flow_cookie; 32958275SEric Cheng size_t pkt_size; 32968275SEric Cheng mblk_t *mp1; 32978275SEric Cheng 32988275SEric Cheng next = mp->b_next; 32998275SEric Cheng mp->b_next = NULL; 33008275SEric Cheng opackets++; 33018275SEric Cheng pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp)); 33028275SEric Cheng obytes += pkt_size; 33038275SEric Cheng CHECK_VID_AND_ADD_TAG(mp); 33048275SEric Cheng 33058275SEric Cheng /* 3306*8833SVenu.Iyer@Sun.COM * Check if there are promiscuous mode callbacks defined. 3307*8833SVenu.Iyer@Sun.COM */ 3308*8833SVenu.Iyer@Sun.COM if (mip->mi_promisc_list != NULL) 3309*8833SVenu.Iyer@Sun.COM mac_promisc_dispatch(mip, mp, src_mcip); 3310*8833SVenu.Iyer@Sun.COM 3311*8833SVenu.Iyer@Sun.COM /* 33128275SEric Cheng * Find the destination. 33138275SEric Cheng */ 33148275SEric Cheng dst_flow_ent = mac_tx_classify(mip, mp); 33158275SEric Cheng 33168275SEric Cheng if (dst_flow_ent != NULL) { 33178275SEric Cheng size_t hdrsize; 33188275SEric Cheng int err = 0; 33198275SEric Cheng 33208275SEric Cheng if (mip->mi_info.mi_nativemedia == DL_ETHER) { 33218275SEric Cheng struct ether_vlan_header *evhp = 33228275SEric Cheng (struct ether_vlan_header *)mp->b_rptr; 33238275SEric Cheng 33248275SEric Cheng if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) 33258275SEric Cheng hdrsize = sizeof (*evhp); 33268275SEric Cheng else 33278275SEric Cheng hdrsize = sizeof (struct ether_header); 33288275SEric Cheng } else { 33298275SEric Cheng mac_header_info_t mhi; 33308275SEric Cheng 33318275SEric Cheng err = mac_header_info((mac_handle_t)mip, 33328275SEric Cheng mp, &mhi); 33338275SEric Cheng if (err == 0) 33348275SEric Cheng hdrsize = mhi.mhi_hdrsize; 33358275SEric Cheng } 33368275SEric Cheng 33378275SEric Cheng /* 33388275SEric Cheng * Got a matching flow. It's either another 33398275SEric Cheng * MAC client, or a broadcast/multicast flow. 33408275SEric Cheng * Make sure the packet size is within the 33418275SEric Cheng * allowed size. If not drop the packet and 33428275SEric Cheng * move to next packet. 33438275SEric Cheng */ 33448275SEric Cheng if (err != 0 || 33458275SEric Cheng (pkt_size - hdrsize) > mip->mi_sdu_max) { 33468275SEric Cheng oerrors++; 33478275SEric Cheng DTRACE_PROBE2(loopback__drop, size_t, pkt_size, 33488275SEric Cheng mblk_t *, mp); 33498275SEric Cheng freemsg(mp); 33508275SEric Cheng mp = next; 33518275SEric Cheng FLOW_REFRELE(dst_flow_ent); 33528275SEric Cheng continue; 33538275SEric Cheng } 33548275SEric Cheng flow_cookie = mac_flow_get_client_cookie(dst_flow_ent); 33558275SEric Cheng if (flow_cookie != NULL) { 33568275SEric Cheng /* 33578275SEric Cheng * The vnic_bcast_send function expects 33588275SEric Cheng * to receive the sender MAC client 33598275SEric Cheng * as value for arg2. 33608275SEric Cheng */ 33618275SEric Cheng mac_bcast_send(flow_cookie, src_mcip, mp, 33628275SEric Cheng B_TRUE); 33638275SEric Cheng } else { 33648275SEric Cheng /* 33658275SEric Cheng * loopback the packet to a 33668275SEric Cheng * local MAC client. We force a context 33678275SEric Cheng * switch if both source and destination 33688275SEric Cheng * MAC clients are used by IP, i.e. bypass 33698275SEric Cheng * is set. 33708275SEric Cheng */ 33718275SEric Cheng boolean_t do_switch; 33728275SEric Cheng mac_client_impl_t *dst_mcip = 33738275SEric Cheng dst_flow_ent->fe_mcip; 33748275SEric Cheng 33758275SEric Cheng do_switch = ((src_mcip->mci_state_flags & 33768275SEric Cheng dst_mcip->mci_state_flags & 33778275SEric Cheng MCIS_CLIENT_POLL_CAPABLE) != 0); 33788275SEric Cheng 33798275SEric Cheng if ((mp1 = mac_fix_cksum(mp)) != NULL) { 33808275SEric Cheng (dst_flow_ent->fe_cb_fn)( 33818275SEric Cheng dst_flow_ent->fe_cb_arg1, 33828275SEric Cheng dst_flow_ent->fe_cb_arg2, 33838275SEric Cheng mp1, do_switch); 33848275SEric Cheng } 33858275SEric Cheng } 33868275SEric Cheng FLOW_REFRELE(dst_flow_ent); 33878275SEric Cheng } else { 33888275SEric Cheng /* 33898275SEric Cheng * Unknown destination, send via the underlying 33908275SEric Cheng * NIC. 33918275SEric Cheng */ 33928275SEric Cheng MAC_TX(mip, ring, mp, src_mcip); 33938275SEric Cheng if (mp != NULL) { 33948275SEric Cheng /* 33958275SEric Cheng * Adjust for the last packet that 33968275SEric Cheng * could not be transmitted 33978275SEric Cheng */ 33988275SEric Cheng opackets--; 33998275SEric Cheng obytes -= pkt_size; 34008275SEric Cheng mp->b_next = next; 34018275SEric Cheng break; 34028275SEric Cheng } 34038275SEric Cheng } 34048275SEric Cheng mp = next; 34058275SEric Cheng } 34068275SEric Cheng 34078275SEric Cheng done: 34088275SEric Cheng src_mcip->mci_stat_obytes += obytes; 34098275SEric Cheng src_mcip->mci_stat_opackets += opackets; 34108275SEric Cheng src_mcip->mci_stat_oerrors += oerrors; 34118275SEric Cheng 34128275SEric Cheng if (stats != NULL) { 34138275SEric Cheng stats->ts_opackets = opackets; 34148275SEric Cheng stats->ts_obytes = obytes; 34158275SEric Cheng stats->ts_oerrors = oerrors; 34168275SEric Cheng } 34178275SEric Cheng return (mp); 34188275SEric Cheng } 34198275SEric Cheng 34208275SEric Cheng /* 34218275SEric Cheng * mac_tx_srs_ring_present 34228275SEric Cheng * 34238275SEric Cheng * Returns whether the specified ring is part of the specified SRS. 34248275SEric Cheng */ 34258275SEric Cheng boolean_t 34268275SEric Cheng mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring) 34278275SEric Cheng { 34288275SEric Cheng int i; 34298275SEric Cheng mac_soft_ring_t *soft_ring; 34308275SEric Cheng 34318275SEric Cheng if (srs->srs_tx.st_arg2 == tx_ring) 34328275SEric Cheng return (B_TRUE); 34338275SEric Cheng 34348275SEric Cheng for (i = 0; i < srs->srs_oth_ring_count; i++) { 34358275SEric Cheng soft_ring = srs->srs_oth_soft_rings[i]; 34368275SEric Cheng if (soft_ring->s_ring_tx_arg2 == tx_ring) 34378275SEric Cheng return (B_TRUE); 34388275SEric Cheng } 34398275SEric Cheng 34408275SEric Cheng return (B_FALSE); 34418275SEric Cheng } 34428275SEric Cheng 34438275SEric Cheng /* 34448275SEric Cheng * mac_tx_srs_wakeup 34458275SEric Cheng * 34468275SEric Cheng * Called when Tx desc become available. Wakeup the appropriate worker 34478275SEric Cheng * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the 34488275SEric Cheng * state field. 34498275SEric Cheng */ 34508275SEric Cheng void 34518275SEric Cheng mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring) 34528275SEric Cheng { 34538275SEric Cheng int i; 34548275SEric Cheng mac_soft_ring_t *sringp; 34558275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 34568275SEric Cheng 34578275SEric Cheng mutex_enter(&mac_srs->srs_lock); 34588275SEric Cheng if (TX_SINGLE_RING_MODE(mac_srs)) { 34598275SEric Cheng if (srs_tx->st_arg2 == ring && 34608275SEric Cheng mac_srs->srs_state & SRS_TX_BLOCKED) { 34618275SEric Cheng mac_srs->srs_state &= ~SRS_TX_BLOCKED; 34628275SEric Cheng srs_tx->st_unblocked_cnt++; 34638275SEric Cheng cv_signal(&mac_srs->srs_async); 34648275SEric Cheng } 34658275SEric Cheng /* 34668275SEric Cheng * A wakeup can come before tx_srs_drain() could 34678275SEric Cheng * grab srs lock and set SRS_TX_BLOCKED. So 34688275SEric Cheng * always set woken_up flag when we come here. 34698275SEric Cheng */ 34708275SEric Cheng srs_tx->st_woken_up = B_TRUE; 34718275SEric Cheng mutex_exit(&mac_srs->srs_lock); 34728275SEric Cheng return; 34738275SEric Cheng } 34748275SEric Cheng 34758275SEric Cheng /* If you are here, it is for FANOUT or BW_FANOUT case */ 34768275SEric Cheng ASSERT(TX_MULTI_RING_MODE(mac_srs)); 34778275SEric Cheng for (i = 0; i < mac_srs->srs_oth_ring_count; i++) { 34788275SEric Cheng sringp = mac_srs->srs_oth_soft_rings[i]; 34798275SEric Cheng mutex_enter(&sringp->s_ring_lock); 34808275SEric Cheng if (sringp->s_ring_tx_arg2 == ring) { 34818275SEric Cheng if (sringp->s_ring_state & S_RING_BLOCK) { 34828275SEric Cheng sringp->s_ring_state &= ~S_RING_BLOCK; 34838275SEric Cheng sringp->s_ring_unblocked_cnt++; 34848275SEric Cheng cv_signal(&sringp->s_ring_async); 34858275SEric Cheng } 34868275SEric Cheng sringp->s_ring_tx_woken_up = B_TRUE; 34878275SEric Cheng } 34888275SEric Cheng mutex_exit(&sringp->s_ring_lock); 34898275SEric Cheng } 34908275SEric Cheng mutex_exit(&mac_srs->srs_lock); 34918275SEric Cheng } 34928275SEric Cheng 34938275SEric Cheng /* 34948275SEric Cheng * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash 34958275SEric Cheng * the blocked clients again. 34968275SEric Cheng */ 34978275SEric Cheng void 34988275SEric Cheng mac_tx_notify(mac_impl_t *mip) 34998275SEric Cheng { 35008275SEric Cheng i_mac_notify(mip, MAC_NOTE_TX); 35018275SEric Cheng } 35028275SEric Cheng 35038275SEric Cheng /* 35048275SEric Cheng * RX SOFTRING RELATED FUNCTIONS 35058275SEric Cheng * 35068275SEric Cheng * These functions really belong in mac_soft_ring.c and here for 35078275SEric Cheng * a short period. 35088275SEric Cheng */ 35098275SEric Cheng 35108275SEric Cheng #define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 35118275SEric Cheng /* \ 35128275SEric Cheng * Enqueue our mblk chain. \ 35138275SEric Cheng */ \ 35148275SEric Cheng ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \ 35158275SEric Cheng \ 35168275SEric Cheng if ((ringp)->s_ring_last != NULL) \ 35178275SEric Cheng (ringp)->s_ring_last->b_next = (mp); \ 35188275SEric Cheng else \ 35198275SEric Cheng (ringp)->s_ring_first = (mp); \ 35208275SEric Cheng (ringp)->s_ring_last = (tail); \ 35218275SEric Cheng (ringp)->s_ring_count += (cnt); \ 35228275SEric Cheng ASSERT((ringp)->s_ring_count > 0); \ 35238275SEric Cheng if ((ringp)->s_ring_type & ST_RING_BW_CTL) { \ 35248275SEric Cheng (ringp)->s_ring_size += sz; \ 35258275SEric Cheng } \ 35268275SEric Cheng } 35278275SEric Cheng 35288275SEric Cheng /* 35298275SEric Cheng * Default entry point to deliver a packet chain to a MAC client. 35308275SEric Cheng * If the MAC client has flows, do the classification with these 35318275SEric Cheng * flows as well. 35328275SEric Cheng */ 35338275SEric Cheng /* ARGSUSED */ 35348275SEric Cheng void 35358275SEric Cheng mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain, 35368275SEric Cheng mac_header_info_t *arg3) 35378275SEric Cheng { 35388275SEric Cheng mac_client_impl_t *mcip = arg1; 35398275SEric Cheng 35408275SEric Cheng if (mcip->mci_nvids == 1 && 35418275SEric Cheng !(mcip->mci_state_flags & MCIS_TAG_DISABLE)) { 35428275SEric Cheng /* 35438275SEric Cheng * If the client has exactly one VID associated with it 35448275SEric Cheng * and striping of VLAN header is not disabled, 35458275SEric Cheng * remove the VLAN tag from the packet before 35468275SEric Cheng * passing it on to the client's receive callback. 35478275SEric Cheng * Note that this needs to be done after we dispatch 35488275SEric Cheng * the packet to the promiscuous listeners of the 35498275SEric Cheng * client, since they expect to see the whole 35508275SEric Cheng * frame including the VLAN headers. 35518275SEric Cheng */ 35528275SEric Cheng mp_chain = mac_strip_vlan_tag_chain(mp_chain); 35538275SEric Cheng } 35548275SEric Cheng 35558275SEric Cheng mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE); 35568275SEric Cheng } 35578275SEric Cheng 35588275SEric Cheng /* 35598275SEric Cheng * mac_rx_soft_ring_process 35608275SEric Cheng * 35618275SEric Cheng * process a chain for a given soft ring. The number of packets queued 35628275SEric Cheng * in the SRS and its associated soft rings (including this one) is 35638275SEric Cheng * very small (tracked by srs_poll_pkt_cnt), then allow the entering 35648275SEric Cheng * thread (interrupt or poll thread) to do inline processing. This 35658275SEric Cheng * helps keep the latency down under low load. 35668275SEric Cheng * 35678275SEric Cheng * The proc and arg for each mblk is already stored in the mblk in 35688275SEric Cheng * appropriate places. 35698275SEric Cheng */ 35708275SEric Cheng /* ARGSUSED */ 35718275SEric Cheng void 35728275SEric Cheng mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, 35738275SEric Cheng mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz) 35748275SEric Cheng { 35758275SEric Cheng mac_direct_rx_t proc; 35768275SEric Cheng void *arg1; 35778275SEric Cheng mac_resource_handle_t arg2; 35788275SEric Cheng mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 35798275SEric Cheng 35808275SEric Cheng ASSERT(ringp != NULL); 35818275SEric Cheng ASSERT(mp_chain != NULL); 35828275SEric Cheng ASSERT(tail != NULL); 35838275SEric Cheng ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 35848275SEric Cheng 35858275SEric Cheng mutex_enter(&ringp->s_ring_lock); 35868275SEric Cheng ringp->s_ring_total_inpkt += cnt; 3587*8833SVenu.Iyer@Sun.COM if ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) && 3588*8833SVenu.Iyer@Sun.COM !(ringp->s_ring_type & ST_RING_WORKER_ONLY)) { 35898275SEric Cheng /* If on processor or blanking on, then enqueue and return */ 35908275SEric Cheng if (ringp->s_ring_state & S_RING_BLANK || 35918275SEric Cheng ringp->s_ring_state & S_RING_PROC) { 35928275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 35938275SEric Cheng mutex_exit(&ringp->s_ring_lock); 35948275SEric Cheng return; 35958275SEric Cheng } 35968275SEric Cheng proc = ringp->s_ring_rx_func; 35978275SEric Cheng arg1 = ringp->s_ring_rx_arg1; 35988275SEric Cheng arg2 = ringp->s_ring_rx_arg2; 35998275SEric Cheng /* 36008275SEric Cheng * See if anything is already queued. If we are the 36018275SEric Cheng * first packet, do inline processing else queue the 36028275SEric Cheng * packet and do the drain. 36038275SEric Cheng */ 36048275SEric Cheng if (ringp->s_ring_first == NULL) { 36058275SEric Cheng /* 36068275SEric Cheng * Fast-path, ok to process and nothing queued. 36078275SEric Cheng */ 36088275SEric Cheng ringp->s_ring_run = curthread; 36098275SEric Cheng ringp->s_ring_state |= (S_RING_PROC); 36108275SEric Cheng 36118275SEric Cheng mutex_exit(&ringp->s_ring_lock); 36128275SEric Cheng 36138275SEric Cheng /* 36148275SEric Cheng * We are the chain of 1 packet so 36158275SEric Cheng * go through this fast path. 36168275SEric Cheng */ 36178275SEric Cheng ASSERT(mp_chain->b_next == NULL); 36188275SEric Cheng 36198275SEric Cheng (*proc)(arg1, arg2, mp_chain, NULL); 36208275SEric Cheng 36218275SEric Cheng ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 36228275SEric Cheng /* 36238275SEric Cheng * If we have a soft ring set which is doing 36248275SEric Cheng * bandwidth control, we need to decrement 36258275SEric Cheng * srs_size and count so it the SRS can have a 36268275SEric Cheng * accurate idea of what is the real data 36278275SEric Cheng * queued between SRS and its soft rings. We 36288275SEric Cheng * decrement the counters only when the packet 36298275SEric Cheng * gets processed by both SRS and the soft ring. 36308275SEric Cheng */ 36318275SEric Cheng mutex_enter(&mac_srs->srs_lock); 36328275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 36338275SEric Cheng MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 36348275SEric Cheng mutex_exit(&mac_srs->srs_lock); 36358275SEric Cheng 36368275SEric Cheng mutex_enter(&ringp->s_ring_lock); 36378275SEric Cheng ringp->s_ring_run = NULL; 36388275SEric Cheng ringp->s_ring_state &= ~S_RING_PROC; 36398275SEric Cheng if (ringp->s_ring_state & S_RING_CLIENT_WAIT) 36408275SEric Cheng cv_signal(&ringp->s_ring_client_cv); 36418275SEric Cheng 36428275SEric Cheng if ((ringp->s_ring_first == NULL) || 36438275SEric Cheng (ringp->s_ring_state & S_RING_BLANK)) { 36448275SEric Cheng /* 36458275SEric Cheng * We processed inline our packet and 36468275SEric Cheng * nothing new has arrived or our 36478275SEric Cheng * receiver doesn't want to receive 36488275SEric Cheng * any packets. We are done. 36498275SEric Cheng */ 36508275SEric Cheng mutex_exit(&ringp->s_ring_lock); 36518275SEric Cheng return; 36528275SEric Cheng } 36538275SEric Cheng } else { 36548275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, 36558275SEric Cheng mp_chain, tail, cnt, sz); 36568275SEric Cheng } 36578275SEric Cheng 36588275SEric Cheng /* 36598275SEric Cheng * We are here because either we couldn't do inline 36608275SEric Cheng * processing (because something was already 36618275SEric Cheng * queued), or we had a chain of more than one 36628275SEric Cheng * packet, or something else arrived after we were 36638275SEric Cheng * done with inline processing. 36648275SEric Cheng */ 36658275SEric Cheng ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 36668275SEric Cheng ASSERT(ringp->s_ring_first != NULL); 36678275SEric Cheng 36688275SEric Cheng ringp->s_ring_drain_func(ringp); 36698275SEric Cheng mutex_exit(&ringp->s_ring_lock); 36708275SEric Cheng return; 36718275SEric Cheng } else { 36728275SEric Cheng /* ST_RING_WORKER_ONLY case */ 36738275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 36748275SEric Cheng mac_soft_ring_worker_wakeup(ringp); 36758275SEric Cheng mutex_exit(&ringp->s_ring_lock); 36768275SEric Cheng } 36778275SEric Cheng } 36788275SEric Cheng 36798275SEric Cheng /* 36808275SEric Cheng * TX SOFTRING RELATED FUNCTIONS 36818275SEric Cheng * 36828275SEric Cheng * These functions really belong in mac_soft_ring.c and here for 36838275SEric Cheng * a short period. 36848275SEric Cheng */ 36858275SEric Cheng 36868275SEric Cheng #define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 36878275SEric Cheng ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); \ 36888275SEric Cheng ringp->s_ring_state |= S_RING_ENQUEUED; \ 36898275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); \ 36908275SEric Cheng } 36918275SEric Cheng 36928275SEric Cheng /* 36938275SEric Cheng * mac_tx_sring_queued 36948275SEric Cheng * 36958275SEric Cheng * When we are out of transmit descriptors and we already have a 36968275SEric Cheng * queue that exceeds hiwat (or the client called us with 36978275SEric Cheng * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the 36988275SEric Cheng * soft ring pointer as the opaque cookie for the client enable 36998275SEric Cheng * flow control. 37008275SEric Cheng */ 37018275SEric Cheng static mac_tx_cookie_t 37028275SEric Cheng mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, 37038275SEric Cheng mblk_t **ret_mp) 37048275SEric Cheng { 37058275SEric Cheng int cnt; 37068275SEric Cheng size_t sz; 37078275SEric Cheng mblk_t *tail; 37088275SEric Cheng mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 37098275SEric Cheng mac_tx_cookie_t cookie = NULL; 37108275SEric Cheng boolean_t wakeup_worker = B_TRUE; 37118275SEric Cheng 37128275SEric Cheng ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 37138275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 37148275SEric Cheng if (flag & MAC_DROP_ON_NO_DESC) { 37158275SEric Cheng mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 37168275SEric Cheng /* increment freed stats */ 37178275SEric Cheng ringp->s_ring_drops += cnt; 37188275SEric Cheng cookie = (mac_tx_cookie_t)ringp; 37198275SEric Cheng } else { 37208275SEric Cheng if (ringp->s_ring_first != NULL) 37218275SEric Cheng wakeup_worker = B_FALSE; 37228275SEric Cheng 37238275SEric Cheng if (flag & MAC_TX_NO_ENQUEUE) { 37248275SEric Cheng /* 37258275SEric Cheng * If QUEUED is not set, queue the packet 37268275SEric Cheng * and let mac_tx_soft_ring_drain() set 37278275SEric Cheng * the TX_BLOCKED bit for the reasons 37288275SEric Cheng * explained above. Otherwise, return the 37298275SEric Cheng * mblks. 37308275SEric Cheng */ 37318275SEric Cheng if (wakeup_worker) { 37328275SEric Cheng TX_SOFT_RING_ENQUEUE_CHAIN(ringp, 37338275SEric Cheng mp_chain, tail, cnt, sz); 37348275SEric Cheng } else { 37358275SEric Cheng ringp->s_ring_state |= S_RING_WAKEUP_CLIENT; 37368275SEric Cheng cookie = (mac_tx_cookie_t)ringp; 37378275SEric Cheng *ret_mp = mp_chain; 37388275SEric Cheng } 37398275SEric Cheng } else { 37408275SEric Cheng boolean_t enqueue = B_TRUE; 37418275SEric Cheng 37428275SEric Cheng if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 37438275SEric Cheng /* 37448275SEric Cheng * flow-controlled. Store ringp in cookie 37458275SEric Cheng * so that it can be returned as 37468275SEric Cheng * mac_tx_cookie_t to client 37478275SEric Cheng */ 37488275SEric Cheng ringp->s_ring_state |= S_RING_TX_HIWAT; 37498275SEric Cheng cookie = (mac_tx_cookie_t)ringp; 37508275SEric Cheng ringp->s_ring_hiwat_cnt++; 37518275SEric Cheng if (ringp->s_ring_count > 37528275SEric Cheng ringp->s_ring_tx_max_q_cnt) { 37538275SEric Cheng /* increment freed stats */ 37548275SEric Cheng ringp->s_ring_drops += cnt; 37558275SEric Cheng /* 37568275SEric Cheng * b_prev may be set to the fanout hint 37578275SEric Cheng * hence can't use freemsg directly 37588275SEric Cheng */ 37598275SEric Cheng mac_pkt_drop(NULL, NULL, 37608275SEric Cheng mp_chain, B_FALSE); 37618275SEric Cheng DTRACE_PROBE1(tx_queued_hiwat, 37628275SEric Cheng mac_soft_ring_t *, ringp); 37638275SEric Cheng enqueue = B_FALSE; 37648275SEric Cheng } 37658275SEric Cheng } 37668275SEric Cheng if (enqueue) { 37678275SEric Cheng TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, 37688275SEric Cheng tail, cnt, sz); 37698275SEric Cheng } 37708275SEric Cheng } 37718275SEric Cheng if (wakeup_worker) 37728275SEric Cheng cv_signal(&ringp->s_ring_async); 37738275SEric Cheng } 37748275SEric Cheng return (cookie); 37758275SEric Cheng } 37768275SEric Cheng 37778275SEric Cheng 37788275SEric Cheng /* 37798275SEric Cheng * mac_tx_soft_ring_process 37808275SEric Cheng * 37818275SEric Cheng * This routine is called when fanning out outgoing traffic among 37828275SEric Cheng * multipe Tx rings. 37838275SEric Cheng * Note that a soft ring is associated with a h/w Tx ring. 37848275SEric Cheng */ 37858275SEric Cheng mac_tx_cookie_t 37868275SEric Cheng mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain, 37878275SEric Cheng uint16_t flag, mblk_t **ret_mp) 37888275SEric Cheng { 37898275SEric Cheng mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 37908275SEric Cheng int cnt; 37918275SEric Cheng size_t sz; 37928275SEric Cheng mblk_t *tail; 37938275SEric Cheng mac_tx_cookie_t cookie = NULL; 37948275SEric Cheng 37958275SEric Cheng ASSERT(ringp != NULL); 37968275SEric Cheng ASSERT(mp_chain != NULL); 37978275SEric Cheng ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 37988275SEric Cheng /* 37998275SEric Cheng * Only two modes can come here; either it can be 38008275SEric Cheng * SRS_TX_BW_FANOUT or SRS_TX_FANOUT 38018275SEric Cheng */ 38028275SEric Cheng ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT || 38038275SEric Cheng mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT); 38048275SEric Cheng 38058275SEric Cheng if (ringp->s_ring_type & ST_RING_WORKER_ONLY) { 38068275SEric Cheng /* Serialization mode */ 38078275SEric Cheng 38088275SEric Cheng mutex_enter(&ringp->s_ring_lock); 38098275SEric Cheng if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 38108275SEric Cheng cookie = mac_tx_sring_enqueue(ringp, mp_chain, 38118275SEric Cheng flag, ret_mp); 38128275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38138275SEric Cheng return (cookie); 38148275SEric Cheng } 38158275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 38168275SEric Cheng TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 38178275SEric Cheng if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) { 38188275SEric Cheng /* 38198275SEric Cheng * If ring is blocked due to lack of Tx 38208275SEric Cheng * descs, just return. Worker thread 38218275SEric Cheng * will get scheduled when Tx desc's 38228275SEric Cheng * become available. 38238275SEric Cheng */ 38248275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38258275SEric Cheng return (cookie); 38268275SEric Cheng } 38278275SEric Cheng mac_soft_ring_worker_wakeup(ringp); 38288275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38298275SEric Cheng return (cookie); 38308275SEric Cheng } else { 38318275SEric Cheng /* Default fanout mode */ 38328275SEric Cheng /* 38338275SEric Cheng * S_RING_BLOCKED is set when underlying NIC runs 38348275SEric Cheng * out of Tx descs and messages start getting 38358275SEric Cheng * queued. It won't get reset until 38368275SEric Cheng * tx_srs_drain() completely drains out the 38378275SEric Cheng * messages. 38388275SEric Cheng */ 38398275SEric Cheng boolean_t is_subflow; 38408275SEric Cheng mac_tx_stats_t stats; 38418275SEric Cheng 38428275SEric Cheng if (ringp->s_ring_state & S_RING_ENQUEUED) { 38438275SEric Cheng /* Tx descs/resources not available */ 38448275SEric Cheng mutex_enter(&ringp->s_ring_lock); 38458275SEric Cheng if (ringp->s_ring_state & S_RING_ENQUEUED) { 38468275SEric Cheng cookie = mac_tx_sring_enqueue(ringp, mp_chain, 38478275SEric Cheng flag, ret_mp); 38488275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38498275SEric Cheng return (cookie); 38508275SEric Cheng } 38518275SEric Cheng /* 38528275SEric Cheng * While we were computing mblk count, the 38538275SEric Cheng * flow control condition got relieved. 38548275SEric Cheng * Continue with the transmission. 38558275SEric Cheng */ 38568275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38578275SEric Cheng } 38588275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 38598275SEric Cheng 38608275SEric Cheng mp_chain = mac_tx_send(ringp->s_ring_tx_arg1, 38618275SEric Cheng ringp->s_ring_tx_arg2, mp_chain, 38628275SEric Cheng (is_subflow ? &stats : NULL)); 38638275SEric Cheng 38648275SEric Cheng /* 38658275SEric Cheng * Multiple threads could be here sending packets. 38668275SEric Cheng * Under such conditions, it is not possible to 38678275SEric Cheng * automically set S_RING_BLOCKED bit to indicate 38688275SEric Cheng * out of tx desc condition. To atomically set 38698275SEric Cheng * this, we queue the returned packet and do 38708275SEric Cheng * the setting of S_RING_BLOCKED in 38718275SEric Cheng * mac_tx_soft_ring_drain(). 38728275SEric Cheng */ 38738275SEric Cheng if (mp_chain != NULL) { 38748275SEric Cheng mutex_enter(&ringp->s_ring_lock); 38758275SEric Cheng cookie = 38768275SEric Cheng mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp); 38778275SEric Cheng mutex_exit(&ringp->s_ring_lock); 38788275SEric Cheng return (cookie); 38798275SEric Cheng } 38808275SEric Cheng if (is_subflow) { 38818275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 38828275SEric Cheng } 38838275SEric Cheng return (NULL); 38848275SEric Cheng } 38858275SEric Cheng } 3886