1*8275SEric Cheng /* 2*8275SEric Cheng * CDDL HEADER START 3*8275SEric Cheng * 4*8275SEric Cheng * The contents of this file are subject to the terms of the 5*8275SEric Cheng * Common Development and Distribution License (the "License"). 6*8275SEric Cheng * You may not use this file except in compliance with the License. 7*8275SEric Cheng * 8*8275SEric Cheng * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*8275SEric Cheng * or http://www.opensolaris.org/os/licensing. 10*8275SEric Cheng * See the License for the specific language governing permissions 11*8275SEric Cheng * and limitations under the License. 12*8275SEric Cheng * 13*8275SEric Cheng * When distributing Covered Code, include this CDDL HEADER in each 14*8275SEric Cheng * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*8275SEric Cheng * If applicable, add the following below this CDDL HEADER, with the 16*8275SEric Cheng * fields enclosed by brackets "[]" replaced with your own identifying 17*8275SEric Cheng * information: Portions Copyright [yyyy] [name of copyright owner] 18*8275SEric Cheng * 19*8275SEric Cheng * CDDL HEADER END 20*8275SEric Cheng */ 21*8275SEric Cheng /* 22*8275SEric Cheng * Copyright 2008 Sun Microsystems, Inc. All rights reserved. 23*8275SEric Cheng * Use is subject to license terms. 24*8275SEric Cheng */ 25*8275SEric Cheng 26*8275SEric Cheng #include <sys/types.h> 27*8275SEric Cheng #include <sys/callb.h> 28*8275SEric Cheng #include <sys/sdt.h> 29*8275SEric Cheng #include <sys/strsubr.h> 30*8275SEric Cheng #include <sys/strsun.h> 31*8275SEric Cheng #include <sys/vlan.h> 32*8275SEric Cheng #include <inet/ipsec_impl.h> 33*8275SEric Cheng #include <inet/ip_impl.h> 34*8275SEric Cheng #include <inet/sadb.h> 35*8275SEric Cheng #include <inet/ipsecesp.h> 36*8275SEric Cheng #include <inet/ipsecah.h> 37*8275SEric Cheng #include <inet/ip6.h> 38*8275SEric Cheng 39*8275SEric Cheng #include <sys/mac_impl.h> 40*8275SEric Cheng #include <sys/mac_client_impl.h> 41*8275SEric Cheng #include <sys/mac_client_priv.h> 42*8275SEric Cheng #include <sys/mac_soft_ring.h> 43*8275SEric Cheng #include <sys/mac_flow_impl.h> 44*8275SEric Cheng 45*8275SEric Cheng static mac_tx_cookie_t mac_tx_single_ring_mode(mac_soft_ring_set_t *, mblk_t *, 46*8275SEric Cheng uintptr_t, uint16_t, mblk_t **); 47*8275SEric Cheng static mac_tx_cookie_t mac_tx_serializer_mode(mac_soft_ring_set_t *, mblk_t *, 48*8275SEric Cheng uintptr_t, uint16_t, mblk_t **); 49*8275SEric Cheng static mac_tx_cookie_t mac_tx_fanout_mode(mac_soft_ring_set_t *, mblk_t *, 50*8275SEric Cheng uintptr_t, uint16_t, mblk_t **); 51*8275SEric Cheng static mac_tx_cookie_t mac_tx_bw_mode(mac_soft_ring_set_t *, mblk_t *, 52*8275SEric Cheng uintptr_t, uint16_t, mblk_t **); 53*8275SEric Cheng 54*8275SEric Cheng typedef struct mac_tx_mode_s { 55*8275SEric Cheng mac_tx_srs_mode_t mac_tx_mode; 56*8275SEric Cheng mac_tx_func_t mac_tx_func; 57*8275SEric Cheng } mac_tx_mode_t; 58*8275SEric Cheng 59*8275SEric Cheng /* 60*8275SEric Cheng * There are five modes of operation on the Tx side. These modes get set 61*8275SEric Cheng * in mac_tx_srs_setup(). Except for the experimental TX_SERIALIZE mode, 62*8275SEric Cheng * none of the other modes are user configurable. They get selected by 63*8275SEric Cheng * the system depending upon whether the link (or flow) has multiple Tx 64*8275SEric Cheng * rings or a bandwidth configured, etc. 65*8275SEric Cheng */ 66*8275SEric Cheng mac_tx_mode_t mac_tx_mode_list[] = { 67*8275SEric Cheng {SRS_TX_DEFAULT, mac_tx_single_ring_mode}, 68*8275SEric Cheng {SRS_TX_SERIALIZE, mac_tx_serializer_mode}, 69*8275SEric Cheng {SRS_TX_FANOUT, mac_tx_fanout_mode}, 70*8275SEric Cheng {SRS_TX_BW, mac_tx_bw_mode}, 71*8275SEric Cheng {SRS_TX_BW_FANOUT, mac_tx_bw_mode} 72*8275SEric Cheng }; 73*8275SEric Cheng 74*8275SEric Cheng /* 75*8275SEric Cheng * Soft Ring Set (SRS) - The Run time code that deals with 76*8275SEric Cheng * dynamic polling from the hardware, bandwidth enforcement, 77*8275SEric Cheng * fanout etc. 78*8275SEric Cheng * 79*8275SEric Cheng * We try to use H/W classification on NIC and assign traffic for 80*8275SEric Cheng * a MAC address to a particular Rx ring or ring group. There is a 81*8275SEric Cheng * 1-1 mapping between a SRS and a Rx ring. The SRS dynamically 82*8275SEric Cheng * switches the underlying Rx ring between interrupt and 83*8275SEric Cheng * polling mode and enforces any specified B/W control. 84*8275SEric Cheng * 85*8275SEric Cheng * There is always a SRS created and tied to each H/W and S/W rule. 86*8275SEric Cheng * Whenever we create a H/W rule, we always add the the same rule to 87*8275SEric Cheng * S/W classifier and tie a SRS to it. 88*8275SEric Cheng * 89*8275SEric Cheng * In case a B/W control is specified, it is broken into bytes 90*8275SEric Cheng * per ticks and as soon as the quota for a tick is exhausted, 91*8275SEric Cheng * the underlying Rx ring is forced into poll mode for remainder of 92*8275SEric Cheng * the tick. The SRS poll thread only polls for bytes that are 93*8275SEric Cheng * allowed to come in the SRS. We typically let 4x the configured 94*8275SEric Cheng * B/W worth of packets to come in the SRS (to prevent unnecessary 95*8275SEric Cheng * drops due to bursts) but only process the specified amount. 96*8275SEric Cheng * 97*8275SEric Cheng * A MAC client (e.g. a VNIC or aggr) can have 1 or more 98*8275SEric Cheng * Rx rings (and corresponding SRSs) assigned to it. The SRS 99*8275SEric Cheng * in turn can have softrings to do protocol level fanout or 100*8275SEric Cheng * softrings to do S/W based fanout or both. In case the NIC 101*8275SEric Cheng * has no Rx rings, we do S/W classification to respective SRS. 102*8275SEric Cheng * The S/W classification rule is always setup and ready. This 103*8275SEric Cheng * allows the MAC layer to reassign Rx rings whenever needed 104*8275SEric Cheng * but packets still continue to flow via the default path and 105*8275SEric Cheng * getting S/W classified to correct SRS. 106*8275SEric Cheng * 107*8275SEric Cheng * The SRS's are used on both Tx and Rx side. They use the same 108*8275SEric Cheng * data structure but the processing routines have slightly different 109*8275SEric Cheng * semantics due to the fact that Rx side needs to do dynamic 110*8275SEric Cheng * polling etc. 111*8275SEric Cheng * 112*8275SEric Cheng * Dynamic Polling Notes 113*8275SEric Cheng * ===================== 114*8275SEric Cheng * 115*8275SEric Cheng * Each Soft ring set is capable of switching its Rx ring between 116*8275SEric Cheng * interrupt and poll mode and actively 'polls' for packets in 117*8275SEric Cheng * poll mode. If the SRS is implementing a B/W limit, it makes 118*8275SEric Cheng * sure that only Max allowed packets are pulled in poll mode 119*8275SEric Cheng * and goes to poll mode as soon as B/W limit is exceeded. As 120*8275SEric Cheng * such, there are no overheads to implement B/W limits. 121*8275SEric Cheng * 122*8275SEric Cheng * In poll mode, its better to keep the pipeline going where the 123*8275SEric Cheng * SRS worker thread keeps processing packets and poll thread 124*8275SEric Cheng * keeps bringing more packets (specially if they get to run 125*8275SEric Cheng * on different CPUs). This also prevents the overheads associated 126*8275SEric Cheng * by excessive signalling (on NUMA machines, this can be 127*8275SEric Cheng * pretty devastating). The exception is latency optimized case 128*8275SEric Cheng * where worker thread does no work and interrupt and poll thread 129*8275SEric Cheng * are allowed to do their own drain. 130*8275SEric Cheng * 131*8275SEric Cheng * We use the following policy to control Dynamic Polling: 132*8275SEric Cheng * 1) We switch to poll mode anytime the processing 133*8275SEric Cheng * thread causes a backlog to build up in SRS and 134*8275SEric Cheng * its associated Soft Rings (sr_poll_pkt_cnt > 0). 135*8275SEric Cheng * 2) As long as the backlog stays under the low water 136*8275SEric Cheng * mark (sr_lowat), we poll the H/W for more packets. 137*8275SEric Cheng * 3) If the backlog (sr_poll_pkt_cnt) exceeds low 138*8275SEric Cheng * water mark, we stay in poll mode but don't poll 139*8275SEric Cheng * the H/W for more packets. 140*8275SEric Cheng * 4) Anytime in polling mode, if we poll the H/W for 141*8275SEric Cheng * packets and find nothing plus we have an existing 142*8275SEric Cheng * backlog (sr_poll_pkt_cnt > 0), we stay in polling 143*8275SEric Cheng * mode but don't poll the H/W for packets anymore 144*8275SEric Cheng * (let the polling thread go to sleep). 145*8275SEric Cheng * 5) Once the backlog is relived (packets are processed) 146*8275SEric Cheng * we reenable polling (by signalling the poll thread) 147*8275SEric Cheng * only when the backlog dips below sr_poll_thres. 148*8275SEric Cheng * 6) sr_hiwat is used exclusively when we are not 149*8275SEric Cheng * polling capable and is used to decide when to 150*8275SEric Cheng * drop packets so the SRS queue length doesn't grow 151*8275SEric Cheng * infinitely. 152*8275SEric Cheng * 153*8275SEric Cheng * NOTE: Also see the block level comment on top of mac_soft_ring.c 154*8275SEric Cheng */ 155*8275SEric Cheng 156*8275SEric Cheng /* 157*8275SEric Cheng * mac_latency_optimize 158*8275SEric Cheng * 159*8275SEric Cheng * Controls whether the poll thread can process the packets inline 160*8275SEric Cheng * or let the SRS worker thread do the processing. This applies if 161*8275SEric Cheng * the SRS was not being processed. For latency sensitive traffic, 162*8275SEric Cheng * this needs to be true to allow inline processing. For throughput 163*8275SEric Cheng * under load, this should be false. 164*8275SEric Cheng * 165*8275SEric Cheng * This (and other similar) tunable should be rolled into a link 166*8275SEric Cheng * or flow specific workload hint that can be set using dladm 167*8275SEric Cheng * linkprop (instead of multiple such tunables). 168*8275SEric Cheng */ 169*8275SEric Cheng boolean_t mac_latency_optimize = B_TRUE; 170*8275SEric Cheng 171*8275SEric Cheng /* 172*8275SEric Cheng * MAC_RX_SRS_ENQUEUE_CHAIN and MAC_TX_SRS_ENQUEUE_CHAIN 173*8275SEric Cheng * 174*8275SEric Cheng * queue a mp or chain in soft ring set and increment the 175*8275SEric Cheng * local count (srs_count) for the SRS and the shared counter 176*8275SEric Cheng * (srs_poll_pkt_cnt - shared between SRS and its soft rings 177*8275SEric Cheng * to track the total unprocessed packets for polling to work 178*8275SEric Cheng * correctly). 179*8275SEric Cheng * 180*8275SEric Cheng * The size (total bytes queued) counters are incremented only 181*8275SEric Cheng * if we are doing B/W control. 182*8275SEric Cheng */ 183*8275SEric Cheng #define MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 184*8275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 185*8275SEric Cheng if ((mac_srs)->srs_last != NULL) \ 186*8275SEric Cheng (mac_srs)->srs_last->b_next = (head); \ 187*8275SEric Cheng else \ 188*8275SEric Cheng (mac_srs)->srs_first = (head); \ 189*8275SEric Cheng (mac_srs)->srs_last = (tail); \ 190*8275SEric Cheng (mac_srs)->srs_count += count; \ 191*8275SEric Cheng } 192*8275SEric Cheng 193*8275SEric Cheng #define MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 194*8275SEric Cheng mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 195*8275SEric Cheng \ 196*8275SEric Cheng MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 197*8275SEric Cheng srs_rx->sr_poll_pkt_cnt += count; \ 198*8275SEric Cheng ASSERT(srs_rx->sr_poll_pkt_cnt > 0); \ 199*8275SEric Cheng if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 200*8275SEric Cheng (mac_srs)->srs_size += (sz); \ 201*8275SEric Cheng mutex_enter(&(mac_srs)->srs_bw->mac_bw_lock); \ 202*8275SEric Cheng (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 203*8275SEric Cheng mutex_exit(&(mac_srs)->srs_bw->mac_bw_lock); \ 204*8275SEric Cheng } \ 205*8275SEric Cheng } 206*8275SEric Cheng 207*8275SEric Cheng #define MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz) { \ 208*8275SEric Cheng mac_srs->srs_state |= SRS_ENQUEUED; \ 209*8275SEric Cheng MAC_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, count, sz); \ 210*8275SEric Cheng if ((mac_srs)->srs_type & SRST_BW_CONTROL) { \ 211*8275SEric Cheng (mac_srs)->srs_size += (sz); \ 212*8275SEric Cheng (mac_srs)->srs_bw->mac_bw_sz += (sz); \ 213*8275SEric Cheng } \ 214*8275SEric Cheng } 215*8275SEric Cheng 216*8275SEric Cheng /* 217*8275SEric Cheng * Turn polling on routines 218*8275SEric Cheng */ 219*8275SEric Cheng #define MAC_SRS_POLLING_ON(mac_srs) { \ 220*8275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 221*8275SEric Cheng if (((mac_srs)->srs_state & \ 222*8275SEric Cheng (SRS_POLLING_CAPAB|SRS_POLLING)) == SRS_POLLING_CAPAB) { \ 223*8275SEric Cheng (mac_srs)->srs_state |= SRS_POLLING; \ 224*8275SEric Cheng (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 225*8275SEric Cheng (mac_srs)->srs_ring); \ 226*8275SEric Cheng (mac_srs)->srs_rx.sr_poll_on++; \ 227*8275SEric Cheng } \ 228*8275SEric Cheng } 229*8275SEric Cheng 230*8275SEric Cheng #define MAC_SRS_WORKER_POLLING_ON(mac_srs) { \ 231*8275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 232*8275SEric Cheng if (((mac_srs)->srs_state & \ 233*8275SEric Cheng (SRS_POLLING_CAPAB|SRS_WORKER|SRS_POLLING)) == \ 234*8275SEric Cheng (SRS_POLLING_CAPAB|SRS_WORKER)) { \ 235*8275SEric Cheng (mac_srs)->srs_state |= SRS_POLLING; \ 236*8275SEric Cheng (void) mac_hwring_disable_intr((mac_ring_handle_t) \ 237*8275SEric Cheng (mac_srs)->srs_ring); \ 238*8275SEric Cheng (mac_srs)->srs_rx.sr_worker_poll_on++; \ 239*8275SEric Cheng } \ 240*8275SEric Cheng } 241*8275SEric Cheng 242*8275SEric Cheng /* 243*8275SEric Cheng * MAC_SRS_POLL_RING 244*8275SEric Cheng * 245*8275SEric Cheng * Signal the SRS poll thread to poll the underlying H/W ring 246*8275SEric Cheng * provided it wasn't already polling (SRS_GET_PKTS was set). 247*8275SEric Cheng * 248*8275SEric Cheng * Poll thread gets to run only from mac_rx_srs_drain() and only 249*8275SEric Cheng * if the drain was being done by the worker thread. 250*8275SEric Cheng */ 251*8275SEric Cheng #define MAC_SRS_POLL_RING(mac_srs) { \ 252*8275SEric Cheng mac_srs_rx_t *srs_rx = &(mac_srs)->srs_rx; \ 253*8275SEric Cheng \ 254*8275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 255*8275SEric Cheng srs_rx->sr_poll_thr_sig++; \ 256*8275SEric Cheng if (((mac_srs)->srs_state & \ 257*8275SEric Cheng (SRS_POLLING_CAPAB|SRS_WORKER|SRS_GET_PKTS)) == \ 258*8275SEric Cheng (SRS_WORKER|SRS_POLLING_CAPAB)) { \ 259*8275SEric Cheng (mac_srs)->srs_state |= SRS_GET_PKTS; \ 260*8275SEric Cheng cv_signal(&(mac_srs)->srs_cv); \ 261*8275SEric Cheng } else { \ 262*8275SEric Cheng srs_rx->sr_poll_thr_busy++; \ 263*8275SEric Cheng } \ 264*8275SEric Cheng } 265*8275SEric Cheng 266*8275SEric Cheng /* 267*8275SEric Cheng * MAC_SRS_CHECK_BW_CONTROL 268*8275SEric Cheng * 269*8275SEric Cheng * Check to see if next tick has started so we can reset the 270*8275SEric Cheng * SRS_BW_ENFORCED flag and allow more packets to come in the 271*8275SEric Cheng * system. 272*8275SEric Cheng */ 273*8275SEric Cheng #define MAC_SRS_CHECK_BW_CONTROL(mac_srs) { \ 274*8275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 275*8275SEric Cheng ASSERT(((mac_srs)->srs_type & SRST_TX) || \ 276*8275SEric Cheng MUTEX_HELD(&(mac_srs)->srs_bw->mac_bw_lock)); \ 277*8275SEric Cheng if ((mac_srs)->srs_bw->mac_bw_curr_time != lbolt) { \ 278*8275SEric Cheng (mac_srs)->srs_bw->mac_bw_curr_time = lbolt; \ 279*8275SEric Cheng (mac_srs)->srs_bw->mac_bw_used = 0; \ 280*8275SEric Cheng if ((mac_srs)->srs_bw->mac_bw_state & SRS_BW_ENFORCED) \ 281*8275SEric Cheng (mac_srs)->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; \ 282*8275SEric Cheng } \ 283*8275SEric Cheng } 284*8275SEric Cheng 285*8275SEric Cheng /* 286*8275SEric Cheng * MAC_SRS_WORKER_WAKEUP 287*8275SEric Cheng * 288*8275SEric Cheng * Wake up the SRS worker thread to process the queue as long as 289*8275SEric Cheng * no one else is processing the queue. If we are optimizing for 290*8275SEric Cheng * latency, we wake up the worker thread immediately or else we 291*8275SEric Cheng * wait mac_srs_worker_wakeup_ticks before worker thread gets 292*8275SEric Cheng * woken up. 293*8275SEric Cheng */ 294*8275SEric Cheng int mac_srs_worker_wakeup_ticks = 0; 295*8275SEric Cheng #define MAC_SRS_WORKER_WAKEUP(mac_srs) { \ 296*8275SEric Cheng ASSERT(MUTEX_HELD(&(mac_srs)->srs_lock)); \ 297*8275SEric Cheng if (!((mac_srs)->srs_state & SRS_PROC) && \ 298*8275SEric Cheng (mac_srs)->srs_tid == NULL) { \ 299*8275SEric Cheng if (mac_latency_optimize || \ 300*8275SEric Cheng (mac_srs_worker_wakeup_ticks == 0)) \ 301*8275SEric Cheng cv_signal(&(mac_srs)->srs_async); \ 302*8275SEric Cheng else \ 303*8275SEric Cheng (mac_srs)->srs_tid = \ 304*8275SEric Cheng timeout(mac_srs_fire, (mac_srs), \ 305*8275SEric Cheng mac_srs_worker_wakeup_ticks); \ 306*8275SEric Cheng } \ 307*8275SEric Cheng } 308*8275SEric Cheng 309*8275SEric Cheng #define TX_SINGLE_RING_MODE(mac_srs) \ 310*8275SEric Cheng ((mac_srs)->srs_tx.st_mode == SRS_TX_DEFAULT || \ 311*8275SEric Cheng (mac_srs)->srs_tx.st_mode == SRS_TX_SERIALIZE || \ 312*8275SEric Cheng (mac_srs)->srs_tx.st_mode == SRS_TX_BW) 313*8275SEric Cheng 314*8275SEric Cheng #define TX_BANDWIDTH_MODE(mac_srs) \ 315*8275SEric Cheng ((mac_srs)->srs_tx.st_mode == SRS_TX_BW || \ 316*8275SEric Cheng (mac_srs)->srs_tx.st_mode == SRS_TX_BW_FANOUT) 317*8275SEric Cheng 318*8275SEric Cheng #define TX_SRS_TO_SOFT_RING(mac_srs, head, hint) { \ 319*8275SEric Cheng uint_t hash, indx; \ 320*8275SEric Cheng hash = HASH_HINT(hint); \ 321*8275SEric Cheng indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); \ 322*8275SEric Cheng softring = mac_srs->srs_oth_soft_rings[indx]; \ 323*8275SEric Cheng (void) (mac_tx_soft_ring_process(softring, head, 0, NULL)); \ 324*8275SEric Cheng } 325*8275SEric Cheng 326*8275SEric Cheng /* 327*8275SEric Cheng * MAC_TX_SRS_BLOCK 328*8275SEric Cheng * 329*8275SEric Cheng * Always called from mac_tx_srs_drain() function. SRS_TX_BLOCKED 330*8275SEric Cheng * will be set only if srs_tx_woken_up is FALSE. If 331*8275SEric Cheng * srs_tx_woken_up is TRUE, it indicates that the wakeup arrived 332*8275SEric Cheng * before we grabbed srs_lock to set SRS_TX_BLOCKED. We need to 333*8275SEric Cheng * attempt to transmit again and not setting SRS_TX_BLOCKED does 334*8275SEric Cheng * that. 335*8275SEric Cheng */ 336*8275SEric Cheng #define MAC_TX_SRS_BLOCK(srs, mp) { \ 337*8275SEric Cheng ASSERT(MUTEX_HELD(&(srs)->srs_lock)); \ 338*8275SEric Cheng if ((srs)->srs_tx.st_woken_up) { \ 339*8275SEric Cheng (srs)->srs_tx.st_woken_up = B_FALSE; \ 340*8275SEric Cheng } else { \ 341*8275SEric Cheng ASSERT(!((srs)->srs_state & SRS_TX_BLOCKED)); \ 342*8275SEric Cheng (srs)->srs_state |= SRS_TX_BLOCKED; \ 343*8275SEric Cheng (srs)->srs_tx.st_blocked_cnt++; \ 344*8275SEric Cheng } \ 345*8275SEric Cheng } 346*8275SEric Cheng 347*8275SEric Cheng /* 348*8275SEric Cheng * MAC_TX_SRS_TEST_HIWAT 349*8275SEric Cheng * 350*8275SEric Cheng * Called before queueing a packet onto Tx SRS to test and set 351*8275SEric Cheng * SRS_TX_HIWAT if srs_count exceeds srs_tx_hiwat. 352*8275SEric Cheng */ 353*8275SEric Cheng #define MAC_TX_SRS_TEST_HIWAT(srs, mp, tail, cnt, sz, cookie) { \ 354*8275SEric Cheng boolean_t enqueue = 1; \ 355*8275SEric Cheng \ 356*8275SEric Cheng if ((srs)->srs_count > (srs)->srs_tx.st_hiwat) { \ 357*8275SEric Cheng /* \ 358*8275SEric Cheng * flow-controlled. Store srs in cookie so that it \ 359*8275SEric Cheng * can be returned as mac_tx_cookie_t to client \ 360*8275SEric Cheng */ \ 361*8275SEric Cheng (srs)->srs_state |= SRS_TX_HIWAT; \ 362*8275SEric Cheng cookie = (mac_tx_cookie_t)srs; \ 363*8275SEric Cheng (srs)->srs_tx.st_hiwat_cnt++; \ 364*8275SEric Cheng if ((srs)->srs_count > (srs)->srs_tx.st_max_q_cnt) { \ 365*8275SEric Cheng /* increment freed stats */ \ 366*8275SEric Cheng (srs)->srs_tx.st_drop_count += cnt; \ 367*8275SEric Cheng /* \ 368*8275SEric Cheng * b_prev may be set to the fanout hint \ 369*8275SEric Cheng * hence can't use freemsg directly \ 370*8275SEric Cheng */ \ 371*8275SEric Cheng mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); \ 372*8275SEric Cheng DTRACE_PROBE1(tx_queued_hiwat, \ 373*8275SEric Cheng mac_soft_ring_set_t *, srs); \ 374*8275SEric Cheng enqueue = 0; \ 375*8275SEric Cheng } \ 376*8275SEric Cheng } \ 377*8275SEric Cheng if (enqueue) \ 378*8275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(srs, mp, tail, cnt, sz); \ 379*8275SEric Cheng } 380*8275SEric Cheng 381*8275SEric Cheng /* Some utility macros */ 382*8275SEric Cheng #define MAC_SRS_BW_LOCK(srs) \ 383*8275SEric Cheng if (!(srs->srs_type & SRST_TX)) \ 384*8275SEric Cheng mutex_enter(&srs->srs_bw->mac_bw_lock); 385*8275SEric Cheng 386*8275SEric Cheng #define MAC_SRS_BW_UNLOCK(srs) \ 387*8275SEric Cheng if (!(srs->srs_type & SRST_TX)) \ 388*8275SEric Cheng mutex_exit(&srs->srs_bw->mac_bw_lock); 389*8275SEric Cheng 390*8275SEric Cheng #define MAC_TX_SRS_DROP_MESSAGE(srs, mp, cookie) { \ 391*8275SEric Cheng mac_pkt_drop(NULL, NULL, mp, B_FALSE); \ 392*8275SEric Cheng /* increment freed stats */ \ 393*8275SEric Cheng mac_srs->srs_tx.st_drop_count++; \ 394*8275SEric Cheng cookie = (mac_tx_cookie_t)srs; \ 395*8275SEric Cheng } 396*8275SEric Cheng 397*8275SEric Cheng #define MAC_TX_SET_NO_ENQUEUE(srs, mp_chain, ret_mp, cookie) { \ 398*8275SEric Cheng mac_srs->srs_state |= SRS_TX_WAKEUP_CLIENT; \ 399*8275SEric Cheng cookie = (mac_tx_cookie_t)srs; \ 400*8275SEric Cheng *ret_mp = mp_chain; \ 401*8275SEric Cheng } 402*8275SEric Cheng 403*8275SEric Cheng /* 404*8275SEric Cheng * Drop the rx packet and advance to the next one in the chain. 405*8275SEric Cheng */ 406*8275SEric Cheng static void 407*8275SEric Cheng mac_rx_drop_pkt(mac_soft_ring_set_t *srs, mblk_t *mp) 408*8275SEric Cheng { 409*8275SEric Cheng mac_srs_rx_t *srs_rx = &srs->srs_rx; 410*8275SEric Cheng 411*8275SEric Cheng ASSERT(mp->b_next == NULL); 412*8275SEric Cheng mutex_enter(&srs->srs_lock); 413*8275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(srs, 1); 414*8275SEric Cheng MAC_UPDATE_SRS_SIZE_LOCKED(srs, msgdsize(mp)); 415*8275SEric Cheng mutex_exit(&srs->srs_lock); 416*8275SEric Cheng 417*8275SEric Cheng srs_rx->sr_drop_count++; 418*8275SEric Cheng freemsg(mp); 419*8275SEric Cheng } 420*8275SEric Cheng 421*8275SEric Cheng /* DATAPATH RUNTIME ROUTINES */ 422*8275SEric Cheng 423*8275SEric Cheng /* 424*8275SEric Cheng * mac_srs_fire 425*8275SEric Cheng * 426*8275SEric Cheng * Timer callback routine for waking up the SRS worker thread. 427*8275SEric Cheng */ 428*8275SEric Cheng static void 429*8275SEric Cheng mac_srs_fire(void *arg) 430*8275SEric Cheng { 431*8275SEric Cheng mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)arg; 432*8275SEric Cheng 433*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 434*8275SEric Cheng if (mac_srs->srs_tid == 0) { 435*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 436*8275SEric Cheng return; 437*8275SEric Cheng } 438*8275SEric Cheng 439*8275SEric Cheng mac_srs->srs_tid = 0; 440*8275SEric Cheng if (!(mac_srs->srs_state & SRS_PROC)) 441*8275SEric Cheng cv_signal(&mac_srs->srs_async); 442*8275SEric Cheng 443*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 444*8275SEric Cheng } 445*8275SEric Cheng 446*8275SEric Cheng /* 447*8275SEric Cheng * 'hint' is fanout_hint (type of uint64_t) which is given by the TCP/IP stack, 448*8275SEric Cheng * and it is used on the TX path. 449*8275SEric Cheng */ 450*8275SEric Cheng #define HASH_HINT(hint) (((hint) << 17) | ((hint) >> 16)) 451*8275SEric Cheng 452*8275SEric Cheng /* 453*8275SEric Cheng * hash based on the src address and the port information. 454*8275SEric Cheng */ 455*8275SEric Cheng #define HASH_ADDR(src, ports) \ 456*8275SEric Cheng (ntohl((src)) ^ ((ports) >> 24) ^ ((ports) >> 16) ^ \ 457*8275SEric Cheng ((ports) >> 8) ^ (ports)) 458*8275SEric Cheng 459*8275SEric Cheng #define COMPUTE_INDEX(key, sz) (key % sz) 460*8275SEric Cheng 461*8275SEric Cheng #define FANOUT_ENQUEUE_MP(head, tail, cnt, bw_ctl, sz, sz0, mp) { \ 462*8275SEric Cheng if ((tail) != NULL) { \ 463*8275SEric Cheng ASSERT((tail)->b_next == NULL); \ 464*8275SEric Cheng (tail)->b_next = (mp); \ 465*8275SEric Cheng } else { \ 466*8275SEric Cheng ASSERT((head) == NULL); \ 467*8275SEric Cheng (head) = (mp); \ 468*8275SEric Cheng } \ 469*8275SEric Cheng (tail) = (mp); \ 470*8275SEric Cheng (cnt)++; \ 471*8275SEric Cheng if ((bw_ctl)) \ 472*8275SEric Cheng (sz) += (sz0); \ 473*8275SEric Cheng } 474*8275SEric Cheng 475*8275SEric Cheng #define MAC_FANOUT_DEFAULT 0 476*8275SEric Cheng #define MAC_FANOUT_RND_ROBIN 1 477*8275SEric Cheng int mac_fanout_type = MAC_FANOUT_DEFAULT; 478*8275SEric Cheng 479*8275SEric Cheng #define MAX_SR_TYPES 3 480*8275SEric Cheng /* fanout types for port based hashing */ 481*8275SEric Cheng enum pkt_type { 482*8275SEric Cheng V4_TCP = 0, 483*8275SEric Cheng V4_UDP, 484*8275SEric Cheng OTH, 485*8275SEric Cheng UNDEF 486*8275SEric Cheng }; 487*8275SEric Cheng 488*8275SEric Cheng /* 489*8275SEric Cheng * In general we do port based hashing to spread traffic over different 490*8275SEric Cheng * softrings. The below tunable allows to override that behavior. Setting it 491*8275SEric Cheng * to B_TRUE allows to do a fanout based on src ipv6 address. This behavior 492*8275SEric Cheng * is also the applicable to ipv6 packets carrying multiple optional headers 493*8275SEric Cheng * and other uncommon packet types. 494*8275SEric Cheng */ 495*8275SEric Cheng boolean_t mac_src_ipv6_fanout = B_FALSE; 496*8275SEric Cheng 497*8275SEric Cheng /* 498*8275SEric Cheng * Pair of local and remote ports in the transport header 499*8275SEric Cheng */ 500*8275SEric Cheng #define PORTS_SIZE 4 501*8275SEric Cheng 502*8275SEric Cheng /* 503*8275SEric Cheng * mac_rx_srs_proto_fanout 504*8275SEric Cheng * 505*8275SEric Cheng * This routine delivers packets destined to an SRS into one of the 506*8275SEric Cheng * protocol soft rings. 507*8275SEric Cheng * 508*8275SEric Cheng * Given a chain of packets we need to split it up into multiple sub chains 509*8275SEric Cheng * destined into TCP, UDP or OTH soft ring. Instead of entering 510*8275SEric Cheng * the soft ring one packet at a time, we want to enter it in the form of a 511*8275SEric Cheng * chain otherwise we get this start/stop behaviour where the worker thread 512*8275SEric Cheng * goes to sleep and then next packets comes in forcing it to wake up etc. 513*8275SEric Cheng */ 514*8275SEric Cheng static void 515*8275SEric Cheng mac_rx_srs_proto_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 516*8275SEric Cheng { 517*8275SEric Cheng struct ether_header *ehp; 518*8275SEric Cheng uint16_t etype; 519*8275SEric Cheng ipha_t *ipha; 520*8275SEric Cheng mac_soft_ring_t *softring; 521*8275SEric Cheng size_t ether_hlen; 522*8275SEric Cheng mblk_t *mp; 523*8275SEric Cheng mblk_t *headmp[MAX_SR_TYPES]; 524*8275SEric Cheng mblk_t *tailmp[MAX_SR_TYPES]; 525*8275SEric Cheng int cnt[MAX_SR_TYPES]; 526*8275SEric Cheng size_t sz[MAX_SR_TYPES]; 527*8275SEric Cheng size_t sz1; 528*8275SEric Cheng boolean_t bw_ctl = B_FALSE; 529*8275SEric Cheng boolean_t hw_classified; 530*8275SEric Cheng boolean_t dls_bypass = B_TRUE; 531*8275SEric Cheng enum pkt_type type; 532*8275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 533*8275SEric Cheng struct ether_vlan_header *evhp; 534*8275SEric Cheng 535*8275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) 536*8275SEric Cheng bw_ctl = B_TRUE; 537*8275SEric Cheng 538*8275SEric Cheng /* 539*8275SEric Cheng * If we don't have a Rx ring, S/W classification would have done 540*8275SEric Cheng * its job and its a packet meant for us. If we were polling on 541*8275SEric Cheng * the default ring (i.e. there was a ring assigned to this SRS), 542*8275SEric Cheng * then we need to make sure that the mac address really belongs 543*8275SEric Cheng * to us. 544*8275SEric Cheng */ 545*8275SEric Cheng hw_classified = mac_srs->srs_ring != NULL && 546*8275SEric Cheng mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 547*8275SEric Cheng 548*8275SEric Cheng /* 549*8275SEric Cheng * Special clients (eg. VLAN, non ether, etc) need DLS 550*8275SEric Cheng * processing in the Rx path. SRST_DLS_BYPASS will be clear for 551*8275SEric Cheng * such SRSs. 552*8275SEric Cheng */ 553*8275SEric Cheng if (!(mac_srs->srs_type & SRST_DLS_BYPASS)) 554*8275SEric Cheng dls_bypass = B_FALSE; 555*8275SEric Cheng 556*8275SEric Cheng bzero(headmp, MAX_SR_TYPES * sizeof (mblk_t *)); 557*8275SEric Cheng bzero(tailmp, MAX_SR_TYPES * sizeof (mblk_t *)); 558*8275SEric Cheng bzero(cnt, MAX_SR_TYPES * sizeof (int)); 559*8275SEric Cheng bzero(sz, MAX_SR_TYPES * sizeof (size_t)); 560*8275SEric Cheng 561*8275SEric Cheng /* 562*8275SEric Cheng * We got a chain from SRS that we need to send to the soft rings. 563*8275SEric Cheng * Since squeues for TCP & IPv4 sap poll their soft rings (for 564*8275SEric Cheng * performance reasons), we need to separate out v4_tcp, v4_udp 565*8275SEric Cheng * and the rest goes in other. 566*8275SEric Cheng */ 567*8275SEric Cheng while (head != NULL) { 568*8275SEric Cheng mp = head; 569*8275SEric Cheng head = head->b_next; 570*8275SEric Cheng mp->b_next = NULL; 571*8275SEric Cheng 572*8275SEric Cheng type = OTH; 573*8275SEric Cheng sz1 = msgdsize(mp); 574*8275SEric Cheng 575*8275SEric Cheng if (!dls_bypass) { 576*8275SEric Cheng mac_impl_t *mip = mcip->mci_mip; 577*8275SEric Cheng 578*8275SEric Cheng ehp = (struct ether_header *)mp->b_rptr; 579*8275SEric Cheng 580*8275SEric Cheng /* 581*8275SEric Cheng * For VLAN packets, if the VLAN id doesn't belong 582*8275SEric Cheng * to this client, we drop the packet. 583*8275SEric Cheng */ 584*8275SEric Cheng if (mip->mi_info.mi_nativemedia == DL_ETHER && 585*8275SEric Cheng ntohs(ehp->ether_type) == VLAN_TPID) { 586*8275SEric Cheng /* 587*8275SEric Cheng * LINTED: cast may result in improper 588*8275SEric Cheng * alignment 589*8275SEric Cheng */ 590*8275SEric Cheng evhp = (struct ether_vlan_header *)ehp; 591*8275SEric Cheng if (!mac_client_check_flow_vid(mcip, 592*8275SEric Cheng VLAN_ID(ntohs(evhp->ether_tci)))) { 593*8275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 594*8275SEric Cheng continue; 595*8275SEric Cheng } 596*8275SEric Cheng } 597*8275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 598*8275SEric Cheng cnt[type], bw_ctl, sz[type], sz1, mp); 599*8275SEric Cheng continue; 600*8275SEric Cheng } 601*8275SEric Cheng 602*8275SEric Cheng /* 603*8275SEric Cheng * At this point we can be sure the packet at least 604*8275SEric Cheng * has an ether header. 605*8275SEric Cheng */ 606*8275SEric Cheng if (sz1 < sizeof (struct ether_header)) { 607*8275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 608*8275SEric Cheng continue; 609*8275SEric Cheng } 610*8275SEric Cheng /* LINTED: cast may result in improper alignment */ 611*8275SEric Cheng ehp = (struct ether_header *)mp->b_rptr; 612*8275SEric Cheng 613*8275SEric Cheng /* 614*8275SEric Cheng * Determine if this is a VLAN or non-VLAN packet. 615*8275SEric Cheng */ 616*8275SEric Cheng if ((etype = ntohs(ehp->ether_type)) == VLAN_TPID) { 617*8275SEric Cheng /* LINTED: cast may result in improper alignment */ 618*8275SEric Cheng evhp = (struct ether_vlan_header *)mp->b_rptr; 619*8275SEric Cheng etype = ntohs(evhp->ether_type); 620*8275SEric Cheng ether_hlen = sizeof (struct ether_vlan_header); 621*8275SEric Cheng /* 622*8275SEric Cheng * Check if the VID of the packet, if any, belongs 623*8275SEric Cheng * to this client. 624*8275SEric Cheng */ 625*8275SEric Cheng if (!mac_client_check_flow_vid(mcip, 626*8275SEric Cheng VLAN_ID(ntohs(evhp->ether_tci)))) { 627*8275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 628*8275SEric Cheng continue; 629*8275SEric Cheng } 630*8275SEric Cheng } else { 631*8275SEric Cheng ether_hlen = sizeof (struct ether_header); 632*8275SEric Cheng } 633*8275SEric Cheng 634*8275SEric Cheng if (etype == ETHERTYPE_IP) { 635*8275SEric Cheng /* 636*8275SEric Cheng * If we are H/W classified, but we have promisc 637*8275SEric Cheng * on, then we need to check for the unicast address. 638*8275SEric Cheng */ 639*8275SEric Cheng if (hw_classified && mcip->mci_promisc_list != NULL) { 640*8275SEric Cheng mac_address_t *map; 641*8275SEric Cheng 642*8275SEric Cheng rw_enter(&mcip->mci_rw_lock, RW_READER); 643*8275SEric Cheng map = mcip->mci_unicast; 644*8275SEric Cheng if (bcmp(&ehp->ether_dhost, map->ma_addr, 645*8275SEric Cheng map->ma_len) == 0) 646*8275SEric Cheng type = UNDEF; 647*8275SEric Cheng rw_exit(&mcip->mci_rw_lock); 648*8275SEric Cheng } else if (((((uint8_t *)&ehp->ether_dhost)[0] & 649*8275SEric Cheng 0x01) == 0)) { 650*8275SEric Cheng type = UNDEF; 651*8275SEric Cheng } 652*8275SEric Cheng } 653*8275SEric Cheng 654*8275SEric Cheng /* 655*8275SEric Cheng * This needs to become a contract with the driver for 656*8275SEric Cheng * the fast path. 657*8275SEric Cheng * 658*8275SEric Cheng * In the normal case the packet will have at least the L2 659*8275SEric Cheng * header and the IP + Transport header in the same mblk. 660*8275SEric Cheng * This is usually the case when the NIC driver sends up 661*8275SEric Cheng * the packet. This is also true when the stack generates 662*8275SEric Cheng * a packet that is looped back and when the stack uses the 663*8275SEric Cheng * fastpath mechanism. The normal case is optimized for 664*8275SEric Cheng * performance and may bypass DLS. All other cases go through 665*8275SEric Cheng * the 'OTH' type path without DLS bypass. 666*8275SEric Cheng */ 667*8275SEric Cheng 668*8275SEric Cheng /* LINTED: cast may result in improper alignment */ 669*8275SEric Cheng ipha = (ipha_t *)(mp->b_rptr + ether_hlen); 670*8275SEric Cheng if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) 671*8275SEric Cheng type = OTH; 672*8275SEric Cheng 673*8275SEric Cheng if (type == OTH) { 674*8275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], 675*8275SEric Cheng cnt[type], bw_ctl, sz[type], sz1, mp); 676*8275SEric Cheng continue; 677*8275SEric Cheng } 678*8275SEric Cheng 679*8275SEric Cheng ASSERT(type == UNDEF); 680*8275SEric Cheng /* 681*8275SEric Cheng * We look for at least 4 bytes past the IP header to get 682*8275SEric Cheng * the port information. If we get an IP fragment, we don't 683*8275SEric Cheng * have the port information, and we use just the protocol 684*8275SEric Cheng * information. 685*8275SEric Cheng */ 686*8275SEric Cheng switch (ipha->ipha_protocol) { 687*8275SEric Cheng case IPPROTO_TCP: 688*8275SEric Cheng type = V4_TCP; 689*8275SEric Cheng mp->b_rptr += ether_hlen; 690*8275SEric Cheng break; 691*8275SEric Cheng case IPPROTO_UDP: 692*8275SEric Cheng type = V4_UDP; 693*8275SEric Cheng mp->b_rptr += ether_hlen; 694*8275SEric Cheng break; 695*8275SEric Cheng default: 696*8275SEric Cheng type = OTH; 697*8275SEric Cheng break; 698*8275SEric Cheng } 699*8275SEric Cheng 700*8275SEric Cheng ASSERT(type != UNDEF); 701*8275SEric Cheng 702*8275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type], tailmp[type], cnt[type], 703*8275SEric Cheng bw_ctl, sz[type], sz1, mp); 704*8275SEric Cheng } 705*8275SEric Cheng 706*8275SEric Cheng for (type = V4_TCP; type < UNDEF; type++) { 707*8275SEric Cheng if (headmp[type] != NULL) { 708*8275SEric Cheng ASSERT(tailmp[type]->b_next == NULL); 709*8275SEric Cheng switch (type) { 710*8275SEric Cheng case V4_TCP: 711*8275SEric Cheng softring = mac_srs->srs_tcp_soft_rings[0]; 712*8275SEric Cheng break; 713*8275SEric Cheng case V4_UDP: 714*8275SEric Cheng softring = mac_srs->srs_udp_soft_rings[0]; 715*8275SEric Cheng break; 716*8275SEric Cheng case OTH: 717*8275SEric Cheng softring = mac_srs->srs_oth_soft_rings[0]; 718*8275SEric Cheng } 719*8275SEric Cheng mac_rx_soft_ring_process(mac_srs->srs_mcip, softring, 720*8275SEric Cheng headmp[type], tailmp[type], cnt[type], sz[type]); 721*8275SEric Cheng } 722*8275SEric Cheng } 723*8275SEric Cheng } 724*8275SEric Cheng 725*8275SEric Cheng int fanout_unalligned = 0; 726*8275SEric Cheng 727*8275SEric Cheng /* 728*8275SEric Cheng * mac_rx_srs_long_fanout 729*8275SEric Cheng * 730*8275SEric Cheng * The fanout routine for IPv6 731*8275SEric Cheng */ 732*8275SEric Cheng static int 733*8275SEric Cheng mac_rx_srs_long_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *mp, 734*8275SEric Cheng uint16_t etype, enum pkt_type *type, uint_t *indx) 735*8275SEric Cheng { 736*8275SEric Cheng ip6_t *ip6h; 737*8275SEric Cheng uint8_t *whereptr; 738*8275SEric Cheng uint_t hash; 739*8275SEric Cheng uint16_t remlen; 740*8275SEric Cheng uint8_t nexthdr; 741*8275SEric Cheng uint16_t hdr_len; 742*8275SEric Cheng 743*8275SEric Cheng if (etype == ETHERTYPE_IPV6) { 744*8275SEric Cheng boolean_t modifiable = B_TRUE; 745*8275SEric Cheng 746*8275SEric Cheng ASSERT(MBLKL(mp) >= sizeof (struct ether_header)); 747*8275SEric Cheng 748*8275SEric Cheng ip6h = (ip6_t *)(mp->b_rptr + sizeof (struct ether_header)); 749*8275SEric Cheng if ((unsigned char *)ip6h == mp->b_wptr) { 750*8275SEric Cheng /* 751*8275SEric Cheng * The first mblk_t only includes the ethernet header. 752*8275SEric Cheng * Note that it is safe to change the mp pointer here, 753*8275SEric Cheng * as the subsequent operation does not assume mp 754*8275SEric Cheng * points to the start of the ethernet header. 755*8275SEric Cheng */ 756*8275SEric Cheng mp = mp->b_cont; 757*8275SEric Cheng 758*8275SEric Cheng /* 759*8275SEric Cheng * Make sure ip6h holds the full ip6_t structure. 760*8275SEric Cheng */ 761*8275SEric Cheng if (mp == NULL) 762*8275SEric Cheng return (-1); 763*8275SEric Cheng 764*8275SEric Cheng if (MBLKL(mp) < IPV6_HDR_LEN) { 765*8275SEric Cheng modifiable = (DB_REF(mp) == 1); 766*8275SEric Cheng 767*8275SEric Cheng if (modifiable && 768*8275SEric Cheng !pullupmsg(mp, IPV6_HDR_LEN)) { 769*8275SEric Cheng return (-1); 770*8275SEric Cheng } 771*8275SEric Cheng } 772*8275SEric Cheng 773*8275SEric Cheng ip6h = (ip6_t *)mp->b_rptr; 774*8275SEric Cheng } 775*8275SEric Cheng 776*8275SEric Cheng if (!modifiable || !(OK_32PTR((char *)ip6h)) || 777*8275SEric Cheng ((unsigned char *)ip6h + IPV6_HDR_LEN > mp->b_wptr)) { 778*8275SEric Cheng /* 779*8275SEric Cheng * If either ip6h is not alligned, or ip6h does not 780*8275SEric Cheng * hold the complete ip6_t structure (a pullupmsg() 781*8275SEric Cheng * is not an option since it would result in an 782*8275SEric Cheng * unalligned ip6h), fanout to the default ring. Note 783*8275SEric Cheng * that this may cause packets reordering. 784*8275SEric Cheng */ 785*8275SEric Cheng *indx = 0; 786*8275SEric Cheng *type = OTH; 787*8275SEric Cheng fanout_unalligned++; 788*8275SEric Cheng return (0); 789*8275SEric Cheng } 790*8275SEric Cheng 791*8275SEric Cheng remlen = ntohs(ip6h->ip6_plen); 792*8275SEric Cheng nexthdr = ip6h->ip6_nxt; 793*8275SEric Cheng 794*8275SEric Cheng if (remlen < MIN_EHDR_LEN) 795*8275SEric Cheng return (-1); 796*8275SEric Cheng /* 797*8275SEric Cheng * Do src based fanout if below tunable is set to B_TRUE or 798*8275SEric Cheng * when mac_ip_hdr_length_v6() fails because of malformed 799*8275SEric Cheng * packets or because mblk's need to be concatenated using 800*8275SEric Cheng * pullupmsg(). 801*8275SEric Cheng */ 802*8275SEric Cheng if (mac_src_ipv6_fanout || !mac_ip_hdr_length_v6(mp, ip6h, 803*8275SEric Cheng &hdr_len, &nexthdr)) { 804*8275SEric Cheng goto src_based_fanout; 805*8275SEric Cheng } 806*8275SEric Cheng whereptr = (uint8_t *)ip6h + hdr_len; 807*8275SEric Cheng 808*8275SEric Cheng /* If the transport is one of below, we do port based fanout */ 809*8275SEric Cheng switch (nexthdr) { 810*8275SEric Cheng case IPPROTO_TCP: 811*8275SEric Cheng case IPPROTO_UDP: 812*8275SEric Cheng case IPPROTO_SCTP: 813*8275SEric Cheng case IPPROTO_ESP: 814*8275SEric Cheng /* 815*8275SEric Cheng * If the ports in the transport header is not part of 816*8275SEric Cheng * the mblk, do src_based_fanout, instead of calling 817*8275SEric Cheng * pullupmsg(). 818*8275SEric Cheng */ 819*8275SEric Cheng if (mp->b_cont != NULL && 820*8275SEric Cheng whereptr + PORTS_SIZE > mp->b_wptr) { 821*8275SEric Cheng goto src_based_fanout; 822*8275SEric Cheng } 823*8275SEric Cheng break; 824*8275SEric Cheng default: 825*8275SEric Cheng break; 826*8275SEric Cheng } 827*8275SEric Cheng 828*8275SEric Cheng switch (nexthdr) { 829*8275SEric Cheng case IPPROTO_TCP: 830*8275SEric Cheng hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 831*8275SEric Cheng *(uint32_t *)whereptr); 832*8275SEric Cheng *indx = COMPUTE_INDEX(hash, 833*8275SEric Cheng mac_srs->srs_tcp_ring_count); 834*8275SEric Cheng *type = OTH; 835*8275SEric Cheng break; 836*8275SEric Cheng 837*8275SEric Cheng case IPPROTO_UDP: 838*8275SEric Cheng case IPPROTO_SCTP: 839*8275SEric Cheng case IPPROTO_ESP: 840*8275SEric Cheng if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 841*8275SEric Cheng hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), 842*8275SEric Cheng *(uint32_t *)whereptr); 843*8275SEric Cheng *indx = COMPUTE_INDEX(hash, 844*8275SEric Cheng mac_srs->srs_udp_ring_count); 845*8275SEric Cheng } else { 846*8275SEric Cheng *indx = mac_srs->srs_ind % 847*8275SEric Cheng mac_srs->srs_udp_ring_count; 848*8275SEric Cheng mac_srs->srs_ind++; 849*8275SEric Cheng } 850*8275SEric Cheng *type = OTH; 851*8275SEric Cheng break; 852*8275SEric Cheng 853*8275SEric Cheng /* For all other protocol, do source based fanout */ 854*8275SEric Cheng default: 855*8275SEric Cheng goto src_based_fanout; 856*8275SEric Cheng } 857*8275SEric Cheng } else { 858*8275SEric Cheng *indx = 0; 859*8275SEric Cheng *type = OTH; 860*8275SEric Cheng } 861*8275SEric Cheng return (0); 862*8275SEric Cheng 863*8275SEric Cheng src_based_fanout: 864*8275SEric Cheng hash = HASH_ADDR(V4_PART_OF_V6(ip6h->ip6_src), (uint32_t)0); 865*8275SEric Cheng *indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); 866*8275SEric Cheng *type = OTH; 867*8275SEric Cheng return (0); 868*8275SEric Cheng } 869*8275SEric Cheng 870*8275SEric Cheng /* 871*8275SEric Cheng * mac_rx_srs_fanout 872*8275SEric Cheng * 873*8275SEric Cheng * This routine delivers packets destined to an SRS into a soft ring member 874*8275SEric Cheng * of the set. 875*8275SEric Cheng * 876*8275SEric Cheng * Given a chain of packets we need to split it up into multiple sub chains 877*8275SEric Cheng * destined for one of the TCP, UDP or OTH soft rings. Instead of entering 878*8275SEric Cheng * the soft ring one packet at a time, we want to enter it in the form of a 879*8275SEric Cheng * chain otherwise we get this start/stop behaviour where the worker thread 880*8275SEric Cheng * goes to sleep and then next packets comes in forcing it to wake up etc. 881*8275SEric Cheng * 882*8275SEric Cheng * Note: 883*8275SEric Cheng * Since we know what is the maximum fanout possible, we create a 2D array 884*8275SEric Cheng * of 'softring types * MAX_SR_FANOUT' for the head, tail, cnt and sz 885*8275SEric Cheng * variables so that we can enter the softrings with chain. We need the 886*8275SEric Cheng * MAX_SR_FANOUT so we can allocate the arrays on the stack (a kmem_alloc 887*8275SEric Cheng * for each packet would be expensive). If we ever want to have the 888*8275SEric Cheng * ability to have unlimited fanout, we should probably declare a head, 889*8275SEric Cheng * tail, cnt, sz with each soft ring (a data struct which contains a softring 890*8275SEric Cheng * along with these members) and create an array of this uber struct so we 891*8275SEric Cheng * don't have to do kmem_alloc. 892*8275SEric Cheng */ 893*8275SEric Cheng int fanout_oth1 = 0; 894*8275SEric Cheng int fanout_oth2 = 0; 895*8275SEric Cheng int fanout_oth3 = 0; 896*8275SEric Cheng int fanout_oth4 = 0; 897*8275SEric Cheng int fanout_oth5 = 0; 898*8275SEric Cheng 899*8275SEric Cheng static void 900*8275SEric Cheng mac_rx_srs_fanout(mac_soft_ring_set_t *mac_srs, mblk_t *head) 901*8275SEric Cheng { 902*8275SEric Cheng struct ether_header *ehp; 903*8275SEric Cheng uint16_t etype; 904*8275SEric Cheng ipha_t *ipha; 905*8275SEric Cheng uint_t indx; 906*8275SEric Cheng int ports_offset = -1; 907*8275SEric Cheng int ipha_len; 908*8275SEric Cheng uint_t hash; 909*8275SEric Cheng mac_soft_ring_t *softring; 910*8275SEric Cheng size_t ether_hlen; 911*8275SEric Cheng uint16_t frag_offset_flags; 912*8275SEric Cheng mblk_t *mp; 913*8275SEric Cheng mblk_t *headmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 914*8275SEric Cheng mblk_t *tailmp[MAX_SR_TYPES][MAX_SR_FANOUT]; 915*8275SEric Cheng int cnt[MAX_SR_TYPES][MAX_SR_FANOUT]; 916*8275SEric Cheng size_t sz[MAX_SR_TYPES][MAX_SR_FANOUT]; 917*8275SEric Cheng size_t sz1; 918*8275SEric Cheng boolean_t bw_ctl = B_FALSE; 919*8275SEric Cheng boolean_t hw_classified; 920*8275SEric Cheng boolean_t dls_bypass = B_TRUE; 921*8275SEric Cheng int i; 922*8275SEric Cheng int fanout_cnt; 923*8275SEric Cheng enum pkt_type type; 924*8275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 925*8275SEric Cheng struct ether_vlan_header *evhp; 926*8275SEric Cheng 927*8275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) 928*8275SEric Cheng bw_ctl = B_TRUE; 929*8275SEric Cheng 930*8275SEric Cheng /* 931*8275SEric Cheng * If we don't have a Rx ring, S/W classification would have done 932*8275SEric Cheng * its job and its a packet meant for us. If we were polling on 933*8275SEric Cheng * the default ring (i.e. there was a ring assigned to this SRS), 934*8275SEric Cheng * then we need to make sure that the mac address really belongs 935*8275SEric Cheng * to us. 936*8275SEric Cheng */ 937*8275SEric Cheng hw_classified = mac_srs->srs_ring != NULL && 938*8275SEric Cheng mac_srs->srs_ring->mr_classify_type == MAC_HW_CLASSIFIER; 939*8275SEric Cheng 940*8275SEric Cheng /* 941*8275SEric Cheng * Special clients (eg. VLAN, non ether, etc) need DLS 942*8275SEric Cheng * processing in the Rx path. SRST_DLS_BYPASS will be clear for 943*8275SEric Cheng * such SRSs. 944*8275SEric Cheng */ 945*8275SEric Cheng if (!(mac_srs->srs_type & SRST_DLS_BYPASS)) 946*8275SEric Cheng dls_bypass = B_FALSE; 947*8275SEric Cheng 948*8275SEric Cheng /* 949*8275SEric Cheng * Since the softrings are never destroyed and we always 950*8275SEric Cheng * create equal number of softrings for TCP, UDP and rest, 951*8275SEric Cheng * its OK to check one of them for count and use it without 952*8275SEric Cheng * any lock. In future, if soft rings get destroyed because 953*8275SEric Cheng * of reduction in fanout, we will need to ensure that happens 954*8275SEric Cheng * behind the SRS_PROC. 955*8275SEric Cheng */ 956*8275SEric Cheng fanout_cnt = mac_srs->srs_tcp_ring_count; 957*8275SEric Cheng 958*8275SEric Cheng bzero(headmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 959*8275SEric Cheng bzero(tailmp, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (mblk_t *)); 960*8275SEric Cheng bzero(cnt, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (int)); 961*8275SEric Cheng bzero(sz, MAX_SR_TYPES * MAX_SR_FANOUT * sizeof (size_t)); 962*8275SEric Cheng 963*8275SEric Cheng /* 964*8275SEric Cheng * We got a chain from SRS that we need to send to the soft rings. 965*8275SEric Cheng * Since squeues for TCP & IPv4 sap poll their soft rings (for 966*8275SEric Cheng * performance reasons), we need to separate out v4_tcp, v4_udp 967*8275SEric Cheng * and the rest goes in other. 968*8275SEric Cheng */ 969*8275SEric Cheng while (head != NULL) { 970*8275SEric Cheng mp = head; 971*8275SEric Cheng head = head->b_next; 972*8275SEric Cheng mp->b_next = NULL; 973*8275SEric Cheng 974*8275SEric Cheng type = OTH; 975*8275SEric Cheng sz1 = msgdsize(mp); 976*8275SEric Cheng 977*8275SEric Cheng if (!dls_bypass) { 978*8275SEric Cheng mac_impl_t *mip = mcip->mci_mip; 979*8275SEric Cheng 980*8275SEric Cheng indx = 0; 981*8275SEric Cheng if (mip->mi_info.mi_nativemedia == DL_ETHER) { 982*8275SEric Cheng ehp = (struct ether_header *)mp->b_rptr; 983*8275SEric Cheng etype = ntohs(ehp->ether_type); 984*8275SEric Cheng /* 985*8275SEric Cheng * For VLAN packets, if the VLAN id doesn't 986*8275SEric Cheng * belong to this client, we drop the packet. 987*8275SEric Cheng */ 988*8275SEric Cheng if (etype == VLAN_TPID) { 989*8275SEric Cheng /* 990*8275SEric Cheng * LINTED: cast may result in improper 991*8275SEric Cheng * alignment 992*8275SEric Cheng */ 993*8275SEric Cheng evhp = (struct ether_vlan_header *) 994*8275SEric Cheng mp->b_rptr; 995*8275SEric Cheng if (!mac_client_check_flow_vid(mcip, 996*8275SEric Cheng VLAN_ID(ntohs(evhp->ether_tci)))) { 997*8275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 998*8275SEric Cheng continue; 999*8275SEric Cheng } 1000*8275SEric Cheng } 1001*8275SEric Cheng if (mac_rx_srs_long_fanout(mac_srs, mp, etype, 1002*8275SEric Cheng &type, &indx) == -1) { 1003*8275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 1004*8275SEric Cheng continue; 1005*8275SEric Cheng } 1006*8275SEric Cheng } 1007*8275SEric Cheng 1008*8275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type][indx], 1009*8275SEric Cheng tailmp[type][indx], cnt[type][indx], bw_ctl, 1010*8275SEric Cheng sz[type][indx], sz1, mp); 1011*8275SEric Cheng continue; 1012*8275SEric Cheng } 1013*8275SEric Cheng 1014*8275SEric Cheng /* 1015*8275SEric Cheng * At this point we can be sure the packet at least 1016*8275SEric Cheng * has an ether header. On the outbound side, GLD/stack 1017*8275SEric Cheng * ensure this. On the inbound side, the driver needs 1018*8275SEric Cheng * to ensure this. 1019*8275SEric Cheng */ 1020*8275SEric Cheng if (sz1 < sizeof (struct ether_header)) { 1021*8275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 1022*8275SEric Cheng continue; 1023*8275SEric Cheng } 1024*8275SEric Cheng /* LINTED: cast may result in improper alignment */ 1025*8275SEric Cheng ehp = (struct ether_header *)mp->b_rptr; 1026*8275SEric Cheng 1027*8275SEric Cheng /* 1028*8275SEric Cheng * Determine if this is a VLAN or non-VLAN packet. 1029*8275SEric Cheng */ 1030*8275SEric Cheng if ((etype = ntohs(ehp->ether_type)) == VLAN_TPID) { 1031*8275SEric Cheng /* LINTED: cast may result in improper alignment */ 1032*8275SEric Cheng evhp = (struct ether_vlan_header *)mp->b_rptr; 1033*8275SEric Cheng etype = ntohs(evhp->ether_type); 1034*8275SEric Cheng ether_hlen = sizeof (struct ether_vlan_header); 1035*8275SEric Cheng /* 1036*8275SEric Cheng * Check if the VID of the packet, if any, belongs 1037*8275SEric Cheng * to this client. 1038*8275SEric Cheng */ 1039*8275SEric Cheng if (!mac_client_check_flow_vid(mcip, 1040*8275SEric Cheng VLAN_ID(ntohs(evhp->ether_tci)))) { 1041*8275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 1042*8275SEric Cheng continue; 1043*8275SEric Cheng } 1044*8275SEric Cheng } else { 1045*8275SEric Cheng ether_hlen = sizeof (struct ether_header); 1046*8275SEric Cheng } 1047*8275SEric Cheng 1048*8275SEric Cheng 1049*8275SEric Cheng /* 1050*8275SEric Cheng * If we are using the default Rx ring where H/W or S/W 1051*8275SEric Cheng * classification has not happened, we need to verify if 1052*8275SEric Cheng * this unicast packet really belongs to us. 1053*8275SEric Cheng */ 1054*8275SEric Cheng if (etype == ETHERTYPE_IP) { 1055*8275SEric Cheng /* 1056*8275SEric Cheng * If we are H/W classified, but we have promisc 1057*8275SEric Cheng * on, then we need to check for the unicast address. 1058*8275SEric Cheng */ 1059*8275SEric Cheng if (hw_classified && mcip->mci_promisc_list != NULL) { 1060*8275SEric Cheng mac_address_t *map; 1061*8275SEric Cheng 1062*8275SEric Cheng rw_enter(&mcip->mci_rw_lock, RW_READER); 1063*8275SEric Cheng map = mcip->mci_unicast; 1064*8275SEric Cheng if (bcmp(&ehp->ether_dhost, map->ma_addr, 1065*8275SEric Cheng map->ma_len) == 0) 1066*8275SEric Cheng type = UNDEF; 1067*8275SEric Cheng rw_exit(&mcip->mci_rw_lock); 1068*8275SEric Cheng } else if (((((uint8_t *)&ehp->ether_dhost)[0] & 1069*8275SEric Cheng 0x01) == 0)) { 1070*8275SEric Cheng type = UNDEF; 1071*8275SEric Cheng } 1072*8275SEric Cheng } 1073*8275SEric Cheng 1074*8275SEric Cheng /* 1075*8275SEric Cheng * This needs to become a contract with the driver for 1076*8275SEric Cheng * the fast path. 1077*8275SEric Cheng */ 1078*8275SEric Cheng 1079*8275SEric Cheng /* LINTED: cast may result in improper alignment */ 1080*8275SEric Cheng ipha = (ipha_t *)(mp->b_rptr + ether_hlen); 1081*8275SEric Cheng if ((type != OTH) && MBLK_RX_FANOUT_SLOWPATH(mp, ipha)) { 1082*8275SEric Cheng type = OTH; 1083*8275SEric Cheng fanout_oth1++; 1084*8275SEric Cheng } 1085*8275SEric Cheng 1086*8275SEric Cheng if (type != OTH) { 1087*8275SEric Cheng switch (ipha->ipha_protocol) { 1088*8275SEric Cheng case IPPROTO_TCP: 1089*8275SEric Cheng case IPPROTO_UDP: 1090*8275SEric Cheng case IPPROTO_SCTP: 1091*8275SEric Cheng case IPPROTO_ESP: 1092*8275SEric Cheng ipha_len = IPH_HDR_LENGTH(ipha); 1093*8275SEric Cheng if ((uchar_t *)ipha + ipha_len + PORTS_SIZE > 1094*8275SEric Cheng mp->b_wptr) { 1095*8275SEric Cheng type = OTH; 1096*8275SEric Cheng break; 1097*8275SEric Cheng } 1098*8275SEric Cheng frag_offset_flags = 1099*8275SEric Cheng ntohs(ipha->ipha_fragment_offset_and_flags); 1100*8275SEric Cheng if ((frag_offset_flags & 1101*8275SEric Cheng (IPH_MF | IPH_OFFSET)) != 0) { 1102*8275SEric Cheng type = OTH; 1103*8275SEric Cheng fanout_oth3++; 1104*8275SEric Cheng break; 1105*8275SEric Cheng } 1106*8275SEric Cheng ports_offset = ether_hlen + ipha_len; 1107*8275SEric Cheng break; 1108*8275SEric Cheng default: 1109*8275SEric Cheng type = OTH; 1110*8275SEric Cheng fanout_oth4++; 1111*8275SEric Cheng break; 1112*8275SEric Cheng } 1113*8275SEric Cheng } 1114*8275SEric Cheng 1115*8275SEric Cheng if (type == OTH) { 1116*8275SEric Cheng if (mac_rx_srs_long_fanout(mac_srs, mp, etype, 1117*8275SEric Cheng &type, &indx) == -1) { 1118*8275SEric Cheng mac_rx_drop_pkt(mac_srs, mp); 1119*8275SEric Cheng continue; 1120*8275SEric Cheng } 1121*8275SEric Cheng 1122*8275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type][indx], 1123*8275SEric Cheng tailmp[type][indx], cnt[type][indx], bw_ctl, 1124*8275SEric Cheng sz[type][indx], sz1, mp); 1125*8275SEric Cheng continue; 1126*8275SEric Cheng } 1127*8275SEric Cheng 1128*8275SEric Cheng ASSERT(type == UNDEF); 1129*8275SEric Cheng 1130*8275SEric Cheng /* 1131*8275SEric Cheng * XXX-Sunay: We should hold srs_lock since ring_count 1132*8275SEric Cheng * below can change. But if we are always called from 1133*8275SEric Cheng * mac_rx_srs_drain and SRS_PROC is set, then we can 1134*8275SEric Cheng * enforce that ring_count can't be changed i.e. 1135*8275SEric Cheng * to change fanout type or ring count, the calling 1136*8275SEric Cheng * thread needs to be behind SRS_PROC. 1137*8275SEric Cheng */ 1138*8275SEric Cheng switch (ipha->ipha_protocol) { 1139*8275SEric Cheng case IPPROTO_TCP: 1140*8275SEric Cheng /* 1141*8275SEric Cheng * Note that for ESP, we fanout on SPI and it is at the 1142*8275SEric Cheng * same offset as the 2x16-bit ports. So it is clumped 1143*8275SEric Cheng * along with TCP, UDP and SCTP. 1144*8275SEric Cheng */ 1145*8275SEric Cheng hash = HASH_ADDR(ipha->ipha_src, 1146*8275SEric Cheng *(uint32_t *)(mp->b_rptr + ports_offset)); 1147*8275SEric Cheng indx = COMPUTE_INDEX(hash, mac_srs->srs_tcp_ring_count); 1148*8275SEric Cheng type = V4_TCP; 1149*8275SEric Cheng mp->b_rptr += ether_hlen; 1150*8275SEric Cheng break; 1151*8275SEric Cheng case IPPROTO_UDP: 1152*8275SEric Cheng case IPPROTO_SCTP: 1153*8275SEric Cheng case IPPROTO_ESP: 1154*8275SEric Cheng if (mac_fanout_type == MAC_FANOUT_DEFAULT) { 1155*8275SEric Cheng hash = HASH_ADDR(ipha->ipha_src, 1156*8275SEric Cheng *(uint32_t *)(mp->b_rptr + ports_offset)); 1157*8275SEric Cheng indx = COMPUTE_INDEX(hash, 1158*8275SEric Cheng mac_srs->srs_udp_ring_count); 1159*8275SEric Cheng } else { 1160*8275SEric Cheng indx = mac_srs->srs_ind % 1161*8275SEric Cheng mac_srs->srs_udp_ring_count; 1162*8275SEric Cheng mac_srs->srs_ind++; 1163*8275SEric Cheng } 1164*8275SEric Cheng type = V4_UDP; 1165*8275SEric Cheng mp->b_rptr += ether_hlen; 1166*8275SEric Cheng break; 1167*8275SEric Cheng } 1168*8275SEric Cheng 1169*8275SEric Cheng ASSERT(type != UNDEF); 1170*8275SEric Cheng 1171*8275SEric Cheng FANOUT_ENQUEUE_MP(headmp[type][indx], tailmp[type][indx], 1172*8275SEric Cheng cnt[type][indx], bw_ctl, sz[type][indx], sz1, mp); 1173*8275SEric Cheng } 1174*8275SEric Cheng 1175*8275SEric Cheng for (type = V4_TCP; type < UNDEF; type++) { 1176*8275SEric Cheng for (i = 0; i < fanout_cnt; i++) { 1177*8275SEric Cheng if (headmp[type][i] != NULL) { 1178*8275SEric Cheng ASSERT(tailmp[type][i]->b_next == NULL); 1179*8275SEric Cheng switch (type) { 1180*8275SEric Cheng case V4_TCP: 1181*8275SEric Cheng softring = 1182*8275SEric Cheng mac_srs->srs_tcp_soft_rings[i]; 1183*8275SEric Cheng break; 1184*8275SEric Cheng case V4_UDP: 1185*8275SEric Cheng softring = 1186*8275SEric Cheng mac_srs->srs_udp_soft_rings[i]; 1187*8275SEric Cheng break; 1188*8275SEric Cheng case OTH: 1189*8275SEric Cheng softring = 1190*8275SEric Cheng mac_srs->srs_oth_soft_rings[i]; 1191*8275SEric Cheng break; 1192*8275SEric Cheng } 1193*8275SEric Cheng mac_rx_soft_ring_process(mac_srs->srs_mcip, 1194*8275SEric Cheng softring, headmp[type][i], tailmp[type][i], 1195*8275SEric Cheng cnt[type][i], sz[type][i]); 1196*8275SEric Cheng } 1197*8275SEric Cheng } 1198*8275SEric Cheng } 1199*8275SEric Cheng } 1200*8275SEric Cheng 1201*8275SEric Cheng #define SRS_BYTES_TO_PICKUP 150000 1202*8275SEric Cheng ssize_t max_bytes_to_pickup = SRS_BYTES_TO_PICKUP; 1203*8275SEric Cheng 1204*8275SEric Cheng /* 1205*8275SEric Cheng * mac_rx_srs_poll_ring 1206*8275SEric Cheng * 1207*8275SEric Cheng * This SRS Poll thread uses this routine to poll the underlying hardware 1208*8275SEric Cheng * Rx ring to get a chain of packets. It can inline process that chain 1209*8275SEric Cheng * if mac_latency_optimize is set (default) or signal the SRS worker thread 1210*8275SEric Cheng * to do the remaining processing. 1211*8275SEric Cheng * 1212*8275SEric Cheng * Since packets come in the system via interrupt or poll path, we also 1213*8275SEric Cheng * update the stats and deal with promiscous clients here. 1214*8275SEric Cheng */ 1215*8275SEric Cheng void 1216*8275SEric Cheng mac_rx_srs_poll_ring(mac_soft_ring_set_t *mac_srs) 1217*8275SEric Cheng { 1218*8275SEric Cheng kmutex_t *lock = &mac_srs->srs_lock; 1219*8275SEric Cheng kcondvar_t *async = &mac_srs->srs_cv; 1220*8275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1221*8275SEric Cheng mblk_t *head, *tail, *mp; 1222*8275SEric Cheng callb_cpr_t cprinfo; 1223*8275SEric Cheng ssize_t bytes_to_pickup; 1224*8275SEric Cheng size_t sz; 1225*8275SEric Cheng int count; 1226*8275SEric Cheng mac_client_impl_t *smcip; 1227*8275SEric Cheng 1228*8275SEric Cheng CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "mac_srs_poll"); 1229*8275SEric Cheng mutex_enter(lock); 1230*8275SEric Cheng 1231*8275SEric Cheng start: 1232*8275SEric Cheng for (;;) { 1233*8275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 1234*8275SEric Cheng goto done; 1235*8275SEric Cheng 1236*8275SEric Cheng CALLB_CPR_SAFE_BEGIN(&cprinfo); 1237*8275SEric Cheng cv_wait(async, lock); 1238*8275SEric Cheng CALLB_CPR_SAFE_END(&cprinfo, lock); 1239*8275SEric Cheng 1240*8275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 1241*8275SEric Cheng goto done; 1242*8275SEric Cheng 1243*8275SEric Cheng check_again: 1244*8275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 1245*8275SEric Cheng /* 1246*8275SEric Cheng * We pick as many bytes as we are allowed to queue. 1247*8275SEric Cheng * Its possible that we will exceed the total 1248*8275SEric Cheng * packets queued in case this SRS is part of the 1249*8275SEric Cheng * Rx ring group since > 1 poll thread can be pulling 1250*8275SEric Cheng * upto the max allowed packets at the same time 1251*8275SEric Cheng * but that should be OK. 1252*8275SEric Cheng */ 1253*8275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1254*8275SEric Cheng bytes_to_pickup = 1255*8275SEric Cheng mac_srs->srs_bw->mac_bw_drop_threshold - 1256*8275SEric Cheng mac_srs->srs_bw->mac_bw_sz; 1257*8275SEric Cheng /* 1258*8275SEric Cheng * We shouldn't have been signalled if we 1259*8275SEric Cheng * have 0 or less bytes to pick but since 1260*8275SEric Cheng * some of the bytes accounting is driver 1261*8275SEric Cheng * dependant, we do the safety check. 1262*8275SEric Cheng */ 1263*8275SEric Cheng if (bytes_to_pickup < 0) 1264*8275SEric Cheng bytes_to_pickup = 0; 1265*8275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1266*8275SEric Cheng } else { 1267*8275SEric Cheng /* 1268*8275SEric Cheng * ToDO: Need to change the polling API 1269*8275SEric Cheng * to add a packet count and a flag which 1270*8275SEric Cheng * tells the driver whether we want packets 1271*8275SEric Cheng * based on a count, or bytes, or all the 1272*8275SEric Cheng * packets queued in the driver/HW. This 1273*8275SEric Cheng * way, we never have to check the limits 1274*8275SEric Cheng * on poll path. We truly let only as many 1275*8275SEric Cheng * packets enter the system as we are willing 1276*8275SEric Cheng * to process or queue. 1277*8275SEric Cheng * 1278*8275SEric Cheng * Something along the lines of 1279*8275SEric Cheng * pkts_to_pickup = mac_soft_ring_max_q_cnt - 1280*8275SEric Cheng * mac_srs->srs_poll_pkt_cnt 1281*8275SEric Cheng */ 1282*8275SEric Cheng 1283*8275SEric Cheng /* 1284*8275SEric Cheng * Since we are not doing B/W control, pick 1285*8275SEric Cheng * as many packets as allowed. 1286*8275SEric Cheng */ 1287*8275SEric Cheng bytes_to_pickup = max_bytes_to_pickup; 1288*8275SEric Cheng } 1289*8275SEric Cheng 1290*8275SEric Cheng /* Poll the underlying Hardware */ 1291*8275SEric Cheng mutex_exit(lock); 1292*8275SEric Cheng head = MAC_HWRING_POLL(mac_srs->srs_ring, (int)bytes_to_pickup); 1293*8275SEric Cheng mutex_enter(lock); 1294*8275SEric Cheng 1295*8275SEric Cheng ASSERT((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 1296*8275SEric Cheng SRS_POLL_THR_OWNER); 1297*8275SEric Cheng 1298*8275SEric Cheng mp = tail = head; 1299*8275SEric Cheng count = 0; 1300*8275SEric Cheng sz = 0; 1301*8275SEric Cheng while (mp != NULL) { 1302*8275SEric Cheng tail = mp; 1303*8275SEric Cheng sz += msgdsize(mp); 1304*8275SEric Cheng mp = mp->b_next; 1305*8275SEric Cheng count++; 1306*8275SEric Cheng } 1307*8275SEric Cheng 1308*8275SEric Cheng if (head != NULL) { 1309*8275SEric Cheng tail->b_next = NULL; 1310*8275SEric Cheng smcip = mac_srs->srs_mcip; 1311*8275SEric Cheng 1312*8275SEric Cheng if ((mac_srs->srs_type & SRST_FLOW) || 1313*8275SEric Cheng (smcip == NULL)) { 1314*8275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, 1315*8275SEric Cheng rbytes, sz); 1316*8275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, 1317*8275SEric Cheng ipackets, count); 1318*8275SEric Cheng } 1319*8275SEric Cheng 1320*8275SEric Cheng /* 1321*8275SEric Cheng * If there are any promiscuous mode callbacks 1322*8275SEric Cheng * defined for this MAC client, pass them a copy 1323*8275SEric Cheng * if appropriate and also update the counters. 1324*8275SEric Cheng */ 1325*8275SEric Cheng if (smcip != NULL) { 1326*8275SEric Cheng smcip->mci_stat_ibytes += sz; 1327*8275SEric Cheng smcip->mci_stat_ipackets += count; 1328*8275SEric Cheng 1329*8275SEric Cheng if (smcip->mci_mip->mi_promisc_list != NULL) { 1330*8275SEric Cheng mutex_exit(lock); 1331*8275SEric Cheng mac_promisc_dispatch(smcip->mci_mip, 1332*8275SEric Cheng head, NULL); 1333*8275SEric Cheng mutex_enter(lock); 1334*8275SEric Cheng } 1335*8275SEric Cheng } 1336*8275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 1337*8275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1338*8275SEric Cheng mac_srs->srs_bw->mac_bw_polled += sz; 1339*8275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1340*8275SEric Cheng } 1341*8275SEric Cheng srs_rx->sr_poll_count += count; 1342*8275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, head, tail, 1343*8275SEric Cheng count, sz); 1344*8275SEric Cheng if (count <= 10) 1345*8275SEric Cheng srs_rx->sr_chain_cnt_undr10++; 1346*8275SEric Cheng else if (count > 10 && count <= 50) 1347*8275SEric Cheng srs_rx->sr_chain_cnt_10to50++; 1348*8275SEric Cheng else 1349*8275SEric Cheng srs_rx->sr_chain_cnt_over50++; 1350*8275SEric Cheng } 1351*8275SEric Cheng 1352*8275SEric Cheng /* 1353*8275SEric Cheng * We are guaranteed that SRS_PROC will be set if we 1354*8275SEric Cheng * are here. Also, poll thread gets to run only if 1355*8275SEric Cheng * the drain was being done by a worker thread although 1356*8275SEric Cheng * its possible that worker thread is still running 1357*8275SEric Cheng * and poll thread was sent down to keep the pipeline 1358*8275SEric Cheng * going instead of doing a complete drain and then 1359*8275SEric Cheng * trying to poll the NIC. 1360*8275SEric Cheng * 1361*8275SEric Cheng * So we need to check SRS_WORKER flag to make sure 1362*8275SEric Cheng * that the worker thread is not processing the queue 1363*8275SEric Cheng * in parallel to us. The flags and conditions are 1364*8275SEric Cheng * protected by the srs_lock to prevent any race. We 1365*8275SEric Cheng * ensure that we don't drop the srs_lock from now 1366*8275SEric Cheng * till the end and similarly we don't drop the srs_lock 1367*8275SEric Cheng * in mac_rx_srs_drain() till similar condition check 1368*8275SEric Cheng * are complete. The mac_rx_srs_drain() needs to ensure 1369*8275SEric Cheng * that SRS_WORKER flag remains set as long as its 1370*8275SEric Cheng * processing the queue. 1371*8275SEric Cheng */ 1372*8275SEric Cheng if (!(mac_srs->srs_state & SRS_WORKER) && 1373*8275SEric Cheng (mac_srs->srs_first != NULL)) { 1374*8275SEric Cheng /* 1375*8275SEric Cheng * We have packets to process and worker thread 1376*8275SEric Cheng * is not running. Check to see if poll thread is 1377*8275SEric Cheng * allowed to process. Let it do processing only if it 1378*8275SEric Cheng * picked up some packets from the NIC otherwise 1379*8275SEric Cheng * wakeup the worker thread. 1380*8275SEric Cheng */ 1381*8275SEric Cheng if ((mac_srs->srs_state & SRS_LATENCY_OPT) && 1382*8275SEric Cheng (head != NULL)) { 1383*8275SEric Cheng mac_srs->srs_drain_func(mac_srs, SRS_POLL_PROC); 1384*8275SEric Cheng if (srs_rx->sr_poll_pkt_cnt <= 1385*8275SEric Cheng srs_rx->sr_lowat) { 1386*8275SEric Cheng srs_rx->sr_poll_again++; 1387*8275SEric Cheng goto check_again; 1388*8275SEric Cheng } else { 1389*8275SEric Cheng /* 1390*8275SEric Cheng * We are already above low water mark 1391*8275SEric Cheng * so stay in the polling mode but no 1392*8275SEric Cheng * need to poll. Once we dip below 1393*8275SEric Cheng * the polling threshold, the processing 1394*8275SEric Cheng * thread (soft ring) will signal us 1395*8275SEric Cheng * to poll again (MAC_UPDATE_SRS_COUNT) 1396*8275SEric Cheng */ 1397*8275SEric Cheng srs_rx->sr_poll_drain_no_poll++; 1398*8275SEric Cheng mac_srs->srs_state &= 1399*8275SEric Cheng ~(SRS_PROC|SRS_GET_PKTS); 1400*8275SEric Cheng /* 1401*8275SEric Cheng * In B/W control case, its possible 1402*8275SEric Cheng * that the backlog built up due to 1403*8275SEric Cheng * B/W limit being reached and packets 1404*8275SEric Cheng * are queued only in SRS. In this case, 1405*8275SEric Cheng * we should schedule worker thread 1406*8275SEric Cheng * since no one else will wake us up. 1407*8275SEric Cheng */ 1408*8275SEric Cheng if ((mac_srs->srs_type & 1409*8275SEric Cheng SRST_BW_CONTROL) && 1410*8275SEric Cheng (mac_srs->srs_tid == NULL)) { 1411*8275SEric Cheng mac_srs->srs_tid = 1412*8275SEric Cheng timeout(mac_srs_fire, 1413*8275SEric Cheng mac_srs, 1); 1414*8275SEric Cheng srs_rx->sr_poll_worker_wakeup++; 1415*8275SEric Cheng } 1416*8275SEric Cheng } 1417*8275SEric Cheng } else { 1418*8275SEric Cheng /* 1419*8275SEric Cheng * Wakeup the worker thread for more processing. 1420*8275SEric Cheng * We optimize for throughput in this case. 1421*8275SEric Cheng */ 1422*8275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|SRS_GET_PKTS); 1423*8275SEric Cheng MAC_SRS_WORKER_WAKEUP(mac_srs); 1424*8275SEric Cheng srs_rx->sr_poll_sig_worker++; 1425*8275SEric Cheng } 1426*8275SEric Cheng } else if ((mac_srs->srs_first == NULL) && 1427*8275SEric Cheng !(mac_srs->srs_state & SRS_WORKER)) { 1428*8275SEric Cheng /* 1429*8275SEric Cheng * There is nothing queued in SRS and 1430*8275SEric Cheng * no worker thread running. Plus we 1431*8275SEric Cheng * didn't get anything from the H/W 1432*8275SEric Cheng * as well (head == NULL); 1433*8275SEric Cheng */ 1434*8275SEric Cheng ASSERT(head == NULL); 1435*8275SEric Cheng mac_srs->srs_state &= 1436*8275SEric Cheng ~(SRS_PROC|SRS_GET_PKTS); 1437*8275SEric Cheng 1438*8275SEric Cheng /* 1439*8275SEric Cheng * If we have a packets in soft ring, don't allow 1440*8275SEric Cheng * more packets to come into this SRS by keeping the 1441*8275SEric Cheng * interrupts off but not polling the H/W. The 1442*8275SEric Cheng * poll thread will get signaled as soon as 1443*8275SEric Cheng * srs_poll_pkt_cnt dips below poll threshold. 1444*8275SEric Cheng */ 1445*8275SEric Cheng if (srs_rx->sr_poll_pkt_cnt == 0) { 1446*8275SEric Cheng srs_rx->sr_poll_intr_enable++; 1447*8275SEric Cheng MAC_SRS_POLLING_OFF(mac_srs); 1448*8275SEric Cheng } else { 1449*8275SEric Cheng /* 1450*8275SEric Cheng * We know nothing is queued in SRS 1451*8275SEric Cheng * since we are here after checking 1452*8275SEric Cheng * srs_first is NULL. The backlog 1453*8275SEric Cheng * is entirely due to packets queued 1454*8275SEric Cheng * in Soft ring which will wake us up 1455*8275SEric Cheng * and get the interface out of polling 1456*8275SEric Cheng * mode once the backlog dips below 1457*8275SEric Cheng * sr_poll_thres. 1458*8275SEric Cheng */ 1459*8275SEric Cheng srs_rx->sr_poll_no_poll++; 1460*8275SEric Cheng } 1461*8275SEric Cheng } else { 1462*8275SEric Cheng /* 1463*8275SEric Cheng * Worker thread is already running. 1464*8275SEric Cheng * Nothing much to do. If the polling 1465*8275SEric Cheng * was enabled, worker thread will deal 1466*8275SEric Cheng * with that. 1467*8275SEric Cheng */ 1468*8275SEric Cheng mac_srs->srs_state &= ~SRS_GET_PKTS; 1469*8275SEric Cheng srs_rx->sr_poll_goto_sleep++; 1470*8275SEric Cheng } 1471*8275SEric Cheng } 1472*8275SEric Cheng done: 1473*8275SEric Cheng mac_srs->srs_state |= SRS_POLL_THR_QUIESCED; 1474*8275SEric Cheng cv_signal(&mac_srs->srs_async); 1475*8275SEric Cheng /* 1476*8275SEric Cheng * If this is a temporary quiesce then wait for the restart signal 1477*8275SEric Cheng * from the srs worker. Then clear the flags and signal the srs worker 1478*8275SEric Cheng * to ensure a positive handshake and go back to start. 1479*8275SEric Cheng */ 1480*8275SEric Cheng while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_POLL_THR_RESTART))) 1481*8275SEric Cheng cv_wait(async, lock); 1482*8275SEric Cheng if (mac_srs->srs_state & SRS_POLL_THR_RESTART) { 1483*8275SEric Cheng ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 1484*8275SEric Cheng mac_srs->srs_state &= 1485*8275SEric Cheng ~(SRS_POLL_THR_QUIESCED | SRS_POLL_THR_RESTART); 1486*8275SEric Cheng cv_signal(&mac_srs->srs_async); 1487*8275SEric Cheng goto start; 1488*8275SEric Cheng } else { 1489*8275SEric Cheng mac_srs->srs_state |= SRS_POLL_THR_EXITED; 1490*8275SEric Cheng cv_signal(&mac_srs->srs_async); 1491*8275SEric Cheng CALLB_CPR_EXIT(&cprinfo); 1492*8275SEric Cheng thread_exit(); 1493*8275SEric Cheng } 1494*8275SEric Cheng } 1495*8275SEric Cheng 1496*8275SEric Cheng /* 1497*8275SEric Cheng * mac_srs_pick_chain 1498*8275SEric Cheng * 1499*8275SEric Cheng * In Bandwidth control case, checks how many packets can be processed 1500*8275SEric Cheng * and return them in a sub chain. 1501*8275SEric Cheng */ 1502*8275SEric Cheng static mblk_t * 1503*8275SEric Cheng mac_srs_pick_chain(mac_soft_ring_set_t *mac_srs, mblk_t **chain_tail, 1504*8275SEric Cheng size_t *chain_sz, int *chain_cnt) 1505*8275SEric Cheng { 1506*8275SEric Cheng mblk_t *head = NULL; 1507*8275SEric Cheng mblk_t *tail = NULL; 1508*8275SEric Cheng size_t sz; 1509*8275SEric Cheng size_t tsz = 0; 1510*8275SEric Cheng int cnt = 0; 1511*8275SEric Cheng mblk_t *mp; 1512*8275SEric Cheng 1513*8275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1514*8275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1515*8275SEric Cheng if (((mac_srs->srs_bw->mac_bw_used + mac_srs->srs_size) <= 1516*8275SEric Cheng mac_srs->srs_bw->mac_bw_limit) || 1517*8275SEric Cheng (mac_srs->srs_bw->mac_bw_limit == 0)) { 1518*8275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1519*8275SEric Cheng head = mac_srs->srs_first; 1520*8275SEric Cheng mac_srs->srs_first = NULL; 1521*8275SEric Cheng *chain_tail = mac_srs->srs_last; 1522*8275SEric Cheng mac_srs->srs_last = NULL; 1523*8275SEric Cheng *chain_sz = mac_srs->srs_size; 1524*8275SEric Cheng *chain_cnt = mac_srs->srs_count; 1525*8275SEric Cheng mac_srs->srs_count = 0; 1526*8275SEric Cheng mac_srs->srs_size = 0; 1527*8275SEric Cheng return (head); 1528*8275SEric Cheng } 1529*8275SEric Cheng 1530*8275SEric Cheng /* 1531*8275SEric Cheng * Can't clear the entire backlog. 1532*8275SEric Cheng * Need to find how many packets to pick 1533*8275SEric Cheng */ 1534*8275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_bw->mac_bw_lock)); 1535*8275SEric Cheng while ((mp = mac_srs->srs_first) != NULL) { 1536*8275SEric Cheng sz = msgdsize(mp); 1537*8275SEric Cheng if ((tsz + sz + mac_srs->srs_bw->mac_bw_used) > 1538*8275SEric Cheng mac_srs->srs_bw->mac_bw_limit) { 1539*8275SEric Cheng if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) 1540*8275SEric Cheng mac_srs->srs_bw->mac_bw_state |= 1541*8275SEric Cheng SRS_BW_ENFORCED; 1542*8275SEric Cheng break; 1543*8275SEric Cheng } 1544*8275SEric Cheng 1545*8275SEric Cheng /* 1546*8275SEric Cheng * The _size & cnt is decremented from the softrings 1547*8275SEric Cheng * when they send up the packet for polling to work 1548*8275SEric Cheng * properly. 1549*8275SEric Cheng */ 1550*8275SEric Cheng tsz += sz; 1551*8275SEric Cheng cnt++; 1552*8275SEric Cheng mac_srs->srs_count--; 1553*8275SEric Cheng mac_srs->srs_size -= sz; 1554*8275SEric Cheng if (tail != NULL) 1555*8275SEric Cheng tail->b_next = mp; 1556*8275SEric Cheng else 1557*8275SEric Cheng head = mp; 1558*8275SEric Cheng tail = mp; 1559*8275SEric Cheng mac_srs->srs_first = mac_srs->srs_first->b_next; 1560*8275SEric Cheng } 1561*8275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1562*8275SEric Cheng if (mac_srs->srs_first == NULL) 1563*8275SEric Cheng mac_srs->srs_last = NULL; 1564*8275SEric Cheng 1565*8275SEric Cheng if (tail != NULL) 1566*8275SEric Cheng tail->b_next = NULL; 1567*8275SEric Cheng *chain_tail = tail; 1568*8275SEric Cheng *chain_cnt = cnt; 1569*8275SEric Cheng *chain_sz = tsz; 1570*8275SEric Cheng 1571*8275SEric Cheng return (head); 1572*8275SEric Cheng } 1573*8275SEric Cheng 1574*8275SEric Cheng /* 1575*8275SEric Cheng * mac_rx_srs_drain 1576*8275SEric Cheng * 1577*8275SEric Cheng * The SRS drain routine. Gets to run to clear the queue. Any thread 1578*8275SEric Cheng * (worker, interrupt, poll) can call this based on processing model. 1579*8275SEric Cheng * The first thing we do is disable interrupts if possible and then 1580*8275SEric Cheng * drain the queue. we also try to poll the underlying hardware if 1581*8275SEric Cheng * there is a dedicated hardware Rx ring assigned to this SRS. 1582*8275SEric Cheng * 1583*8275SEric Cheng * There is a equivalent drain routine in bandwidth control mode 1584*8275SEric Cheng * mac_rx_srs_drain_bw. There is some code duplication between the two 1585*8275SEric Cheng * routines but they are highly performance sensitive and are easier 1586*8275SEric Cheng * to read/debug if they stay separate. Any code changes here might 1587*8275SEric Cheng * also apply to mac_rx_srs_drain_bw as well. 1588*8275SEric Cheng */ 1589*8275SEric Cheng void 1590*8275SEric Cheng mac_rx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 1591*8275SEric Cheng { 1592*8275SEric Cheng mblk_t *head; 1593*8275SEric Cheng mblk_t *tail; 1594*8275SEric Cheng timeout_id_t tid; 1595*8275SEric Cheng int cnt = 0; 1596*8275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 1597*8275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1598*8275SEric Cheng 1599*8275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1600*8275SEric Cheng ASSERT(!(mac_srs->srs_type & SRST_BW_CONTROL)); 1601*8275SEric Cheng again: 1602*8275SEric Cheng /* If we are blanked i.e. can't do upcalls, then we are done */ 1603*8275SEric Cheng if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 1604*8275SEric Cheng ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 1605*8275SEric Cheng (mac_srs->srs_state & SRS_PAUSE)); 1606*8275SEric Cheng goto out; 1607*8275SEric Cheng } 1608*8275SEric Cheng 1609*8275SEric Cheng if (mac_srs->srs_first == NULL) 1610*8275SEric Cheng goto out; 1611*8275SEric Cheng 1612*8275SEric Cheng head = mac_srs->srs_first; 1613*8275SEric Cheng mac_srs->srs_first = NULL; 1614*8275SEric Cheng tail = mac_srs->srs_last; 1615*8275SEric Cheng mac_srs->srs_last = NULL; 1616*8275SEric Cheng cnt = mac_srs->srs_count; 1617*8275SEric Cheng mac_srs->srs_count = 0; 1618*8275SEric Cheng 1619*8275SEric Cheng ASSERT(head != NULL); 1620*8275SEric Cheng ASSERT(tail != NULL); 1621*8275SEric Cheng 1622*8275SEric Cheng if ((tid = mac_srs->srs_tid) != 0) 1623*8275SEric Cheng mac_srs->srs_tid = 0; 1624*8275SEric Cheng 1625*8275SEric Cheng mac_srs->srs_state |= (SRS_PROC|proc_type); 1626*8275SEric Cheng 1627*8275SEric Cheng /* Switch to polling mode */ 1628*8275SEric Cheng MAC_SRS_WORKER_POLLING_ON(mac_srs); 1629*8275SEric Cheng if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) 1630*8275SEric Cheng MAC_SRS_POLL_RING(mac_srs); 1631*8275SEric Cheng /* 1632*8275SEric Cheng * mcip is NULL for broadcast and multicast flows. The promisc 1633*8275SEric Cheng * callbacks for broadcast and multicast packets are delivered from 1634*8275SEric Cheng * mac_rx() and we don't need to worry about that case in this path 1635*8275SEric Cheng */ 1636*8275SEric Cheng if (mcip != NULL && mcip->mci_promisc_list != NULL) { 1637*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 1638*8275SEric Cheng mac_promisc_client_dispatch(mcip, head); 1639*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 1640*8275SEric Cheng } 1641*8275SEric Cheng 1642*8275SEric Cheng /* 1643*8275SEric Cheng * Check if SRS itself is doing the processing 1644*8275SEric Cheng * This direct path does not apply when subflows are present. In this 1645*8275SEric Cheng * case, packets need to be dispatched to a soft ring according to the 1646*8275SEric Cheng * flow's bandwidth and other resources contraints. 1647*8275SEric Cheng */ 1648*8275SEric Cheng if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 1649*8275SEric Cheng mac_direct_rx_t proc; 1650*8275SEric Cheng void *arg1; 1651*8275SEric Cheng mac_resource_handle_t arg2; 1652*8275SEric Cheng 1653*8275SEric Cheng /* 1654*8275SEric Cheng * This is the case when a Rx is directly 1655*8275SEric Cheng * assigned and we have a fully classified 1656*8275SEric Cheng * protocol chain. We can deal with it in 1657*8275SEric Cheng * one shot. 1658*8275SEric Cheng */ 1659*8275SEric Cheng proc = srs_rx->sr_func; 1660*8275SEric Cheng arg1 = srs_rx->sr_arg1; 1661*8275SEric Cheng arg2 = srs_rx->sr_arg2; 1662*8275SEric Cheng 1663*8275SEric Cheng mac_srs->srs_state |= SRS_CLIENT_PROC; 1664*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 1665*8275SEric Cheng if (tid != 0) { 1666*8275SEric Cheng (void) untimeout(tid); 1667*8275SEric Cheng tid = 0; 1668*8275SEric Cheng } 1669*8275SEric Cheng 1670*8275SEric Cheng proc(arg1, arg2, head, NULL); 1671*8275SEric Cheng /* 1672*8275SEric Cheng * Decrement the size and count here itelf 1673*8275SEric Cheng * since the packet has been processed. 1674*8275SEric Cheng */ 1675*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 1676*8275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 1677*8275SEric Cheng if (mac_srs->srs_state & SRS_CLIENT_WAIT) 1678*8275SEric Cheng cv_signal(&mac_srs->srs_client_cv); 1679*8275SEric Cheng mac_srs->srs_state &= ~SRS_CLIENT_PROC; 1680*8275SEric Cheng } else { 1681*8275SEric Cheng /* Some kind of softrings based fanout is required */ 1682*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 1683*8275SEric Cheng if (tid != 0) { 1684*8275SEric Cheng (void) untimeout(tid); 1685*8275SEric Cheng tid = 0; 1686*8275SEric Cheng } 1687*8275SEric Cheng 1688*8275SEric Cheng /* 1689*8275SEric Cheng * Since the fanout routines can deal with chains, 1690*8275SEric Cheng * shoot the entire chain up. 1691*8275SEric Cheng */ 1692*8275SEric Cheng if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 1693*8275SEric Cheng mac_rx_srs_fanout(mac_srs, head); 1694*8275SEric Cheng else 1695*8275SEric Cheng mac_rx_srs_proto_fanout(mac_srs, head); 1696*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 1697*8275SEric Cheng } 1698*8275SEric Cheng 1699*8275SEric Cheng /* 1700*8275SEric Cheng * Send the poll thread to pick up any packets arrived 1701*8275SEric Cheng * so far. This also serves as the last check in case 1702*8275SEric Cheng * nothing else is queued in the SRS. The poll thread 1703*8275SEric Cheng * is signalled only in the case the drain was done 1704*8275SEric Cheng * by the worker thread and SRS_WORKER is set. The 1705*8275SEric Cheng * worker thread can run in parallel as long as the 1706*8275SEric Cheng * SRS_WORKER flag is set. We we have nothing else to 1707*8275SEric Cheng * process, we can exit while leaving SRS_PROC set 1708*8275SEric Cheng * which gives the poll thread control to process and 1709*8275SEric Cheng * cleanup once it returns from the NIC. 1710*8275SEric Cheng * 1711*8275SEric Cheng * If we have nothing else to process, we need to 1712*8275SEric Cheng * ensure that we keep holding the srs_lock till 1713*8275SEric Cheng * all the checks below are done and control is 1714*8275SEric Cheng * handed to the poll thread if it was running. 1715*8275SEric Cheng */ 1716*8275SEric Cheng if (mac_srs->srs_first != NULL) { 1717*8275SEric Cheng if (proc_type == SRS_WORKER) { 1718*8275SEric Cheng if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) 1719*8275SEric Cheng MAC_SRS_POLL_RING(mac_srs); 1720*8275SEric Cheng srs_rx->sr_drain_again++; 1721*8275SEric Cheng goto again; 1722*8275SEric Cheng } else { 1723*8275SEric Cheng srs_rx->sr_drain_worker_sig++; 1724*8275SEric Cheng cv_signal(&mac_srs->srs_async); 1725*8275SEric Cheng } 1726*8275SEric Cheng } 1727*8275SEric Cheng 1728*8275SEric Cheng out: 1729*8275SEric Cheng 1730*8275SEric Cheng if (mac_srs->srs_state & SRS_GET_PKTS) { 1731*8275SEric Cheng /* 1732*8275SEric Cheng * Poll thread is already running. Leave the 1733*8275SEric Cheng * SRS_RPOC set and hand over the control to 1734*8275SEric Cheng * poll thread. 1735*8275SEric Cheng */ 1736*8275SEric Cheng mac_srs->srs_state &= ~proc_type; 1737*8275SEric Cheng srs_rx->sr_drain_poll_running++; 1738*8275SEric Cheng return; 1739*8275SEric Cheng } 1740*8275SEric Cheng 1741*8275SEric Cheng /* 1742*8275SEric Cheng * Even if there are no packets queued in SRS, we 1743*8275SEric Cheng * need to make sure that the shared counter is 1744*8275SEric Cheng * clear and any associated softrings have cleared 1745*8275SEric Cheng * all the backlog. Otherwise, leave the interface 1746*8275SEric Cheng * in polling mode and the poll thread will get 1747*8275SEric Cheng * signalled once the count goes down to zero. 1748*8275SEric Cheng * 1749*8275SEric Cheng * If someone is already draining the queue (SRS_PROC is 1750*8275SEric Cheng * set) when the srs_poll_pkt_cnt goes down to zero, 1751*8275SEric Cheng * then it means that drain is already running and we 1752*8275SEric Cheng * will turn off polling at that time if there is 1753*8275SEric Cheng * no backlog. 1754*8275SEric Cheng * 1755*8275SEric Cheng * As long as there are packets queued either 1756*8275SEric Cheng * in soft ring set or its soft rings, we will leave 1757*8275SEric Cheng * the interface in polling mode (even if the drain 1758*8275SEric Cheng * was done being the interrupt thread). We signal 1759*8275SEric Cheng * the poll thread as well if we have dipped below 1760*8275SEric Cheng * low water mark. 1761*8275SEric Cheng * 1762*8275SEric Cheng * NOTE: We can't use the MAC_SRS_POLLING_ON macro 1763*8275SEric Cheng * since that turn polling on only for worker thread. 1764*8275SEric Cheng * Its not worth turning polling on for interrupt 1765*8275SEric Cheng * thread (since NIC will not issue another interrupt) 1766*8275SEric Cheng * unless a backlog builds up. 1767*8275SEric Cheng */ 1768*8275SEric Cheng if ((srs_rx->sr_poll_pkt_cnt > 0) && 1769*8275SEric Cheng (mac_srs->srs_state & SRS_POLLING_CAPAB)) { 1770*8275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 1771*8275SEric Cheng srs_rx->sr_drain_keep_polling++; 1772*8275SEric Cheng MAC_SRS_POLLING_ON(mac_srs); 1773*8275SEric Cheng if (srs_rx->sr_poll_pkt_cnt <= srs_rx->sr_lowat) 1774*8275SEric Cheng MAC_SRS_POLL_RING(mac_srs); 1775*8275SEric Cheng return; 1776*8275SEric Cheng } 1777*8275SEric Cheng 1778*8275SEric Cheng /* Nothing else to do. Get out of poll mode */ 1779*8275SEric Cheng MAC_SRS_POLLING_OFF(mac_srs); 1780*8275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 1781*8275SEric Cheng srs_rx->sr_drain_finish_intr++; 1782*8275SEric Cheng } 1783*8275SEric Cheng 1784*8275SEric Cheng /* 1785*8275SEric Cheng * mac_rx_srs_drain_bw 1786*8275SEric Cheng * 1787*8275SEric Cheng * The SRS BW drain routine. Gets to run to clear the queue. Any thread 1788*8275SEric Cheng * (worker, interrupt, poll) can call this based on processing model. 1789*8275SEric Cheng * The first thing we do is disable interrupts if possible and then 1790*8275SEric Cheng * drain the queue. we also try to poll the underlying hardware if 1791*8275SEric Cheng * there is a dedicated hardware Rx ring assigned to this SRS. 1792*8275SEric Cheng * 1793*8275SEric Cheng * There is a equivalent drain routine in non bandwidth control mode 1794*8275SEric Cheng * mac_rx_srs_drain. There is some code duplication between the two 1795*8275SEric Cheng * routines but they are highly performance sensitive and are easier 1796*8275SEric Cheng * to read/debug if they stay separate. Any code changes here might 1797*8275SEric Cheng * also apply to mac_rx_srs_drain as well. 1798*8275SEric Cheng */ 1799*8275SEric Cheng void 1800*8275SEric Cheng mac_rx_srs_drain_bw(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 1801*8275SEric Cheng { 1802*8275SEric Cheng mblk_t *head; 1803*8275SEric Cheng mblk_t *tail; 1804*8275SEric Cheng timeout_id_t tid; 1805*8275SEric Cheng size_t sz = 0; 1806*8275SEric Cheng int cnt = 0; 1807*8275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 1808*8275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 1809*8275SEric Cheng 1810*8275SEric Cheng ASSERT(MUTEX_HELD(&mac_srs->srs_lock)); 1811*8275SEric Cheng ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 1812*8275SEric Cheng again: 1813*8275SEric Cheng /* Check if we are doing B/W control */ 1814*8275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1815*8275SEric Cheng if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 1816*8275SEric Cheng mac_srs->srs_bw->mac_bw_curr_time = lbolt; 1817*8275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 1818*8275SEric Cheng if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 1819*8275SEric Cheng mac_srs->srs_bw->mac_bw_state &= ~SRS_BW_ENFORCED; 1820*8275SEric Cheng } else if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) { 1821*8275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1822*8275SEric Cheng goto done; 1823*8275SEric Cheng } else if (mac_srs->srs_bw->mac_bw_used > 1824*8275SEric Cheng mac_srs->srs_bw->mac_bw_limit) { 1825*8275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 1826*8275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1827*8275SEric Cheng goto done; 1828*8275SEric Cheng } 1829*8275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1830*8275SEric Cheng 1831*8275SEric Cheng /* If we are blanked i.e. can't do upcalls, then we are done */ 1832*8275SEric Cheng if (mac_srs->srs_state & (SRS_BLANK | SRS_PAUSE)) { 1833*8275SEric Cheng ASSERT((mac_srs->srs_type & SRST_NO_SOFT_RINGS) || 1834*8275SEric Cheng (mac_srs->srs_state & SRS_PAUSE)); 1835*8275SEric Cheng goto done; 1836*8275SEric Cheng } 1837*8275SEric Cheng 1838*8275SEric Cheng sz = 0; 1839*8275SEric Cheng cnt = 0; 1840*8275SEric Cheng if ((head = mac_srs_pick_chain(mac_srs, &tail, &sz, &cnt)) == NULL) { 1841*8275SEric Cheng /* 1842*8275SEric Cheng * We couldn't pick up a single packet. 1843*8275SEric Cheng */ 1844*8275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1845*8275SEric Cheng if ((mac_srs->srs_bw->mac_bw_used == 0) && 1846*8275SEric Cheng (mac_srs->srs_size != 0) && 1847*8275SEric Cheng !(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 1848*8275SEric Cheng /* 1849*8275SEric Cheng * Seems like configured B/W doesn't 1850*8275SEric Cheng * even allow processing of 1 packet 1851*8275SEric Cheng * per tick. 1852*8275SEric Cheng * 1853*8275SEric Cheng * XXX: raise the limit to processing 1854*8275SEric Cheng * at least 1 packet per tick. 1855*8275SEric Cheng */ 1856*8275SEric Cheng mac_srs->srs_bw->mac_bw_limit += 1857*8275SEric Cheng mac_srs->srs_bw->mac_bw_limit; 1858*8275SEric Cheng mac_srs->srs_bw->mac_bw_drop_threshold += 1859*8275SEric Cheng mac_srs->srs_bw->mac_bw_drop_threshold; 1860*8275SEric Cheng cmn_err(CE_NOTE, "mac_rx_srs_drain: srs(%p) " 1861*8275SEric Cheng "raised B/W limit to %d since not even a " 1862*8275SEric Cheng "single packet can be processed per " 1863*8275SEric Cheng "tick %d\n", (void *)mac_srs, 1864*8275SEric Cheng (int)mac_srs->srs_bw->mac_bw_limit, 1865*8275SEric Cheng (int)msgdsize(mac_srs->srs_first)); 1866*8275SEric Cheng } 1867*8275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1868*8275SEric Cheng goto done; 1869*8275SEric Cheng } 1870*8275SEric Cheng 1871*8275SEric Cheng ASSERT(head != NULL); 1872*8275SEric Cheng ASSERT(tail != NULL); 1873*8275SEric Cheng 1874*8275SEric Cheng /* zero bandwidth: drop all and return to interrupt mode */ 1875*8275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1876*8275SEric Cheng if (mac_srs->srs_bw->mac_bw_limit == 0) { 1877*8275SEric Cheng srs_rx->sr_drop_count += cnt; 1878*8275SEric Cheng ASSERT(mac_srs->srs_bw->mac_bw_sz >= sz); 1879*8275SEric Cheng mac_srs->srs_bw->mac_bw_sz -= sz; 1880*8275SEric Cheng mac_srs->srs_bw->mac_bw_drop_bytes += sz; 1881*8275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1882*8275SEric Cheng mac_pkt_drop(NULL, NULL, head, B_FALSE); 1883*8275SEric Cheng goto leave_poll; 1884*8275SEric Cheng } else { 1885*8275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1886*8275SEric Cheng } 1887*8275SEric Cheng 1888*8275SEric Cheng /* 1889*8275SEric Cheng * We can continue processing the queue. 1890*8275SEric Cheng * We need to figure out if there is a fanout needed or 1891*8275SEric Cheng * we can just process this here. 1892*8275SEric Cheng */ 1893*8275SEric Cheng 1894*8275SEric Cheng if ((tid = mac_srs->srs_tid) != 0) 1895*8275SEric Cheng mac_srs->srs_tid = 0; 1896*8275SEric Cheng 1897*8275SEric Cheng mac_srs->srs_state |= (SRS_PROC|proc_type); 1898*8275SEric Cheng MAC_SRS_WORKER_POLLING_ON(mac_srs); 1899*8275SEric Cheng 1900*8275SEric Cheng /* 1901*8275SEric Cheng * mcip is NULL for broadcast and multicast flows. The promisc 1902*8275SEric Cheng * callbacks for broadcast and multicast packets are delivered from 1903*8275SEric Cheng * mac_rx() and we don't need to worry about that case in this path 1904*8275SEric Cheng */ 1905*8275SEric Cheng if (mcip != NULL && mcip->mci_promisc_list != NULL) { 1906*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 1907*8275SEric Cheng mac_promisc_client_dispatch(mcip, head); 1908*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 1909*8275SEric Cheng } 1910*8275SEric Cheng 1911*8275SEric Cheng /* 1912*8275SEric Cheng * Check if SRS itself is doing the processing 1913*8275SEric Cheng * This direct path does not apply when subflows are present. In this 1914*8275SEric Cheng * case, packets need to be dispatched to a soft ring according to the 1915*8275SEric Cheng * flow's bandwidth and other resources contraints. 1916*8275SEric Cheng */ 1917*8275SEric Cheng if (mac_srs->srs_type & SRST_NO_SOFT_RINGS) { 1918*8275SEric Cheng mac_direct_rx_t proc; 1919*8275SEric Cheng void *arg1; 1920*8275SEric Cheng mac_resource_handle_t arg2; 1921*8275SEric Cheng 1922*8275SEric Cheng /* 1923*8275SEric Cheng * This is the case when a Rx is directly 1924*8275SEric Cheng * assigned and we have a fully classified 1925*8275SEric Cheng * protocol chain. We can deal with it in 1926*8275SEric Cheng * one shot. 1927*8275SEric Cheng */ 1928*8275SEric Cheng proc = srs_rx->sr_func; 1929*8275SEric Cheng arg1 = srs_rx->sr_arg1; 1930*8275SEric Cheng arg2 = srs_rx->sr_arg2; 1931*8275SEric Cheng 1932*8275SEric Cheng mac_srs->srs_state |= SRS_CLIENT_PROC; 1933*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 1934*8275SEric Cheng if (tid != 0) { 1935*8275SEric Cheng (void) untimeout(tid); 1936*8275SEric Cheng tid = 0; 1937*8275SEric Cheng } 1938*8275SEric Cheng 1939*8275SEric Cheng proc(arg1, arg2, head, NULL); 1940*8275SEric Cheng /* 1941*8275SEric Cheng * Decrement the size and count here itelf 1942*8275SEric Cheng * since the packet has been processed. 1943*8275SEric Cheng */ 1944*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 1945*8275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 1946*8275SEric Cheng MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 1947*8275SEric Cheng 1948*8275SEric Cheng if (mac_srs->srs_state & SRS_CLIENT_WAIT) 1949*8275SEric Cheng cv_signal(&mac_srs->srs_client_cv); 1950*8275SEric Cheng mac_srs->srs_state &= ~SRS_CLIENT_PROC; 1951*8275SEric Cheng } else { 1952*8275SEric Cheng /* Some kind of softrings based fanout is required */ 1953*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 1954*8275SEric Cheng if (tid != 0) { 1955*8275SEric Cheng (void) untimeout(tid); 1956*8275SEric Cheng tid = 0; 1957*8275SEric Cheng } 1958*8275SEric Cheng 1959*8275SEric Cheng /* 1960*8275SEric Cheng * Since the fanout routines can deal with chains, 1961*8275SEric Cheng * shoot the entire chain up. 1962*8275SEric Cheng */ 1963*8275SEric Cheng if (mac_srs->srs_type & SRST_FANOUT_SRC_IP) 1964*8275SEric Cheng mac_rx_srs_fanout(mac_srs, head); 1965*8275SEric Cheng else 1966*8275SEric Cheng mac_rx_srs_proto_fanout(mac_srs, head); 1967*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 1968*8275SEric Cheng } 1969*8275SEric Cheng 1970*8275SEric Cheng /* 1971*8275SEric Cheng * Send the poll thread to pick up any packets arrived 1972*8275SEric Cheng * so far. This also serves as the last check in case 1973*8275SEric Cheng * nothing else is queued in the SRS. The poll thread 1974*8275SEric Cheng * is signalled only in the case the drain was done 1975*8275SEric Cheng * by the worker thread and SRS_WORKER is set. The 1976*8275SEric Cheng * worker thread can run in parallel as long as the 1977*8275SEric Cheng * SRS_WORKER flag is set. We we have nothing else to 1978*8275SEric Cheng * process, we can exit while leaving SRS_PROC set 1979*8275SEric Cheng * which gives the poll thread control to process and 1980*8275SEric Cheng * cleanup once it returns from the NIC. 1981*8275SEric Cheng * 1982*8275SEric Cheng * If we have nothing else to process, we need to 1983*8275SEric Cheng * ensure that we keep holding the srs_lock till 1984*8275SEric Cheng * all the checks below are done and control is 1985*8275SEric Cheng * handed to the poll thread if it was running. 1986*8275SEric Cheng */ 1987*8275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 1988*8275SEric Cheng if (!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 1989*8275SEric Cheng if (mac_srs->srs_first != NULL) { 1990*8275SEric Cheng if (proc_type == SRS_WORKER) { 1991*8275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 1992*8275SEric Cheng if (srs_rx->sr_poll_pkt_cnt <= 1993*8275SEric Cheng srs_rx->sr_lowat) 1994*8275SEric Cheng MAC_SRS_POLL_RING(mac_srs); 1995*8275SEric Cheng goto again; 1996*8275SEric Cheng } else { 1997*8275SEric Cheng cv_signal(&mac_srs->srs_async); 1998*8275SEric Cheng } 1999*8275SEric Cheng } 2000*8275SEric Cheng } 2001*8275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2002*8275SEric Cheng 2003*8275SEric Cheng done: 2004*8275SEric Cheng 2005*8275SEric Cheng if (mac_srs->srs_state & SRS_GET_PKTS) { 2006*8275SEric Cheng /* 2007*8275SEric Cheng * Poll thread is already running. Leave the 2008*8275SEric Cheng * SRS_RPOC set and hand over the control to 2009*8275SEric Cheng * poll thread. 2010*8275SEric Cheng */ 2011*8275SEric Cheng mac_srs->srs_state &= ~proc_type; 2012*8275SEric Cheng return; 2013*8275SEric Cheng } 2014*8275SEric Cheng 2015*8275SEric Cheng /* 2016*8275SEric Cheng * If we can't process packets because we have exceeded 2017*8275SEric Cheng * B/W limit for this tick, just set the timeout 2018*8275SEric Cheng * and leave. 2019*8275SEric Cheng * 2020*8275SEric Cheng * Even if there are no packets queued in SRS, we 2021*8275SEric Cheng * need to make sure that the shared counter is 2022*8275SEric Cheng * clear and any associated softrings have cleared 2023*8275SEric Cheng * all the backlog. Otherwise, leave the interface 2024*8275SEric Cheng * in polling mode and the poll thread will get 2025*8275SEric Cheng * signalled once the count goes down to zero. 2026*8275SEric Cheng * 2027*8275SEric Cheng * If someone is already draining the queue (SRS_PROC is 2028*8275SEric Cheng * set) when the srs_poll_pkt_cnt goes down to zero, 2029*8275SEric Cheng * then it means that drain is already running and we 2030*8275SEric Cheng * will turn off polling at that time if there is 2031*8275SEric Cheng * no backlog. As long as there are packets queued either 2032*8275SEric Cheng * is soft ring set or its soft rings, we will leave 2033*8275SEric Cheng * the interface in polling mode. 2034*8275SEric Cheng */ 2035*8275SEric Cheng mutex_enter(&mac_srs->srs_bw->mac_bw_lock); 2036*8275SEric Cheng if ((mac_srs->srs_state & SRS_POLLING_CAPAB) && 2037*8275SEric Cheng ((mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) || 2038*8275SEric Cheng (srs_rx->sr_poll_pkt_cnt > 0))) { 2039*8275SEric Cheng MAC_SRS_POLLING_ON(mac_srs); 2040*8275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 2041*8275SEric Cheng if ((mac_srs->srs_first != NULL) && 2042*8275SEric Cheng (mac_srs->srs_tid == NULL)) 2043*8275SEric Cheng mac_srs->srs_tid = timeout(mac_srs_fire, 2044*8275SEric Cheng mac_srs, 1); 2045*8275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2046*8275SEric Cheng return; 2047*8275SEric Cheng } 2048*8275SEric Cheng mutex_exit(&mac_srs->srs_bw->mac_bw_lock); 2049*8275SEric Cheng 2050*8275SEric Cheng leave_poll: 2051*8275SEric Cheng 2052*8275SEric Cheng /* Nothing else to do. Get out of poll mode */ 2053*8275SEric Cheng MAC_SRS_POLLING_OFF(mac_srs); 2054*8275SEric Cheng mac_srs->srs_state &= ~(SRS_PROC|proc_type); 2055*8275SEric Cheng } 2056*8275SEric Cheng 2057*8275SEric Cheng /* 2058*8275SEric Cheng * mac_srs_worker 2059*8275SEric Cheng * 2060*8275SEric Cheng * The SRS worker routine. Drains the queue when no one else is 2061*8275SEric Cheng * processing it. 2062*8275SEric Cheng */ 2063*8275SEric Cheng void 2064*8275SEric Cheng mac_srs_worker(mac_soft_ring_set_t *mac_srs) 2065*8275SEric Cheng { 2066*8275SEric Cheng kmutex_t *lock = &mac_srs->srs_lock; 2067*8275SEric Cheng kcondvar_t *async = &mac_srs->srs_async; 2068*8275SEric Cheng callb_cpr_t cprinfo; 2069*8275SEric Cheng boolean_t bw_ctl_flag; 2070*8275SEric Cheng 2071*8275SEric Cheng CALLB_CPR_INIT(&cprinfo, lock, callb_generic_cpr, "srs_worker"); 2072*8275SEric Cheng mutex_enter(lock); 2073*8275SEric Cheng 2074*8275SEric Cheng start: 2075*8275SEric Cheng for (;;) { 2076*8275SEric Cheng bw_ctl_flag = B_FALSE; 2077*8275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 2078*8275SEric Cheng MAC_SRS_BW_LOCK(mac_srs); 2079*8275SEric Cheng MAC_SRS_CHECK_BW_CONTROL(mac_srs); 2080*8275SEric Cheng if (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED) 2081*8275SEric Cheng bw_ctl_flag = B_TRUE; 2082*8275SEric Cheng MAC_SRS_BW_UNLOCK(mac_srs); 2083*8275SEric Cheng } 2084*8275SEric Cheng /* 2085*8275SEric Cheng * The SRS_BW_ENFORCED flag may change since we have dropped 2086*8275SEric Cheng * the mac_bw_lock. However the drain function can handle both 2087*8275SEric Cheng * a drainable SRS or a bandwidth controlled SRS, and the 2088*8275SEric Cheng * effect of scheduling a timeout is to wakeup the worker 2089*8275SEric Cheng * thread which in turn will call the drain function. Since 2090*8275SEric Cheng * we release the srs_lock atomically only in the cv_wait there 2091*8275SEric Cheng * isn't a fear of waiting for ever. 2092*8275SEric Cheng */ 2093*8275SEric Cheng while (((mac_srs->srs_state & SRS_PROC) || 2094*8275SEric Cheng (mac_srs->srs_first == NULL) || bw_ctl_flag || 2095*8275SEric Cheng (mac_srs->srs_state & SRS_TX_BLOCKED)) && 2096*8275SEric Cheng !(mac_srs->srs_state & SRS_PAUSE)) { 2097*8275SEric Cheng /* 2098*8275SEric Cheng * If we have packets queued and we are here 2099*8275SEric Cheng * because B/W control is in place, we better 2100*8275SEric Cheng * schedule the worker wakeup after 1 tick 2101*8275SEric Cheng * to see if bandwidth control can be relaxed. 2102*8275SEric Cheng */ 2103*8275SEric Cheng if (bw_ctl_flag && mac_srs->srs_tid == NULL) { 2104*8275SEric Cheng /* 2105*8275SEric Cheng * We need to ensure that a timer is already 2106*8275SEric Cheng * scheduled or we force schedule one for 2107*8275SEric Cheng * later so that we can continue processing 2108*8275SEric Cheng * after this quanta is over. 2109*8275SEric Cheng */ 2110*8275SEric Cheng mac_srs->srs_tid = timeout(mac_srs_fire, 2111*8275SEric Cheng mac_srs, 1); 2112*8275SEric Cheng } 2113*8275SEric Cheng wait: 2114*8275SEric Cheng CALLB_CPR_SAFE_BEGIN(&cprinfo); 2115*8275SEric Cheng cv_wait(async, lock); 2116*8275SEric Cheng CALLB_CPR_SAFE_END(&cprinfo, lock); 2117*8275SEric Cheng 2118*8275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 2119*8275SEric Cheng goto done; 2120*8275SEric Cheng if (mac_srs->srs_state & SRS_PROC) 2121*8275SEric Cheng goto wait; 2122*8275SEric Cheng 2123*8275SEric Cheng if (mac_srs->srs_first != NULL && 2124*8275SEric Cheng mac_srs->srs_type & SRST_BW_CONTROL) { 2125*8275SEric Cheng MAC_SRS_BW_LOCK(mac_srs); 2126*8275SEric Cheng if (mac_srs->srs_bw->mac_bw_state & 2127*8275SEric Cheng SRS_BW_ENFORCED) { 2128*8275SEric Cheng MAC_SRS_CHECK_BW_CONTROL(mac_srs); 2129*8275SEric Cheng } 2130*8275SEric Cheng bw_ctl_flag = mac_srs->srs_bw->mac_bw_state & 2131*8275SEric Cheng SRS_BW_ENFORCED; 2132*8275SEric Cheng MAC_SRS_BW_UNLOCK(mac_srs); 2133*8275SEric Cheng } 2134*8275SEric Cheng } 2135*8275SEric Cheng 2136*8275SEric Cheng if (mac_srs->srs_state & SRS_PAUSE) 2137*8275SEric Cheng goto done; 2138*8275SEric Cheng mac_srs->srs_drain_func(mac_srs, SRS_WORKER); 2139*8275SEric Cheng } 2140*8275SEric Cheng done: 2141*8275SEric Cheng /* 2142*8275SEric Cheng * The Rx SRS quiesce logic first cuts off packet supply to the SRS 2143*8275SEric Cheng * from both hard and soft classifications and waits for such threads 2144*8275SEric Cheng * to finish before signaling the worker. So at this point the only 2145*8275SEric Cheng * thread left that could be competing with the worker is the poll 2146*8275SEric Cheng * thread. In the case of Tx, there shouldn't be any thread holding 2147*8275SEric Cheng * SRS_PROC at this point. 2148*8275SEric Cheng */ 2149*8275SEric Cheng if (!(mac_srs->srs_state & SRS_PROC)) { 2150*8275SEric Cheng mac_srs->srs_state |= SRS_PROC; 2151*8275SEric Cheng } else { 2152*8275SEric Cheng ASSERT((mac_srs->srs_type & SRST_TX) == 0); 2153*8275SEric Cheng /* 2154*8275SEric Cheng * Poll thread still owns the SRS and is still running 2155*8275SEric Cheng */ 2156*8275SEric Cheng ASSERT((mac_srs->srs_poll_thr == NULL) || 2157*8275SEric Cheng ((mac_srs->srs_state & SRS_POLL_THR_OWNER) == 2158*8275SEric Cheng SRS_POLL_THR_OWNER)); 2159*8275SEric Cheng } 2160*8275SEric Cheng mac_srs_worker_quiesce(mac_srs); 2161*8275SEric Cheng /* 2162*8275SEric Cheng * Wait for the SRS_RESTART or SRS_CONDEMNED signal from the initiator 2163*8275SEric Cheng * of the quiesce operation 2164*8275SEric Cheng */ 2165*8275SEric Cheng while (!(mac_srs->srs_state & (SRS_CONDEMNED | SRS_RESTART))) 2166*8275SEric Cheng cv_wait(&mac_srs->srs_async, &mac_srs->srs_lock); 2167*8275SEric Cheng 2168*8275SEric Cheng if (mac_srs->srs_state & SRS_RESTART) { 2169*8275SEric Cheng ASSERT(!(mac_srs->srs_state & SRS_CONDEMNED)); 2170*8275SEric Cheng mac_srs_worker_restart(mac_srs); 2171*8275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 2172*8275SEric Cheng goto start; 2173*8275SEric Cheng } 2174*8275SEric Cheng 2175*8275SEric Cheng if (!(mac_srs->srs_state & SRS_CONDEMNED_DONE)) 2176*8275SEric Cheng mac_srs_worker_quiesce(mac_srs); 2177*8275SEric Cheng 2178*8275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 2179*8275SEric Cheng /* The macro drops the srs_lock */ 2180*8275SEric Cheng CALLB_CPR_EXIT(&cprinfo); 2181*8275SEric Cheng thread_exit(); 2182*8275SEric Cheng } 2183*8275SEric Cheng 2184*8275SEric Cheng /* 2185*8275SEric Cheng * mac_rx_srs_subflow_process 2186*8275SEric Cheng * 2187*8275SEric Cheng * Receive side routine called from interrupt path when there are 2188*8275SEric Cheng * sub flows present on this SRS. 2189*8275SEric Cheng */ 2190*8275SEric Cheng /* ARGSUSED */ 2191*8275SEric Cheng void 2192*8275SEric Cheng mac_rx_srs_subflow_process(void *arg, mac_resource_handle_t srs, 2193*8275SEric Cheng mblk_t *mp_chain, boolean_t loopback) 2194*8275SEric Cheng { 2195*8275SEric Cheng flow_entry_t *flent = NULL; 2196*8275SEric Cheng flow_entry_t *prev_flent = NULL; 2197*8275SEric Cheng mblk_t *mp = NULL; 2198*8275SEric Cheng mblk_t *tail = NULL; 2199*8275SEric Cheng mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 2200*8275SEric Cheng mac_client_impl_t *mcip; 2201*8275SEric Cheng 2202*8275SEric Cheng mcip = mac_srs->srs_mcip; 2203*8275SEric Cheng ASSERT(mcip != NULL); 2204*8275SEric Cheng 2205*8275SEric Cheng /* 2206*8275SEric Cheng * We need to determine the SRS for every packet 2207*8275SEric Cheng * by walking the flow table, if we don't get any, 2208*8275SEric Cheng * then we proceed using the SRS we came with. 2209*8275SEric Cheng */ 2210*8275SEric Cheng mp = tail = mp_chain; 2211*8275SEric Cheng while (mp != NULL) { 2212*8275SEric Cheng 2213*8275SEric Cheng /* 2214*8275SEric Cheng * We will increment the stats for the mactching subflow. 2215*8275SEric Cheng * when we get the bytes/pkt count for the classified packets 2216*8275SEric Cheng * later in mac_rx_srs_process. 2217*8275SEric Cheng */ 2218*8275SEric Cheng (void) mac_flow_lookup(mcip->mci_subflow_tab, mp, 2219*8275SEric Cheng FLOW_INBOUND, &flent); 2220*8275SEric Cheng 2221*8275SEric Cheng if (mp == mp_chain || flent == prev_flent) { 2222*8275SEric Cheng if (prev_flent != NULL) 2223*8275SEric Cheng FLOW_REFRELE(prev_flent); 2224*8275SEric Cheng prev_flent = flent; 2225*8275SEric Cheng flent = NULL; 2226*8275SEric Cheng tail = mp; 2227*8275SEric Cheng mp = mp->b_next; 2228*8275SEric Cheng continue; 2229*8275SEric Cheng } 2230*8275SEric Cheng tail->b_next = NULL; 2231*8275SEric Cheng /* 2232*8275SEric Cheng * A null indicates, this is for the mac_srs itself. 2233*8275SEric Cheng * XXX-venu : probably assert for fe_rx_srs_cnt == 0. 2234*8275SEric Cheng */ 2235*8275SEric Cheng if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 2236*8275SEric Cheng mac_rx_srs_process(arg, 2237*8275SEric Cheng (mac_resource_handle_t)mac_srs, mp_chain, 2238*8275SEric Cheng loopback); 2239*8275SEric Cheng } else { 2240*8275SEric Cheng (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 2241*8275SEric Cheng prev_flent->fe_cb_arg2, mp_chain, loopback); 2242*8275SEric Cheng FLOW_REFRELE(prev_flent); 2243*8275SEric Cheng } 2244*8275SEric Cheng prev_flent = flent; 2245*8275SEric Cheng flent = NULL; 2246*8275SEric Cheng mp_chain = mp; 2247*8275SEric Cheng tail = mp; 2248*8275SEric Cheng mp = mp->b_next; 2249*8275SEric Cheng } 2250*8275SEric Cheng /* Last chain */ 2251*8275SEric Cheng ASSERT(mp_chain != NULL); 2252*8275SEric Cheng if (prev_flent == NULL || prev_flent->fe_rx_srs_cnt == 0) { 2253*8275SEric Cheng mac_rx_srs_process(arg, 2254*8275SEric Cheng (mac_resource_handle_t)mac_srs, mp_chain, loopback); 2255*8275SEric Cheng } else { 2256*8275SEric Cheng (prev_flent->fe_cb_fn)(prev_flent->fe_cb_arg1, 2257*8275SEric Cheng prev_flent->fe_cb_arg2, mp_chain, loopback); 2258*8275SEric Cheng FLOW_REFRELE(prev_flent); 2259*8275SEric Cheng } 2260*8275SEric Cheng } 2261*8275SEric Cheng 2262*8275SEric Cheng /* 2263*8275SEric Cheng * mac_rx_srs_process 2264*8275SEric Cheng * 2265*8275SEric Cheng * Receive side routine called from the interrupt path. 2266*8275SEric Cheng * 2267*8275SEric Cheng * loopback is set to force a context switch on the loopback 2268*8275SEric Cheng * path between MAC clients. 2269*8275SEric Cheng */ 2270*8275SEric Cheng /* ARGSUSED */ 2271*8275SEric Cheng void 2272*8275SEric Cheng mac_rx_srs_process(void *arg, mac_resource_handle_t srs, mblk_t *mp_chain, 2273*8275SEric Cheng boolean_t loopback) 2274*8275SEric Cheng { 2275*8275SEric Cheng mac_soft_ring_set_t *mac_srs = (mac_soft_ring_set_t *)srs; 2276*8275SEric Cheng mblk_t *mp, *tail, *head; 2277*8275SEric Cheng int count = 0; 2278*8275SEric Cheng int count1; 2279*8275SEric Cheng size_t sz = 0; 2280*8275SEric Cheng size_t chain_sz, sz1; 2281*8275SEric Cheng mac_bw_ctl_t *mac_bw; 2282*8275SEric Cheng mac_client_impl_t *smcip; 2283*8275SEric Cheng mac_srs_rx_t *srs_rx = &mac_srs->srs_rx; 2284*8275SEric Cheng 2285*8275SEric Cheng /* 2286*8275SEric Cheng * Set the tail, count and sz. We set the sz irrespective 2287*8275SEric Cheng * of whether we are doing B/W control or not for the 2288*8275SEric Cheng * purpose of updating the stats. 2289*8275SEric Cheng */ 2290*8275SEric Cheng mp = tail = mp_chain; 2291*8275SEric Cheng while (mp != NULL) { 2292*8275SEric Cheng tail = mp; 2293*8275SEric Cheng count++; 2294*8275SEric Cheng sz += msgdsize(mp); 2295*8275SEric Cheng mp = mp->b_next; 2296*8275SEric Cheng } 2297*8275SEric Cheng 2298*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 2299*8275SEric Cheng smcip = mac_srs->srs_mcip; 2300*8275SEric Cheng 2301*8275SEric Cheng if (mac_srs->srs_type & SRST_FLOW || smcip == NULL) { 2302*8275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, rbytes, sz); 2303*8275SEric Cheng FLOW_STAT_UPDATE(mac_srs->srs_flent, ipackets, count); 2304*8275SEric Cheng } 2305*8275SEric Cheng if (smcip != NULL) { 2306*8275SEric Cheng smcip->mci_stat_ibytes += sz; 2307*8275SEric Cheng smcip->mci_stat_ipackets += count; 2308*8275SEric Cheng } 2309*8275SEric Cheng 2310*8275SEric Cheng /* 2311*8275SEric Cheng * If the SRS in already being processed; has been blanked; 2312*8275SEric Cheng * can be processed by worker thread only; or the B/W limit 2313*8275SEric Cheng * has been reached, then queue the chain and check if 2314*8275SEric Cheng * worker thread needs to be awakend. 2315*8275SEric Cheng */ 2316*8275SEric Cheng if (mac_srs->srs_type & SRST_BW_CONTROL) { 2317*8275SEric Cheng mac_bw = mac_srs->srs_bw; 2318*8275SEric Cheng ASSERT(mac_bw != NULL); 2319*8275SEric Cheng mutex_enter(&mac_bw->mac_bw_lock); 2320*8275SEric Cheng /* Count the packets and bytes via interrupt */ 2321*8275SEric Cheng srs_rx->sr_intr_count += count; 2322*8275SEric Cheng mac_bw->mac_bw_intr += sz; 2323*8275SEric Cheng if (mac_bw->mac_bw_limit == 0) { 2324*8275SEric Cheng /* zero bandwidth: drop all */ 2325*8275SEric Cheng srs_rx->sr_drop_count += count; 2326*8275SEric Cheng mac_bw->mac_bw_drop_bytes += sz; 2327*8275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 2328*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 2329*8275SEric Cheng mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 2330*8275SEric Cheng return; 2331*8275SEric Cheng } else { 2332*8275SEric Cheng if ((mac_bw->mac_bw_sz + sz) <= 2333*8275SEric Cheng mac_bw->mac_bw_drop_threshold) { 2334*8275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 2335*8275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, 2336*8275SEric Cheng tail, count, sz); 2337*8275SEric Cheng } else { 2338*8275SEric Cheng mp = mp_chain; 2339*8275SEric Cheng chain_sz = 0; 2340*8275SEric Cheng count1 = 0; 2341*8275SEric Cheng tail = NULL; 2342*8275SEric Cheng head = NULL; 2343*8275SEric Cheng while (mp != NULL) { 2344*8275SEric Cheng sz1 = msgdsize(mp); 2345*8275SEric Cheng if (mac_bw->mac_bw_sz + chain_sz + sz1 > 2346*8275SEric Cheng mac_bw->mac_bw_drop_threshold) 2347*8275SEric Cheng break; 2348*8275SEric Cheng chain_sz += sz1; 2349*8275SEric Cheng count1++; 2350*8275SEric Cheng tail = mp; 2351*8275SEric Cheng mp = mp->b_next; 2352*8275SEric Cheng } 2353*8275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 2354*8275SEric Cheng if (tail != NULL) { 2355*8275SEric Cheng head = tail->b_next; 2356*8275SEric Cheng tail->b_next = NULL; 2357*8275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, 2358*8275SEric Cheng mp_chain, tail, count1, chain_sz); 2359*8275SEric Cheng sz -= chain_sz; 2360*8275SEric Cheng count -= count1; 2361*8275SEric Cheng } else { 2362*8275SEric Cheng /* Can't pick up any */ 2363*8275SEric Cheng head = mp_chain; 2364*8275SEric Cheng } 2365*8275SEric Cheng if (head != NULL) { 2366*8275SEric Cheng /* Drop any packet over the threshold */ 2367*8275SEric Cheng srs_rx->sr_drop_count += count; 2368*8275SEric Cheng mutex_enter(&mac_bw->mac_bw_lock); 2369*8275SEric Cheng mac_bw->mac_bw_drop_bytes += sz; 2370*8275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 2371*8275SEric Cheng freemsgchain(head); 2372*8275SEric Cheng } 2373*8275SEric Cheng } 2374*8275SEric Cheng MAC_SRS_WORKER_WAKEUP(mac_srs); 2375*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 2376*8275SEric Cheng return; 2377*8275SEric Cheng } 2378*8275SEric Cheng } 2379*8275SEric Cheng 2380*8275SEric Cheng /* 2381*8275SEric Cheng * If the total number of packets queued in the SRS and 2382*8275SEric Cheng * its associated soft rings exceeds the max allowed, 2383*8275SEric Cheng * then drop the chain. If we are polling capable, this 2384*8275SEric Cheng * shouldn't be happening. 2385*8275SEric Cheng */ 2386*8275SEric Cheng if (!(mac_srs->srs_type & SRST_BW_CONTROL) && 2387*8275SEric Cheng (srs_rx->sr_poll_pkt_cnt > srs_rx->sr_hiwat)) { 2388*8275SEric Cheng mac_bw = mac_srs->srs_bw; 2389*8275SEric Cheng srs_rx->sr_drop_count += count; 2390*8275SEric Cheng mutex_enter(&mac_bw->mac_bw_lock); 2391*8275SEric Cheng mac_bw->mac_bw_drop_bytes += sz; 2392*8275SEric Cheng mutex_exit(&mac_bw->mac_bw_lock); 2393*8275SEric Cheng freemsgchain(mp_chain); 2394*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 2395*8275SEric Cheng return; 2396*8275SEric Cheng } 2397*8275SEric Cheng 2398*8275SEric Cheng MAC_RX_SRS_ENQUEUE_CHAIN(mac_srs, mp_chain, tail, count, sz); 2399*8275SEric Cheng /* Count the packets entering via interrupt path */ 2400*8275SEric Cheng srs_rx->sr_intr_count += count; 2401*8275SEric Cheng 2402*8275SEric Cheng if (!(mac_srs->srs_state & SRS_PROC)) { 2403*8275SEric Cheng /* 2404*8275SEric Cheng * If we are coming via loopback or if we are not 2405*8275SEric Cheng * optimizing for latency, we should signal the 2406*8275SEric Cheng * worker thread. 2407*8275SEric Cheng */ 2408*8275SEric Cheng if (loopback || ((count > 1) && 2409*8275SEric Cheng !(mac_srs->srs_state & SRS_LATENCY_OPT))) { 2410*8275SEric Cheng /* 2411*8275SEric Cheng * For loopback, We need to let the worker take 2412*8275SEric Cheng * over as we don't want to continue in the same 2413*8275SEric Cheng * thread even if we can. This could lead to stack 2414*8275SEric Cheng * overflows and may also end up using 2415*8275SEric Cheng * resources (cpu) incorrectly. 2416*8275SEric Cheng */ 2417*8275SEric Cheng cv_signal(&mac_srs->srs_async); 2418*8275SEric Cheng } else { 2419*8275SEric Cheng /* 2420*8275SEric Cheng * Seems like no one is processing the SRS and 2421*8275SEric Cheng * there is no backlog. We also inline process 2422*8275SEric Cheng * our packet if its a single packet in non 2423*8275SEric Cheng * latency optimized case (in latency optimized 2424*8275SEric Cheng * case, we inline process chains of any size). 2425*8275SEric Cheng */ 2426*8275SEric Cheng mac_srs->srs_drain_func(mac_srs, SRS_PROC_FAST); 2427*8275SEric Cheng } 2428*8275SEric Cheng } 2429*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 2430*8275SEric Cheng } 2431*8275SEric Cheng 2432*8275SEric Cheng /* TX SIDE ROUTINES (RUNTIME) */ 2433*8275SEric Cheng 2434*8275SEric Cheng /* 2435*8275SEric Cheng * mac_tx_srs_no_desc 2436*8275SEric Cheng * 2437*8275SEric Cheng * This routine is called by Tx single ring default mode 2438*8275SEric Cheng * when Tx ring runs out of descs. 2439*8275SEric Cheng */ 2440*8275SEric Cheng mac_tx_cookie_t 2441*8275SEric Cheng mac_tx_srs_no_desc(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2442*8275SEric Cheng uint16_t flag, mblk_t **ret_mp) 2443*8275SEric Cheng { 2444*8275SEric Cheng mac_tx_cookie_t cookie = NULL; 2445*8275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2446*8275SEric Cheng boolean_t wakeup_worker = B_TRUE; 2447*8275SEric Cheng uint32_t tx_mode = srs_tx->st_mode; 2448*8275SEric Cheng int cnt, sz; 2449*8275SEric Cheng mblk_t *tail; 2450*8275SEric Cheng 2451*8275SEric Cheng ASSERT(tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_BW); 2452*8275SEric Cheng if (flag & MAC_DROP_ON_NO_DESC) { 2453*8275SEric Cheng MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2454*8275SEric Cheng } else { 2455*8275SEric Cheng if (mac_srs->srs_first != NULL) 2456*8275SEric Cheng wakeup_worker = B_FALSE; 2457*8275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2458*8275SEric Cheng if (flag & MAC_TX_NO_ENQUEUE) { 2459*8275SEric Cheng /* 2460*8275SEric Cheng * If TX_QUEUED is not set, queue the 2461*8275SEric Cheng * packet and let mac_tx_srs_drain() 2462*8275SEric Cheng * set the TX_BLOCKED bit for the 2463*8275SEric Cheng * reasons explained above. Otherwise, 2464*8275SEric Cheng * return the mblks. 2465*8275SEric Cheng */ 2466*8275SEric Cheng if (wakeup_worker) { 2467*8275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2468*8275SEric Cheng mp_chain, tail, cnt, sz); 2469*8275SEric Cheng } else { 2470*8275SEric Cheng MAC_TX_SET_NO_ENQUEUE(mac_srs, 2471*8275SEric Cheng mp_chain, ret_mp, cookie); 2472*8275SEric Cheng } 2473*8275SEric Cheng } else { 2474*8275SEric Cheng MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 2475*8275SEric Cheng tail, cnt, sz, cookie); 2476*8275SEric Cheng } 2477*8275SEric Cheng if (wakeup_worker) 2478*8275SEric Cheng cv_signal(&mac_srs->srs_async); 2479*8275SEric Cheng } 2480*8275SEric Cheng return (cookie); 2481*8275SEric Cheng } 2482*8275SEric Cheng 2483*8275SEric Cheng /* 2484*8275SEric Cheng * mac_tx_srs_enqueue 2485*8275SEric Cheng * 2486*8275SEric Cheng * This routine is called when Tx SRS is operating in either serializer 2487*8275SEric Cheng * or bandwidth mode. In serializer mode, a packet will get enqueued 2488*8275SEric Cheng * when a thread cannot enter SRS exclusively. In bandwidth mode, 2489*8275SEric Cheng * packets gets queued if allowed byte-count limit for a tick is 2490*8275SEric Cheng * exceeded. The action that gets taken when MAC_DROP_ON_NO_DESC and 2491*8275SEric Cheng * MAC_TX_NO_ENQUEUE is set is different than when operaing in either 2492*8275SEric Cheng * the default mode or fanout mode. Here packets get dropped or 2493*8275SEric Cheng * returned back to the caller only after hi-watermark worth of data 2494*8275SEric Cheng * is queued. 2495*8275SEric Cheng */ 2496*8275SEric Cheng static mac_tx_cookie_t 2497*8275SEric Cheng mac_tx_srs_enqueue(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2498*8275SEric Cheng uint16_t flag, uintptr_t fanout_hint, mblk_t **ret_mp) 2499*8275SEric Cheng { 2500*8275SEric Cheng mac_tx_cookie_t cookie = NULL; 2501*8275SEric Cheng int cnt, sz; 2502*8275SEric Cheng mblk_t *tail; 2503*8275SEric Cheng boolean_t wakeup_worker = B_TRUE; 2504*8275SEric Cheng 2505*8275SEric Cheng if (mac_srs->srs_first != NULL) 2506*8275SEric Cheng wakeup_worker = B_FALSE; 2507*8275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2508*8275SEric Cheng if (flag & MAC_DROP_ON_NO_DESC) { 2509*8275SEric Cheng if (mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) { 2510*8275SEric Cheng MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2511*8275SEric Cheng } else { 2512*8275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2513*8275SEric Cheng mp_chain, tail, cnt, sz); 2514*8275SEric Cheng } 2515*8275SEric Cheng } else if (flag & MAC_TX_NO_ENQUEUE) { 2516*8275SEric Cheng if ((mac_srs->srs_count > mac_srs->srs_tx.st_hiwat) || 2517*8275SEric Cheng (mac_srs->srs_state & SRS_TX_WAKEUP_CLIENT)) { 2518*8275SEric Cheng MAC_TX_SET_NO_ENQUEUE(mac_srs, mp_chain, 2519*8275SEric Cheng ret_mp, cookie); 2520*8275SEric Cheng } else { 2521*8275SEric Cheng mp_chain->b_prev = (mblk_t *)fanout_hint; 2522*8275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2523*8275SEric Cheng mp_chain, tail, cnt, sz); 2524*8275SEric Cheng } 2525*8275SEric Cheng } else { 2526*8275SEric Cheng /* 2527*8275SEric Cheng * If you are BW_ENFORCED, just enqueue the 2528*8275SEric Cheng * packet. srs_worker will drain it at the 2529*8275SEric Cheng * prescribed rate. Before enqueueing, save 2530*8275SEric Cheng * the fanout hint. 2531*8275SEric Cheng */ 2532*8275SEric Cheng mp_chain->b_prev = (mblk_t *)fanout_hint; 2533*8275SEric Cheng MAC_TX_SRS_TEST_HIWAT(mac_srs, mp_chain, 2534*8275SEric Cheng tail, cnt, sz, cookie); 2535*8275SEric Cheng } 2536*8275SEric Cheng if (wakeup_worker) 2537*8275SEric Cheng cv_signal(&mac_srs->srs_async); 2538*8275SEric Cheng return (cookie); 2539*8275SEric Cheng } 2540*8275SEric Cheng 2541*8275SEric Cheng /* 2542*8275SEric Cheng * There are five tx modes: 2543*8275SEric Cheng * 2544*8275SEric Cheng * 1) Default mode (SRS_TX_DEFAULT) 2545*8275SEric Cheng * 2) Serialization mode (SRS_TX_SERIALIZE) 2546*8275SEric Cheng * 3) Fanout mode (SRS_TX_FANOUT) 2547*8275SEric Cheng * 4) Bandwdith mode (SRS_TX_BW) 2548*8275SEric Cheng * 5) Fanout and Bandwidth mode (SRS_TX_BW_FANOUT) 2549*8275SEric Cheng * 2550*8275SEric Cheng * The tx mode in which an SRS operates is decided in mac_tx_srs_setup() 2551*8275SEric Cheng * based on the number of Tx rings requested for an SRS and whether 2552*8275SEric Cheng * bandwidth control is requested or not. 2553*8275SEric Cheng * 2554*8275SEric Cheng * In the default mode (i.e., no fanout/no bandwidth), the SRS acts as a 2555*8275SEric Cheng * pass-thru. Packets will go directly to mac_tx_send(). When the underlying 2556*8275SEric Cheng * Tx ring runs out of Tx descs, it starts queueing up packets in SRS. 2557*8275SEric Cheng * When flow-control is relieved, the srs_worker drains the queued 2558*8275SEric Cheng * packets and informs blocked clients to restart sending packets. 2559*8275SEric Cheng * 2560*8275SEric Cheng * In the SRS_TX_SERIALIZE mode, all calls to mac_tx() are serialized. 2561*8275SEric Cheng * 2562*8275SEric Cheng * In the SRS_TX_FANOUT mode, packets will be fanned out to multiple 2563*8275SEric Cheng * Tx rings. Each Tx ring will have a soft ring associated with it. 2564*8275SEric Cheng * These soft rings will be hung off the Tx SRS. Queueing if it happens 2565*8275SEric Cheng * due to lack of Tx desc will be in individual soft ring (and not srs) 2566*8275SEric Cheng * associated with Tx ring. 2567*8275SEric Cheng * 2568*8275SEric Cheng * In the TX_BW mode, tx srs will allow packets to go down to Tx ring 2569*8275SEric Cheng * only if bw is available. Otherwise the packets will be queued in 2570*8275SEric Cheng * SRS. If fanout to multiple Tx rings is configured, the packets will 2571*8275SEric Cheng * be fanned out among the soft rings associated with the Tx rings. 2572*8275SEric Cheng * 2573*8275SEric Cheng * Four flags are used in srs_state for indicating flow control 2574*8275SEric Cheng * conditions : SRS_TX_BLOCKED, SRS_TX_HIWAT, SRS_TX_WAKEUP_CLIENT. 2575*8275SEric Cheng * SRS_TX_BLOCKED indicates out of Tx descs. SRS expects a wakeup from the 2576*8275SEric Cheng * driver below. 2577*8275SEric Cheng * SRS_TX_HIWAT indicates packet count enqueued in Tx SRS exceeded Tx hiwat 2578*8275SEric Cheng * and flow-control pressure is applied back to clients. The clients expect 2579*8275SEric Cheng * wakeup when flow-control is relieved. 2580*8275SEric Cheng * SRS_TX_WAKEUP_CLIENT get set when (flag == MAC_TX_NO_ENQUEUE) and mblk 2581*8275SEric Cheng * got returned back to client either due to lack of Tx descs or due to bw 2582*8275SEric Cheng * control reasons. The clients expect a wakeup when condition is relieved. 2583*8275SEric Cheng * 2584*8275SEric Cheng * The fourth argument to mac_tx() is the flag. Normally it will be 0 but 2585*8275SEric Cheng * some clients set the following values too: MAC_DROP_ON_NO_DESC, 2586*8275SEric Cheng * MAC_TX_NO_ENQUEUE 2587*8275SEric Cheng * Mac clients that do not want packets to be enqueued in the mac layer set 2588*8275SEric Cheng * MAC_DROP_ON_NO_DESC value. The packets won't be queued in the Tx SRS or 2589*8275SEric Cheng * Tx soft rings but instead get dropped when the NIC runs out of desc. The 2590*8275SEric Cheng * behaviour of this flag is different when the Tx is running in serializer 2591*8275SEric Cheng * or bandwidth mode. Under these (Serializer, bandwidth) modes, the packet 2592*8275SEric Cheng * get dropped when Tx high watermark is reached. 2593*8275SEric Cheng * There are some mac clients like vsw, aggr that want the mblks to be 2594*8275SEric Cheng * returned back to clients instead of being queued in Tx SRS (or Tx soft 2595*8275SEric Cheng * rings) under flow-control (i.e., out of desc or exceeding bw limits) 2596*8275SEric Cheng * conditions. These clients call mac_tx() with MAC_TX_NO_ENQUEUE flag set. 2597*8275SEric Cheng * In the default and Tx fanout mode, the un-transmitted mblks will be 2598*8275SEric Cheng * returned back to the clients when the driver runs out of Tx descs. 2599*8275SEric Cheng * SRS_TX_WAKEUP_CLIENT (or S_RING_WAKEUP_CLIENT) will be set in SRS (or 2600*8275SEric Cheng * soft ring) so that the clients can be woken up when Tx desc become 2601*8275SEric Cheng * available. When running in serializer or bandwidth mode mode, 2602*8275SEric Cheng * SRS_TX_WAKEUP_CLIENT will be set when tx hi-watermark is reached. 2603*8275SEric Cheng */ 2604*8275SEric Cheng 2605*8275SEric Cheng mac_tx_func_t 2606*8275SEric Cheng mac_tx_get_func(uint32_t mode) 2607*8275SEric Cheng { 2608*8275SEric Cheng return (mac_tx_mode_list[mode].mac_tx_func); 2609*8275SEric Cheng } 2610*8275SEric Cheng 2611*8275SEric Cheng /* ARGSUSED */ 2612*8275SEric Cheng static mac_tx_cookie_t 2613*8275SEric Cheng mac_tx_single_ring_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2614*8275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2615*8275SEric Cheng { 2616*8275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2617*8275SEric Cheng boolean_t is_subflow; 2618*8275SEric Cheng mac_tx_stats_t stats; 2619*8275SEric Cheng mac_tx_cookie_t cookie = NULL; 2620*8275SEric Cheng 2621*8275SEric Cheng ASSERT(srs_tx->st_mode == SRS_TX_DEFAULT); 2622*8275SEric Cheng 2623*8275SEric Cheng /* Regular case with a single Tx ring */ 2624*8275SEric Cheng /* 2625*8275SEric Cheng * SRS_TX_BLOCKED is set when underlying NIC runs 2626*8275SEric Cheng * out of Tx descs and messages start getting 2627*8275SEric Cheng * queued. It won't get reset until 2628*8275SEric Cheng * tx_srs_drain() completely drains out the 2629*8275SEric Cheng * messages. 2630*8275SEric Cheng */ 2631*8275SEric Cheng if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 2632*8275SEric Cheng /* Tx descs/resources not available */ 2633*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 2634*8275SEric Cheng if ((mac_srs->srs_state & SRS_ENQUEUED) != 0) { 2635*8275SEric Cheng cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, 2636*8275SEric Cheng flag, ret_mp); 2637*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 2638*8275SEric Cheng return (cookie); 2639*8275SEric Cheng } 2640*8275SEric Cheng /* 2641*8275SEric Cheng * While we were computing mblk count, the 2642*8275SEric Cheng * flow control condition got relieved. 2643*8275SEric Cheng * Continue with the transmission. 2644*8275SEric Cheng */ 2645*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 2646*8275SEric Cheng } 2647*8275SEric Cheng 2648*8275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2649*8275SEric Cheng 2650*8275SEric Cheng mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2651*8275SEric Cheng mp_chain, (is_subflow ? &stats : NULL)); 2652*8275SEric Cheng 2653*8275SEric Cheng /* 2654*8275SEric Cheng * Multiple threads could be here sending packets. 2655*8275SEric Cheng * Under such conditions, it is not possible to 2656*8275SEric Cheng * automically set SRS_TX_BLOCKED bit to indicate 2657*8275SEric Cheng * out of tx desc condition. To atomically set 2658*8275SEric Cheng * this, we queue the returned packet and do 2659*8275SEric Cheng * the setting of SRS_TX_BLOCKED in 2660*8275SEric Cheng * mac_tx_srs_drain(). 2661*8275SEric Cheng */ 2662*8275SEric Cheng if (mp_chain != NULL) { 2663*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 2664*8275SEric Cheng cookie = mac_tx_srs_no_desc(mac_srs, mp_chain, flag, ret_mp); 2665*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 2666*8275SEric Cheng return (cookie); 2667*8275SEric Cheng } 2668*8275SEric Cheng 2669*8275SEric Cheng if (is_subflow) 2670*8275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2671*8275SEric Cheng 2672*8275SEric Cheng return (NULL); 2673*8275SEric Cheng } 2674*8275SEric Cheng 2675*8275SEric Cheng /* 2676*8275SEric Cheng * mac_tx_serialize_mode 2677*8275SEric Cheng * 2678*8275SEric Cheng * This is an experimental mode implemented as per the request of PAE. 2679*8275SEric Cheng * In this mode, all callers attempting to send a packet to the NIC 2680*8275SEric Cheng * will get serialized. Only one thread at any time will access the 2681*8275SEric Cheng * NIC to send the packet out. 2682*8275SEric Cheng */ 2683*8275SEric Cheng /* ARGSUSED */ 2684*8275SEric Cheng static mac_tx_cookie_t 2685*8275SEric Cheng mac_tx_serializer_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2686*8275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2687*8275SEric Cheng { 2688*8275SEric Cheng boolean_t is_subflow; 2689*8275SEric Cheng mac_tx_stats_t stats; 2690*8275SEric Cheng mac_tx_cookie_t cookie = NULL; 2691*8275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2692*8275SEric Cheng 2693*8275SEric Cheng /* Single ring, serialize below */ 2694*8275SEric Cheng ASSERT(srs_tx->st_mode == SRS_TX_SERIALIZE); 2695*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 2696*8275SEric Cheng if ((mac_srs->srs_first != NULL) || 2697*8275SEric Cheng (mac_srs->srs_state & SRS_PROC)) { 2698*8275SEric Cheng /* 2699*8275SEric Cheng * In serialization mode, queue all packets until 2700*8275SEric Cheng * TX_HIWAT is set. 2701*8275SEric Cheng * If drop bit is set, drop if TX_HIWAT is set. 2702*8275SEric Cheng * If no_enqueue is set, still enqueue until hiwat 2703*8275SEric Cheng * is set and return mblks after TX_HIWAT is set. 2704*8275SEric Cheng */ 2705*8275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, 2706*8275SEric Cheng flag, NULL, ret_mp); 2707*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 2708*8275SEric Cheng return (cookie); 2709*8275SEric Cheng } 2710*8275SEric Cheng /* 2711*8275SEric Cheng * No packets queued, nothing on proc and no flow 2712*8275SEric Cheng * control condition. Fast-path, ok. Do inline 2713*8275SEric Cheng * processing. 2714*8275SEric Cheng */ 2715*8275SEric Cheng mac_srs->srs_state |= SRS_PROC; 2716*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 2717*8275SEric Cheng 2718*8275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2719*8275SEric Cheng 2720*8275SEric Cheng mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2721*8275SEric Cheng mp_chain, (is_subflow ? &stats : NULL)); 2722*8275SEric Cheng 2723*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 2724*8275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 2725*8275SEric Cheng if (mp_chain != NULL) { 2726*8275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, 2727*8275SEric Cheng mp_chain, flag, NULL, ret_mp); 2728*8275SEric Cheng } 2729*8275SEric Cheng if (mac_srs->srs_first != NULL) { 2730*8275SEric Cheng /* 2731*8275SEric Cheng * We processed inline our packet and a new 2732*8275SEric Cheng * packet/s got queued while we were 2733*8275SEric Cheng * processing. Wakeup srs worker 2734*8275SEric Cheng */ 2735*8275SEric Cheng cv_signal(&mac_srs->srs_async); 2736*8275SEric Cheng } 2737*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 2738*8275SEric Cheng 2739*8275SEric Cheng if (is_subflow && cookie == NULL) 2740*8275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2741*8275SEric Cheng 2742*8275SEric Cheng return (cookie); 2743*8275SEric Cheng } 2744*8275SEric Cheng 2745*8275SEric Cheng /* 2746*8275SEric Cheng * mac_tx_fanout_mode 2747*8275SEric Cheng * 2748*8275SEric Cheng * In this mode, the SRS will have access to multiple Tx rings to send 2749*8275SEric Cheng * the packet out. The fanout hint that is passed as an argument is 2750*8275SEric Cheng * used to find an appropriate ring to fanout the traffic. Each Tx 2751*8275SEric Cheng * ring, in turn, will have a soft ring associated with it. If a Tx 2752*8275SEric Cheng * ring runs out of Tx desc's the returned packet will be queued in 2753*8275SEric Cheng * the soft ring associated with that Tx ring. The srs itself will not 2754*8275SEric Cheng * queue any packets. 2755*8275SEric Cheng */ 2756*8275SEric Cheng static mac_tx_cookie_t 2757*8275SEric Cheng mac_tx_fanout_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2758*8275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2759*8275SEric Cheng { 2760*8275SEric Cheng mac_soft_ring_t *softring; 2761*8275SEric Cheng uint_t indx, hash; 2762*8275SEric Cheng 2763*8275SEric Cheng ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT); 2764*8275SEric Cheng hash = HASH_HINT(fanout_hint); 2765*8275SEric Cheng indx = COMPUTE_INDEX(hash, mac_srs->srs_oth_ring_count); 2766*8275SEric Cheng softring = mac_srs->srs_oth_soft_rings[indx]; 2767*8275SEric Cheng return (mac_tx_soft_ring_process(softring, mp_chain, flag, ret_mp)); 2768*8275SEric Cheng } 2769*8275SEric Cheng 2770*8275SEric Cheng /* 2771*8275SEric Cheng * mac_tx_bw_mode 2772*8275SEric Cheng * 2773*8275SEric Cheng * In the bandwidth mode, Tx srs will allow packets to go down to Tx ring 2774*8275SEric Cheng * only if bw is available. Otherwise the packets will be queued in 2775*8275SEric Cheng * SRS. If the SRS has multiple Tx rings, then packets will get fanned 2776*8275SEric Cheng * out to a Tx rings. 2777*8275SEric Cheng */ 2778*8275SEric Cheng static mac_tx_cookie_t 2779*8275SEric Cheng mac_tx_bw_mode(mac_soft_ring_set_t *mac_srs, mblk_t *mp_chain, 2780*8275SEric Cheng uintptr_t fanout_hint, uint16_t flag, mblk_t **ret_mp) 2781*8275SEric Cheng { 2782*8275SEric Cheng int cnt, sz; 2783*8275SEric Cheng mblk_t *tail; 2784*8275SEric Cheng mac_tx_cookie_t cookie = NULL; 2785*8275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2786*8275SEric Cheng 2787*8275SEric Cheng ASSERT(TX_BANDWIDTH_MODE(mac_srs)); 2788*8275SEric Cheng ASSERT(mac_srs->srs_type & SRST_BW_CONTROL); 2789*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 2790*8275SEric Cheng if (mac_srs->srs_bw->mac_bw_limit == 0) { 2791*8275SEric Cheng /* zero bandwidth: drop all */ 2792*8275SEric Cheng MAC_TX_SRS_DROP_MESSAGE(mac_srs, mp_chain, cookie); 2793*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 2794*8275SEric Cheng return (cookie); 2795*8275SEric Cheng } else if ((mac_srs->srs_first != NULL) || 2796*8275SEric Cheng (mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)) { 2797*8275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 2798*8275SEric Cheng fanout_hint, ret_mp); 2799*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 2800*8275SEric Cheng return (cookie); 2801*8275SEric Cheng } 2802*8275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2803*8275SEric Cheng if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 2804*8275SEric Cheng mac_srs->srs_bw->mac_bw_curr_time = lbolt; 2805*8275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 2806*8275SEric Cheng } else if (mac_srs->srs_bw->mac_bw_used > 2807*8275SEric Cheng mac_srs->srs_bw->mac_bw_limit) { 2808*8275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 2809*8275SEric Cheng MAC_TX_SRS_ENQUEUE_CHAIN(mac_srs, 2810*8275SEric Cheng mp_chain, tail, cnt, sz); 2811*8275SEric Cheng /* 2812*8275SEric Cheng * Wakeup worker thread. Note that worker 2813*8275SEric Cheng * thread has to be woken up so that it 2814*8275SEric Cheng * can fire up the timer to be woken up 2815*8275SEric Cheng * on the next tick. Also once 2816*8275SEric Cheng * BW_ENFORCED is set, it can only be 2817*8275SEric Cheng * reset by srs_worker thread. Until then 2818*8275SEric Cheng * all packets will get queued up in SRS 2819*8275SEric Cheng * and hence this this code path won't be 2820*8275SEric Cheng * entered until BW_ENFORCED is reset. 2821*8275SEric Cheng */ 2822*8275SEric Cheng cv_signal(&mac_srs->srs_async); 2823*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 2824*8275SEric Cheng return (cookie); 2825*8275SEric Cheng } 2826*8275SEric Cheng 2827*8275SEric Cheng mac_srs->srs_bw->mac_bw_used += sz; 2828*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 2829*8275SEric Cheng 2830*8275SEric Cheng if (srs_tx->st_mode == SRS_TX_BW_FANOUT) { 2831*8275SEric Cheng mac_soft_ring_t *softring; 2832*8275SEric Cheng uint_t indx, hash; 2833*8275SEric Cheng 2834*8275SEric Cheng hash = HASH_HINT(fanout_hint); 2835*8275SEric Cheng indx = COMPUTE_INDEX(hash, 2836*8275SEric Cheng mac_srs->srs_oth_ring_count); 2837*8275SEric Cheng softring = mac_srs->srs_oth_soft_rings[indx]; 2838*8275SEric Cheng return (mac_tx_soft_ring_process(softring, mp_chain, flag, 2839*8275SEric Cheng ret_mp)); 2840*8275SEric Cheng } else { 2841*8275SEric Cheng boolean_t is_subflow; 2842*8275SEric Cheng mac_tx_stats_t stats; 2843*8275SEric Cheng 2844*8275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2845*8275SEric Cheng 2846*8275SEric Cheng mp_chain = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2847*8275SEric Cheng mp_chain, (is_subflow ? &stats : NULL)); 2848*8275SEric Cheng 2849*8275SEric Cheng if (mp_chain != NULL) { 2850*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 2851*8275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 2852*8275SEric Cheng if (mac_srs->srs_bw->mac_bw_used > sz) 2853*8275SEric Cheng mac_srs->srs_bw->mac_bw_used -= sz; 2854*8275SEric Cheng else 2855*8275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 2856*8275SEric Cheng cookie = mac_tx_srs_enqueue(mac_srs, mp_chain, flag, 2857*8275SEric Cheng fanout_hint, ret_mp); 2858*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 2859*8275SEric Cheng return (cookie); 2860*8275SEric Cheng } 2861*8275SEric Cheng if (is_subflow) 2862*8275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 2863*8275SEric Cheng 2864*8275SEric Cheng return (NULL); 2865*8275SEric Cheng } 2866*8275SEric Cheng } 2867*8275SEric Cheng 2868*8275SEric Cheng /* ARGSUSED */ 2869*8275SEric Cheng void 2870*8275SEric Cheng mac_tx_srs_drain(mac_soft_ring_set_t *mac_srs, uint_t proc_type) 2871*8275SEric Cheng { 2872*8275SEric Cheng mblk_t *head, *tail; 2873*8275SEric Cheng size_t sz; 2874*8275SEric Cheng uint32_t tx_mode; 2875*8275SEric Cheng uint_t saved_pkt_count; 2876*8275SEric Cheng boolean_t is_subflow; 2877*8275SEric Cheng mac_tx_stats_t stats; 2878*8275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 2879*8275SEric Cheng 2880*8275SEric Cheng saved_pkt_count = 0; 2881*8275SEric Cheng ASSERT(mutex_owned(&mac_srs->srs_lock)); 2882*8275SEric Cheng ASSERT(!(mac_srs->srs_state & SRS_PROC)); 2883*8275SEric Cheng 2884*8275SEric Cheng mac_srs->srs_state |= SRS_PROC; 2885*8275SEric Cheng 2886*8275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 2887*8275SEric Cheng tx_mode = srs_tx->st_mode; 2888*8275SEric Cheng if (tx_mode == SRS_TX_DEFAULT || tx_mode == SRS_TX_SERIALIZE) { 2889*8275SEric Cheng if (mac_srs->srs_first != NULL) { 2890*8275SEric Cheng head = mac_srs->srs_first; 2891*8275SEric Cheng tail = mac_srs->srs_last; 2892*8275SEric Cheng saved_pkt_count = mac_srs->srs_count; 2893*8275SEric Cheng mac_srs->srs_first = NULL; 2894*8275SEric Cheng mac_srs->srs_last = NULL; 2895*8275SEric Cheng mac_srs->srs_count = 0; 2896*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 2897*8275SEric Cheng 2898*8275SEric Cheng head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2899*8275SEric Cheng head, &stats); 2900*8275SEric Cheng 2901*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 2902*8275SEric Cheng if (head != NULL) { 2903*8275SEric Cheng /* Device out of tx desc, set block */ 2904*8275SEric Cheng if (head->b_next == NULL) 2905*8275SEric Cheng VERIFY(head == tail); 2906*8275SEric Cheng tail->b_next = mac_srs->srs_first; 2907*8275SEric Cheng mac_srs->srs_first = head; 2908*8275SEric Cheng mac_srs->srs_count += 2909*8275SEric Cheng (saved_pkt_count - stats.ts_opackets); 2910*8275SEric Cheng if (mac_srs->srs_last == NULL) 2911*8275SEric Cheng mac_srs->srs_last = tail; 2912*8275SEric Cheng MAC_TX_SRS_BLOCK(mac_srs, head); 2913*8275SEric Cheng } else { 2914*8275SEric Cheng srs_tx->st_woken_up = B_FALSE; 2915*8275SEric Cheng if (is_subflow) { 2916*8275SEric Cheng FLOW_TX_STATS_UPDATE( 2917*8275SEric Cheng mac_srs->srs_flent, &stats); 2918*8275SEric Cheng } 2919*8275SEric Cheng } 2920*8275SEric Cheng } 2921*8275SEric Cheng } else if (tx_mode == SRS_TX_BW) { 2922*8275SEric Cheng /* 2923*8275SEric Cheng * We are here because the timer fired and we have some data 2924*8275SEric Cheng * to tranmit. Also mac_tx_srs_worker should have reset 2925*8275SEric Cheng * SRS_BW_ENFORCED flag 2926*8275SEric Cheng */ 2927*8275SEric Cheng ASSERT(!(mac_srs->srs_bw->mac_bw_state & SRS_BW_ENFORCED)); 2928*8275SEric Cheng head = tail = mac_srs->srs_first; 2929*8275SEric Cheng while (mac_srs->srs_first != NULL) { 2930*8275SEric Cheng tail = mac_srs->srs_first; 2931*8275SEric Cheng tail->b_prev = NULL; 2932*8275SEric Cheng mac_srs->srs_first = tail->b_next; 2933*8275SEric Cheng if (mac_srs->srs_first == NULL) 2934*8275SEric Cheng mac_srs->srs_last = NULL; 2935*8275SEric Cheng mac_srs->srs_count--; 2936*8275SEric Cheng sz = msgdsize(tail); 2937*8275SEric Cheng mac_srs->srs_size -= sz; 2938*8275SEric Cheng saved_pkt_count++; 2939*8275SEric Cheng MAC_TX_UPDATE_BW_INFO(mac_srs, sz); 2940*8275SEric Cheng 2941*8275SEric Cheng if (mac_srs->srs_bw->mac_bw_used < 2942*8275SEric Cheng mac_srs->srs_bw->mac_bw_limit) 2943*8275SEric Cheng continue; 2944*8275SEric Cheng 2945*8275SEric Cheng if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 2946*8275SEric Cheng mac_srs->srs_bw->mac_bw_curr_time = lbolt; 2947*8275SEric Cheng mac_srs->srs_bw->mac_bw_used = sz; 2948*8275SEric Cheng continue; 2949*8275SEric Cheng } 2950*8275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 2951*8275SEric Cheng break; 2952*8275SEric Cheng } 2953*8275SEric Cheng 2954*8275SEric Cheng ASSERT((head == NULL && tail == NULL) || 2955*8275SEric Cheng (head != NULL && tail != NULL)); 2956*8275SEric Cheng if (tail != NULL) { 2957*8275SEric Cheng tail->b_next = NULL; 2958*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 2959*8275SEric Cheng 2960*8275SEric Cheng head = mac_tx_send(srs_tx->st_arg1, srs_tx->st_arg2, 2961*8275SEric Cheng head, &stats); 2962*8275SEric Cheng 2963*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 2964*8275SEric Cheng if (head != NULL) { 2965*8275SEric Cheng uint_t size_sent; 2966*8275SEric Cheng 2967*8275SEric Cheng /* Device out of tx desc, set block */ 2968*8275SEric Cheng if (head->b_next == NULL) 2969*8275SEric Cheng VERIFY(head == tail); 2970*8275SEric Cheng tail->b_next = mac_srs->srs_first; 2971*8275SEric Cheng mac_srs->srs_first = head; 2972*8275SEric Cheng mac_srs->srs_count += 2973*8275SEric Cheng (saved_pkt_count - stats.ts_opackets); 2974*8275SEric Cheng if (mac_srs->srs_last == NULL) 2975*8275SEric Cheng mac_srs->srs_last = tail; 2976*8275SEric Cheng size_sent = sz - stats.ts_obytes; 2977*8275SEric Cheng mac_srs->srs_size += size_sent; 2978*8275SEric Cheng mac_srs->srs_bw->mac_bw_sz += size_sent; 2979*8275SEric Cheng if (mac_srs->srs_bw->mac_bw_used > size_sent) { 2980*8275SEric Cheng mac_srs->srs_bw->mac_bw_used -= 2981*8275SEric Cheng size_sent; 2982*8275SEric Cheng } else { 2983*8275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 2984*8275SEric Cheng } 2985*8275SEric Cheng MAC_TX_SRS_BLOCK(mac_srs, head); 2986*8275SEric Cheng } else { 2987*8275SEric Cheng srs_tx->st_woken_up = B_FALSE; 2988*8275SEric Cheng if (is_subflow) { 2989*8275SEric Cheng FLOW_TX_STATS_UPDATE( 2990*8275SEric Cheng mac_srs->srs_flent, &stats); 2991*8275SEric Cheng } 2992*8275SEric Cheng } 2993*8275SEric Cheng } 2994*8275SEric Cheng } else if (tx_mode == SRS_TX_BW_FANOUT) { 2995*8275SEric Cheng mblk_t *prev; 2996*8275SEric Cheng mac_soft_ring_t *softring; 2997*8275SEric Cheng uint64_t hint; 2998*8275SEric Cheng 2999*8275SEric Cheng /* 3000*8275SEric Cheng * We are here because the timer fired and we 3001*8275SEric Cheng * have some quota to tranmit. 3002*8275SEric Cheng */ 3003*8275SEric Cheng prev = NULL; 3004*8275SEric Cheng head = tail = mac_srs->srs_first; 3005*8275SEric Cheng while (mac_srs->srs_first != NULL) { 3006*8275SEric Cheng tail = mac_srs->srs_first; 3007*8275SEric Cheng mac_srs->srs_first = tail->b_next; 3008*8275SEric Cheng if (mac_srs->srs_first == NULL) 3009*8275SEric Cheng mac_srs->srs_last = NULL; 3010*8275SEric Cheng mac_srs->srs_count--; 3011*8275SEric Cheng sz = msgdsize(tail); 3012*8275SEric Cheng mac_srs->srs_size -= sz; 3013*8275SEric Cheng mac_srs->srs_bw->mac_bw_used += sz; 3014*8275SEric Cheng if (prev == NULL) 3015*8275SEric Cheng hint = (ulong_t)tail->b_prev; 3016*8275SEric Cheng if (hint != (ulong_t)tail->b_prev) { 3017*8275SEric Cheng prev->b_next = NULL; 3018*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 3019*8275SEric Cheng TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 3020*8275SEric Cheng head = tail; 3021*8275SEric Cheng hint = (ulong_t)tail->b_prev; 3022*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 3023*8275SEric Cheng } 3024*8275SEric Cheng 3025*8275SEric Cheng prev = tail; 3026*8275SEric Cheng tail->b_prev = NULL; 3027*8275SEric Cheng if (mac_srs->srs_bw->mac_bw_used < 3028*8275SEric Cheng mac_srs->srs_bw->mac_bw_limit) 3029*8275SEric Cheng continue; 3030*8275SEric Cheng 3031*8275SEric Cheng if (mac_srs->srs_bw->mac_bw_curr_time != lbolt) { 3032*8275SEric Cheng mac_srs->srs_bw->mac_bw_curr_time = lbolt; 3033*8275SEric Cheng mac_srs->srs_bw->mac_bw_used = 0; 3034*8275SEric Cheng continue; 3035*8275SEric Cheng } 3036*8275SEric Cheng mac_srs->srs_bw->mac_bw_state |= SRS_BW_ENFORCED; 3037*8275SEric Cheng break; 3038*8275SEric Cheng } 3039*8275SEric Cheng ASSERT((head == NULL && tail == NULL) || 3040*8275SEric Cheng (head != NULL && tail != NULL)); 3041*8275SEric Cheng if (tail != NULL) { 3042*8275SEric Cheng tail->b_next = NULL; 3043*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 3044*8275SEric Cheng TX_SRS_TO_SOFT_RING(mac_srs, head, hint); 3045*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 3046*8275SEric Cheng } 3047*8275SEric Cheng } 3048*8275SEric Cheng /* 3049*8275SEric Cheng * SRS_TX_FANOUT case not considered here because packets 3050*8275SEric Cheng * won't be queued in the SRS for this case. Packets will 3051*8275SEric Cheng * be sent directly to soft rings underneath and if there 3052*8275SEric Cheng * is any queueing at all, it would be in Tx side soft 3053*8275SEric Cheng * rings. 3054*8275SEric Cheng */ 3055*8275SEric Cheng 3056*8275SEric Cheng /* 3057*8275SEric Cheng * When srs_count becomes 0, reset SRS_TX_HIWAT and 3058*8275SEric Cheng * SRS_TX_WAKEUP_CLIENT and wakeup registered clients. 3059*8275SEric Cheng */ 3060*8275SEric Cheng if (mac_srs->srs_count == 0 && (mac_srs->srs_state & 3061*8275SEric Cheng (SRS_TX_HIWAT | SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED))) { 3062*8275SEric Cheng mac_tx_notify_cb_t *mtnfp; 3063*8275SEric Cheng mac_cb_t *mcb; 3064*8275SEric Cheng mac_client_impl_t *mcip = mac_srs->srs_mcip; 3065*8275SEric Cheng boolean_t wakeup_required = B_FALSE; 3066*8275SEric Cheng 3067*8275SEric Cheng if (mac_srs->srs_state & 3068*8275SEric Cheng (SRS_TX_HIWAT|SRS_TX_WAKEUP_CLIENT)) { 3069*8275SEric Cheng wakeup_required = B_TRUE; 3070*8275SEric Cheng } 3071*8275SEric Cheng mac_srs->srs_state &= ~(SRS_TX_HIWAT | 3072*8275SEric Cheng SRS_TX_WAKEUP_CLIENT | SRS_ENQUEUED); 3073*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 3074*8275SEric Cheng if (wakeup_required) { 3075*8275SEric Cheng /* Wakeup callback registered clients */ 3076*8275SEric Cheng MAC_CALLBACK_WALKER_INC(&mcip->mci_tx_notify_cb_info); 3077*8275SEric Cheng for (mcb = mcip->mci_tx_notify_cb_list; mcb != NULL; 3078*8275SEric Cheng mcb = mcb->mcb_nextp) { 3079*8275SEric Cheng mtnfp = (mac_tx_notify_cb_t *)mcb->mcb_objp; 3080*8275SEric Cheng mtnfp->mtnf_fn(mtnfp->mtnf_arg, 3081*8275SEric Cheng (mac_tx_cookie_t)mac_srs); 3082*8275SEric Cheng } 3083*8275SEric Cheng MAC_CALLBACK_WALKER_DCR(&mcip->mci_tx_notify_cb_info, 3084*8275SEric Cheng &mcip->mci_tx_notify_cb_list); 3085*8275SEric Cheng /* 3086*8275SEric Cheng * If the client is not the primary MAC client, then we 3087*8275SEric Cheng * need to send the notification to the clients upper 3088*8275SEric Cheng * MAC, i.e. mci_upper_mip. 3089*8275SEric Cheng */ 3090*8275SEric Cheng mac_tx_notify(mcip->mci_upper_mip != NULL ? 3091*8275SEric Cheng mcip->mci_upper_mip : mcip->mci_mip); 3092*8275SEric Cheng } 3093*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 3094*8275SEric Cheng } 3095*8275SEric Cheng mac_srs->srs_state &= ~SRS_PROC; 3096*8275SEric Cheng } 3097*8275SEric Cheng 3098*8275SEric Cheng /* 3099*8275SEric Cheng * Given a packet, get the flow_entry that identifies the flow 3100*8275SEric Cheng * to which that packet belongs. The flow_entry will contain 3101*8275SEric Cheng * the transmit function to be used to send the packet. If the 3102*8275SEric Cheng * function returns NULL, the packet should be sent using the 3103*8275SEric Cheng * underlying NIC. 3104*8275SEric Cheng */ 3105*8275SEric Cheng static flow_entry_t * 3106*8275SEric Cheng mac_tx_classify(mac_impl_t *mip, mblk_t *mp) 3107*8275SEric Cheng { 3108*8275SEric Cheng flow_entry_t *flent = NULL; 3109*8275SEric Cheng mac_client_impl_t *mcip; 3110*8275SEric Cheng int err; 3111*8275SEric Cheng 3112*8275SEric Cheng /* 3113*8275SEric Cheng * Do classification on the packet. 3114*8275SEric Cheng */ 3115*8275SEric Cheng err = mac_flow_lookup(mip->mi_flow_tab, mp, FLOW_OUTBOUND, &flent); 3116*8275SEric Cheng if (err != 0) 3117*8275SEric Cheng return (NULL); 3118*8275SEric Cheng 3119*8275SEric Cheng /* 3120*8275SEric Cheng * This flent might just be an additional one on the MAC client, 3121*8275SEric Cheng * i.e. for classification purposes (different fdesc), however 3122*8275SEric Cheng * the resources, SRS et. al., are in the mci_flent, so if 3123*8275SEric Cheng * this isn't the mci_flent, we need to get it. 3124*8275SEric Cheng */ 3125*8275SEric Cheng if ((mcip = flent->fe_mcip) != NULL && mcip->mci_flent != flent) { 3126*8275SEric Cheng FLOW_REFRELE(flent); 3127*8275SEric Cheng flent = mcip->mci_flent; 3128*8275SEric Cheng FLOW_TRY_REFHOLD(flent, err); 3129*8275SEric Cheng if (err != 0) 3130*8275SEric Cheng return (NULL); 3131*8275SEric Cheng } 3132*8275SEric Cheng 3133*8275SEric Cheng return (flent); 3134*8275SEric Cheng } 3135*8275SEric Cheng 3136*8275SEric Cheng /* 3137*8275SEric Cheng * This macro is only meant to be used by mac_tx_send(). 3138*8275SEric Cheng */ 3139*8275SEric Cheng #define CHECK_VID_AND_ADD_TAG(mp) { \ 3140*8275SEric Cheng if (vid_check) { \ 3141*8275SEric Cheng int err = 0; \ 3142*8275SEric Cheng \ 3143*8275SEric Cheng MAC_VID_CHECK(src_mcip, (mp), err); \ 3144*8275SEric Cheng if (err != 0) { \ 3145*8275SEric Cheng freemsg((mp)); \ 3146*8275SEric Cheng (mp) = next; \ 3147*8275SEric Cheng oerrors++; \ 3148*8275SEric Cheng continue; \ 3149*8275SEric Cheng } \ 3150*8275SEric Cheng } \ 3151*8275SEric Cheng if (add_tag) { \ 3152*8275SEric Cheng (mp) = mac_add_vlan_tag((mp), 0, vid); \ 3153*8275SEric Cheng if ((mp) == NULL) { \ 3154*8275SEric Cheng (mp) = next; \ 3155*8275SEric Cheng oerrors++; \ 3156*8275SEric Cheng continue; \ 3157*8275SEric Cheng } \ 3158*8275SEric Cheng } \ 3159*8275SEric Cheng } 3160*8275SEric Cheng 3161*8275SEric Cheng mblk_t * 3162*8275SEric Cheng mac_tx_send(mac_client_handle_t mch, mac_ring_handle_t ring, mblk_t *mp_chain, 3163*8275SEric Cheng mac_tx_stats_t *stats) 3164*8275SEric Cheng { 3165*8275SEric Cheng mac_client_impl_t *src_mcip = (mac_client_impl_t *)mch; 3166*8275SEric Cheng mac_impl_t *mip = src_mcip->mci_mip; 3167*8275SEric Cheng uint_t obytes = 0, opackets = 0, oerrors = 0; 3168*8275SEric Cheng mblk_t *mp = NULL, *next; 3169*8275SEric Cheng boolean_t vid_check, add_tag; 3170*8275SEric Cheng uint16_t vid = 0; 3171*8275SEric Cheng 3172*8275SEric Cheng if (mip->mi_nclients > 1) { 3173*8275SEric Cheng vid_check = MAC_VID_CHECK_NEEDED(src_mcip); 3174*8275SEric Cheng add_tag = MAC_TAG_NEEDED(src_mcip); 3175*8275SEric Cheng if (add_tag) 3176*8275SEric Cheng vid = mac_client_vid(mch); 3177*8275SEric Cheng } else { 3178*8275SEric Cheng ASSERT(mip->mi_nclients == 1); 3179*8275SEric Cheng vid_check = add_tag = B_FALSE; 3180*8275SEric Cheng } 3181*8275SEric Cheng 3182*8275SEric Cheng /* 3183*8275SEric Cheng * Fastpath: if there's only one client, and there's no 3184*8275SEric Cheng * multicast listeners, we simply send the packet down to the 3185*8275SEric Cheng * underlying NIC. 3186*8275SEric Cheng */ 3187*8275SEric Cheng if (mip->mi_nactiveclients == 1 && mip->mi_promisc_list == NULL) { 3188*8275SEric Cheng DTRACE_PROBE2(fastpath, 3189*8275SEric Cheng mac_client_impl_t *, src_mcip, mblk_t *, mp_chain); 3190*8275SEric Cheng 3191*8275SEric Cheng mp = mp_chain; 3192*8275SEric Cheng while (mp != NULL) { 3193*8275SEric Cheng next = mp->b_next; 3194*8275SEric Cheng mp->b_next = NULL; 3195*8275SEric Cheng opackets++; 3196*8275SEric Cheng obytes += (mp->b_cont == NULL ? MBLKL(mp) : 3197*8275SEric Cheng msgdsize(mp)); 3198*8275SEric Cheng 3199*8275SEric Cheng CHECK_VID_AND_ADD_TAG(mp); 3200*8275SEric Cheng MAC_TX(mip, ring, mp, src_mcip); 3201*8275SEric Cheng 3202*8275SEric Cheng /* 3203*8275SEric Cheng * If the driver is out of descriptors and does a 3204*8275SEric Cheng * partial send it will return a chain of unsent 3205*8275SEric Cheng * mblks. Adjust the accounting stats. 3206*8275SEric Cheng */ 3207*8275SEric Cheng if (mp != NULL) { 3208*8275SEric Cheng opackets--; 3209*8275SEric Cheng obytes -= msgdsize(mp); 3210*8275SEric Cheng mp->b_next = next; 3211*8275SEric Cheng break; 3212*8275SEric Cheng } 3213*8275SEric Cheng mp = next; 3214*8275SEric Cheng } 3215*8275SEric Cheng goto done; 3216*8275SEric Cheng } 3217*8275SEric Cheng 3218*8275SEric Cheng /* 3219*8275SEric Cheng * No fastpath, we either have more than one MAC client 3220*8275SEric Cheng * defined on top of the same MAC, or one or more MAC 3221*8275SEric Cheng * client promiscuous callbacks. 3222*8275SEric Cheng */ 3223*8275SEric Cheng DTRACE_PROBE3(slowpath, mac_client_impl_t *, 3224*8275SEric Cheng src_mcip, int, mip->mi_nclients, mblk_t *, mp_chain); 3225*8275SEric Cheng 3226*8275SEric Cheng if (mip->mi_promisc_list != NULL) 3227*8275SEric Cheng mac_promisc_dispatch(mip, mp_chain, src_mcip); 3228*8275SEric Cheng 3229*8275SEric Cheng mp = mp_chain; 3230*8275SEric Cheng while (mp != NULL) { 3231*8275SEric Cheng flow_entry_t *dst_flow_ent; 3232*8275SEric Cheng void *flow_cookie; 3233*8275SEric Cheng size_t pkt_size; 3234*8275SEric Cheng mblk_t *mp1; 3235*8275SEric Cheng 3236*8275SEric Cheng next = mp->b_next; 3237*8275SEric Cheng mp->b_next = NULL; 3238*8275SEric Cheng opackets++; 3239*8275SEric Cheng pkt_size = (mp->b_cont == NULL ? MBLKL(mp) : msgdsize(mp)); 3240*8275SEric Cheng obytes += pkt_size; 3241*8275SEric Cheng CHECK_VID_AND_ADD_TAG(mp); 3242*8275SEric Cheng 3243*8275SEric Cheng /* 3244*8275SEric Cheng * Find the destination. 3245*8275SEric Cheng */ 3246*8275SEric Cheng dst_flow_ent = mac_tx_classify(mip, mp); 3247*8275SEric Cheng 3248*8275SEric Cheng if (dst_flow_ent != NULL) { 3249*8275SEric Cheng size_t hdrsize; 3250*8275SEric Cheng int err = 0; 3251*8275SEric Cheng 3252*8275SEric Cheng if (mip->mi_info.mi_nativemedia == DL_ETHER) { 3253*8275SEric Cheng struct ether_vlan_header *evhp = 3254*8275SEric Cheng (struct ether_vlan_header *)mp->b_rptr; 3255*8275SEric Cheng 3256*8275SEric Cheng if (ntohs(evhp->ether_tpid) == ETHERTYPE_VLAN) 3257*8275SEric Cheng hdrsize = sizeof (*evhp); 3258*8275SEric Cheng else 3259*8275SEric Cheng hdrsize = sizeof (struct ether_header); 3260*8275SEric Cheng } else { 3261*8275SEric Cheng mac_header_info_t mhi; 3262*8275SEric Cheng 3263*8275SEric Cheng err = mac_header_info((mac_handle_t)mip, 3264*8275SEric Cheng mp, &mhi); 3265*8275SEric Cheng if (err == 0) 3266*8275SEric Cheng hdrsize = mhi.mhi_hdrsize; 3267*8275SEric Cheng } 3268*8275SEric Cheng 3269*8275SEric Cheng /* 3270*8275SEric Cheng * Got a matching flow. It's either another 3271*8275SEric Cheng * MAC client, or a broadcast/multicast flow. 3272*8275SEric Cheng * Make sure the packet size is within the 3273*8275SEric Cheng * allowed size. If not drop the packet and 3274*8275SEric Cheng * move to next packet. 3275*8275SEric Cheng */ 3276*8275SEric Cheng if (err != 0 || 3277*8275SEric Cheng (pkt_size - hdrsize) > mip->mi_sdu_max) { 3278*8275SEric Cheng oerrors++; 3279*8275SEric Cheng DTRACE_PROBE2(loopback__drop, size_t, pkt_size, 3280*8275SEric Cheng mblk_t *, mp); 3281*8275SEric Cheng freemsg(mp); 3282*8275SEric Cheng mp = next; 3283*8275SEric Cheng FLOW_REFRELE(dst_flow_ent); 3284*8275SEric Cheng continue; 3285*8275SEric Cheng } 3286*8275SEric Cheng flow_cookie = mac_flow_get_client_cookie(dst_flow_ent); 3287*8275SEric Cheng if (flow_cookie != NULL) { 3288*8275SEric Cheng /* 3289*8275SEric Cheng * The vnic_bcast_send function expects 3290*8275SEric Cheng * to receive the sender MAC client 3291*8275SEric Cheng * as value for arg2. 3292*8275SEric Cheng */ 3293*8275SEric Cheng mac_bcast_send(flow_cookie, src_mcip, mp, 3294*8275SEric Cheng B_TRUE); 3295*8275SEric Cheng } else { 3296*8275SEric Cheng /* 3297*8275SEric Cheng * loopback the packet to a 3298*8275SEric Cheng * local MAC client. We force a context 3299*8275SEric Cheng * switch if both source and destination 3300*8275SEric Cheng * MAC clients are used by IP, i.e. bypass 3301*8275SEric Cheng * is set. 3302*8275SEric Cheng */ 3303*8275SEric Cheng boolean_t do_switch; 3304*8275SEric Cheng mac_client_impl_t *dst_mcip = 3305*8275SEric Cheng dst_flow_ent->fe_mcip; 3306*8275SEric Cheng 3307*8275SEric Cheng do_switch = ((src_mcip->mci_state_flags & 3308*8275SEric Cheng dst_mcip->mci_state_flags & 3309*8275SEric Cheng MCIS_CLIENT_POLL_CAPABLE) != 0); 3310*8275SEric Cheng 3311*8275SEric Cheng if ((mp1 = mac_fix_cksum(mp)) != NULL) { 3312*8275SEric Cheng (dst_flow_ent->fe_cb_fn)( 3313*8275SEric Cheng dst_flow_ent->fe_cb_arg1, 3314*8275SEric Cheng dst_flow_ent->fe_cb_arg2, 3315*8275SEric Cheng mp1, do_switch); 3316*8275SEric Cheng } 3317*8275SEric Cheng } 3318*8275SEric Cheng FLOW_REFRELE(dst_flow_ent); 3319*8275SEric Cheng } else { 3320*8275SEric Cheng /* 3321*8275SEric Cheng * Unknown destination, send via the underlying 3322*8275SEric Cheng * NIC. 3323*8275SEric Cheng */ 3324*8275SEric Cheng MAC_TX(mip, ring, mp, src_mcip); 3325*8275SEric Cheng if (mp != NULL) { 3326*8275SEric Cheng /* 3327*8275SEric Cheng * Adjust for the last packet that 3328*8275SEric Cheng * could not be transmitted 3329*8275SEric Cheng */ 3330*8275SEric Cheng opackets--; 3331*8275SEric Cheng obytes -= pkt_size; 3332*8275SEric Cheng mp->b_next = next; 3333*8275SEric Cheng break; 3334*8275SEric Cheng } 3335*8275SEric Cheng } 3336*8275SEric Cheng mp = next; 3337*8275SEric Cheng } 3338*8275SEric Cheng 3339*8275SEric Cheng done: 3340*8275SEric Cheng src_mcip->mci_stat_obytes += obytes; 3341*8275SEric Cheng src_mcip->mci_stat_opackets += opackets; 3342*8275SEric Cheng src_mcip->mci_stat_oerrors += oerrors; 3343*8275SEric Cheng 3344*8275SEric Cheng if (stats != NULL) { 3345*8275SEric Cheng stats->ts_opackets = opackets; 3346*8275SEric Cheng stats->ts_obytes = obytes; 3347*8275SEric Cheng stats->ts_oerrors = oerrors; 3348*8275SEric Cheng } 3349*8275SEric Cheng return (mp); 3350*8275SEric Cheng } 3351*8275SEric Cheng 3352*8275SEric Cheng /* 3353*8275SEric Cheng * mac_tx_srs_ring_present 3354*8275SEric Cheng * 3355*8275SEric Cheng * Returns whether the specified ring is part of the specified SRS. 3356*8275SEric Cheng */ 3357*8275SEric Cheng boolean_t 3358*8275SEric Cheng mac_tx_srs_ring_present(mac_soft_ring_set_t *srs, mac_ring_t *tx_ring) 3359*8275SEric Cheng { 3360*8275SEric Cheng int i; 3361*8275SEric Cheng mac_soft_ring_t *soft_ring; 3362*8275SEric Cheng 3363*8275SEric Cheng if (srs->srs_tx.st_arg2 == tx_ring) 3364*8275SEric Cheng return (B_TRUE); 3365*8275SEric Cheng 3366*8275SEric Cheng for (i = 0; i < srs->srs_oth_ring_count; i++) { 3367*8275SEric Cheng soft_ring = srs->srs_oth_soft_rings[i]; 3368*8275SEric Cheng if (soft_ring->s_ring_tx_arg2 == tx_ring) 3369*8275SEric Cheng return (B_TRUE); 3370*8275SEric Cheng } 3371*8275SEric Cheng 3372*8275SEric Cheng return (B_FALSE); 3373*8275SEric Cheng } 3374*8275SEric Cheng 3375*8275SEric Cheng /* 3376*8275SEric Cheng * mac_tx_srs_wakeup 3377*8275SEric Cheng * 3378*8275SEric Cheng * Called when Tx desc become available. Wakeup the appropriate worker 3379*8275SEric Cheng * thread after resetting the SRS_TX_BLOCKED/S_RING_BLOCK bit in the 3380*8275SEric Cheng * state field. 3381*8275SEric Cheng */ 3382*8275SEric Cheng void 3383*8275SEric Cheng mac_tx_srs_wakeup(mac_soft_ring_set_t *mac_srs, mac_ring_handle_t ring) 3384*8275SEric Cheng { 3385*8275SEric Cheng int i; 3386*8275SEric Cheng mac_soft_ring_t *sringp; 3387*8275SEric Cheng mac_srs_tx_t *srs_tx = &mac_srs->srs_tx; 3388*8275SEric Cheng 3389*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 3390*8275SEric Cheng if (TX_SINGLE_RING_MODE(mac_srs)) { 3391*8275SEric Cheng if (srs_tx->st_arg2 == ring && 3392*8275SEric Cheng mac_srs->srs_state & SRS_TX_BLOCKED) { 3393*8275SEric Cheng mac_srs->srs_state &= ~SRS_TX_BLOCKED; 3394*8275SEric Cheng srs_tx->st_unblocked_cnt++; 3395*8275SEric Cheng cv_signal(&mac_srs->srs_async); 3396*8275SEric Cheng } 3397*8275SEric Cheng /* 3398*8275SEric Cheng * A wakeup can come before tx_srs_drain() could 3399*8275SEric Cheng * grab srs lock and set SRS_TX_BLOCKED. So 3400*8275SEric Cheng * always set woken_up flag when we come here. 3401*8275SEric Cheng */ 3402*8275SEric Cheng srs_tx->st_woken_up = B_TRUE; 3403*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 3404*8275SEric Cheng return; 3405*8275SEric Cheng } 3406*8275SEric Cheng 3407*8275SEric Cheng /* If you are here, it is for FANOUT or BW_FANOUT case */ 3408*8275SEric Cheng ASSERT(TX_MULTI_RING_MODE(mac_srs)); 3409*8275SEric Cheng for (i = 0; i < mac_srs->srs_oth_ring_count; i++) { 3410*8275SEric Cheng sringp = mac_srs->srs_oth_soft_rings[i]; 3411*8275SEric Cheng mutex_enter(&sringp->s_ring_lock); 3412*8275SEric Cheng if (sringp->s_ring_tx_arg2 == ring) { 3413*8275SEric Cheng if (sringp->s_ring_state & S_RING_BLOCK) { 3414*8275SEric Cheng sringp->s_ring_state &= ~S_RING_BLOCK; 3415*8275SEric Cheng sringp->s_ring_unblocked_cnt++; 3416*8275SEric Cheng cv_signal(&sringp->s_ring_async); 3417*8275SEric Cheng } 3418*8275SEric Cheng sringp->s_ring_tx_woken_up = B_TRUE; 3419*8275SEric Cheng } 3420*8275SEric Cheng mutex_exit(&sringp->s_ring_lock); 3421*8275SEric Cheng } 3422*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 3423*8275SEric Cheng } 3424*8275SEric Cheng 3425*8275SEric Cheng /* 3426*8275SEric Cheng * Once the driver is done draining, send a MAC_NOTE_TX notification to unleash 3427*8275SEric Cheng * the blocked clients again. 3428*8275SEric Cheng */ 3429*8275SEric Cheng void 3430*8275SEric Cheng mac_tx_notify(mac_impl_t *mip) 3431*8275SEric Cheng { 3432*8275SEric Cheng i_mac_notify(mip, MAC_NOTE_TX); 3433*8275SEric Cheng } 3434*8275SEric Cheng 3435*8275SEric Cheng /* 3436*8275SEric Cheng * RX SOFTRING RELATED FUNCTIONS 3437*8275SEric Cheng * 3438*8275SEric Cheng * These functions really belong in mac_soft_ring.c and here for 3439*8275SEric Cheng * a short period. 3440*8275SEric Cheng */ 3441*8275SEric Cheng 3442*8275SEric Cheng #define SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 3443*8275SEric Cheng /* \ 3444*8275SEric Cheng * Enqueue our mblk chain. \ 3445*8275SEric Cheng */ \ 3446*8275SEric Cheng ASSERT(MUTEX_HELD(&(ringp)->s_ring_lock)); \ 3447*8275SEric Cheng \ 3448*8275SEric Cheng if ((ringp)->s_ring_last != NULL) \ 3449*8275SEric Cheng (ringp)->s_ring_last->b_next = (mp); \ 3450*8275SEric Cheng else \ 3451*8275SEric Cheng (ringp)->s_ring_first = (mp); \ 3452*8275SEric Cheng (ringp)->s_ring_last = (tail); \ 3453*8275SEric Cheng (ringp)->s_ring_count += (cnt); \ 3454*8275SEric Cheng ASSERT((ringp)->s_ring_count > 0); \ 3455*8275SEric Cheng if ((ringp)->s_ring_type & ST_RING_BW_CTL) { \ 3456*8275SEric Cheng (ringp)->s_ring_size += sz; \ 3457*8275SEric Cheng } \ 3458*8275SEric Cheng } 3459*8275SEric Cheng 3460*8275SEric Cheng /* 3461*8275SEric Cheng * Default entry point to deliver a packet chain to a MAC client. 3462*8275SEric Cheng * If the MAC client has flows, do the classification with these 3463*8275SEric Cheng * flows as well. 3464*8275SEric Cheng */ 3465*8275SEric Cheng /* ARGSUSED */ 3466*8275SEric Cheng void 3467*8275SEric Cheng mac_rx_deliver(void *arg1, mac_resource_handle_t mrh, mblk_t *mp_chain, 3468*8275SEric Cheng mac_header_info_t *arg3) 3469*8275SEric Cheng { 3470*8275SEric Cheng mac_client_impl_t *mcip = arg1; 3471*8275SEric Cheng 3472*8275SEric Cheng if (mcip->mci_nvids == 1 && 3473*8275SEric Cheng !(mcip->mci_state_flags & MCIS_TAG_DISABLE)) { 3474*8275SEric Cheng /* 3475*8275SEric Cheng * If the client has exactly one VID associated with it 3476*8275SEric Cheng * and striping of VLAN header is not disabled, 3477*8275SEric Cheng * remove the VLAN tag from the packet before 3478*8275SEric Cheng * passing it on to the client's receive callback. 3479*8275SEric Cheng * Note that this needs to be done after we dispatch 3480*8275SEric Cheng * the packet to the promiscuous listeners of the 3481*8275SEric Cheng * client, since they expect to see the whole 3482*8275SEric Cheng * frame including the VLAN headers. 3483*8275SEric Cheng */ 3484*8275SEric Cheng mp_chain = mac_strip_vlan_tag_chain(mp_chain); 3485*8275SEric Cheng } 3486*8275SEric Cheng 3487*8275SEric Cheng mcip->mci_rx_fn(mcip->mci_rx_arg, mrh, mp_chain, B_FALSE); 3488*8275SEric Cheng } 3489*8275SEric Cheng 3490*8275SEric Cheng /* 3491*8275SEric Cheng * mac_rx_soft_ring_process 3492*8275SEric Cheng * 3493*8275SEric Cheng * process a chain for a given soft ring. The number of packets queued 3494*8275SEric Cheng * in the SRS and its associated soft rings (including this one) is 3495*8275SEric Cheng * very small (tracked by srs_poll_pkt_cnt), then allow the entering 3496*8275SEric Cheng * thread (interrupt or poll thread) to do inline processing. This 3497*8275SEric Cheng * helps keep the latency down under low load. 3498*8275SEric Cheng * 3499*8275SEric Cheng * The proc and arg for each mblk is already stored in the mblk in 3500*8275SEric Cheng * appropriate places. 3501*8275SEric Cheng */ 3502*8275SEric Cheng /* ARGSUSED */ 3503*8275SEric Cheng void 3504*8275SEric Cheng mac_rx_soft_ring_process(mac_client_impl_t *mcip, mac_soft_ring_t *ringp, 3505*8275SEric Cheng mblk_t *mp_chain, mblk_t *tail, int cnt, size_t sz) 3506*8275SEric Cheng { 3507*8275SEric Cheng mac_direct_rx_t proc; 3508*8275SEric Cheng void *arg1; 3509*8275SEric Cheng mac_resource_handle_t arg2; 3510*8275SEric Cheng mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3511*8275SEric Cheng 3512*8275SEric Cheng ASSERT(ringp != NULL); 3513*8275SEric Cheng ASSERT(mp_chain != NULL); 3514*8275SEric Cheng ASSERT(tail != NULL); 3515*8275SEric Cheng ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3516*8275SEric Cheng 3517*8275SEric Cheng mutex_enter(&ringp->s_ring_lock); 3518*8275SEric Cheng ringp->s_ring_total_inpkt += cnt; 3519*8275SEric Cheng if ((ringp->s_ring_type & ST_RING_ANY) || 3520*8275SEric Cheng ((mac_srs->srs_rx.sr_poll_pkt_cnt <= 1) && 3521*8275SEric Cheng !mac_srs->srs_rx.sr_enqueue_always)) { 3522*8275SEric Cheng /* If on processor or blanking on, then enqueue and return */ 3523*8275SEric Cheng if (ringp->s_ring_state & S_RING_BLANK || 3524*8275SEric Cheng ringp->s_ring_state & S_RING_PROC) { 3525*8275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3526*8275SEric Cheng mutex_exit(&ringp->s_ring_lock); 3527*8275SEric Cheng return; 3528*8275SEric Cheng } 3529*8275SEric Cheng 3530*8275SEric Cheng proc = ringp->s_ring_rx_func; 3531*8275SEric Cheng arg1 = ringp->s_ring_rx_arg1; 3532*8275SEric Cheng arg2 = ringp->s_ring_rx_arg2; 3533*8275SEric Cheng /* 3534*8275SEric Cheng * See if anything is already queued. If we are the 3535*8275SEric Cheng * first packet, do inline processing else queue the 3536*8275SEric Cheng * packet and do the drain. 3537*8275SEric Cheng */ 3538*8275SEric Cheng if (ringp->s_ring_first == NULL) { 3539*8275SEric Cheng /* 3540*8275SEric Cheng * Fast-path, ok to process and nothing queued. 3541*8275SEric Cheng */ 3542*8275SEric Cheng ringp->s_ring_run = curthread; 3543*8275SEric Cheng ringp->s_ring_state |= (S_RING_PROC); 3544*8275SEric Cheng 3545*8275SEric Cheng mutex_exit(&ringp->s_ring_lock); 3546*8275SEric Cheng 3547*8275SEric Cheng /* 3548*8275SEric Cheng * We are the chain of 1 packet so 3549*8275SEric Cheng * go through this fast path. 3550*8275SEric Cheng */ 3551*8275SEric Cheng ASSERT(mp_chain->b_next == NULL); 3552*8275SEric Cheng 3553*8275SEric Cheng (*proc)(arg1, arg2, mp_chain, NULL); 3554*8275SEric Cheng 3555*8275SEric Cheng ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3556*8275SEric Cheng /* 3557*8275SEric Cheng * If we have a soft ring set which is doing 3558*8275SEric Cheng * bandwidth control, we need to decrement 3559*8275SEric Cheng * srs_size and count so it the SRS can have a 3560*8275SEric Cheng * accurate idea of what is the real data 3561*8275SEric Cheng * queued between SRS and its soft rings. We 3562*8275SEric Cheng * decrement the counters only when the packet 3563*8275SEric Cheng * gets processed by both SRS and the soft ring. 3564*8275SEric Cheng */ 3565*8275SEric Cheng mutex_enter(&mac_srs->srs_lock); 3566*8275SEric Cheng MAC_UPDATE_SRS_COUNT_LOCKED(mac_srs, cnt); 3567*8275SEric Cheng MAC_UPDATE_SRS_SIZE_LOCKED(mac_srs, sz); 3568*8275SEric Cheng mutex_exit(&mac_srs->srs_lock); 3569*8275SEric Cheng 3570*8275SEric Cheng mutex_enter(&ringp->s_ring_lock); 3571*8275SEric Cheng ringp->s_ring_run = NULL; 3572*8275SEric Cheng ringp->s_ring_state &= ~S_RING_PROC; 3573*8275SEric Cheng if (ringp->s_ring_state & S_RING_CLIENT_WAIT) 3574*8275SEric Cheng cv_signal(&ringp->s_ring_client_cv); 3575*8275SEric Cheng 3576*8275SEric Cheng if ((ringp->s_ring_first == NULL) || 3577*8275SEric Cheng (ringp->s_ring_state & S_RING_BLANK)) { 3578*8275SEric Cheng /* 3579*8275SEric Cheng * We processed inline our packet and 3580*8275SEric Cheng * nothing new has arrived or our 3581*8275SEric Cheng * receiver doesn't want to receive 3582*8275SEric Cheng * any packets. We are done. 3583*8275SEric Cheng */ 3584*8275SEric Cheng mutex_exit(&ringp->s_ring_lock); 3585*8275SEric Cheng return; 3586*8275SEric Cheng } 3587*8275SEric Cheng } else { 3588*8275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, 3589*8275SEric Cheng mp_chain, tail, cnt, sz); 3590*8275SEric Cheng } 3591*8275SEric Cheng 3592*8275SEric Cheng /* 3593*8275SEric Cheng * We are here because either we couldn't do inline 3594*8275SEric Cheng * processing (because something was already 3595*8275SEric Cheng * queued), or we had a chain of more than one 3596*8275SEric Cheng * packet, or something else arrived after we were 3597*8275SEric Cheng * done with inline processing. 3598*8275SEric Cheng */ 3599*8275SEric Cheng ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 3600*8275SEric Cheng ASSERT(ringp->s_ring_first != NULL); 3601*8275SEric Cheng 3602*8275SEric Cheng ringp->s_ring_drain_func(ringp); 3603*8275SEric Cheng mutex_exit(&ringp->s_ring_lock); 3604*8275SEric Cheng return; 3605*8275SEric Cheng } else { 3606*8275SEric Cheng /* ST_RING_WORKER_ONLY case */ 3607*8275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3608*8275SEric Cheng mac_soft_ring_worker_wakeup(ringp); 3609*8275SEric Cheng mutex_exit(&ringp->s_ring_lock); 3610*8275SEric Cheng } 3611*8275SEric Cheng } 3612*8275SEric Cheng 3613*8275SEric Cheng /* 3614*8275SEric Cheng * TX SOFTRING RELATED FUNCTIONS 3615*8275SEric Cheng * 3616*8275SEric Cheng * These functions really belong in mac_soft_ring.c and here for 3617*8275SEric Cheng * a short period. 3618*8275SEric Cheng */ 3619*8275SEric Cheng 3620*8275SEric Cheng #define TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp, tail, cnt, sz) { \ 3621*8275SEric Cheng ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); \ 3622*8275SEric Cheng ringp->s_ring_state |= S_RING_ENQUEUED; \ 3623*8275SEric Cheng SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); \ 3624*8275SEric Cheng } 3625*8275SEric Cheng 3626*8275SEric Cheng /* 3627*8275SEric Cheng * mac_tx_sring_queued 3628*8275SEric Cheng * 3629*8275SEric Cheng * When we are out of transmit descriptors and we already have a 3630*8275SEric Cheng * queue that exceeds hiwat (or the client called us with 3631*8275SEric Cheng * MAC_TX_NO_ENQUEUE or MAC_DROP_ON_NO_DESC flag), return the 3632*8275SEric Cheng * soft ring pointer as the opaque cookie for the client enable 3633*8275SEric Cheng * flow control. 3634*8275SEric Cheng */ 3635*8275SEric Cheng static mac_tx_cookie_t 3636*8275SEric Cheng mac_tx_sring_enqueue(mac_soft_ring_t *ringp, mblk_t *mp_chain, uint16_t flag, 3637*8275SEric Cheng mblk_t **ret_mp) 3638*8275SEric Cheng { 3639*8275SEric Cheng int cnt; 3640*8275SEric Cheng size_t sz; 3641*8275SEric Cheng mblk_t *tail; 3642*8275SEric Cheng mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3643*8275SEric Cheng mac_tx_cookie_t cookie = NULL; 3644*8275SEric Cheng boolean_t wakeup_worker = B_TRUE; 3645*8275SEric Cheng 3646*8275SEric Cheng ASSERT(MUTEX_HELD(&ringp->s_ring_lock)); 3647*8275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 3648*8275SEric Cheng if (flag & MAC_DROP_ON_NO_DESC) { 3649*8275SEric Cheng mac_pkt_drop(NULL, NULL, mp_chain, B_FALSE); 3650*8275SEric Cheng /* increment freed stats */ 3651*8275SEric Cheng ringp->s_ring_drops += cnt; 3652*8275SEric Cheng cookie = (mac_tx_cookie_t)ringp; 3653*8275SEric Cheng } else { 3654*8275SEric Cheng if (ringp->s_ring_first != NULL) 3655*8275SEric Cheng wakeup_worker = B_FALSE; 3656*8275SEric Cheng 3657*8275SEric Cheng if (flag & MAC_TX_NO_ENQUEUE) { 3658*8275SEric Cheng /* 3659*8275SEric Cheng * If QUEUED is not set, queue the packet 3660*8275SEric Cheng * and let mac_tx_soft_ring_drain() set 3661*8275SEric Cheng * the TX_BLOCKED bit for the reasons 3662*8275SEric Cheng * explained above. Otherwise, return the 3663*8275SEric Cheng * mblks. 3664*8275SEric Cheng */ 3665*8275SEric Cheng if (wakeup_worker) { 3666*8275SEric Cheng TX_SOFT_RING_ENQUEUE_CHAIN(ringp, 3667*8275SEric Cheng mp_chain, tail, cnt, sz); 3668*8275SEric Cheng } else { 3669*8275SEric Cheng ringp->s_ring_state |= S_RING_WAKEUP_CLIENT; 3670*8275SEric Cheng cookie = (mac_tx_cookie_t)ringp; 3671*8275SEric Cheng *ret_mp = mp_chain; 3672*8275SEric Cheng } 3673*8275SEric Cheng } else { 3674*8275SEric Cheng boolean_t enqueue = B_TRUE; 3675*8275SEric Cheng 3676*8275SEric Cheng if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 3677*8275SEric Cheng /* 3678*8275SEric Cheng * flow-controlled. Store ringp in cookie 3679*8275SEric Cheng * so that it can be returned as 3680*8275SEric Cheng * mac_tx_cookie_t to client 3681*8275SEric Cheng */ 3682*8275SEric Cheng ringp->s_ring_state |= S_RING_TX_HIWAT; 3683*8275SEric Cheng cookie = (mac_tx_cookie_t)ringp; 3684*8275SEric Cheng ringp->s_ring_hiwat_cnt++; 3685*8275SEric Cheng if (ringp->s_ring_count > 3686*8275SEric Cheng ringp->s_ring_tx_max_q_cnt) { 3687*8275SEric Cheng /* increment freed stats */ 3688*8275SEric Cheng ringp->s_ring_drops += cnt; 3689*8275SEric Cheng /* 3690*8275SEric Cheng * b_prev may be set to the fanout hint 3691*8275SEric Cheng * hence can't use freemsg directly 3692*8275SEric Cheng */ 3693*8275SEric Cheng mac_pkt_drop(NULL, NULL, 3694*8275SEric Cheng mp_chain, B_FALSE); 3695*8275SEric Cheng DTRACE_PROBE1(tx_queued_hiwat, 3696*8275SEric Cheng mac_soft_ring_t *, ringp); 3697*8275SEric Cheng enqueue = B_FALSE; 3698*8275SEric Cheng } 3699*8275SEric Cheng } 3700*8275SEric Cheng if (enqueue) { 3701*8275SEric Cheng TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, 3702*8275SEric Cheng tail, cnt, sz); 3703*8275SEric Cheng } 3704*8275SEric Cheng } 3705*8275SEric Cheng if (wakeup_worker) 3706*8275SEric Cheng cv_signal(&ringp->s_ring_async); 3707*8275SEric Cheng } 3708*8275SEric Cheng return (cookie); 3709*8275SEric Cheng } 3710*8275SEric Cheng 3711*8275SEric Cheng 3712*8275SEric Cheng /* 3713*8275SEric Cheng * mac_tx_soft_ring_process 3714*8275SEric Cheng * 3715*8275SEric Cheng * This routine is called when fanning out outgoing traffic among 3716*8275SEric Cheng * multipe Tx rings. 3717*8275SEric Cheng * Note that a soft ring is associated with a h/w Tx ring. 3718*8275SEric Cheng */ 3719*8275SEric Cheng mac_tx_cookie_t 3720*8275SEric Cheng mac_tx_soft_ring_process(mac_soft_ring_t *ringp, mblk_t *mp_chain, 3721*8275SEric Cheng uint16_t flag, mblk_t **ret_mp) 3722*8275SEric Cheng { 3723*8275SEric Cheng mac_soft_ring_set_t *mac_srs = ringp->s_ring_set; 3724*8275SEric Cheng int cnt; 3725*8275SEric Cheng size_t sz; 3726*8275SEric Cheng mblk_t *tail; 3727*8275SEric Cheng mac_tx_cookie_t cookie = NULL; 3728*8275SEric Cheng 3729*8275SEric Cheng ASSERT(ringp != NULL); 3730*8275SEric Cheng ASSERT(mp_chain != NULL); 3731*8275SEric Cheng ASSERT(MUTEX_NOT_HELD(&ringp->s_ring_lock)); 3732*8275SEric Cheng /* 3733*8275SEric Cheng * Only two modes can come here; either it can be 3734*8275SEric Cheng * SRS_TX_BW_FANOUT or SRS_TX_FANOUT 3735*8275SEric Cheng */ 3736*8275SEric Cheng ASSERT(mac_srs->srs_tx.st_mode == SRS_TX_FANOUT || 3737*8275SEric Cheng mac_srs->srs_tx.st_mode == SRS_TX_BW_FANOUT); 3738*8275SEric Cheng 3739*8275SEric Cheng if (ringp->s_ring_type & ST_RING_WORKER_ONLY) { 3740*8275SEric Cheng /* Serialization mode */ 3741*8275SEric Cheng 3742*8275SEric Cheng mutex_enter(&ringp->s_ring_lock); 3743*8275SEric Cheng if (ringp->s_ring_count > ringp->s_ring_tx_hiwat) { 3744*8275SEric Cheng cookie = mac_tx_sring_enqueue(ringp, mp_chain, 3745*8275SEric Cheng flag, ret_mp); 3746*8275SEric Cheng mutex_exit(&ringp->s_ring_lock); 3747*8275SEric Cheng return (cookie); 3748*8275SEric Cheng } 3749*8275SEric Cheng MAC_COUNT_CHAIN(mac_srs, mp_chain, tail, cnt, sz); 3750*8275SEric Cheng TX_SOFT_RING_ENQUEUE_CHAIN(ringp, mp_chain, tail, cnt, sz); 3751*8275SEric Cheng if (ringp->s_ring_state & (S_RING_BLOCK | S_RING_PROC)) { 3752*8275SEric Cheng /* 3753*8275SEric Cheng * If ring is blocked due to lack of Tx 3754*8275SEric Cheng * descs, just return. Worker thread 3755*8275SEric Cheng * will get scheduled when Tx desc's 3756*8275SEric Cheng * become available. 3757*8275SEric Cheng */ 3758*8275SEric Cheng mutex_exit(&ringp->s_ring_lock); 3759*8275SEric Cheng return (cookie); 3760*8275SEric Cheng } 3761*8275SEric Cheng mac_soft_ring_worker_wakeup(ringp); 3762*8275SEric Cheng mutex_exit(&ringp->s_ring_lock); 3763*8275SEric Cheng return (cookie); 3764*8275SEric Cheng } else { 3765*8275SEric Cheng /* Default fanout mode */ 3766*8275SEric Cheng /* 3767*8275SEric Cheng * S_RING_BLOCKED is set when underlying NIC runs 3768*8275SEric Cheng * out of Tx descs and messages start getting 3769*8275SEric Cheng * queued. It won't get reset until 3770*8275SEric Cheng * tx_srs_drain() completely drains out the 3771*8275SEric Cheng * messages. 3772*8275SEric Cheng */ 3773*8275SEric Cheng boolean_t is_subflow; 3774*8275SEric Cheng mac_tx_stats_t stats; 3775*8275SEric Cheng 3776*8275SEric Cheng if (ringp->s_ring_state & S_RING_ENQUEUED) { 3777*8275SEric Cheng /* Tx descs/resources not available */ 3778*8275SEric Cheng mutex_enter(&ringp->s_ring_lock); 3779*8275SEric Cheng if (ringp->s_ring_state & S_RING_ENQUEUED) { 3780*8275SEric Cheng cookie = mac_tx_sring_enqueue(ringp, mp_chain, 3781*8275SEric Cheng flag, ret_mp); 3782*8275SEric Cheng mutex_exit(&ringp->s_ring_lock); 3783*8275SEric Cheng return (cookie); 3784*8275SEric Cheng } 3785*8275SEric Cheng /* 3786*8275SEric Cheng * While we were computing mblk count, the 3787*8275SEric Cheng * flow control condition got relieved. 3788*8275SEric Cheng * Continue with the transmission. 3789*8275SEric Cheng */ 3790*8275SEric Cheng mutex_exit(&ringp->s_ring_lock); 3791*8275SEric Cheng } 3792*8275SEric Cheng is_subflow = ((mac_srs->srs_type & SRST_FLOW) != 0); 3793*8275SEric Cheng 3794*8275SEric Cheng mp_chain = mac_tx_send(ringp->s_ring_tx_arg1, 3795*8275SEric Cheng ringp->s_ring_tx_arg2, mp_chain, 3796*8275SEric Cheng (is_subflow ? &stats : NULL)); 3797*8275SEric Cheng 3798*8275SEric Cheng /* 3799*8275SEric Cheng * Multiple threads could be here sending packets. 3800*8275SEric Cheng * Under such conditions, it is not possible to 3801*8275SEric Cheng * automically set S_RING_BLOCKED bit to indicate 3802*8275SEric Cheng * out of tx desc condition. To atomically set 3803*8275SEric Cheng * this, we queue the returned packet and do 3804*8275SEric Cheng * the setting of S_RING_BLOCKED in 3805*8275SEric Cheng * mac_tx_soft_ring_drain(). 3806*8275SEric Cheng */ 3807*8275SEric Cheng if (mp_chain != NULL) { 3808*8275SEric Cheng mutex_enter(&ringp->s_ring_lock); 3809*8275SEric Cheng cookie = 3810*8275SEric Cheng mac_tx_sring_enqueue(ringp, mp_chain, flag, ret_mp); 3811*8275SEric Cheng mutex_exit(&ringp->s_ring_lock); 3812*8275SEric Cheng return (cookie); 3813*8275SEric Cheng } 3814*8275SEric Cheng if (is_subflow) { 3815*8275SEric Cheng FLOW_TX_STATS_UPDATE(mac_srs->srs_flent, &stats); 3816*8275SEric Cheng } 3817*8275SEric Cheng return (NULL); 3818*8275SEric Cheng } 3819*8275SEric Cheng } 3820