10Sstevel@tonic-gate /*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
51804Sericheng * Common Development and Distribution License (the "License").
61804Sericheng * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate *
80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate * See the License for the specific language governing permissions
110Sstevel@tonic-gate * and limitations under the License.
120Sstevel@tonic-gate *
130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate *
190Sstevel@tonic-gate * CDDL HEADER END
200Sstevel@tonic-gate */
210Sstevel@tonic-gate /*
22*11878SVenu.Iyer@Sun.COM * Copyright 2010 Sun Microsystems, Inc. All rights reserved.
230Sstevel@tonic-gate * Use is subject to license terms.
240Sstevel@tonic-gate */
250Sstevel@tonic-gate
260Sstevel@tonic-gate /*
270Sstevel@tonic-gate * IEEE 802.3ad Link Aggregation - Send code.
280Sstevel@tonic-gate *
290Sstevel@tonic-gate * Implements the Distributor function.
300Sstevel@tonic-gate */
310Sstevel@tonic-gate
320Sstevel@tonic-gate #include <sys/conf.h>
330Sstevel@tonic-gate #include <sys/modctl.h>
340Sstevel@tonic-gate #include <sys/sunddi.h>
35*11878SVenu.Iyer@Sun.COM #include <sys/callb.h>
360Sstevel@tonic-gate #include <sys/vlan.h>
370Sstevel@tonic-gate #include <sys/strsun.h>
380Sstevel@tonic-gate #include <sys/strsubr.h>
398833SVenu.Iyer@Sun.COM #include <sys/dlpi.h>
400Sstevel@tonic-gate
410Sstevel@tonic-gate #include <inet/common.h>
420Sstevel@tonic-gate #include <inet/led.h>
430Sstevel@tonic-gate #include <inet/ip.h>
440Sstevel@tonic-gate #include <inet/ip6.h>
450Sstevel@tonic-gate #include <inet/tcp.h>
460Sstevel@tonic-gate #include <netinet/udp.h>
470Sstevel@tonic-gate
480Sstevel@tonic-gate #include <sys/aggr.h>
490Sstevel@tonic-gate #include <sys/aggr_impl.h>
500Sstevel@tonic-gate
510Sstevel@tonic-gate /*
520Sstevel@tonic-gate * Update the TX load balancing policy of the specified group.
530Sstevel@tonic-gate */
540Sstevel@tonic-gate void
aggr_send_update_policy(aggr_grp_t * grp,uint32_t policy)550Sstevel@tonic-gate aggr_send_update_policy(aggr_grp_t *grp, uint32_t policy)
560Sstevel@tonic-gate {
578833SVenu.Iyer@Sun.COM uint8_t mac_policy = 0;
588833SVenu.Iyer@Sun.COM
598275SEric Cheng ASSERT(MAC_PERIM_HELD(grp->lg_mh));
600Sstevel@tonic-gate
618833SVenu.Iyer@Sun.COM if ((policy & AGGR_POLICY_L2) != 0)
628833SVenu.Iyer@Sun.COM mac_policy |= MAC_PKT_HASH_L2;
638833SVenu.Iyer@Sun.COM if ((policy & AGGR_POLICY_L3) != 0)
648833SVenu.Iyer@Sun.COM mac_policy |= MAC_PKT_HASH_L3;
658833SVenu.Iyer@Sun.COM if ((policy & AGGR_POLICY_L4) != 0)
668833SVenu.Iyer@Sun.COM mac_policy |= MAC_PKT_HASH_L4;
678833SVenu.Iyer@Sun.COM
680Sstevel@tonic-gate grp->lg_tx_policy = policy;
698833SVenu.Iyer@Sun.COM grp->lg_mac_tx_policy = mac_policy;
700Sstevel@tonic-gate }
710Sstevel@tonic-gate
72*11878SVenu.Iyer@Sun.COM #define HASH_HINT(hint) \
73*11878SVenu.Iyer@Sun.COM ((hint) ^ ((hint) >> 24) ^ ((hint) >> 16) ^ ((hint) >> 8))
74*11878SVenu.Iyer@Sun.COM
75*11878SVenu.Iyer@Sun.COM /*
76*11878SVenu.Iyer@Sun.COM * Function invoked by mac layer to find a specific TX ring on a port
77*11878SVenu.Iyer@Sun.COM * to send data.
78*11878SVenu.Iyer@Sun.COM */
79*11878SVenu.Iyer@Sun.COM mblk_t *
aggr_find_tx_ring(void * arg,mblk_t * mp,uintptr_t hint,mac_ring_handle_t * rh)80*11878SVenu.Iyer@Sun.COM aggr_find_tx_ring(void *arg, mblk_t *mp, uintptr_t hint, mac_ring_handle_t *rh)
81*11878SVenu.Iyer@Sun.COM {
82*11878SVenu.Iyer@Sun.COM aggr_grp_t *grp = arg;
83*11878SVenu.Iyer@Sun.COM aggr_port_t *port;
84*11878SVenu.Iyer@Sun.COM uint64_t hash;
85*11878SVenu.Iyer@Sun.COM
86*11878SVenu.Iyer@Sun.COM rw_enter(&grp->lg_tx_lock, RW_READER);
87*11878SVenu.Iyer@Sun.COM if (grp->lg_ntx_ports == 0) {
88*11878SVenu.Iyer@Sun.COM /*
89*11878SVenu.Iyer@Sun.COM * We could have returned from aggr_m_start() before
90*11878SVenu.Iyer@Sun.COM * the ports were actually attached. Drop the chain.
91*11878SVenu.Iyer@Sun.COM */
92*11878SVenu.Iyer@Sun.COM rw_exit(&grp->lg_tx_lock);
93*11878SVenu.Iyer@Sun.COM freemsgchain(mp);
94*11878SVenu.Iyer@Sun.COM return (NULL);
95*11878SVenu.Iyer@Sun.COM }
96*11878SVenu.Iyer@Sun.COM hash = mac_pkt_hash(DL_ETHER, mp, grp->lg_mac_tx_policy, B_TRUE);
97*11878SVenu.Iyer@Sun.COM port = grp->lg_tx_ports[hash % grp->lg_ntx_ports];
98*11878SVenu.Iyer@Sun.COM
99*11878SVenu.Iyer@Sun.COM /*
100*11878SVenu.Iyer@Sun.COM * Use hash as the hint so to direct traffic to
101*11878SVenu.Iyer@Sun.COM * different TX rings. Note below bit operation
102*11878SVenu.Iyer@Sun.COM * is needed in case hint is 0 to get the most
103*11878SVenu.Iyer@Sun.COM * benefit from HASH_HINT() algorithm.
104*11878SVenu.Iyer@Sun.COM */
105*11878SVenu.Iyer@Sun.COM if (port->lp_tx_ring_cnt > 1) {
106*11878SVenu.Iyer@Sun.COM if (hint == 0) {
107*11878SVenu.Iyer@Sun.COM hash = (hash << 24 | hash << 16 | hash);
108*11878SVenu.Iyer@Sun.COM hash = (hash << 32 | hash);
109*11878SVenu.Iyer@Sun.COM } else {
110*11878SVenu.Iyer@Sun.COM hash = hint;
111*11878SVenu.Iyer@Sun.COM }
112*11878SVenu.Iyer@Sun.COM hash = HASH_HINT(hash);
113*11878SVenu.Iyer@Sun.COM *rh = port->lp_pseudo_tx_rings[hash % port->lp_tx_ring_cnt];
114*11878SVenu.Iyer@Sun.COM } else {
115*11878SVenu.Iyer@Sun.COM *rh = port->lp_pseudo_tx_rings[0];
116*11878SVenu.Iyer@Sun.COM }
117*11878SVenu.Iyer@Sun.COM rw_exit(&grp->lg_tx_lock);
118*11878SVenu.Iyer@Sun.COM
119*11878SVenu.Iyer@Sun.COM return (mp);
120*11878SVenu.Iyer@Sun.COM }
121*11878SVenu.Iyer@Sun.COM
122*11878SVenu.Iyer@Sun.COM /*
123*11878SVenu.Iyer@Sun.COM * aggr_tx_notify_thread:
124*11878SVenu.Iyer@Sun.COM *
125*11878SVenu.Iyer@Sun.COM * aggr_tx_ring_update() callback function wakes up this thread when
126*11878SVenu.Iyer@Sun.COM * it gets called. This thread will call mac_tx_ring_update() to
127*11878SVenu.Iyer@Sun.COM * notify upper mac of flow control getting relieved. Note that
128*11878SVenu.Iyer@Sun.COM * aggr_tx_ring_update() cannot call mac_tx_ring_update() directly
129*11878SVenu.Iyer@Sun.COM * because aggr_tx_ring_update() is called from lower mac with
130*11878SVenu.Iyer@Sun.COM * mi_rw_lock held.
131*11878SVenu.Iyer@Sun.COM */
132*11878SVenu.Iyer@Sun.COM void
aggr_tx_notify_thread(void * arg)133*11878SVenu.Iyer@Sun.COM aggr_tx_notify_thread(void *arg)
134*11878SVenu.Iyer@Sun.COM {
135*11878SVenu.Iyer@Sun.COM callb_cpr_t cprinfo;
136*11878SVenu.Iyer@Sun.COM aggr_grp_t *grp = (aggr_grp_t *)arg;
137*11878SVenu.Iyer@Sun.COM mac_ring_handle_t pseudo_mrh;
138*11878SVenu.Iyer@Sun.COM
139*11878SVenu.Iyer@Sun.COM CALLB_CPR_INIT(&cprinfo, &grp->lg_tx_flowctl_lock, callb_generic_cpr,
140*11878SVenu.Iyer@Sun.COM "aggr_tx_notify_thread");
141*11878SVenu.Iyer@Sun.COM
142*11878SVenu.Iyer@Sun.COM mutex_enter(&grp->lg_tx_flowctl_lock);
143*11878SVenu.Iyer@Sun.COM while (!grp->lg_tx_notify_done) {
144*11878SVenu.Iyer@Sun.COM if ((grp->lg_tx_blocked_cnt) == 0) {
145*11878SVenu.Iyer@Sun.COM CALLB_CPR_SAFE_BEGIN(&cprinfo);
146*11878SVenu.Iyer@Sun.COM cv_wait(&grp->lg_tx_flowctl_cv,
147*11878SVenu.Iyer@Sun.COM &grp->lg_tx_flowctl_lock);
148*11878SVenu.Iyer@Sun.COM CALLB_CPR_SAFE_END(&cprinfo, &grp->lg_tx_flowctl_lock);
149*11878SVenu.Iyer@Sun.COM continue;
150*11878SVenu.Iyer@Sun.COM }
151*11878SVenu.Iyer@Sun.COM while (grp->lg_tx_blocked_cnt != 0) {
152*11878SVenu.Iyer@Sun.COM grp->lg_tx_blocked_cnt--;
153*11878SVenu.Iyer@Sun.COM pseudo_mrh =
154*11878SVenu.Iyer@Sun.COM grp->lg_tx_blocked_rings[grp->lg_tx_blocked_cnt];
155*11878SVenu.Iyer@Sun.COM mutex_exit(&grp->lg_tx_flowctl_lock);
156*11878SVenu.Iyer@Sun.COM mac_tx_ring_update(grp->lg_mh, pseudo_mrh);
157*11878SVenu.Iyer@Sun.COM mutex_enter(&grp->lg_tx_flowctl_lock);
158*11878SVenu.Iyer@Sun.COM }
159*11878SVenu.Iyer@Sun.COM }
160*11878SVenu.Iyer@Sun.COM /*
161*11878SVenu.Iyer@Sun.COM * The grp is being destroyed, exit the thread.
162*11878SVenu.Iyer@Sun.COM */
163*11878SVenu.Iyer@Sun.COM grp->lg_tx_notify_thread = NULL;
164*11878SVenu.Iyer@Sun.COM CALLB_CPR_EXIT(&cprinfo);
165*11878SVenu.Iyer@Sun.COM thread_exit();
166*11878SVenu.Iyer@Sun.COM }
167*11878SVenu.Iyer@Sun.COM
168*11878SVenu.Iyer@Sun.COM /*
169*11878SVenu.Iyer@Sun.COM * Callback function registered with lower mac to receive wakeups from
170*11878SVenu.Iyer@Sun.COM * drivers when flow control is relieved (i.e. Tx descriptors are
171*11878SVenu.Iyer@Sun.COM * available).
172*11878SVenu.Iyer@Sun.COM */
173*11878SVenu.Iyer@Sun.COM void
aggr_tx_ring_update(void * arg1,uintptr_t arg2)174*11878SVenu.Iyer@Sun.COM aggr_tx_ring_update(void *arg1, uintptr_t arg2)
175*11878SVenu.Iyer@Sun.COM {
176*11878SVenu.Iyer@Sun.COM aggr_port_t *port = (aggr_port_t *)arg1;
177*11878SVenu.Iyer@Sun.COM mac_ring_handle_t mrh = (mac_ring_handle_t)arg2;
178*11878SVenu.Iyer@Sun.COM mac_ring_handle_t pseudo_mrh;
179*11878SVenu.Iyer@Sun.COM aggr_grp_t *grp = port->lp_grp;
180*11878SVenu.Iyer@Sun.COM int i = 0;
181*11878SVenu.Iyer@Sun.COM
182*11878SVenu.Iyer@Sun.COM if (mrh == NULL) {
183*11878SVenu.Iyer@Sun.COM /*
184*11878SVenu.Iyer@Sun.COM * If the underlying NIC does not expose TX rings,
185*11878SVenu.Iyer@Sun.COM * still as pseudo TX ring is presented to the
186*11878SVenu.Iyer@Sun.COM * aggr mac.
187*11878SVenu.Iyer@Sun.COM */
188*11878SVenu.Iyer@Sun.COM pseudo_mrh = port->lp_pseudo_tx_rings[0];
189*11878SVenu.Iyer@Sun.COM } else {
190*11878SVenu.Iyer@Sun.COM for (i = 0; i < port->lp_tx_ring_cnt; i++) {
191*11878SVenu.Iyer@Sun.COM if (port->lp_tx_rings[i] == mrh)
192*11878SVenu.Iyer@Sun.COM break;
193*11878SVenu.Iyer@Sun.COM }
194*11878SVenu.Iyer@Sun.COM ASSERT(i < port->lp_tx_ring_cnt);
195*11878SVenu.Iyer@Sun.COM pseudo_mrh = port->lp_pseudo_tx_rings[i];
196*11878SVenu.Iyer@Sun.COM }
197*11878SVenu.Iyer@Sun.COM mutex_enter(&grp->lg_tx_flowctl_lock);
198*11878SVenu.Iyer@Sun.COM /*
199*11878SVenu.Iyer@Sun.COM * It could be possible that some (broken?) device driver
200*11878SVenu.Iyer@Sun.COM * could send more than one wakeup on the same ring. In
201*11878SVenu.Iyer@Sun.COM * such a case, multiple instances of the same pseudo TX
202*11878SVenu.Iyer@Sun.COM * ring should not be saved in lg_tx_blocked_rings[]
203*11878SVenu.Iyer@Sun.COM * array. So first check if woken up ring (pseudo_mrh) is
204*11878SVenu.Iyer@Sun.COM * already in the lg_tx_blocked_rings[] array.
205*11878SVenu.Iyer@Sun.COM */
206*11878SVenu.Iyer@Sun.COM for (i = 0; i < grp->lg_tx_blocked_cnt; i++) {
207*11878SVenu.Iyer@Sun.COM if (grp->lg_tx_blocked_rings[i] == pseudo_mrh) {
208*11878SVenu.Iyer@Sun.COM mutex_exit(&grp->lg_tx_flowctl_lock);
209*11878SVenu.Iyer@Sun.COM return;
210*11878SVenu.Iyer@Sun.COM }
211*11878SVenu.Iyer@Sun.COM }
212*11878SVenu.Iyer@Sun.COM /* A distinct mac_ring_handle. Save and increment count */
213*11878SVenu.Iyer@Sun.COM grp->lg_tx_blocked_rings[grp->lg_tx_blocked_cnt] = pseudo_mrh;
214*11878SVenu.Iyer@Sun.COM grp->lg_tx_blocked_cnt++;
215*11878SVenu.Iyer@Sun.COM cv_signal(&grp->lg_tx_flowctl_cv);
216*11878SVenu.Iyer@Sun.COM mutex_exit(&grp->lg_tx_flowctl_lock);
217*11878SVenu.Iyer@Sun.COM }
218*11878SVenu.Iyer@Sun.COM
2190Sstevel@tonic-gate /*
2200Sstevel@tonic-gate * Send function invoked by the MAC service module.
2210Sstevel@tonic-gate */
2220Sstevel@tonic-gate mblk_t *
aggr_ring_tx(void * arg,mblk_t * mp)223*11878SVenu.Iyer@Sun.COM aggr_ring_tx(void *arg, mblk_t *mp)
2240Sstevel@tonic-gate {
225*11878SVenu.Iyer@Sun.COM aggr_pseudo_tx_ring_t *pseudo_ring = (aggr_pseudo_tx_ring_t *)arg;
226*11878SVenu.Iyer@Sun.COM aggr_port_t *port = pseudo_ring->atr_port;
2278275SEric Cheng
228*11878SVenu.Iyer@Sun.COM return (mac_hwring_send_priv(port->lp_mch, pseudo_ring->atr_hw_rh, mp));
2290Sstevel@tonic-gate }
2300Sstevel@tonic-gate
2310Sstevel@tonic-gate /*
2320Sstevel@tonic-gate * Enable sending on the specified port.
2330Sstevel@tonic-gate */
2340Sstevel@tonic-gate void
aggr_send_port_enable(aggr_port_t * port)2350Sstevel@tonic-gate aggr_send_port_enable(aggr_port_t *port)
2360Sstevel@tonic-gate {
2370Sstevel@tonic-gate aggr_grp_t *grp = port->lp_grp;
2380Sstevel@tonic-gate
2398275SEric Cheng ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2408275SEric Cheng
2410Sstevel@tonic-gate if (port->lp_tx_enabled || (port->lp_state !=
2420Sstevel@tonic-gate AGGR_PORT_STATE_ATTACHED)) {
2430Sstevel@tonic-gate /* already enabled or port not yet attached */
2440Sstevel@tonic-gate return;
2450Sstevel@tonic-gate }
2460Sstevel@tonic-gate
2470Sstevel@tonic-gate /*
2480Sstevel@tonic-gate * Add to group's array of tx ports.
2490Sstevel@tonic-gate */
2508275SEric Cheng rw_enter(&grp->lg_tx_lock, RW_WRITER);
2510Sstevel@tonic-gate if (grp->lg_tx_ports_size < grp->lg_ntx_ports+1) {
2520Sstevel@tonic-gate /* current array too small */
2530Sstevel@tonic-gate aggr_port_t **new_ports;
2540Sstevel@tonic-gate uint_t new_size;
2550Sstevel@tonic-gate
2560Sstevel@tonic-gate new_size = grp->lg_ntx_ports+1;
2570Sstevel@tonic-gate new_ports = kmem_zalloc(new_size * sizeof (aggr_port_t *),
2580Sstevel@tonic-gate KM_SLEEP);
2590Sstevel@tonic-gate
2600Sstevel@tonic-gate if (grp->lg_tx_ports_size > 0) {
2610Sstevel@tonic-gate ASSERT(grp->lg_tx_ports != NULL);
2620Sstevel@tonic-gate bcopy(grp->lg_tx_ports, new_ports,
2630Sstevel@tonic-gate grp->lg_ntx_ports * sizeof (aggr_port_t *));
2640Sstevel@tonic-gate kmem_free(grp->lg_tx_ports,
2650Sstevel@tonic-gate grp->lg_tx_ports_size * sizeof (aggr_port_t *));
2660Sstevel@tonic-gate }
2670Sstevel@tonic-gate
2680Sstevel@tonic-gate grp->lg_tx_ports = new_ports;
2690Sstevel@tonic-gate grp->lg_tx_ports_size = new_size;
2700Sstevel@tonic-gate }
2710Sstevel@tonic-gate
2720Sstevel@tonic-gate grp->lg_tx_ports[grp->lg_ntx_ports++] = port;
2730Sstevel@tonic-gate port->lp_tx_idx = grp->lg_ntx_ports-1;
2748275SEric Cheng rw_exit(&grp->lg_tx_lock);
2750Sstevel@tonic-gate
2760Sstevel@tonic-gate port->lp_tx_enabled = B_TRUE;
2770Sstevel@tonic-gate }
2780Sstevel@tonic-gate
2790Sstevel@tonic-gate /*
2800Sstevel@tonic-gate * Disable sending from the specified port.
2810Sstevel@tonic-gate */
2820Sstevel@tonic-gate void
aggr_send_port_disable(aggr_port_t * port)2830Sstevel@tonic-gate aggr_send_port_disable(aggr_port_t *port)
2840Sstevel@tonic-gate {
2850Sstevel@tonic-gate uint_t idx, ntx;
2860Sstevel@tonic-gate aggr_grp_t *grp = port->lp_grp;
2870Sstevel@tonic-gate
2888275SEric Cheng ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2898275SEric Cheng ASSERT(MAC_PERIM_HELD(port->lp_mh));
2900Sstevel@tonic-gate
2910Sstevel@tonic-gate if (!port->lp_tx_enabled) {
2920Sstevel@tonic-gate /* not yet enabled */
2930Sstevel@tonic-gate return;
2940Sstevel@tonic-gate }
2950Sstevel@tonic-gate
2968275SEric Cheng rw_enter(&grp->lg_tx_lock, RW_WRITER);
2970Sstevel@tonic-gate idx = port->lp_tx_idx;
2980Sstevel@tonic-gate ntx = grp->lg_ntx_ports;
2990Sstevel@tonic-gate ASSERT(idx < ntx);
3000Sstevel@tonic-gate
3010Sstevel@tonic-gate /* remove from array of attached ports */
3020Sstevel@tonic-gate if (idx == (ntx - 1)) {
3030Sstevel@tonic-gate grp->lg_tx_ports[idx] = NULL;
3040Sstevel@tonic-gate } else {
3050Sstevel@tonic-gate /* not the last entry, replace with last one */
3060Sstevel@tonic-gate aggr_port_t *victim;
3070Sstevel@tonic-gate
3080Sstevel@tonic-gate victim = grp->lg_tx_ports[ntx - 1];
3090Sstevel@tonic-gate grp->lg_tx_ports[ntx - 1] = NULL;
3100Sstevel@tonic-gate victim->lp_tx_idx = idx;
3110Sstevel@tonic-gate grp->lg_tx_ports[idx] = victim;
3120Sstevel@tonic-gate }
3130Sstevel@tonic-gate
3140Sstevel@tonic-gate port->lp_tx_idx = 0;
3150Sstevel@tonic-gate grp->lg_ntx_ports--;
3168275SEric Cheng rw_exit(&grp->lg_tx_lock);
3170Sstevel@tonic-gate
3180Sstevel@tonic-gate port->lp_tx_enabled = B_FALSE;
3190Sstevel@tonic-gate }
320