1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21 /*
22 * Copyright (c) 2005, 2010, Oracle and/or its affiliates. All rights reserved.
23 */
24
25 /*
26 * IEEE 802.3ad Link Aggregation -- Link Aggregation Groups.
27 *
28 * An instance of the structure aggr_grp_t is allocated for each
29 * link aggregation group. When created, aggr_grp_t objects are
30 * entered into the aggr_grp_hash hash table maintained by the modhash
31 * module. The hash key is the linkid associated with the link
32 * aggregation group.
33 *
34 * A set of MAC ports are associated with each association group.
35 *
36 * Aggr pseudo TX rings
37 * --------------------
38 * The underlying ports (NICs) in an aggregation can have TX rings. To
39 * enhance aggr's performance, these TX rings are made available to the
40 * aggr layer as pseudo TX rings. The concept of pseudo rings are not new.
41 * They are already present and implemented on the RX side. It is called
42 * as pseudo RX rings. The same concept is extended to the TX side where
43 * each TX ring of an underlying port is reflected in aggr as a pseudo
44 * TX ring. Thus each pseudo TX ring will map to a specific hardware TX
45 * ring. Even in the case of a NIC that does not have a TX ring, a pseudo
46 * TX ring is given to the aggregation layer.
47 *
48 * With this change, the outgoing stack depth looks much better:
49 *
50 * mac_tx() -> mac_tx_aggr_mode() -> mac_tx_soft_ring_process() ->
51 * mac_tx_send() -> aggr_ring_rx() -> <driver>_ring_tx()
52 *
53 * Two new modes are introduced to mac_tx() to handle aggr pseudo TX rings:
54 * SRS_TX_AGGR and SRS_TX_BW_AGGR.
55 *
56 * In SRS_TX_AGGR mode, mac_tx_aggr_mode() routine is called. This routine
57 * invokes an aggr function, aggr_find_tx_ring(), to find a (pseudo) TX
58 * ring belonging to a port on which the packet has to be sent.
59 * aggr_find_tx_ring() first finds the outgoing port based on L2/L3/L4
60 * policy and then uses the fanout_hint passed to it to pick a TX ring from
61 * the selected port.
62 *
63 * In SRS_TX_BW_AGGR mode, mac_tx_bw_mode() function is called where
64 * bandwidth limit is applied first on the outgoing packet and the packets
65 * allowed to go out would call mac_tx_aggr_mode() to send the packet on a
66 * particular TX ring.
67 */
68
69 #include <sys/types.h>
70 #include <sys/sysmacros.h>
71 #include <sys/conf.h>
72 #include <sys/cmn_err.h>
73 #include <sys/disp.h>
74 #include <sys/list.h>
75 #include <sys/ksynch.h>
76 #include <sys/kmem.h>
77 #include <sys/stream.h>
78 #include <sys/modctl.h>
79 #include <sys/ddi.h>
80 #include <sys/sunddi.h>
81 #include <sys/atomic.h>
82 #include <sys/stat.h>
83 #include <sys/modhash.h>
84 #include <sys/id_space.h>
85 #include <sys/strsun.h>
86 #include <sys/cred.h>
87 #include <sys/dlpi.h>
88 #include <sys/zone.h>
89 #include <sys/mac_provider.h>
90 #include <sys/dls.h>
91 #include <sys/vlan.h>
92 #include <sys/aggr.h>
93 #include <sys/aggr_impl.h>
94
95 static int aggr_m_start(void *);
96 static void aggr_m_stop(void *);
97 static int aggr_m_promisc(void *, boolean_t);
98 static int aggr_m_multicst(void *, boolean_t, const uint8_t *);
99 static int aggr_m_unicst(void *, const uint8_t *);
100 static int aggr_m_stat(void *, uint_t, uint64_t *);
101 static void aggr_m_ioctl(void *, queue_t *, mblk_t *);
102 static boolean_t aggr_m_capab_get(void *, mac_capab_t, void *);
103 static int aggr_m_setprop(void *, const char *, mac_prop_id_t, uint_t,
104 const void *);
105 static void aggr_m_propinfo(void *, const char *, mac_prop_id_t,
106 mac_prop_info_handle_t);
107
108 static aggr_port_t *aggr_grp_port_lookup(aggr_grp_t *, datalink_id_t);
109 static int aggr_grp_rem_port(aggr_grp_t *, aggr_port_t *, boolean_t *,
110 boolean_t *);
111
112 static void aggr_grp_capab_set(aggr_grp_t *);
113 static boolean_t aggr_grp_capab_check(aggr_grp_t *, aggr_port_t *);
114 static uint_t aggr_grp_max_sdu(aggr_grp_t *);
115 static uint32_t aggr_grp_max_margin(aggr_grp_t *);
116 static boolean_t aggr_grp_sdu_check(aggr_grp_t *, aggr_port_t *);
117 static boolean_t aggr_grp_margin_check(aggr_grp_t *, aggr_port_t *);
118
119 static int aggr_add_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
120 static void aggr_rem_pseudo_rx_group(aggr_port_t *, aggr_pseudo_rx_group_t *);
121 static int aggr_pseudo_disable_intr(mac_intr_handle_t);
122 static int aggr_pseudo_enable_intr(mac_intr_handle_t);
123 static int aggr_pseudo_start_ring(mac_ring_driver_t, uint64_t);
124 static void aggr_pseudo_stop_ring(mac_ring_driver_t);
125 static int aggr_addmac(void *, const uint8_t *);
126 static int aggr_remmac(void *, const uint8_t *);
127 static mblk_t *aggr_rx_poll(void *, int);
128 static void aggr_fill_ring(void *, mac_ring_type_t, const int,
129 const int, mac_ring_info_t *, mac_ring_handle_t);
130 static void aggr_fill_group(void *, mac_ring_type_t, const int,
131 mac_group_info_t *, mac_group_handle_t);
132
133 static kmem_cache_t *aggr_grp_cache;
134 static mod_hash_t *aggr_grp_hash;
135 static krwlock_t aggr_grp_lock;
136 static uint_t aggr_grp_cnt;
137 static id_space_t *key_ids;
138
139 #define GRP_HASHSZ 64
140 #define GRP_HASH_KEY(linkid) ((mod_hash_key_t)(uintptr_t)linkid)
141 #define AGGR_PORT_NAME_DELIMIT '-'
142
143 static uchar_t aggr_zero_mac[] = {0, 0, 0, 0, 0, 0};
144
145 #define AGGR_M_CALLBACK_FLAGS \
146 (MC_IOCTL | MC_GETCAPAB | MC_SETPROP | MC_PROPINFO)
147
148 static mac_callbacks_t aggr_m_callbacks = {
149 AGGR_M_CALLBACK_FLAGS,
150 aggr_m_stat,
151 aggr_m_start,
152 aggr_m_stop,
153 aggr_m_promisc,
154 aggr_m_multicst,
155 NULL,
156 NULL,
157 NULL,
158 aggr_m_ioctl,
159 aggr_m_capab_get,
160 NULL,
161 NULL,
162 aggr_m_setprop,
163 NULL,
164 aggr_m_propinfo
165 };
166
167 /*ARGSUSED*/
168 static int
aggr_grp_constructor(void * buf,void * arg,int kmflag)169 aggr_grp_constructor(void *buf, void *arg, int kmflag)
170 {
171 aggr_grp_t *grp = buf;
172
173 bzero(grp, sizeof (*grp));
174 mutex_init(&grp->lg_lacp_lock, NULL, MUTEX_DEFAULT, NULL);
175 cv_init(&grp->lg_lacp_cv, NULL, CV_DEFAULT, NULL);
176 rw_init(&grp->lg_tx_lock, NULL, RW_DRIVER, NULL);
177 mutex_init(&grp->lg_port_lock, NULL, MUTEX_DEFAULT, NULL);
178 cv_init(&grp->lg_port_cv, NULL, CV_DEFAULT, NULL);
179 mutex_init(&grp->lg_tx_flowctl_lock, NULL, MUTEX_DEFAULT, NULL);
180 cv_init(&grp->lg_tx_flowctl_cv, NULL, CV_DEFAULT, NULL);
181 grp->lg_link_state = LINK_STATE_UNKNOWN;
182 return (0);
183 }
184
185 /*ARGSUSED*/
186 static void
aggr_grp_destructor(void * buf,void * arg)187 aggr_grp_destructor(void *buf, void *arg)
188 {
189 aggr_grp_t *grp = buf;
190
191 if (grp->lg_tx_ports != NULL) {
192 kmem_free(grp->lg_tx_ports,
193 grp->lg_tx_ports_size * sizeof (aggr_port_t *));
194 }
195
196 mutex_destroy(&grp->lg_lacp_lock);
197 cv_destroy(&grp->lg_lacp_cv);
198 mutex_destroy(&grp->lg_port_lock);
199 cv_destroy(&grp->lg_port_cv);
200 rw_destroy(&grp->lg_tx_lock);
201 mutex_destroy(&grp->lg_tx_flowctl_lock);
202 cv_destroy(&grp->lg_tx_flowctl_cv);
203 }
204
205 void
aggr_grp_init(void)206 aggr_grp_init(void)
207 {
208 aggr_grp_cache = kmem_cache_create("aggr_grp_cache",
209 sizeof (aggr_grp_t), 0, aggr_grp_constructor,
210 aggr_grp_destructor, NULL, NULL, NULL, 0);
211
212 aggr_grp_hash = mod_hash_create_idhash("aggr_grp_hash",
213 GRP_HASHSZ, mod_hash_null_valdtor);
214 rw_init(&aggr_grp_lock, NULL, RW_DEFAULT, NULL);
215 aggr_grp_cnt = 0;
216
217 /*
218 * Allocate an id space to manage key values (when key is not
219 * specified). The range of the id space will be from
220 * (AGGR_MAX_KEY + 1) to UINT16_MAX, because the LACP protocol
221 * uses a 16-bit key.
222 */
223 key_ids = id_space_create("aggr_key_ids", AGGR_MAX_KEY + 1, UINT16_MAX);
224 ASSERT(key_ids != NULL);
225 }
226
227 void
aggr_grp_fini(void)228 aggr_grp_fini(void)
229 {
230 id_space_destroy(key_ids);
231 rw_destroy(&aggr_grp_lock);
232 mod_hash_destroy_idhash(aggr_grp_hash);
233 kmem_cache_destroy(aggr_grp_cache);
234 }
235
236 uint_t
aggr_grp_count(void)237 aggr_grp_count(void)
238 {
239 uint_t count;
240
241 rw_enter(&aggr_grp_lock, RW_READER);
242 count = aggr_grp_cnt;
243 rw_exit(&aggr_grp_lock);
244 return (count);
245 }
246
247 /*
248 * Since both aggr_port_notify_cb() and aggr_port_timer_thread() functions
249 * requires the mac perimeter, this function holds a reference of the aggr
250 * and aggr won't call mac_unregister() until this reference drops to 0.
251 */
252 void
aggr_grp_port_hold(aggr_port_t * port)253 aggr_grp_port_hold(aggr_port_t *port)
254 {
255 aggr_grp_t *grp = port->lp_grp;
256
257 AGGR_PORT_REFHOLD(port);
258 mutex_enter(&grp->lg_port_lock);
259 grp->lg_port_ref++;
260 mutex_exit(&grp->lg_port_lock);
261 }
262
263 /*
264 * Release the reference of the grp and inform aggr_grp_delete() calling
265 * mac_unregister() is now safe.
266 */
267 void
aggr_grp_port_rele(aggr_port_t * port)268 aggr_grp_port_rele(aggr_port_t *port)
269 {
270 aggr_grp_t *grp = port->lp_grp;
271
272 mutex_enter(&grp->lg_port_lock);
273 if (--grp->lg_port_ref == 0)
274 cv_signal(&grp->lg_port_cv);
275 mutex_exit(&grp->lg_port_lock);
276 AGGR_PORT_REFRELE(port);
277 }
278
279 /*
280 * Wait for the port's lacp timer thread and the port's notification callback
281 * to exit.
282 */
283 void
aggr_grp_port_wait(aggr_grp_t * grp)284 aggr_grp_port_wait(aggr_grp_t *grp)
285 {
286 mutex_enter(&grp->lg_port_lock);
287 if (grp->lg_port_ref != 0)
288 cv_wait(&grp->lg_port_cv, &grp->lg_port_lock);
289 mutex_exit(&grp->lg_port_lock);
290 }
291
292 /*
293 * Attach a port to a link aggregation group.
294 *
295 * A port is attached to a link aggregation group once its speed
296 * and link state have been verified.
297 *
298 * Returns B_TRUE if the group link state or speed has changed. If
299 * it's the case, the caller must notify the MAC layer via a call
300 * to mac_link().
301 */
302 boolean_t
aggr_grp_attach_port(aggr_grp_t * grp,aggr_port_t * port)303 aggr_grp_attach_port(aggr_grp_t *grp, aggr_port_t *port)
304 {
305 boolean_t link_state_changed = B_FALSE;
306
307 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
308 ASSERT(MAC_PERIM_HELD(port->lp_mh));
309
310 if (port->lp_state == AGGR_PORT_STATE_ATTACHED)
311 return (B_FALSE);
312
313 /*
314 * Validate the MAC port link speed and update the group
315 * link speed if needed.
316 */
317 if (port->lp_ifspeed == 0 ||
318 port->lp_link_state != LINK_STATE_UP ||
319 port->lp_link_duplex != LINK_DUPLEX_FULL) {
320 /*
321 * Can't attach a MAC port with unknown link speed,
322 * down link, or not in full duplex mode.
323 */
324 return (B_FALSE);
325 }
326
327 if (grp->lg_ifspeed == 0) {
328 /*
329 * The group inherits the speed of the first link being
330 * attached.
331 */
332 grp->lg_ifspeed = port->lp_ifspeed;
333 link_state_changed = B_TRUE;
334 } else if (grp->lg_ifspeed != port->lp_ifspeed) {
335 /*
336 * The link speed of the MAC port must be the same as
337 * the group link speed, as per 802.3ad. Since it is
338 * not, the attach is cancelled.
339 */
340 return (B_FALSE);
341 }
342
343 grp->lg_nattached_ports++;
344
345 /*
346 * Update the group link state.
347 */
348 if (grp->lg_link_state != LINK_STATE_UP) {
349 grp->lg_link_state = LINK_STATE_UP;
350 grp->lg_link_duplex = LINK_DUPLEX_FULL;
351 link_state_changed = B_TRUE;
352 }
353
354 /*
355 * Update port's state.
356 */
357 port->lp_state = AGGR_PORT_STATE_ATTACHED;
358
359 aggr_grp_multicst_port(port, B_TRUE);
360
361 /*
362 * Set port's receive callback
363 */
364 mac_rx_set(port->lp_mch, aggr_recv_cb, port);
365
366 /*
367 * If LACP is OFF, the port can be used to send data as soon
368 * as its link is up and verified to be compatible with the
369 * aggregation.
370 *
371 * If LACP is active or passive, notify the LACP subsystem, which
372 * will enable sending on the port following the LACP protocol.
373 */
374 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
375 aggr_send_port_enable(port);
376 else
377 aggr_lacp_port_attached(port);
378
379 return (link_state_changed);
380 }
381
382 boolean_t
aggr_grp_detach_port(aggr_grp_t * grp,aggr_port_t * port)383 aggr_grp_detach_port(aggr_grp_t *grp, aggr_port_t *port)
384 {
385 boolean_t link_state_changed = B_FALSE;
386
387 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
388 ASSERT(MAC_PERIM_HELD(port->lp_mh));
389
390 /* update state */
391 if (port->lp_state != AGGR_PORT_STATE_ATTACHED)
392 return (B_FALSE);
393
394 mac_rx_clear(port->lp_mch);
395
396 aggr_grp_multicst_port(port, B_FALSE);
397
398 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
399 aggr_send_port_disable(port);
400 else
401 aggr_lacp_port_detached(port);
402
403 port->lp_state = AGGR_PORT_STATE_STANDBY;
404
405 grp->lg_nattached_ports--;
406 if (grp->lg_nattached_ports == 0) {
407 /* the last attached MAC port of the group is being detached */
408 grp->lg_ifspeed = 0;
409 grp->lg_link_state = LINK_STATE_DOWN;
410 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
411 link_state_changed = B_TRUE;
412 }
413
414 return (link_state_changed);
415 }
416
417 /*
418 * Update the MAC addresses of the constituent ports of the specified
419 * group. This function is invoked:
420 * - after creating a new aggregation group.
421 * - after adding new ports to an aggregation group.
422 * - after removing a port from a group when the MAC address of
423 * that port was used for the MAC address of the group.
424 * - after the MAC address of a port changed when the MAC address
425 * of that port was used for the MAC address of the group.
426 *
427 * Return true if the link state of the aggregation changed, for example
428 * as a result of a failure changing the MAC address of one of the
429 * constituent ports.
430 */
431 boolean_t
aggr_grp_update_ports_mac(aggr_grp_t * grp)432 aggr_grp_update_ports_mac(aggr_grp_t *grp)
433 {
434 aggr_port_t *cport;
435 boolean_t link_state_changed = B_FALSE;
436 mac_perim_handle_t mph;
437
438 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
439
440 for (cport = grp->lg_ports; cport != NULL;
441 cport = cport->lp_next) {
442 mac_perim_enter_by_mh(cport->lp_mh, &mph);
443 if (aggr_port_unicst(cport) != 0) {
444 if (aggr_grp_detach_port(grp, cport))
445 link_state_changed = B_TRUE;
446 } else {
447 /*
448 * If a port was detached because of a previous
449 * failure changing the MAC address, the port is
450 * reattached when it successfully changes the MAC
451 * address now, and this might cause the link state
452 * of the aggregation to change.
453 */
454 if (aggr_grp_attach_port(grp, cport))
455 link_state_changed = B_TRUE;
456 }
457 mac_perim_exit(mph);
458 }
459 return (link_state_changed);
460 }
461
462 /*
463 * Invoked when the MAC address of a port has changed. If the port's
464 * MAC address was used for the group MAC address, set mac_addr_changedp
465 * to B_TRUE to indicate to the caller that it should send a MAC_NOTE_UNICST
466 * notification. If the link state changes due to detach/attach of
467 * the constituent port, set link_state_changedp to B_TRUE to indicate
468 * to the caller that it should send a MAC_NOTE_LINK notification. In both
469 * cases, it is the responsibility of the caller to invoke notification
470 * functions after releasing the the port lock.
471 */
472 void
aggr_grp_port_mac_changed(aggr_grp_t * grp,aggr_port_t * port,boolean_t * mac_addr_changedp,boolean_t * link_state_changedp)473 aggr_grp_port_mac_changed(aggr_grp_t *grp, aggr_port_t *port,
474 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
475 {
476 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
477 ASSERT(MAC_PERIM_HELD(port->lp_mh));
478 ASSERT(mac_addr_changedp != NULL);
479 ASSERT(link_state_changedp != NULL);
480
481 *mac_addr_changedp = B_FALSE;
482 *link_state_changedp = B_FALSE;
483
484 if (grp->lg_addr_fixed) {
485 /*
486 * The group is using a fixed MAC address or an automatic
487 * MAC address has not been set.
488 */
489 return;
490 }
491
492 if (grp->lg_mac_addr_port == port) {
493 /*
494 * The MAC address of the port was assigned to the group
495 * MAC address. Update the group MAC address.
496 */
497 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
498 *mac_addr_changedp = B_TRUE;
499 } else {
500 /*
501 * Update the actual port MAC address to the MAC address
502 * of the group.
503 */
504 if (aggr_port_unicst(port) != 0) {
505 *link_state_changedp = aggr_grp_detach_port(grp, port);
506 } else {
507 /*
508 * If a port was detached because of a previous
509 * failure changing the MAC address, the port is
510 * reattached when it successfully changes the MAC
511 * address now, and this might cause the link state
512 * of the aggregation to change.
513 */
514 *link_state_changedp = aggr_grp_attach_port(grp, port);
515 }
516 }
517 }
518
519 /*
520 * Add a port to a link aggregation group.
521 */
522 static int
aggr_grp_add_port(aggr_grp_t * grp,datalink_id_t port_linkid,boolean_t force,aggr_port_t ** pp)523 aggr_grp_add_port(aggr_grp_t *grp, datalink_id_t port_linkid, boolean_t force,
524 aggr_port_t **pp)
525 {
526 aggr_port_t *port, **cport;
527 mac_perim_handle_t mph;
528 zoneid_t port_zoneid = ALL_ZONES;
529 int err;
530
531 /* The port must be int the same zone as the aggregation. */
532 if (zone_check_datalink(&port_zoneid, port_linkid) != 0)
533 port_zoneid = GLOBAL_ZONEID;
534 if (grp->lg_zoneid != port_zoneid)
535 return (EBUSY);
536
537 /*
538 * lg_mh could be NULL when the function is called during the creation
539 * of the aggregation.
540 */
541 ASSERT(grp->lg_mh == NULL || MAC_PERIM_HELD(grp->lg_mh));
542
543 /* create new port */
544 err = aggr_port_create(grp, port_linkid, force, &port);
545 if (err != 0)
546 return (err);
547
548 mac_perim_enter_by_mh(port->lp_mh, &mph);
549
550 /* add port to list of group constituent ports */
551 cport = &grp->lg_ports;
552 while (*cport != NULL)
553 cport = &((*cport)->lp_next);
554 *cport = port;
555
556 /*
557 * Back reference to the group it is member of. A port always
558 * holds a reference to its group to ensure that the back
559 * reference is always valid.
560 */
561 port->lp_grp = grp;
562 AGGR_GRP_REFHOLD(grp);
563 grp->lg_nports++;
564
565 aggr_lacp_init_port(port);
566 mac_perim_exit(mph);
567
568 if (pp != NULL)
569 *pp = port;
570
571 return (0);
572 }
573
574 /*
575 * Add a pseudo RX ring for the given HW ring handle.
576 */
577 static int
aggr_add_pseudo_rx_ring(aggr_port_t * port,aggr_pseudo_rx_group_t * rx_grp,mac_ring_handle_t hw_rh)578 aggr_add_pseudo_rx_ring(aggr_port_t *port,
579 aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
580 {
581 aggr_pseudo_rx_ring_t *ring;
582 int err;
583 int j;
584
585 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
586 ring = rx_grp->arg_rings + j;
587 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE))
588 break;
589 }
590
591 /*
592 * No slot for this new RX ring.
593 */
594 if (j == MAX_RINGS_PER_GROUP)
595 return (EIO);
596
597 ring->arr_flags |= MAC_PSEUDO_RING_INUSE;
598 ring->arr_hw_rh = hw_rh;
599 ring->arr_port = port;
600 rx_grp->arg_ring_cnt++;
601
602 /*
603 * The group is already registered, dynamically add a new ring to the
604 * mac group.
605 */
606 if ((err = mac_group_add_ring(rx_grp->arg_gh, j)) != 0) {
607 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
608 ring->arr_hw_rh = NULL;
609 ring->arr_port = NULL;
610 rx_grp->arg_ring_cnt--;
611 } else {
612 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
613 mac_find_ring(rx_grp->arg_gh, j));
614 }
615 return (err);
616 }
617
618 /*
619 * Remove the pseudo RX ring of the given HW ring handle.
620 */
621 static void
aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t * rx_grp,mac_ring_handle_t hw_rh)622 aggr_rem_pseudo_rx_ring(aggr_pseudo_rx_group_t *rx_grp, mac_ring_handle_t hw_rh)
623 {
624 aggr_pseudo_rx_ring_t *ring;
625 int j;
626
627 for (j = 0; j < MAX_RINGS_PER_GROUP; j++) {
628 ring = rx_grp->arg_rings + j;
629 if (!(ring->arr_flags & MAC_PSEUDO_RING_INUSE) ||
630 ring->arr_hw_rh != hw_rh) {
631 continue;
632 }
633
634 mac_group_rem_ring(rx_grp->arg_gh, ring->arr_rh);
635
636 ring->arr_flags &= ~MAC_PSEUDO_RING_INUSE;
637 ring->arr_hw_rh = NULL;
638 ring->arr_port = NULL;
639 rx_grp->arg_ring_cnt--;
640 mac_hwring_teardown(hw_rh);
641 break;
642 }
643 }
644
645 /*
646 * This function is called to create pseudo rings over the hardware rings of
647 * the underlying device. Note that there is a 1:1 mapping between the pseudo
648 * RX rings of the aggr and the hardware rings of the underlying port.
649 */
650 static int
aggr_add_pseudo_rx_group(aggr_port_t * port,aggr_pseudo_rx_group_t * rx_grp)651 aggr_add_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
652 {
653 aggr_grp_t *grp = port->lp_grp;
654 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
655 aggr_unicst_addr_t *addr, *a;
656 mac_perim_handle_t pmph;
657 int hw_rh_cnt, i = 0, j;
658 int err = 0;
659
660 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
661 mac_perim_enter_by_mh(port->lp_mh, &pmph);
662
663 /*
664 * This function must be called after the aggr registers its mac
665 * and its RX group has been initialized.
666 */
667 ASSERT(rx_grp->arg_gh != NULL);
668
669 /*
670 * Get the list the the underlying HW rings.
671 */
672 hw_rh_cnt = mac_hwrings_get(port->lp_mch,
673 &port->lp_hwgh, hw_rh, MAC_RING_TYPE_RX);
674
675 if (port->lp_hwgh != NULL) {
676 /*
677 * Quiesce the HW ring and the mac srs on the ring. Note
678 * that the HW ring will be restarted when the pseudo ring
679 * is started. At that time all the packets will be
680 * directly passed up to the pseudo RX ring and handled
681 * by mac srs created over the pseudo RX ring.
682 */
683 mac_rx_client_quiesce(port->lp_mch);
684 mac_srs_perm_quiesce(port->lp_mch, B_TRUE);
685 }
686
687 /*
688 * Add all the unicast addresses to the newly added port.
689 */
690 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next) {
691 if ((err = aggr_port_addmac(port, addr->aua_addr)) != 0)
692 break;
693 }
694
695 for (i = 0; err == 0 && i < hw_rh_cnt; i++)
696 err = aggr_add_pseudo_rx_ring(port, rx_grp, hw_rh[i]);
697
698 if (err != 0) {
699 for (j = 0; j < i; j++)
700 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[j]);
701
702 for (a = rx_grp->arg_macaddr; a != addr; a = a->aua_next)
703 aggr_port_remmac(port, a->aua_addr);
704
705 if (port->lp_hwgh != NULL) {
706 mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
707 mac_rx_client_restart(port->lp_mch);
708 port->lp_hwgh = NULL;
709 }
710 } else {
711 port->lp_rx_grp_added = B_TRUE;
712 }
713 done:
714 mac_perim_exit(pmph);
715 return (err);
716 }
717
718 /*
719 * This function is called by aggr to remove pseudo RX rings over the
720 * HW rings of the underlying port.
721 */
722 static void
aggr_rem_pseudo_rx_group(aggr_port_t * port,aggr_pseudo_rx_group_t * rx_grp)723 aggr_rem_pseudo_rx_group(aggr_port_t *port, aggr_pseudo_rx_group_t *rx_grp)
724 {
725 aggr_grp_t *grp = port->lp_grp;
726 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP];
727 aggr_unicst_addr_t *addr;
728 mac_group_handle_t hwgh;
729 mac_perim_handle_t pmph;
730 int hw_rh_cnt, i;
731
732 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
733 mac_perim_enter_by_mh(port->lp_mh, &pmph);
734
735 if (!port->lp_rx_grp_added)
736 goto done;
737
738 ASSERT(rx_grp->arg_gh != NULL);
739 hw_rh_cnt = mac_hwrings_get(port->lp_mch,
740 &hwgh, hw_rh, MAC_RING_TYPE_RX);
741
742 /*
743 * If hw_rh_cnt is 0, it means that the underlying port does not
744 * support RX rings. Directly return in this case.
745 */
746 for (i = 0; i < hw_rh_cnt; i++)
747 aggr_rem_pseudo_rx_ring(rx_grp, hw_rh[i]);
748
749 for (addr = rx_grp->arg_macaddr; addr != NULL; addr = addr->aua_next)
750 aggr_port_remmac(port, addr->aua_addr);
751
752 if (port->lp_hwgh != NULL) {
753 port->lp_hwgh = NULL;
754
755 /*
756 * First clear the permanent-quiesced flag of the RX srs then
757 * restart the HW ring and the mac srs on the ring. Note that
758 * the HW ring and associated SRS will soon been removed when
759 * the port is removed from the aggr.
760 */
761 mac_srs_perm_quiesce(port->lp_mch, B_FALSE);
762 mac_rx_client_restart(port->lp_mch);
763 }
764
765 port->lp_rx_grp_added = B_FALSE;
766 done:
767 mac_perim_exit(pmph);
768 }
769
770 /*
771 * Add a pseudo TX ring for the given HW ring handle.
772 */
773 static int
aggr_add_pseudo_tx_ring(aggr_port_t * port,aggr_pseudo_tx_group_t * tx_grp,mac_ring_handle_t hw_rh,mac_ring_handle_t * pseudo_rh)774 aggr_add_pseudo_tx_ring(aggr_port_t *port,
775 aggr_pseudo_tx_group_t *tx_grp, mac_ring_handle_t hw_rh,
776 mac_ring_handle_t *pseudo_rh)
777 {
778 aggr_pseudo_tx_ring_t *ring;
779 int err;
780 int i;
781
782 ASSERT(MAC_PERIM_HELD(port->lp_mh));
783 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
784 ring = tx_grp->atg_rings + i;
785 if (!(ring->atr_flags & MAC_PSEUDO_RING_INUSE))
786 break;
787 }
788 /*
789 * No slot for this new TX ring.
790 */
791 if (i == MAX_RINGS_PER_GROUP)
792 return (EIO);
793 /*
794 * The following 4 statements needs to be done before
795 * calling mac_group_add_ring(). Otherwise it will
796 * result in an assertion failure in mac_init_ring().
797 */
798 ring->atr_flags |= MAC_PSEUDO_RING_INUSE;
799 ring->atr_hw_rh = hw_rh;
800 ring->atr_port = port;
801 tx_grp->atg_ring_cnt++;
802
803 /*
804 * The TX side has no concept of ring groups unlike RX groups.
805 * There is just a single group which stores all the TX rings.
806 * This group will be used to store aggr's pseudo TX rings.
807 */
808 if ((err = mac_group_add_ring(tx_grp->atg_gh, i)) != 0) {
809 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
810 ring->atr_hw_rh = NULL;
811 ring->atr_port = NULL;
812 tx_grp->atg_ring_cnt--;
813 } else {
814 *pseudo_rh = mac_find_ring(tx_grp->atg_gh, i);
815 if (hw_rh != NULL) {
816 mac_hwring_setup(hw_rh, (mac_resource_handle_t)ring,
817 mac_find_ring(tx_grp->atg_gh, i));
818 }
819 }
820 return (err);
821 }
822
823 /*
824 * Remove the pseudo TX ring of the given HW ring handle.
825 */
826 static void
aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t * tx_grp,mac_ring_handle_t pseudo_hw_rh)827 aggr_rem_pseudo_tx_ring(aggr_pseudo_tx_group_t *tx_grp,
828 mac_ring_handle_t pseudo_hw_rh)
829 {
830 aggr_pseudo_tx_ring_t *ring;
831 int i;
832
833 for (i = 0; i < MAX_RINGS_PER_GROUP; i++) {
834 ring = tx_grp->atg_rings + i;
835 if (ring->atr_rh != pseudo_hw_rh)
836 continue;
837
838 ASSERT(ring->atr_flags & MAC_PSEUDO_RING_INUSE);
839 mac_group_rem_ring(tx_grp->atg_gh, pseudo_hw_rh);
840 ring->atr_flags &= ~MAC_PSEUDO_RING_INUSE;
841 mac_hwring_teardown(ring->atr_hw_rh);
842 ring->atr_hw_rh = NULL;
843 ring->atr_port = NULL;
844 tx_grp->atg_ring_cnt--;
845 break;
846 }
847 }
848
849 /*
850 * This function is called to create pseudo rings over hardware rings of
851 * the underlying device. There is a 1:1 mapping between the pseudo TX
852 * rings of the aggr and the hardware rings of the underlying port.
853 */
854 static int
aggr_add_pseudo_tx_group(aggr_port_t * port,aggr_pseudo_tx_group_t * tx_grp)855 aggr_add_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
856 {
857 aggr_grp_t *grp = port->lp_grp;
858 mac_ring_handle_t hw_rh[MAX_RINGS_PER_GROUP], pseudo_rh;
859 mac_perim_handle_t pmph;
860 int hw_rh_cnt, i = 0, j;
861 int err = 0;
862
863 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
864 mac_perim_enter_by_mh(port->lp_mh, &pmph);
865
866 /*
867 * Get the list the the underlying HW rings.
868 */
869 hw_rh_cnt = mac_hwrings_get(port->lp_mch,
870 NULL, hw_rh, MAC_RING_TYPE_TX);
871
872 /*
873 * Even if the underlying NIC does not have TX rings, we
874 * still make a psuedo TX ring for that NIC with NULL as
875 * the ring handle.
876 */
877 if (hw_rh_cnt == 0)
878 port->lp_tx_ring_cnt = 1;
879 else
880 port->lp_tx_ring_cnt = hw_rh_cnt;
881
882 port->lp_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
883 port->lp_tx_ring_cnt), KM_SLEEP);
884 port->lp_pseudo_tx_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
885 port->lp_tx_ring_cnt), KM_SLEEP);
886
887 if (hw_rh_cnt == 0) {
888 if ((err = aggr_add_pseudo_tx_ring(port, tx_grp,
889 NULL, &pseudo_rh)) == 0) {
890 port->lp_tx_rings[0] = NULL;
891 port->lp_pseudo_tx_rings[0] = pseudo_rh;
892 }
893 } else {
894 for (i = 0; err == 0 && i < hw_rh_cnt; i++) {
895 err = aggr_add_pseudo_tx_ring(port,
896 tx_grp, hw_rh[i], &pseudo_rh);
897 if (err != 0)
898 break;
899 port->lp_tx_rings[i] = hw_rh[i];
900 port->lp_pseudo_tx_rings[i] = pseudo_rh;
901 }
902 }
903
904 if (err != 0) {
905 if (hw_rh_cnt != 0) {
906 for (j = 0; j < i; j++) {
907 aggr_rem_pseudo_tx_ring(tx_grp,
908 port->lp_pseudo_tx_rings[j]);
909 }
910 }
911 kmem_free(port->lp_tx_rings,
912 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
913 kmem_free(port->lp_pseudo_tx_rings,
914 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
915 port->lp_tx_ring_cnt = 0;
916 } else {
917 port->lp_tx_grp_added = B_TRUE;
918 port->lp_tx_notify_mh = mac_client_tx_notify(port->lp_mch,
919 aggr_tx_ring_update, port);
920 }
921 mac_perim_exit(pmph);
922 return (err);
923 }
924
925 /*
926 * This function is called by aggr to remove pseudo TX rings over the
927 * HW rings of the underlying port.
928 */
929 static void
aggr_rem_pseudo_tx_group(aggr_port_t * port,aggr_pseudo_tx_group_t * tx_grp)930 aggr_rem_pseudo_tx_group(aggr_port_t *port, aggr_pseudo_tx_group_t *tx_grp)
931 {
932 aggr_grp_t *grp = port->lp_grp;
933 mac_perim_handle_t pmph;
934 int i;
935
936 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
937 mac_perim_enter_by_mh(port->lp_mh, &pmph);
938
939 if (!port->lp_tx_grp_added)
940 goto done;
941
942 ASSERT(tx_grp->atg_gh != NULL);
943
944 for (i = 0; i < port->lp_tx_ring_cnt; i++)
945 aggr_rem_pseudo_tx_ring(tx_grp, port->lp_pseudo_tx_rings[i]);
946
947 kmem_free(port->lp_tx_rings,
948 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
949 kmem_free(port->lp_pseudo_tx_rings,
950 (sizeof (mac_ring_handle_t *) * port->lp_tx_ring_cnt));
951
952 port->lp_tx_ring_cnt = 0;
953 (void) mac_client_tx_notify(port->lp_mch, NULL, port->lp_tx_notify_mh);
954 port->lp_tx_grp_added = B_FALSE;
955 done:
956 mac_perim_exit(pmph);
957 }
958
959 static int
aggr_pseudo_disable_intr(mac_intr_handle_t ih)960 aggr_pseudo_disable_intr(mac_intr_handle_t ih)
961 {
962 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
963 return (mac_hwring_disable_intr(rr_ring->arr_hw_rh));
964 }
965
966 static int
aggr_pseudo_enable_intr(mac_intr_handle_t ih)967 aggr_pseudo_enable_intr(mac_intr_handle_t ih)
968 {
969 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)ih;
970 return (mac_hwring_enable_intr(rr_ring->arr_hw_rh));
971 }
972
973 static int
aggr_pseudo_start_ring(mac_ring_driver_t arg,uint64_t mr_gen)974 aggr_pseudo_start_ring(mac_ring_driver_t arg, uint64_t mr_gen)
975 {
976 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
977 int err;
978
979 err = mac_hwring_start(rr_ring->arr_hw_rh);
980 if (err == 0)
981 rr_ring->arr_gen = mr_gen;
982 return (err);
983 }
984
985 static void
aggr_pseudo_stop_ring(mac_ring_driver_t arg)986 aggr_pseudo_stop_ring(mac_ring_driver_t arg)
987 {
988 aggr_pseudo_rx_ring_t *rr_ring = (aggr_pseudo_rx_ring_t *)arg;
989 mac_hwring_stop(rr_ring->arr_hw_rh);
990 }
991
992 /*
993 * Add one or more ports to an existing link aggregation group.
994 */
995 int
aggr_grp_add_ports(datalink_id_t linkid,uint_t nports,boolean_t force,laioc_port_t * ports)996 aggr_grp_add_ports(datalink_id_t linkid, uint_t nports, boolean_t force,
997 laioc_port_t *ports)
998 {
999 int rc, i, nadded = 0;
1000 aggr_grp_t *grp = NULL;
1001 aggr_port_t *port;
1002 boolean_t link_state_changed = B_FALSE;
1003 mac_perim_handle_t mph, pmph;
1004
1005 /* get group corresponding to linkid */
1006 rw_enter(&aggr_grp_lock, RW_READER);
1007 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1008 (mod_hash_val_t *)&grp) != 0) {
1009 rw_exit(&aggr_grp_lock);
1010 return (ENOENT);
1011 }
1012 AGGR_GRP_REFHOLD(grp);
1013
1014 /*
1015 * Hold the perimeter so that the aggregation won't be destroyed.
1016 */
1017 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1018 rw_exit(&aggr_grp_lock);
1019
1020 /* add the specified ports to group */
1021 for (i = 0; i < nports; i++) {
1022 /* add port to group */
1023 if ((rc = aggr_grp_add_port(grp, ports[i].lp_linkid,
1024 force, &port)) != 0) {
1025 goto bail;
1026 }
1027 ASSERT(port != NULL);
1028 nadded++;
1029
1030 /* check capabilities */
1031 if (!aggr_grp_capab_check(grp, port) ||
1032 !aggr_grp_sdu_check(grp, port) ||
1033 !aggr_grp_margin_check(grp, port)) {
1034 rc = ENOTSUP;
1035 goto bail;
1036 }
1037
1038 /*
1039 * Create the pseudo ring for each HW ring of the underlying
1040 * port.
1041 */
1042 rc = aggr_add_pseudo_tx_group(port, &grp->lg_tx_group);
1043 if (rc != 0)
1044 goto bail;
1045 rc = aggr_add_pseudo_rx_group(port, &grp->lg_rx_group);
1046 if (rc != 0)
1047 goto bail;
1048
1049 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1050
1051 /* set LACP mode */
1052 aggr_port_lacp_set_mode(grp, port);
1053
1054 /* start port if group has already been started */
1055 if (grp->lg_started) {
1056 rc = aggr_port_start(port);
1057 if (rc != 0) {
1058 mac_perim_exit(pmph);
1059 goto bail;
1060 }
1061
1062 /*
1063 * Turn on the promiscuous mode over the port when it
1064 * is requested to be turned on to receive the
1065 * non-primary address over a port, or the promiscous
1066 * mode is enabled over the aggr.
1067 */
1068 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1069 rc = aggr_port_promisc(port, B_TRUE);
1070 if (rc != 0) {
1071 mac_perim_exit(pmph);
1072 goto bail;
1073 }
1074 }
1075 }
1076 mac_perim_exit(pmph);
1077
1078 /*
1079 * Attach each port if necessary.
1080 */
1081 if (aggr_port_notify_link(grp, port))
1082 link_state_changed = B_TRUE;
1083
1084 /*
1085 * Initialize the callback functions for this port.
1086 */
1087 aggr_port_init_callbacks(port);
1088 }
1089
1090 /* update the MAC address of the constituent ports */
1091 if (aggr_grp_update_ports_mac(grp))
1092 link_state_changed = B_TRUE;
1093
1094 if (link_state_changed)
1095 mac_link_update(grp->lg_mh, grp->lg_link_state);
1096
1097 bail:
1098 if (rc != 0) {
1099 /* stop and remove ports that have been added */
1100 for (i = 0; i < nadded; i++) {
1101 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1102 ASSERT(port != NULL);
1103 if (grp->lg_started) {
1104 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1105 (void) aggr_port_promisc(port, B_FALSE);
1106 aggr_port_stop(port);
1107 mac_perim_exit(pmph);
1108 }
1109 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1110 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1111 (void) aggr_grp_rem_port(grp, port, NULL, NULL);
1112 }
1113 }
1114
1115 mac_perim_exit(mph);
1116 AGGR_GRP_REFRELE(grp);
1117 return (rc);
1118 }
1119
1120 static int
aggr_grp_modify_common(aggr_grp_t * grp,uint8_t update_mask,uint32_t policy,boolean_t mac_fixed,const uchar_t * mac_addr,aggr_lacp_mode_t lacp_mode,aggr_lacp_timer_t lacp_timer)1121 aggr_grp_modify_common(aggr_grp_t *grp, uint8_t update_mask, uint32_t policy,
1122 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1123 aggr_lacp_timer_t lacp_timer)
1124 {
1125 boolean_t mac_addr_changed = B_FALSE;
1126 boolean_t link_state_changed = B_FALSE;
1127 mac_perim_handle_t pmph;
1128
1129 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1130
1131 /* validate fixed address if specified */
1132 if ((update_mask & AGGR_MODIFY_MAC) && mac_fixed &&
1133 ((bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) ||
1134 (mac_addr[0] & 0x01))) {
1135 return (EINVAL);
1136 }
1137
1138 /* update policy if requested */
1139 if (update_mask & AGGR_MODIFY_POLICY)
1140 aggr_send_update_policy(grp, policy);
1141
1142 /* update unicast MAC address if requested */
1143 if (update_mask & AGGR_MODIFY_MAC) {
1144 if (mac_fixed) {
1145 /* user-supplied MAC address */
1146 grp->lg_mac_addr_port = NULL;
1147 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) != 0) {
1148 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1149 mac_addr_changed = B_TRUE;
1150 }
1151 } else if (grp->lg_addr_fixed) {
1152 /* switch from user-supplied to automatic */
1153 aggr_port_t *port = grp->lg_ports;
1154
1155 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1156 bcopy(port->lp_addr, grp->lg_addr, ETHERADDRL);
1157 grp->lg_mac_addr_port = port;
1158 mac_addr_changed = B_TRUE;
1159 mac_perim_exit(pmph);
1160 }
1161 grp->lg_addr_fixed = mac_fixed;
1162 }
1163
1164 if (mac_addr_changed)
1165 link_state_changed = aggr_grp_update_ports_mac(grp);
1166
1167 if (update_mask & AGGR_MODIFY_LACP_MODE)
1168 aggr_lacp_update_mode(grp, lacp_mode);
1169
1170 if (update_mask & AGGR_MODIFY_LACP_TIMER)
1171 aggr_lacp_update_timer(grp, lacp_timer);
1172
1173 if (link_state_changed)
1174 mac_link_update(grp->lg_mh, grp->lg_link_state);
1175
1176 if (mac_addr_changed)
1177 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1178
1179 return (0);
1180 }
1181
1182 /*
1183 * Update properties of an existing link aggregation group.
1184 */
1185 int
aggr_grp_modify(datalink_id_t linkid,uint8_t update_mask,uint32_t policy,boolean_t mac_fixed,const uchar_t * mac_addr,aggr_lacp_mode_t lacp_mode,aggr_lacp_timer_t lacp_timer)1186 aggr_grp_modify(datalink_id_t linkid, uint8_t update_mask, uint32_t policy,
1187 boolean_t mac_fixed, const uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode,
1188 aggr_lacp_timer_t lacp_timer)
1189 {
1190 aggr_grp_t *grp = NULL;
1191 mac_perim_handle_t mph;
1192 int err;
1193
1194 /* get group corresponding to linkid */
1195 rw_enter(&aggr_grp_lock, RW_READER);
1196 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1197 (mod_hash_val_t *)&grp) != 0) {
1198 rw_exit(&aggr_grp_lock);
1199 return (ENOENT);
1200 }
1201 AGGR_GRP_REFHOLD(grp);
1202
1203 /*
1204 * Hold the perimeter so that the aggregation won't be destroyed.
1205 */
1206 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1207 rw_exit(&aggr_grp_lock);
1208
1209 err = aggr_grp_modify_common(grp, update_mask, policy, mac_fixed,
1210 mac_addr, lacp_mode, lacp_timer);
1211
1212 mac_perim_exit(mph);
1213 AGGR_GRP_REFRELE(grp);
1214 return (err);
1215 }
1216
1217 /*
1218 * Create a new link aggregation group upon request from administrator.
1219 * Returns 0 on success, an errno on failure.
1220 */
1221 int
aggr_grp_create(datalink_id_t linkid,uint32_t key,uint_t nports,laioc_port_t * ports,uint32_t policy,boolean_t mac_fixed,boolean_t force,uchar_t * mac_addr,aggr_lacp_mode_t lacp_mode,aggr_lacp_timer_t lacp_timer,cred_t * credp)1222 aggr_grp_create(datalink_id_t linkid, uint32_t key, uint_t nports,
1223 laioc_port_t *ports, uint32_t policy, boolean_t mac_fixed, boolean_t force,
1224 uchar_t *mac_addr, aggr_lacp_mode_t lacp_mode, aggr_lacp_timer_t lacp_timer,
1225 cred_t *credp)
1226 {
1227 aggr_grp_t *grp = NULL;
1228 aggr_port_t *port;
1229 mac_register_t *mac;
1230 boolean_t link_state_changed;
1231 mac_perim_handle_t mph;
1232 int err;
1233 int i;
1234 kt_did_t tid = 0;
1235
1236 /* need at least one port */
1237 if (nports == 0)
1238 return (EINVAL);
1239
1240 rw_enter(&aggr_grp_lock, RW_WRITER);
1241
1242 /* does a group with the same linkid already exist? */
1243 err = mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1244 (mod_hash_val_t *)&grp);
1245 if (err == 0) {
1246 rw_exit(&aggr_grp_lock);
1247 return (EEXIST);
1248 }
1249
1250 grp = kmem_cache_alloc(aggr_grp_cache, KM_SLEEP);
1251
1252 grp->lg_refs = 1;
1253 grp->lg_closing = B_FALSE;
1254 grp->lg_force = force;
1255 grp->lg_linkid = linkid;
1256 grp->lg_zoneid = crgetzoneid(credp);
1257 grp->lg_ifspeed = 0;
1258 grp->lg_link_state = LINK_STATE_UNKNOWN;
1259 grp->lg_link_duplex = LINK_DUPLEX_UNKNOWN;
1260 grp->lg_started = B_FALSE;
1261 grp->lg_promisc = B_FALSE;
1262 grp->lg_lacp_done = B_FALSE;
1263 grp->lg_tx_notify_done = B_FALSE;
1264 grp->lg_lacp_head = grp->lg_lacp_tail = NULL;
1265 grp->lg_lacp_rx_thread = thread_create(NULL, 0,
1266 aggr_lacp_rx_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1267 grp->lg_tx_notify_thread = thread_create(NULL, 0,
1268 aggr_tx_notify_thread, grp, 0, &p0, TS_RUN, minclsyspri);
1269 grp->lg_tx_blocked_rings = kmem_zalloc((sizeof (mac_ring_handle_t *) *
1270 MAX_RINGS_PER_GROUP), KM_SLEEP);
1271 grp->lg_tx_blocked_cnt = 0;
1272 bzero(&grp->lg_rx_group, sizeof (aggr_pseudo_rx_group_t));
1273 bzero(&grp->lg_tx_group, sizeof (aggr_pseudo_tx_group_t));
1274 aggr_lacp_init_grp(grp);
1275
1276 /* add MAC ports to group */
1277 grp->lg_ports = NULL;
1278 grp->lg_nports = 0;
1279 grp->lg_nattached_ports = 0;
1280 grp->lg_ntx_ports = 0;
1281
1282 /*
1283 * If key is not specified by the user, allocate the key.
1284 */
1285 if ((key == 0) && ((key = (uint32_t)id_alloc(key_ids)) == 0)) {
1286 err = ENOMEM;
1287 goto bail;
1288 }
1289 grp->lg_key = key;
1290
1291 for (i = 0; i < nports; i++) {
1292 err = aggr_grp_add_port(grp, ports[i].lp_linkid, force, NULL);
1293 if (err != 0)
1294 goto bail;
1295 }
1296
1297 /*
1298 * If no explicit MAC address was specified by the administrator,
1299 * set it to the MAC address of the first port.
1300 */
1301 grp->lg_addr_fixed = mac_fixed;
1302 if (grp->lg_addr_fixed) {
1303 /* validate specified address */
1304 if (bcmp(aggr_zero_mac, mac_addr, ETHERADDRL) == 0) {
1305 err = EINVAL;
1306 goto bail;
1307 }
1308 bcopy(mac_addr, grp->lg_addr, ETHERADDRL);
1309 } else {
1310 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1311 grp->lg_mac_addr_port = grp->lg_ports;
1312 }
1313
1314 /* set the initial group capabilities */
1315 aggr_grp_capab_set(grp);
1316
1317 if ((mac = mac_alloc(MAC_VERSION)) == NULL) {
1318 err = ENOMEM;
1319 goto bail;
1320 }
1321 mac->m_type_ident = MAC_PLUGIN_IDENT_ETHER;
1322 mac->m_driver = grp;
1323 mac->m_dip = aggr_dip;
1324 mac->m_instance = grp->lg_key > AGGR_MAX_KEY ? (uint_t)-1 : grp->lg_key;
1325 mac->m_src_addr = grp->lg_addr;
1326 mac->m_callbacks = &aggr_m_callbacks;
1327 mac->m_min_sdu = 0;
1328 mac->m_max_sdu = grp->lg_max_sdu = aggr_grp_max_sdu(grp);
1329 mac->m_margin = aggr_grp_max_margin(grp);
1330 mac->m_v12n = MAC_VIRT_LEVEL1;
1331 err = mac_register(mac, &grp->lg_mh);
1332 mac_free(mac);
1333 if (err != 0)
1334 goto bail;
1335
1336 err = dls_devnet_create(grp->lg_mh, grp->lg_linkid, crgetzoneid(credp));
1337 if (err != 0) {
1338 (void) mac_unregister(grp->lg_mh);
1339 grp->lg_mh = NULL;
1340 goto bail;
1341 }
1342
1343 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1344
1345 /*
1346 * Update the MAC address of the constituent ports.
1347 * None of the port is attached at this time, the link state of the
1348 * aggregation will not change.
1349 */
1350 link_state_changed = aggr_grp_update_ports_mac(grp);
1351 ASSERT(!link_state_changed);
1352
1353 /* update outbound load balancing policy */
1354 aggr_send_update_policy(grp, policy);
1355
1356 /* set LACP mode */
1357 aggr_lacp_set_mode(grp, lacp_mode, lacp_timer);
1358
1359 /*
1360 * Attach each port if necessary.
1361 */
1362 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1363 /*
1364 * Create the pseudo ring for each HW ring of the underlying
1365 * port. Note that this is done after the aggr registers the
1366 * mac.
1367 */
1368 VERIFY(aggr_add_pseudo_tx_group(port, &grp->lg_tx_group) == 0);
1369 VERIFY(aggr_add_pseudo_rx_group(port, &grp->lg_rx_group) == 0);
1370 if (aggr_port_notify_link(grp, port))
1371 link_state_changed = B_TRUE;
1372
1373 /*
1374 * Initialize the callback functions for this port.
1375 */
1376 aggr_port_init_callbacks(port);
1377 }
1378
1379 if (link_state_changed)
1380 mac_link_update(grp->lg_mh, grp->lg_link_state);
1381
1382 /* add new group to hash table */
1383 err = mod_hash_insert(aggr_grp_hash, GRP_HASH_KEY(linkid),
1384 (mod_hash_val_t)grp);
1385 ASSERT(err == 0);
1386 aggr_grp_cnt++;
1387
1388 mac_perim_exit(mph);
1389 rw_exit(&aggr_grp_lock);
1390 return (0);
1391
1392 bail:
1393
1394 grp->lg_closing = B_TRUE;
1395
1396 port = grp->lg_ports;
1397 while (port != NULL) {
1398 aggr_port_t *cport;
1399
1400 cport = port->lp_next;
1401 aggr_port_delete(port);
1402 port = cport;
1403 }
1404
1405 /*
1406 * Inform the lacp_rx thread to exit.
1407 */
1408 mutex_enter(&grp->lg_lacp_lock);
1409 grp->lg_lacp_done = B_TRUE;
1410 cv_signal(&grp->lg_lacp_cv);
1411 while (grp->lg_lacp_rx_thread != NULL)
1412 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1413 mutex_exit(&grp->lg_lacp_lock);
1414 /*
1415 * Inform the tx_notify thread to exit.
1416 */
1417 mutex_enter(&grp->lg_tx_flowctl_lock);
1418 if (grp->lg_tx_notify_thread != NULL) {
1419 tid = grp->lg_tx_notify_thread->t_did;
1420 grp->lg_tx_notify_done = B_TRUE;
1421 cv_signal(&grp->lg_tx_flowctl_cv);
1422 }
1423 mutex_exit(&grp->lg_tx_flowctl_lock);
1424 if (tid != 0)
1425 thread_join(tid);
1426
1427 kmem_free(grp->lg_tx_blocked_rings,
1428 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1429 rw_exit(&aggr_grp_lock);
1430 AGGR_GRP_REFRELE(grp);
1431 return (err);
1432 }
1433
1434 /*
1435 * Return a pointer to the member of a group with specified linkid.
1436 */
1437 static aggr_port_t *
aggr_grp_port_lookup(aggr_grp_t * grp,datalink_id_t linkid)1438 aggr_grp_port_lookup(aggr_grp_t *grp, datalink_id_t linkid)
1439 {
1440 aggr_port_t *port;
1441
1442 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1443
1444 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1445 if (port->lp_linkid == linkid)
1446 break;
1447 }
1448
1449 return (port);
1450 }
1451
1452 /*
1453 * Stop, detach and remove a port from a link aggregation group.
1454 */
1455 static int
aggr_grp_rem_port(aggr_grp_t * grp,aggr_port_t * port,boolean_t * mac_addr_changedp,boolean_t * link_state_changedp)1456 aggr_grp_rem_port(aggr_grp_t *grp, aggr_port_t *port,
1457 boolean_t *mac_addr_changedp, boolean_t *link_state_changedp)
1458 {
1459 int rc = 0;
1460 aggr_port_t **pport;
1461 boolean_t mac_addr_changed = B_FALSE;
1462 boolean_t link_state_changed = B_FALSE;
1463 mac_perim_handle_t mph;
1464 uint64_t val;
1465 uint_t i;
1466 uint_t stat;
1467
1468 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
1469 ASSERT(grp->lg_nports > 1);
1470 ASSERT(!grp->lg_closing);
1471
1472 /* unlink port */
1473 for (pport = &grp->lg_ports; *pport != port;
1474 pport = &(*pport)->lp_next) {
1475 if (*pport == NULL) {
1476 rc = ENOENT;
1477 goto done;
1478 }
1479 }
1480 *pport = port->lp_next;
1481
1482 mac_perim_enter_by_mh(port->lp_mh, &mph);
1483
1484 /*
1485 * If the MAC address of the port being removed was assigned
1486 * to the group, update the group MAC address
1487 * using the MAC address of a different port.
1488 */
1489 if (!grp->lg_addr_fixed && grp->lg_mac_addr_port == port) {
1490 /*
1491 * Set the MAC address of the group to the
1492 * MAC address of its first port.
1493 */
1494 bcopy(grp->lg_ports->lp_addr, grp->lg_addr, ETHERADDRL);
1495 grp->lg_mac_addr_port = grp->lg_ports;
1496 mac_addr_changed = B_TRUE;
1497 }
1498
1499 link_state_changed = aggr_grp_detach_port(grp, port);
1500
1501 /*
1502 * Add the counter statistics of the ports while it was aggregated
1503 * to the group's residual statistics. This is done by obtaining
1504 * the current counter from the underlying MAC then subtracting the
1505 * value of the counter at the moment it was added to the
1506 * aggregation.
1507 */
1508 for (i = 0; i < MAC_NSTAT; i++) {
1509 stat = i + MAC_STAT_MIN;
1510 if (!MAC_STAT_ISACOUNTER(stat))
1511 continue;
1512 val = aggr_port_stat(port, stat);
1513 val -= port->lp_stat[i];
1514 grp->lg_stat[i] += val;
1515 }
1516 for (i = 0; i < ETHER_NSTAT; i++) {
1517 stat = i + MACTYPE_STAT_MIN;
1518 if (!ETHER_STAT_ISACOUNTER(stat))
1519 continue;
1520 val = aggr_port_stat(port, stat);
1521 val -= port->lp_ether_stat[i];
1522 grp->lg_ether_stat[i] += val;
1523 }
1524
1525 grp->lg_nports--;
1526 mac_perim_exit(mph);
1527
1528 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1529 aggr_port_delete(port);
1530
1531 /*
1532 * If the group MAC address has changed, update the MAC address of
1533 * the remaining constituent ports according to the new MAC
1534 * address of the group.
1535 */
1536 if (mac_addr_changed && aggr_grp_update_ports_mac(grp))
1537 link_state_changed = B_TRUE;
1538
1539 done:
1540 if (mac_addr_changedp != NULL)
1541 *mac_addr_changedp = mac_addr_changed;
1542 if (link_state_changedp != NULL)
1543 *link_state_changedp = link_state_changed;
1544
1545 return (rc);
1546 }
1547
1548 /*
1549 * Remove one or more ports from an existing link aggregation group.
1550 */
1551 int
aggr_grp_rem_ports(datalink_id_t linkid,uint_t nports,laioc_port_t * ports)1552 aggr_grp_rem_ports(datalink_id_t linkid, uint_t nports, laioc_port_t *ports)
1553 {
1554 int rc = 0, i;
1555 aggr_grp_t *grp = NULL;
1556 aggr_port_t *port;
1557 boolean_t mac_addr_update = B_FALSE, mac_addr_changed;
1558 boolean_t link_state_update = B_FALSE, link_state_changed;
1559 mac_perim_handle_t mph, pmph;
1560
1561 /* get group corresponding to linkid */
1562 rw_enter(&aggr_grp_lock, RW_READER);
1563 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1564 (mod_hash_val_t *)&grp) != 0) {
1565 rw_exit(&aggr_grp_lock);
1566 return (ENOENT);
1567 }
1568 AGGR_GRP_REFHOLD(grp);
1569
1570 /*
1571 * Hold the perimeter so that the aggregation won't be destroyed.
1572 */
1573 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1574 rw_exit(&aggr_grp_lock);
1575
1576 /* we need to keep at least one port per group */
1577 if (nports >= grp->lg_nports) {
1578 rc = EINVAL;
1579 goto bail;
1580 }
1581
1582 /* first verify that all the groups are valid */
1583 for (i = 0; i < nports; i++) {
1584 if (aggr_grp_port_lookup(grp, ports[i].lp_linkid) == NULL) {
1585 /* port not found */
1586 rc = ENOENT;
1587 goto bail;
1588 }
1589 }
1590
1591 /* clear the promiscous mode for the specified ports */
1592 for (i = 0; i < nports && rc == 0; i++) {
1593 /* lookup port */
1594 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1595 ASSERT(port != NULL);
1596
1597 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1598 rc = aggr_port_promisc(port, B_FALSE);
1599 mac_perim_exit(pmph);
1600 }
1601 if (rc != 0) {
1602 for (i = 0; i < nports; i++) {
1603 port = aggr_grp_port_lookup(grp,
1604 ports[i].lp_linkid);
1605 ASSERT(port != NULL);
1606
1607 /*
1608 * Turn the promiscuous mode back on if it is required
1609 * to receive the non-primary address over a port, or
1610 * the promiscous mode is enabled over the aggr.
1611 */
1612 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1613 if (port->lp_started && (grp->lg_promisc ||
1614 port->lp_prom_addr != NULL)) {
1615 (void) aggr_port_promisc(port, B_TRUE);
1616 }
1617 mac_perim_exit(pmph);
1618 }
1619 goto bail;
1620 }
1621
1622 /* remove the specified ports from group */
1623 for (i = 0; i < nports; i++) {
1624 /* lookup port */
1625 port = aggr_grp_port_lookup(grp, ports[i].lp_linkid);
1626 ASSERT(port != NULL);
1627
1628 /* stop port if group has already been started */
1629 if (grp->lg_started) {
1630 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1631 aggr_port_stop(port);
1632 mac_perim_exit(pmph);
1633 }
1634
1635 /*
1636 * aggr_rem_pseudo_tx_group() is not called here. Instead
1637 * it is called from inside aggr_grp_rem_port() after the
1638 * port has been detached. The reason is that
1639 * aggr_rem_pseudo_tx_group() removes one ring at a time
1640 * and if there is still traffic going on, then there
1641 * is the possibility of aggr_find_tx_ring() returning a
1642 * removed ring for transmission. Once the port has been
1643 * detached, that port will not be used and
1644 * aggr_find_tx_ring() will not return any rings
1645 * belonging to it.
1646 */
1647 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1648
1649 /* remove port from group */
1650 rc = aggr_grp_rem_port(grp, port, &mac_addr_changed,
1651 &link_state_changed);
1652 ASSERT(rc == 0);
1653 mac_addr_update = mac_addr_update || mac_addr_changed;
1654 link_state_update = link_state_update || link_state_changed;
1655 }
1656
1657 bail:
1658 if (mac_addr_update)
1659 mac_unicst_update(grp->lg_mh, grp->lg_addr);
1660 if (link_state_update)
1661 mac_link_update(grp->lg_mh, grp->lg_link_state);
1662
1663 mac_perim_exit(mph);
1664 AGGR_GRP_REFRELE(grp);
1665
1666 return (rc);
1667 }
1668
1669 int
aggr_grp_delete(datalink_id_t linkid,cred_t * cred)1670 aggr_grp_delete(datalink_id_t linkid, cred_t *cred)
1671 {
1672 aggr_grp_t *grp = NULL;
1673 aggr_port_t *port, *cport;
1674 datalink_id_t tmpid;
1675 mod_hash_val_t val;
1676 mac_perim_handle_t mph, pmph;
1677 int err;
1678 kt_did_t tid = 0;
1679
1680 rw_enter(&aggr_grp_lock, RW_WRITER);
1681
1682 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1683 (mod_hash_val_t *)&grp) != 0) {
1684 rw_exit(&aggr_grp_lock);
1685 return (ENOENT);
1686 }
1687
1688 /*
1689 * Note that dls_devnet_destroy() must be called before lg_lock is
1690 * held. Otherwise, it will deadlock if another thread is in
1691 * aggr_m_stat() and thus has a kstat_hold() on the kstats that
1692 * dls_devnet_destroy() needs to delete.
1693 */
1694 if ((err = dls_devnet_destroy(grp->lg_mh, &tmpid, B_TRUE)) != 0) {
1695 rw_exit(&aggr_grp_lock);
1696 return (err);
1697 }
1698 ASSERT(linkid == tmpid);
1699
1700 /*
1701 * Unregister from the MAC service module. Since this can
1702 * fail if a client hasn't closed the MAC port, we gracefully
1703 * fail the operation.
1704 */
1705 if ((err = mac_disable(grp->lg_mh)) != 0) {
1706 (void) dls_devnet_create(grp->lg_mh, linkid, crgetzoneid(cred));
1707 rw_exit(&aggr_grp_lock);
1708 return (err);
1709 }
1710 (void) mod_hash_remove(aggr_grp_hash, GRP_HASH_KEY(linkid), &val);
1711 ASSERT(grp == (aggr_grp_t *)val);
1712
1713 ASSERT(aggr_grp_cnt > 0);
1714 aggr_grp_cnt--;
1715 rw_exit(&aggr_grp_lock);
1716
1717 /*
1718 * Inform the lacp_rx thread to exit.
1719 */
1720 mutex_enter(&grp->lg_lacp_lock);
1721 grp->lg_lacp_done = B_TRUE;
1722 cv_signal(&grp->lg_lacp_cv);
1723 while (grp->lg_lacp_rx_thread != NULL)
1724 cv_wait(&grp->lg_lacp_cv, &grp->lg_lacp_lock);
1725 mutex_exit(&grp->lg_lacp_lock);
1726 /*
1727 * Inform the tx_notify_thread to exit.
1728 */
1729 mutex_enter(&grp->lg_tx_flowctl_lock);
1730 if (grp->lg_tx_notify_thread != NULL) {
1731 tid = grp->lg_tx_notify_thread->t_did;
1732 grp->lg_tx_notify_done = B_TRUE;
1733 cv_signal(&grp->lg_tx_flowctl_cv);
1734 }
1735 mutex_exit(&grp->lg_tx_flowctl_lock);
1736 if (tid != 0)
1737 thread_join(tid);
1738
1739 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1740
1741 grp->lg_closing = B_TRUE;
1742 /* detach and free MAC ports associated with group */
1743 port = grp->lg_ports;
1744 while (port != NULL) {
1745 cport = port->lp_next;
1746 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1747 if (grp->lg_started)
1748 aggr_port_stop(port);
1749 (void) aggr_grp_detach_port(grp, port);
1750 mac_perim_exit(pmph);
1751 aggr_rem_pseudo_tx_group(port, &grp->lg_tx_group);
1752 aggr_rem_pseudo_rx_group(port, &grp->lg_rx_group);
1753 aggr_port_delete(port);
1754 port = cport;
1755 }
1756
1757 mac_perim_exit(mph);
1758
1759 kmem_free(grp->lg_tx_blocked_rings,
1760 (sizeof (mac_ring_handle_t *) * MAX_RINGS_PER_GROUP));
1761 /*
1762 * Wait for the port's lacp timer thread and its notification callback
1763 * to exit before calling mac_unregister() since both needs to access
1764 * the mac perimeter of the grp.
1765 */
1766 aggr_grp_port_wait(grp);
1767
1768 VERIFY(mac_unregister(grp->lg_mh) == 0);
1769 grp->lg_mh = NULL;
1770
1771 AGGR_GRP_REFRELE(grp);
1772 return (0);
1773 }
1774
1775 void
aggr_grp_free(aggr_grp_t * grp)1776 aggr_grp_free(aggr_grp_t *grp)
1777 {
1778 ASSERT(grp->lg_refs == 0);
1779 ASSERT(grp->lg_port_ref == 0);
1780 if (grp->lg_key > AGGR_MAX_KEY) {
1781 id_free(key_ids, grp->lg_key);
1782 grp->lg_key = 0;
1783 }
1784 kmem_cache_free(aggr_grp_cache, grp);
1785 }
1786
1787 int
aggr_grp_info(datalink_id_t linkid,void * fn_arg,aggr_grp_info_new_grp_fn_t new_grp_fn,aggr_grp_info_new_port_fn_t new_port_fn,cred_t * cred)1788 aggr_grp_info(datalink_id_t linkid, void *fn_arg,
1789 aggr_grp_info_new_grp_fn_t new_grp_fn,
1790 aggr_grp_info_new_port_fn_t new_port_fn, cred_t *cred)
1791 {
1792 aggr_grp_t *grp;
1793 aggr_port_t *port;
1794 mac_perim_handle_t mph, pmph;
1795 int rc = 0;
1796
1797 /*
1798 * Make sure that the aggregation link is visible from the caller's
1799 * zone.
1800 */
1801 if (!dls_devnet_islinkvisible(linkid, crgetzoneid(cred)))
1802 return (ENOENT);
1803
1804 rw_enter(&aggr_grp_lock, RW_READER);
1805
1806 if (mod_hash_find(aggr_grp_hash, GRP_HASH_KEY(linkid),
1807 (mod_hash_val_t *)&grp) != 0) {
1808 rw_exit(&aggr_grp_lock);
1809 return (ENOENT);
1810 }
1811 AGGR_GRP_REFHOLD(grp);
1812
1813 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1814 rw_exit(&aggr_grp_lock);
1815
1816 rc = new_grp_fn(fn_arg, grp->lg_linkid,
1817 (grp->lg_key > AGGR_MAX_KEY) ? 0 : grp->lg_key, grp->lg_addr,
1818 grp->lg_addr_fixed, grp->lg_force, grp->lg_tx_policy,
1819 grp->lg_nports, grp->lg_lacp_mode, grp->aggr.PeriodicTimer);
1820
1821 if (rc != 0)
1822 goto bail;
1823
1824 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1825 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1826 rc = new_port_fn(fn_arg, port->lp_linkid, port->lp_addr,
1827 port->lp_state, &port->lp_lacp.ActorOperPortState);
1828 mac_perim_exit(pmph);
1829
1830 if (rc != 0)
1831 goto bail;
1832 }
1833
1834 bail:
1835 mac_perim_exit(mph);
1836 AGGR_GRP_REFRELE(grp);
1837 return (rc);
1838 }
1839
1840 /*ARGSUSED*/
1841 static void
aggr_m_ioctl(void * arg,queue_t * q,mblk_t * mp)1842 aggr_m_ioctl(void *arg, queue_t *q, mblk_t *mp)
1843 {
1844 miocnak(q, mp, 0, ENOTSUP);
1845 }
1846
1847 static int
aggr_grp_stat(aggr_grp_t * grp,uint_t stat,uint64_t * val)1848 aggr_grp_stat(aggr_grp_t *grp, uint_t stat, uint64_t *val)
1849 {
1850 aggr_port_t *port;
1851 uint_t stat_index;
1852
1853 /* We only aggregate counter statistics. */
1854 if (IS_MAC_STAT(stat) && !MAC_STAT_ISACOUNTER(stat) ||
1855 IS_MACTYPE_STAT(stat) && !ETHER_STAT_ISACOUNTER(stat)) {
1856 return (ENOTSUP);
1857 }
1858
1859 /*
1860 * Counter statistics for a group are computed by aggregating the
1861 * counters of the members MACs while they were aggregated, plus
1862 * the residual counter of the group itself, which is updated each
1863 * time a MAC is removed from the group.
1864 */
1865 *val = 0;
1866 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1867 /* actual port statistic */
1868 *val += aggr_port_stat(port, stat);
1869 /*
1870 * minus the port stat when it was added, plus any residual
1871 * amount for the group.
1872 */
1873 if (IS_MAC_STAT(stat)) {
1874 stat_index = stat - MAC_STAT_MIN;
1875 *val -= port->lp_stat[stat_index];
1876 *val += grp->lg_stat[stat_index];
1877 } else if (IS_MACTYPE_STAT(stat)) {
1878 stat_index = stat - MACTYPE_STAT_MIN;
1879 *val -= port->lp_ether_stat[stat_index];
1880 *val += grp->lg_ether_stat[stat_index];
1881 }
1882 }
1883 return (0);
1884 }
1885
1886 int
aggr_rx_ring_stat(mac_ring_driver_t rdriver,uint_t stat,uint64_t * val)1887 aggr_rx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1888 {
1889 aggr_pseudo_rx_ring_t *rx_ring = (aggr_pseudo_rx_ring_t *)rdriver;
1890
1891 if (rx_ring->arr_hw_rh != NULL) {
1892 *val = mac_pseudo_rx_ring_stat_get(rx_ring->arr_hw_rh, stat);
1893 } else {
1894 aggr_port_t *port = rx_ring->arr_port;
1895
1896 *val = mac_stat_get(port->lp_mh, stat);
1897
1898 }
1899 return (0);
1900 }
1901
1902 int
aggr_tx_ring_stat(mac_ring_driver_t rdriver,uint_t stat,uint64_t * val)1903 aggr_tx_ring_stat(mac_ring_driver_t rdriver, uint_t stat, uint64_t *val)
1904 {
1905 aggr_pseudo_tx_ring_t *tx_ring = (aggr_pseudo_tx_ring_t *)rdriver;
1906
1907 if (tx_ring->atr_hw_rh != NULL) {
1908 *val = mac_pseudo_tx_ring_stat_get(tx_ring->atr_hw_rh, stat);
1909 } else {
1910 aggr_port_t *port = tx_ring->atr_port;
1911
1912 *val = mac_stat_get(port->lp_mh, stat);
1913 }
1914 return (0);
1915 }
1916
1917 static int
aggr_m_stat(void * arg,uint_t stat,uint64_t * val)1918 aggr_m_stat(void *arg, uint_t stat, uint64_t *val)
1919 {
1920 aggr_grp_t *grp = arg;
1921 mac_perim_handle_t mph;
1922 int rval = 0;
1923
1924 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1925
1926 switch (stat) {
1927 case MAC_STAT_IFSPEED:
1928 *val = grp->lg_ifspeed;
1929 break;
1930
1931 case ETHER_STAT_LINK_DUPLEX:
1932 *val = grp->lg_link_duplex;
1933 break;
1934
1935 default:
1936 /*
1937 * For all other statistics, we return the aggregated stat
1938 * from the underlying ports. aggr_grp_stat() will set
1939 * rval appropriately if the statistic isn't a counter.
1940 */
1941 rval = aggr_grp_stat(grp, stat, val);
1942 }
1943
1944 mac_perim_exit(mph);
1945 return (rval);
1946 }
1947
1948 static int
aggr_m_start(void * arg)1949 aggr_m_start(void *arg)
1950 {
1951 aggr_grp_t *grp = arg;
1952 aggr_port_t *port;
1953 mac_perim_handle_t mph, pmph;
1954
1955 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1956
1957 /*
1958 * Attempts to start all configured members of the group.
1959 * Group members will be attached when their link-up notification
1960 * is received.
1961 */
1962 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1963 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1964 if (aggr_port_start(port) != 0) {
1965 mac_perim_exit(pmph);
1966 continue;
1967 }
1968
1969 /*
1970 * Turn on the promiscuous mode if it is required to receive
1971 * the non-primary address over a port, or the promiscous
1972 * mode is enabled over the aggr.
1973 */
1974 if (grp->lg_promisc || port->lp_prom_addr != NULL) {
1975 if (aggr_port_promisc(port, B_TRUE) != 0)
1976 aggr_port_stop(port);
1977 }
1978 mac_perim_exit(pmph);
1979 }
1980
1981 grp->lg_started = B_TRUE;
1982
1983 mac_perim_exit(mph);
1984 return (0);
1985 }
1986
1987 static void
aggr_m_stop(void * arg)1988 aggr_m_stop(void *arg)
1989 {
1990 aggr_grp_t *grp = arg;
1991 aggr_port_t *port;
1992 mac_perim_handle_t mph, pmph;
1993
1994 mac_perim_enter_by_mh(grp->lg_mh, &mph);
1995
1996 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
1997 mac_perim_enter_by_mh(port->lp_mh, &pmph);
1998
1999 /* reset port promiscuous mode */
2000 (void) aggr_port_promisc(port, B_FALSE);
2001
2002 aggr_port_stop(port);
2003 mac_perim_exit(pmph);
2004 }
2005
2006 grp->lg_started = B_FALSE;
2007 mac_perim_exit(mph);
2008 }
2009
2010 static int
aggr_m_promisc(void * arg,boolean_t on)2011 aggr_m_promisc(void *arg, boolean_t on)
2012 {
2013 aggr_grp_t *grp = arg;
2014 aggr_port_t *port;
2015 boolean_t link_state_changed = B_FALSE;
2016 mac_perim_handle_t mph, pmph;
2017
2018 AGGR_GRP_REFHOLD(grp);
2019 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2020
2021 ASSERT(!grp->lg_closing);
2022
2023 if (on == grp->lg_promisc)
2024 goto bail;
2025
2026 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2027 int err = 0;
2028
2029 mac_perim_enter_by_mh(port->lp_mh, &pmph);
2030 AGGR_PORT_REFHOLD(port);
2031 if (!on && (port->lp_prom_addr == NULL))
2032 err = aggr_port_promisc(port, B_FALSE);
2033 else if (on && port->lp_started)
2034 err = aggr_port_promisc(port, B_TRUE);
2035
2036 if (err != 0) {
2037 if (aggr_grp_detach_port(grp, port))
2038 link_state_changed = B_TRUE;
2039 } else {
2040 /*
2041 * If a port was detached because of a previous
2042 * failure changing the promiscuity, the port
2043 * is reattached when it successfully changes
2044 * the promiscuity now, and this might cause
2045 * the link state of the aggregation to change.
2046 */
2047 if (aggr_grp_attach_port(grp, port))
2048 link_state_changed = B_TRUE;
2049 }
2050 mac_perim_exit(pmph);
2051 AGGR_PORT_REFRELE(port);
2052 }
2053
2054 grp->lg_promisc = on;
2055
2056 if (link_state_changed)
2057 mac_link_update(grp->lg_mh, grp->lg_link_state);
2058
2059 bail:
2060 mac_perim_exit(mph);
2061 AGGR_GRP_REFRELE(grp);
2062
2063 return (0);
2064 }
2065
2066 static void
aggr_grp_port_rename(const char * new_name,void * arg)2067 aggr_grp_port_rename(const char *new_name, void *arg)
2068 {
2069 /*
2070 * aggr port's mac client name is the format of "aggr link name" plus
2071 * AGGR_PORT_NAME_DELIMIT plus "underneath link name".
2072 */
2073 int aggr_len, link_len, clnt_name_len, i;
2074 char *str_end, *str_st, *str_del;
2075 char aggr_name[MAXNAMELEN];
2076 char link_name[MAXNAMELEN];
2077 char *clnt_name;
2078 aggr_grp_t *aggr_grp = arg;
2079 aggr_port_t *aggr_port = aggr_grp->lg_ports;
2080
2081 for (i = 0; i < aggr_grp->lg_nports; i++) {
2082 clnt_name = mac_client_name(aggr_port->lp_mch);
2083 clnt_name_len = strlen(clnt_name);
2084 str_st = clnt_name;
2085 str_end = &(clnt_name[clnt_name_len]);
2086 str_del = strchr(str_st, AGGR_PORT_NAME_DELIMIT);
2087 ASSERT(str_del != NULL);
2088 aggr_len = (intptr_t)((uintptr_t)str_del - (uintptr_t)str_st);
2089 link_len = (intptr_t)((uintptr_t)str_end - (uintptr_t)str_del);
2090 bzero(aggr_name, MAXNAMELEN);
2091 bzero(link_name, MAXNAMELEN);
2092 bcopy(clnt_name, aggr_name, aggr_len);
2093 bcopy(str_del, link_name, link_len + 1);
2094 bzero(clnt_name, MAXNAMELEN);
2095 (void) snprintf(clnt_name, MAXNAMELEN, "%s%s", new_name,
2096 link_name);
2097
2098 (void) mac_rename_primary(aggr_port->lp_mh, NULL);
2099 aggr_port = aggr_port->lp_next;
2100 }
2101 }
2102
2103 /*
2104 * Initialize the capabilities that are advertised for the group
2105 * according to the capabilities of the constituent ports.
2106 */
2107 static boolean_t
aggr_m_capab_get(void * arg,mac_capab_t cap,void * cap_data)2108 aggr_m_capab_get(void *arg, mac_capab_t cap, void *cap_data)
2109 {
2110 aggr_grp_t *grp = arg;
2111
2112 switch (cap) {
2113 case MAC_CAPAB_HCKSUM: {
2114 uint32_t *hcksum_txflags = cap_data;
2115 *hcksum_txflags = grp->lg_hcksum_txflags;
2116 break;
2117 }
2118 case MAC_CAPAB_LSO: {
2119 mac_capab_lso_t *cap_lso = cap_data;
2120
2121 if (grp->lg_lso) {
2122 *cap_lso = grp->lg_cap_lso;
2123 break;
2124 } else {
2125 return (B_FALSE);
2126 }
2127 }
2128 case MAC_CAPAB_NO_NATIVEVLAN:
2129 return (!grp->lg_vlan);
2130 case MAC_CAPAB_NO_ZCOPY:
2131 return (!grp->lg_zcopy);
2132 case MAC_CAPAB_RINGS: {
2133 mac_capab_rings_t *cap_rings = cap_data;
2134
2135 if (cap_rings->mr_type == MAC_RING_TYPE_RX) {
2136 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2137 cap_rings->mr_rnum = grp->lg_rx_group.arg_ring_cnt;
2138
2139 /*
2140 * An aggregation advertises only one (pseudo) RX
2141 * group, which virtualizes the main/primary group of
2142 * the underlying devices.
2143 */
2144 cap_rings->mr_gnum = 1;
2145 cap_rings->mr_gaddring = NULL;
2146 cap_rings->mr_gremring = NULL;
2147 } else {
2148 cap_rings->mr_group_type = MAC_GROUP_TYPE_STATIC;
2149 cap_rings->mr_rnum = grp->lg_tx_group.atg_ring_cnt;
2150 cap_rings->mr_gnum = 0;
2151 }
2152 cap_rings->mr_rget = aggr_fill_ring;
2153 cap_rings->mr_gget = aggr_fill_group;
2154 break;
2155 }
2156 case MAC_CAPAB_AGGR:
2157 {
2158 mac_capab_aggr_t *aggr_cap;
2159
2160 if (cap_data != NULL) {
2161 aggr_cap = cap_data;
2162 aggr_cap->mca_rename_fn = aggr_grp_port_rename;
2163 aggr_cap->mca_unicst = aggr_m_unicst;
2164 aggr_cap->mca_find_tx_ring_fn = aggr_find_tx_ring;
2165 aggr_cap->mca_arg = arg;
2166 }
2167 return (B_TRUE);
2168 }
2169 default:
2170 return (B_FALSE);
2171 }
2172 return (B_TRUE);
2173 }
2174
2175 /*
2176 * Callback funtion for MAC layer to register groups.
2177 */
2178 static void
aggr_fill_group(void * arg,mac_ring_type_t rtype,const int index,mac_group_info_t * infop,mac_group_handle_t gh)2179 aggr_fill_group(void *arg, mac_ring_type_t rtype, const int index,
2180 mac_group_info_t *infop, mac_group_handle_t gh)
2181 {
2182 aggr_grp_t *grp = arg;
2183 aggr_pseudo_rx_group_t *rx_group;
2184 aggr_pseudo_tx_group_t *tx_group;
2185
2186 ASSERT(index == 0);
2187 if (rtype == MAC_RING_TYPE_RX) {
2188 rx_group = &grp->lg_rx_group;
2189 rx_group->arg_gh = gh;
2190 rx_group->arg_grp = grp;
2191
2192 infop->mgi_driver = (mac_group_driver_t)rx_group;
2193 infop->mgi_start = NULL;
2194 infop->mgi_stop = NULL;
2195 infop->mgi_addmac = aggr_addmac;
2196 infop->mgi_remmac = aggr_remmac;
2197 infop->mgi_count = rx_group->arg_ring_cnt;
2198 } else {
2199 tx_group = &grp->lg_tx_group;
2200 tx_group->atg_gh = gh;
2201 }
2202 }
2203
2204 /*
2205 * Callback funtion for MAC layer to register all rings.
2206 */
2207 static void
aggr_fill_ring(void * arg,mac_ring_type_t rtype,const int rg_index,const int index,mac_ring_info_t * infop,mac_ring_handle_t rh)2208 aggr_fill_ring(void *arg, mac_ring_type_t rtype, const int rg_index,
2209 const int index, mac_ring_info_t *infop, mac_ring_handle_t rh)
2210 {
2211 aggr_grp_t *grp = arg;
2212
2213 switch (rtype) {
2214 case MAC_RING_TYPE_RX: {
2215 aggr_pseudo_rx_group_t *rx_group = &grp->lg_rx_group;
2216 aggr_pseudo_rx_ring_t *rx_ring;
2217 mac_intr_t aggr_mac_intr;
2218
2219 ASSERT(rg_index == 0);
2220
2221 ASSERT((index >= 0) && (index < rx_group->arg_ring_cnt));
2222 rx_ring = rx_group->arg_rings + index;
2223 rx_ring->arr_rh = rh;
2224
2225 /*
2226 * Entrypoint to enable interrupt (disable poll) and
2227 * disable interrupt (enable poll).
2228 */
2229 aggr_mac_intr.mi_handle = (mac_intr_handle_t)rx_ring;
2230 aggr_mac_intr.mi_enable = aggr_pseudo_enable_intr;
2231 aggr_mac_intr.mi_disable = aggr_pseudo_disable_intr;
2232 aggr_mac_intr.mi_ddi_handle = NULL;
2233
2234 infop->mri_driver = (mac_ring_driver_t)rx_ring;
2235 infop->mri_start = aggr_pseudo_start_ring;
2236 infop->mri_stop = aggr_pseudo_stop_ring;
2237
2238 infop->mri_intr = aggr_mac_intr;
2239 infop->mri_poll = aggr_rx_poll;
2240
2241 infop->mri_stat = aggr_rx_ring_stat;
2242 break;
2243 }
2244 case MAC_RING_TYPE_TX: {
2245 aggr_pseudo_tx_group_t *tx_group = &grp->lg_tx_group;
2246 aggr_pseudo_tx_ring_t *tx_ring;
2247
2248 ASSERT(rg_index == -1);
2249 ASSERT(index < tx_group->atg_ring_cnt);
2250
2251 tx_ring = &tx_group->atg_rings[index];
2252 tx_ring->atr_rh = rh;
2253
2254 infop->mri_driver = (mac_ring_driver_t)tx_ring;
2255 infop->mri_start = NULL;
2256 infop->mri_stop = NULL;
2257 infop->mri_tx = aggr_ring_tx;
2258 infop->mri_stat = aggr_tx_ring_stat;
2259 /*
2260 * Use the hw TX ring handle to find if the ring needs
2261 * serialization or not. For NICs that do not expose
2262 * Tx rings, atr_hw_rh will be NULL.
2263 */
2264 if (tx_ring->atr_hw_rh != NULL) {
2265 infop->mri_flags =
2266 mac_hwring_getinfo(tx_ring->atr_hw_rh);
2267 }
2268 break;
2269 }
2270 default:
2271 break;
2272 }
2273 }
2274
2275 static mblk_t *
aggr_rx_poll(void * arg,int bytes_to_pickup)2276 aggr_rx_poll(void *arg, int bytes_to_pickup)
2277 {
2278 aggr_pseudo_rx_ring_t *rr_ring = arg;
2279 aggr_port_t *port = rr_ring->arr_port;
2280 aggr_grp_t *grp = port->lp_grp;
2281 mblk_t *mp_chain, *mp, **mpp;
2282
2283 mp_chain = mac_hwring_poll(rr_ring->arr_hw_rh, bytes_to_pickup);
2284
2285 if (grp->lg_lacp_mode == AGGR_LACP_OFF)
2286 return (mp_chain);
2287
2288 mpp = &mp_chain;
2289 while ((mp = *mpp) != NULL) {
2290 if (MBLKL(mp) >= sizeof (struct ether_header)) {
2291 struct ether_header *ehp;
2292
2293 ehp = (struct ether_header *)mp->b_rptr;
2294 if (ntohs(ehp->ether_type) == ETHERTYPE_SLOW) {
2295 *mpp = mp->b_next;
2296 mp->b_next = NULL;
2297 aggr_recv_lacp(port,
2298 (mac_resource_handle_t)rr_ring, mp);
2299 continue;
2300 }
2301 }
2302
2303 if (!port->lp_collector_enabled) {
2304 *mpp = mp->b_next;
2305 mp->b_next = NULL;
2306 freemsg(mp);
2307 continue;
2308 }
2309 mpp = &mp->b_next;
2310 }
2311 return (mp_chain);
2312 }
2313
2314 static int
aggr_addmac(void * arg,const uint8_t * mac_addr)2315 aggr_addmac(void *arg, const uint8_t *mac_addr)
2316 {
2317 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2318 aggr_unicst_addr_t *addr, **pprev;
2319 aggr_grp_t *grp = rx_group->arg_grp;
2320 aggr_port_t *port, *p;
2321 mac_perim_handle_t mph;
2322 int err = 0;
2323
2324 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2325
2326 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2327 mac_perim_exit(mph);
2328 return (0);
2329 }
2330
2331 /*
2332 * Insert this mac address into the list of mac addresses owned by
2333 * the aggregation pseudo group.
2334 */
2335 pprev = &rx_group->arg_macaddr;
2336 while ((addr = *pprev) != NULL) {
2337 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) == 0) {
2338 mac_perim_exit(mph);
2339 return (EEXIST);
2340 }
2341 pprev = &addr->aua_next;
2342 }
2343 addr = kmem_alloc(sizeof (aggr_unicst_addr_t), KM_SLEEP);
2344 bcopy(mac_addr, addr->aua_addr, ETHERADDRL);
2345 addr->aua_next = NULL;
2346 *pprev = addr;
2347
2348 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2349 if ((err = aggr_port_addmac(port, mac_addr)) != 0)
2350 break;
2351
2352 if (err != 0) {
2353 for (p = grp->lg_ports; p != port; p = p->lp_next)
2354 aggr_port_remmac(p, mac_addr);
2355
2356 *pprev = NULL;
2357 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2358 }
2359
2360 mac_perim_exit(mph);
2361 return (err);
2362 }
2363
2364 static int
aggr_remmac(void * arg,const uint8_t * mac_addr)2365 aggr_remmac(void *arg, const uint8_t *mac_addr)
2366 {
2367 aggr_pseudo_rx_group_t *rx_group = (aggr_pseudo_rx_group_t *)arg;
2368 aggr_unicst_addr_t *addr, **pprev;
2369 aggr_grp_t *grp = rx_group->arg_grp;
2370 aggr_port_t *port;
2371 mac_perim_handle_t mph;
2372 int err = 0;
2373
2374 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2375
2376 if (bcmp(mac_addr, grp->lg_addr, ETHERADDRL) == 0) {
2377 mac_perim_exit(mph);
2378 return (0);
2379 }
2380
2381 /*
2382 * Insert this mac address into the list of mac addresses owned by
2383 * the aggregation pseudo group.
2384 */
2385 pprev = &rx_group->arg_macaddr;
2386 while ((addr = *pprev) != NULL) {
2387 if (bcmp(mac_addr, addr->aua_addr, ETHERADDRL) != 0) {
2388 pprev = &addr->aua_next;
2389 continue;
2390 }
2391 break;
2392 }
2393 if (addr == NULL) {
2394 mac_perim_exit(mph);
2395 return (EINVAL);
2396 }
2397
2398 for (port = grp->lg_ports; port != NULL; port = port->lp_next)
2399 aggr_port_remmac(port, mac_addr);
2400
2401 *pprev = addr->aua_next;
2402 kmem_free(addr, sizeof (aggr_unicst_addr_t));
2403
2404 mac_perim_exit(mph);
2405 return (err);
2406 }
2407
2408 /*
2409 * Add or remove the multicast addresses that are defined for the group
2410 * to or from the specified port.
2411 *
2412 * Note that aggr_grp_multicst_port(..., B_TRUE) is called when the port
2413 * is started and attached, and aggr_grp_multicst_port(..., B_FALSE) is
2414 * called when the port is either stopped or detached.
2415 */
2416 void
aggr_grp_multicst_port(aggr_port_t * port,boolean_t add)2417 aggr_grp_multicst_port(aggr_port_t *port, boolean_t add)
2418 {
2419 aggr_grp_t *grp = port->lp_grp;
2420
2421 ASSERT(MAC_PERIM_HELD(port->lp_mh));
2422 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2423
2424 if (!port->lp_started || port->lp_state != AGGR_PORT_STATE_ATTACHED)
2425 return;
2426
2427 mac_multicast_refresh(grp->lg_mh, aggr_port_multicst, port, add);
2428 }
2429
2430 static int
aggr_m_multicst(void * arg,boolean_t add,const uint8_t * addrp)2431 aggr_m_multicst(void *arg, boolean_t add, const uint8_t *addrp)
2432 {
2433 aggr_grp_t *grp = arg;
2434 aggr_port_t *port = NULL, *errport = NULL;
2435 mac_perim_handle_t mph;
2436 int err = 0;
2437
2438 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2439 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2440 if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2441 !port->lp_started) {
2442 continue;
2443 }
2444 err = aggr_port_multicst(port, add, addrp);
2445 if (err != 0) {
2446 errport = port;
2447 break;
2448 }
2449 }
2450
2451 /*
2452 * At least one port caused error return and this error is returned to
2453 * mac, eventually a NAK would be sent upwards.
2454 * Some ports have this multicast address listed now, and some don't.
2455 * Treat this error as a whole aggr failure not individual port failure.
2456 * Therefore remove this multicast address from other ports.
2457 */
2458 if ((err != 0) && add) {
2459 for (port = grp->lg_ports; port != errport;
2460 port = port->lp_next) {
2461 if (port->lp_state != AGGR_PORT_STATE_ATTACHED ||
2462 !port->lp_started) {
2463 continue;
2464 }
2465 (void) aggr_port_multicst(port, B_FALSE, addrp);
2466 }
2467 }
2468 mac_perim_exit(mph);
2469 return (err);
2470 }
2471
2472 static int
aggr_m_unicst(void * arg,const uint8_t * macaddr)2473 aggr_m_unicst(void *arg, const uint8_t *macaddr)
2474 {
2475 aggr_grp_t *grp = arg;
2476 mac_perim_handle_t mph;
2477 int err;
2478
2479 mac_perim_enter_by_mh(grp->lg_mh, &mph);
2480 err = aggr_grp_modify_common(grp, AGGR_MODIFY_MAC, 0, B_TRUE, macaddr,
2481 0, 0);
2482 mac_perim_exit(mph);
2483 return (err);
2484 }
2485
2486 /*
2487 * Initialize the capabilities that are advertised for the group
2488 * according to the capabilities of the constituent ports.
2489 */
2490 static void
aggr_grp_capab_set(aggr_grp_t * grp)2491 aggr_grp_capab_set(aggr_grp_t *grp)
2492 {
2493 uint32_t cksum;
2494 aggr_port_t *port;
2495 mac_capab_lso_t cap_lso;
2496
2497 ASSERT(grp->lg_mh == NULL);
2498 ASSERT(grp->lg_ports != NULL);
2499
2500 grp->lg_hcksum_txflags = (uint32_t)-1;
2501 grp->lg_zcopy = B_TRUE;
2502 grp->lg_vlan = B_TRUE;
2503
2504 grp->lg_lso = B_TRUE;
2505 grp->lg_cap_lso.lso_flags = (t_uscalar_t)-1;
2506 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max = (t_uscalar_t)-1;
2507
2508 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2509 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &cksum))
2510 cksum = 0;
2511 grp->lg_hcksum_txflags &= cksum;
2512
2513 grp->lg_vlan &=
2514 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL);
2515
2516 grp->lg_zcopy &=
2517 !mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL);
2518
2519 grp->lg_lso &=
2520 mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso);
2521 if (grp->lg_lso) {
2522 grp->lg_cap_lso.lso_flags &= cap_lso.lso_flags;
2523 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2524 cap_lso.lso_basic_tcp_ipv4.lso_max)
2525 grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max =
2526 cap_lso.lso_basic_tcp_ipv4.lso_max;
2527 }
2528 }
2529 }
2530
2531 /*
2532 * Checks whether the capabilities of the port being added are compatible
2533 * with the current capabilities of the aggregation.
2534 */
2535 static boolean_t
aggr_grp_capab_check(aggr_grp_t * grp,aggr_port_t * port)2536 aggr_grp_capab_check(aggr_grp_t *grp, aggr_port_t *port)
2537 {
2538 uint32_t hcksum_txflags;
2539
2540 ASSERT(grp->lg_ports != NULL);
2541
2542 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_NATIVEVLAN, NULL)) &
2543 grp->lg_vlan) != grp->lg_vlan) {
2544 return (B_FALSE);
2545 }
2546
2547 if (((!mac_capab_get(port->lp_mh, MAC_CAPAB_NO_ZCOPY, NULL)) &
2548 grp->lg_zcopy) != grp->lg_zcopy) {
2549 return (B_FALSE);
2550 }
2551
2552 if (!mac_capab_get(port->lp_mh, MAC_CAPAB_HCKSUM, &hcksum_txflags)) {
2553 if (grp->lg_hcksum_txflags != 0)
2554 return (B_FALSE);
2555 } else if ((hcksum_txflags & grp->lg_hcksum_txflags) !=
2556 grp->lg_hcksum_txflags) {
2557 return (B_FALSE);
2558 }
2559
2560 if (grp->lg_lso) {
2561 mac_capab_lso_t cap_lso;
2562
2563 if (mac_capab_get(port->lp_mh, MAC_CAPAB_LSO, &cap_lso)) {
2564 if ((grp->lg_cap_lso.lso_flags & cap_lso.lso_flags) !=
2565 grp->lg_cap_lso.lso_flags)
2566 return (B_FALSE);
2567 if (grp->lg_cap_lso.lso_basic_tcp_ipv4.lso_max >
2568 cap_lso.lso_basic_tcp_ipv4.lso_max)
2569 return (B_FALSE);
2570 } else {
2571 return (B_FALSE);
2572 }
2573 }
2574
2575 return (B_TRUE);
2576 }
2577
2578 /*
2579 * Returns the maximum SDU according to the SDU of the constituent ports.
2580 */
2581 static uint_t
aggr_grp_max_sdu(aggr_grp_t * grp)2582 aggr_grp_max_sdu(aggr_grp_t *grp)
2583 {
2584 uint_t max_sdu = (uint_t)-1;
2585 aggr_port_t *port;
2586
2587 ASSERT(grp->lg_ports != NULL);
2588
2589 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2590 uint_t port_sdu_max;
2591
2592 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2593 if (max_sdu > port_sdu_max)
2594 max_sdu = port_sdu_max;
2595 }
2596
2597 return (max_sdu);
2598 }
2599
2600 /*
2601 * Checks if the maximum SDU of the specified port is compatible
2602 * with the maximum SDU of the specified aggregation group, returns
2603 * B_TRUE if it is, B_FALSE otherwise.
2604 */
2605 static boolean_t
aggr_grp_sdu_check(aggr_grp_t * grp,aggr_port_t * port)2606 aggr_grp_sdu_check(aggr_grp_t *grp, aggr_port_t *port)
2607 {
2608 uint_t port_sdu_max;
2609
2610 mac_sdu_get(port->lp_mh, NULL, &port_sdu_max);
2611 return (port_sdu_max >= grp->lg_max_sdu);
2612 }
2613
2614 /*
2615 * Returns the maximum margin according to the margin of the constituent ports.
2616 */
2617 static uint32_t
aggr_grp_max_margin(aggr_grp_t * grp)2618 aggr_grp_max_margin(aggr_grp_t *grp)
2619 {
2620 uint32_t margin = UINT32_MAX;
2621 aggr_port_t *port;
2622
2623 ASSERT(grp->lg_mh == NULL);
2624 ASSERT(grp->lg_ports != NULL);
2625
2626 for (port = grp->lg_ports; port != NULL; port = port->lp_next) {
2627 if (margin > port->lp_margin)
2628 margin = port->lp_margin;
2629 }
2630
2631 grp->lg_margin = margin;
2632 return (margin);
2633 }
2634
2635 /*
2636 * Checks if the maximum margin of the specified port is compatible
2637 * with the maximum margin of the specified aggregation group, returns
2638 * B_TRUE if it is, B_FALSE otherwise.
2639 */
2640 static boolean_t
aggr_grp_margin_check(aggr_grp_t * grp,aggr_port_t * port)2641 aggr_grp_margin_check(aggr_grp_t *grp, aggr_port_t *port)
2642 {
2643 if (port->lp_margin >= grp->lg_margin)
2644 return (B_TRUE);
2645
2646 /*
2647 * See whether the current margin value is allowed to be changed to
2648 * the new value.
2649 */
2650 if (!mac_margin_update(grp->lg_mh, port->lp_margin))
2651 return (B_FALSE);
2652
2653 grp->lg_margin = port->lp_margin;
2654 return (B_TRUE);
2655 }
2656
2657 /*
2658 * Set MTU on individual ports of an aggregation group
2659 */
2660 static int
aggr_set_port_sdu(aggr_grp_t * grp,aggr_port_t * port,uint32_t sdu,uint32_t * old_mtu)2661 aggr_set_port_sdu(aggr_grp_t *grp, aggr_port_t *port, uint32_t sdu,
2662 uint32_t *old_mtu)
2663 {
2664 boolean_t removed = B_FALSE;
2665 mac_perim_handle_t mph;
2666 mac_diag_t diag;
2667 int err, rv, retry = 0;
2668
2669 if (port->lp_mah != NULL) {
2670 (void) mac_unicast_remove(port->lp_mch, port->lp_mah);
2671 port->lp_mah = NULL;
2672 removed = B_TRUE;
2673 }
2674 err = mac_set_mtu(port->lp_mh, sdu, old_mtu);
2675 try_again:
2676 if (removed && (rv = mac_unicast_add(port->lp_mch, NULL,
2677 MAC_UNICAST_PRIMARY | MAC_UNICAST_DISABLE_TX_VID_CHECK,
2678 &port->lp_mah, 0, &diag)) != 0) {
2679 /*
2680 * following is a workaround for a bug in 'bge' driver.
2681 * See CR 6794654 for more information and this work around
2682 * will be removed once the CR is fixed.
2683 */
2684 if (rv == EIO && retry++ < 3) {
2685 delay(2 * hz);
2686 goto try_again;
2687 }
2688 /*
2689 * if mac_unicast_add() failed while setting the MTU,
2690 * detach the port from the group.
2691 */
2692 mac_perim_enter_by_mh(port->lp_mh, &mph);
2693 (void) aggr_grp_detach_port(grp, port);
2694 mac_perim_exit(mph);
2695 cmn_err(CE_WARN, "Unable to restart the port %s while "
2696 "setting MTU. Detaching the port from the aggregation.",
2697 mac_client_name(port->lp_mch));
2698 }
2699 return (err);
2700 }
2701
2702 static int
aggr_sdu_update(aggr_grp_t * grp,uint32_t sdu)2703 aggr_sdu_update(aggr_grp_t *grp, uint32_t sdu)
2704 {
2705 int err = 0, i, rv;
2706 aggr_port_t *port;
2707 uint32_t *mtu;
2708
2709 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2710
2711 /*
2712 * If the MTU being set is equal to aggr group's maximum
2713 * allowable value, then there is nothing to change
2714 */
2715 if (sdu == grp->lg_max_sdu)
2716 return (0);
2717
2718 /* 0 is aggr group's min sdu */
2719 if (sdu == 0)
2720 return (EINVAL);
2721
2722 mtu = kmem_alloc(sizeof (uint32_t) * grp->lg_nports, KM_SLEEP);
2723 for (port = grp->lg_ports, i = 0; port != NULL && err == 0;
2724 port = port->lp_next, i++) {
2725 err = aggr_set_port_sdu(grp, port, sdu, mtu + i);
2726 }
2727 if (err != 0) {
2728 /* recover from error: reset the mtus of the ports */
2729 aggr_port_t *tmp;
2730
2731 for (tmp = grp->lg_ports, i = 0; tmp != port;
2732 tmp = tmp->lp_next, i++) {
2733 (void) aggr_set_port_sdu(grp, tmp, *(mtu + i), NULL);
2734 }
2735 goto bail;
2736 }
2737 grp->lg_max_sdu = aggr_grp_max_sdu(grp);
2738 rv = mac_maxsdu_update(grp->lg_mh, grp->lg_max_sdu);
2739 ASSERT(rv == 0);
2740 bail:
2741 kmem_free(mtu, sizeof (uint32_t) * grp->lg_nports);
2742 return (err);
2743 }
2744
2745 /*
2746 * Callback functions for set/get of properties
2747 */
2748 /*ARGSUSED*/
2749 static int
aggr_m_setprop(void * m_driver,const char * pr_name,mac_prop_id_t pr_num,uint_t pr_valsize,const void * pr_val)2750 aggr_m_setprop(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
2751 uint_t pr_valsize, const void *pr_val)
2752 {
2753 int err = ENOTSUP;
2754 aggr_grp_t *grp = m_driver;
2755
2756 switch (pr_num) {
2757 case MAC_PROP_MTU: {
2758 uint32_t mtu;
2759
2760 if (pr_valsize < sizeof (mtu)) {
2761 err = EINVAL;
2762 break;
2763 }
2764 bcopy(pr_val, &mtu, sizeof (mtu));
2765 err = aggr_sdu_update(grp, mtu);
2766 break;
2767 }
2768 default:
2769 break;
2770 }
2771 return (err);
2772 }
2773
2774 typedef struct rboundary {
2775 uint32_t bval;
2776 int btype;
2777 } rboundary_t;
2778
2779 /*
2780 * This function finds the intersection of mtu ranges stored in arrays -
2781 * mrange[0] ... mrange[mcount -1]. It returns the intersection in rval.
2782 * Individual arrays are assumed to contain non-overlapping ranges.
2783 * Algorithm:
2784 * A range has two boundaries - min and max. We scan all arrays and store
2785 * each boundary as a separate element in a temporary array. We also store
2786 * the boundary types, min or max, as +1 or -1 respectively in the temporary
2787 * array. Then we sort the temporary array in ascending order. We scan the
2788 * sorted array from lower to higher values and keep a cumulative sum of
2789 * boundary types. Element in the temporary array for which the sum reaches
2790 * mcount is a min boundary of a range in the result and next element will be
2791 * max boundary.
2792 *
2793 * Example for mcount = 3,
2794 *
2795 * ----|_________|-------|_______|----|__|------ mrange[0]
2796 *
2797 * -------|________|--|____________|-----|___|-- mrange[1]
2798 *
2799 * --------|________________|-------|____|------ mrange[2]
2800 *
2801 * 3 2 1
2802 * \|/
2803 * 1 23 2 1 2 3 2 1 01 2 V 0 <- the sum
2804 * ----|--||-----|-|--|--|--|----|-||-|--|---|-- sorted array
2805 *
2806 * same min and max
2807 * V
2808 * --------|_____|-------|__|------------|------ intersecting ranges
2809 */
2810 void
aggr_mtu_range_intersection(mac_propval_range_t ** mrange,int mcount,mac_propval_uint32_range_t ** prval,int * prmaxcnt,int * prcount)2811 aggr_mtu_range_intersection(mac_propval_range_t **mrange, int mcount,
2812 mac_propval_uint32_range_t **prval, int *prmaxcnt, int *prcount)
2813 {
2814 mac_propval_uint32_range_t *rval, *ur;
2815 int rmaxcnt, rcount;
2816 size_t sz_range32;
2817 rboundary_t *ta; /* temporary array */
2818 rboundary_t temp;
2819 boolean_t range_started = B_FALSE;
2820 int i, j, m, sum;
2821
2822 sz_range32 = sizeof (mac_propval_uint32_range_t);
2823
2824 for (i = 0, rmaxcnt = 0; i < mcount; i++)
2825 rmaxcnt += mrange[i]->mpr_count;
2826
2827 /* Allocate enough space to store the results */
2828 rval = kmem_alloc(rmaxcnt * sz_range32, KM_SLEEP);
2829
2830 /* Number of boundaries are twice as many as ranges */
2831 ta = kmem_alloc(2 * rmaxcnt * sizeof (rboundary_t), KM_SLEEP);
2832
2833 for (i = 0, m = 0; i < mcount; i++) {
2834 ur = &(mrange[i]->mpr_range_uint32[0]);
2835 for (j = 0; j < mrange[i]->mpr_count; j++) {
2836 ta[m].bval = ur[j].mpur_min;
2837 ta[m++].btype = 1;
2838 ta[m].bval = ur[j].mpur_max;
2839 ta[m++].btype = -1;
2840 }
2841 }
2842
2843 /*
2844 * Sort the temporary array in ascending order of bval;
2845 * if boundary values are same then sort on btype.
2846 */
2847 for (i = 0; i < m-1; i++) {
2848 for (j = i+1; j < m; j++) {
2849 if ((ta[i].bval > ta[j].bval) ||
2850 ((ta[i].bval == ta[j].bval) &&
2851 (ta[i].btype < ta[j].btype))) {
2852 temp = ta[i];
2853 ta[i] = ta[j];
2854 ta[j] = temp;
2855 }
2856 }
2857 }
2858
2859 /* Walk through temporary array to find all ranges in the results */
2860 for (i = 0, sum = 0, rcount = 0; i < m; i++) {
2861 sum += ta[i].btype;
2862 if (sum == mcount) {
2863 rval[rcount].mpur_min = ta[i].bval;
2864 range_started = B_TRUE;
2865 } else if (sum < mcount && range_started) {
2866 rval[rcount++].mpur_max = ta[i].bval;
2867 range_started = B_FALSE;
2868 }
2869 }
2870
2871 *prval = rval;
2872 *prmaxcnt = rmaxcnt;
2873 *prcount = rcount;
2874
2875 kmem_free(ta, 2 * rmaxcnt * sizeof (rboundary_t));
2876 }
2877
2878 /*
2879 * Returns the mtu ranges which could be supported by aggr group.
2880 * prmaxcnt returns the size of the buffer prval, prcount returns
2881 * the number of valid entries in prval. Caller is responsible
2882 * for freeing up prval.
2883 */
2884 int
aggr_grp_possible_mtu_range(aggr_grp_t * grp,mac_propval_uint32_range_t ** prval,int * prmaxcnt,int * prcount)2885 aggr_grp_possible_mtu_range(aggr_grp_t *grp, mac_propval_uint32_range_t **prval,
2886 int *prmaxcnt, int *prcount)
2887 {
2888 mac_propval_range_t **vals;
2889 aggr_port_t *port;
2890 mac_perim_handle_t mph;
2891 uint_t i, numr;
2892 int err = 0;
2893 size_t sz_propval, sz_range32;
2894 size_t size;
2895
2896 sz_propval = sizeof (mac_propval_range_t);
2897 sz_range32 = sizeof (mac_propval_uint32_range_t);
2898
2899 ASSERT(MAC_PERIM_HELD(grp->lg_mh));
2900
2901 vals = kmem_zalloc(sizeof (mac_propval_range_t *) * grp->lg_nports,
2902 KM_SLEEP);
2903
2904 for (port = grp->lg_ports, i = 0; port != NULL;
2905 port = port->lp_next, i++) {
2906
2907 size = sz_propval;
2908 vals[i] = kmem_alloc(size, KM_SLEEP);
2909 vals[i]->mpr_count = 1;
2910
2911 mac_perim_enter_by_mh(port->lp_mh, &mph);
2912
2913 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
2914 NULL, 0, vals[i], NULL);
2915 if (err == ENOSPC) {
2916 /*
2917 * Not enough space to hold all ranges.
2918 * Allocate extra space as indicated and retry.
2919 */
2920 numr = vals[i]->mpr_count;
2921 kmem_free(vals[i], sz_propval);
2922 size = sz_propval + (numr - 1) * sz_range32;
2923 vals[i] = kmem_alloc(size, KM_SLEEP);
2924 vals[i]->mpr_count = numr;
2925 err = mac_prop_info(port->lp_mh, MAC_PROP_MTU, NULL,
2926 NULL, 0, vals[i], NULL);
2927 ASSERT(err != ENOSPC);
2928 }
2929 mac_perim_exit(mph);
2930 if (err != 0) {
2931 kmem_free(vals[i], size);
2932 vals[i] = NULL;
2933 break;
2934 }
2935 }
2936
2937 /*
2938 * if any of the underlying ports does not support changing MTU then
2939 * just return ENOTSUP
2940 */
2941 if (port != NULL) {
2942 ASSERT(err != 0);
2943 goto done;
2944 }
2945
2946 aggr_mtu_range_intersection(vals, grp->lg_nports, prval, prmaxcnt,
2947 prcount);
2948
2949 done:
2950 for (i = 0; i < grp->lg_nports; i++) {
2951 if (vals[i] != NULL) {
2952 numr = vals[i]->mpr_count;
2953 size = sz_propval + (numr - 1) * sz_range32;
2954 kmem_free(vals[i], size);
2955 }
2956 }
2957
2958 kmem_free(vals, sizeof (mac_propval_range_t *) * grp->lg_nports);
2959 return (err);
2960 }
2961
2962 static void
aggr_m_propinfo(void * m_driver,const char * pr_name,mac_prop_id_t pr_num,mac_prop_info_handle_t prh)2963 aggr_m_propinfo(void *m_driver, const char *pr_name, mac_prop_id_t pr_num,
2964 mac_prop_info_handle_t prh)
2965 {
2966 aggr_grp_t *grp = m_driver;
2967 mac_propval_uint32_range_t *rval = NULL;
2968 int i, rcount, rmaxcnt;
2969 int err = 0;
2970
2971 _NOTE(ARGUNUSED(pr_name));
2972
2973 switch (pr_num) {
2974 case MAC_PROP_MTU:
2975
2976 err = aggr_grp_possible_mtu_range(grp, &rval, &rmaxcnt,
2977 &rcount);
2978 if (err != 0) {
2979 ASSERT(rval == NULL);
2980 return;
2981 }
2982 for (i = 0; i < rcount; i++) {
2983 mac_prop_info_set_range_uint32(prh,
2984 rval[i].mpur_min, rval[i].mpur_max);
2985 }
2986 kmem_free(rval, sizeof (mac_propval_uint32_range_t) * rmaxcnt);
2987 break;
2988 }
2989 }
2990