xref: /dflybsd-src/sys/net/if.c (revision e586f31ca9899b49a4fc156613d9ecd853defcec)
1 /*
2  * Copyright (c) 1980, 1986, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *	@(#)if.c	8.3 (Berkeley) 1/4/94
30  * $FreeBSD: src/sys/net/if.c,v 1.185 2004/03/13 02:35:03 brooks Exp $
31  */
32 
33 #include "opt_inet6.h"
34 #include "opt_inet.h"
35 #include "opt_ifpoll.h"
36 
37 #include <sys/param.h>
38 #include <sys/malloc.h>
39 #include <sys/mbuf.h>
40 #include <sys/systm.h>
41 #include <sys/proc.h>
42 #include <sys/priv.h>
43 #include <sys/protosw.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/socketops.h>
47 #include <sys/kernel.h>
48 #include <sys/ktr.h>
49 #include <sys/mutex.h>
50 #include <sys/sockio.h>
51 #include <sys/syslog.h>
52 #include <sys/sysctl.h>
53 #include <sys/domain.h>
54 #include <sys/thread.h>
55 #include <sys/serialize.h>
56 #include <sys/bus.h>
57 
58 #include <sys/thread2.h>
59 #include <sys/msgport2.h>
60 #include <sys/mutex2.h>
61 
62 #include <net/if.h>
63 #include <net/if_arp.h>
64 #include <net/if_dl.h>
65 #include <net/if_types.h>
66 #include <net/if_var.h>
67 #include <net/ifq_var.h>
68 #include <net/radix.h>
69 #include <net/route.h>
70 #include <net/if_clone.h>
71 #include <net/netisr2.h>
72 #include <net/netmsg2.h>
73 
74 #include <machine/atomic.h>
75 #include <machine/stdarg.h>
76 #include <machine/smp.h>
77 
78 #if defined(INET) || defined(INET6)
79 /*XXX*/
80 #include <netinet/in.h>
81 #include <netinet/in_var.h>
82 #include <netinet/if_ether.h>
83 #ifdef INET6
84 #include <netinet6/in6_var.h>
85 #include <netinet6/in6_ifattach.h>
86 #endif
87 #endif
88 
89 struct netmsg_ifaddr {
90 	struct netmsg_base base;
91 	struct ifaddr	*ifa;
92 	struct ifnet	*ifp;
93 	int		tail;
94 };
95 
96 struct ifsubq_stage_head {
97 	TAILQ_HEAD(, ifsubq_stage)	stg_head;
98 } __cachealign;
99 
100 struct if_ringmap {
101 	int		rm_cnt;
102 	int		rm_grid;
103 	int		rm_cpumap[];
104 };
105 
106 /*
107  * System initialization
108  */
109 static void	if_attachdomain(void *);
110 static void	if_attachdomain1(struct ifnet *);
111 static int	ifconf(u_long, caddr_t, struct ucred *);
112 static void	ifinit(void *);
113 static void	ifnetinit(void *);
114 static void	if_slowtimo(void *);
115 static void	link_rtrequest(int, struct rtentry *);
116 static int	if_rtdel(struct radix_node *, void *);
117 static void	if_slowtimo_dispatch(netmsg_t);
118 
119 /* Helper functions */
120 static void	ifsq_watchdog_reset(struct ifsubq_watchdog *);
121 static int	if_delmulti_serialized(struct ifnet *, struct sockaddr *);
122 static struct ifnet_array *ifnet_array_alloc(int);
123 static void	ifnet_array_free(struct ifnet_array *);
124 static struct ifnet_array *ifnet_array_add(struct ifnet *,
125 		    const struct ifnet_array *);
126 static struct ifnet_array *ifnet_array_del(struct ifnet *,
127 		    const struct ifnet_array *);
128 
129 #ifdef INET6
130 /*
131  * XXX: declare here to avoid to include many inet6 related files..
132  * should be more generalized?
133  */
134 extern void	nd6_setmtu(struct ifnet *);
135 #endif
136 
137 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
138 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
139 SYSCTL_NODE(_net_link, OID_AUTO, ringmap, CTLFLAG_RW, 0, "link ringmap");
140 
141 static int ifsq_stage_cntmax = 4;
142 TUNABLE_INT("net.link.stage_cntmax", &ifsq_stage_cntmax);
143 SYSCTL_INT(_net_link, OID_AUTO, stage_cntmax, CTLFLAG_RW,
144     &ifsq_stage_cntmax, 0, "ifq staging packet count max");
145 
146 static int if_stats_compat = 0;
147 SYSCTL_INT(_net_link, OID_AUTO, stats_compat, CTLFLAG_RW,
148     &if_stats_compat, 0, "Compat the old ifnet stats");
149 
150 static int if_ringmap_dumprdr = 0;
151 SYSCTL_INT(_net_link_ringmap, OID_AUTO, dump_rdr, CTLFLAG_RW,
152     &if_ringmap_dumprdr, 0, "dump redirect table");
153 
154 SYSINIT(interfaces, SI_SUB_PROTO_IF, SI_ORDER_FIRST, ifinit, NULL);
155 SYSINIT(ifnet, SI_SUB_PRE_DRIVERS, SI_ORDER_ANY, ifnetinit, NULL);
156 
157 static  if_com_alloc_t *if_com_alloc[256];
158 static  if_com_free_t *if_com_free[256];
159 
160 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
161 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
162 MALLOC_DEFINE(M_IFNET, "ifnet", "interface structure");
163 
164 int			ifqmaxlen = IFQ_MAXLEN;
165 struct ifnethead	ifnet = TAILQ_HEAD_INITIALIZER(ifnet);
166 
167 static struct ifnet_array	ifnet_array0;
168 static struct ifnet_array	*ifnet_array = &ifnet_array0;
169 
170 static struct callout		if_slowtimo_timer;
171 static struct netmsg_base	if_slowtimo_netmsg;
172 
173 int			if_index = 0;
174 struct ifnet		**ifindex2ifnet = NULL;
175 static struct mtx	ifnet_mtx = MTX_INITIALIZER("ifnet");
176 
177 static struct ifsubq_stage_head	ifsubq_stage_heads[MAXCPU];
178 
179 #ifdef notyet
180 #define IFQ_KTR_STRING		"ifq=%p"
181 #define IFQ_KTR_ARGS	struct ifaltq *ifq
182 #ifndef KTR_IFQ
183 #define KTR_IFQ			KTR_ALL
184 #endif
185 KTR_INFO_MASTER(ifq);
186 KTR_INFO(KTR_IFQ, ifq, enqueue, 0, IFQ_KTR_STRING, IFQ_KTR_ARGS);
187 KTR_INFO(KTR_IFQ, ifq, dequeue, 1, IFQ_KTR_STRING, IFQ_KTR_ARGS);
188 #define logifq(name, arg)	KTR_LOG(ifq_ ## name, arg)
189 
190 #define IF_START_KTR_STRING	"ifp=%p"
191 #define IF_START_KTR_ARGS	struct ifnet *ifp
192 #ifndef KTR_IF_START
193 #define KTR_IF_START		KTR_ALL
194 #endif
195 KTR_INFO_MASTER(if_start);
196 KTR_INFO(KTR_IF_START, if_start, run, 0,
197 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
198 KTR_INFO(KTR_IF_START, if_start, sched, 1,
199 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
200 KTR_INFO(KTR_IF_START, if_start, avoid, 2,
201 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
202 KTR_INFO(KTR_IF_START, if_start, contend_sched, 3,
203 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
204 KTR_INFO(KTR_IF_START, if_start, chase_sched, 4,
205 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
206 #define logifstart(name, arg)	KTR_LOG(if_start_ ## name, arg)
207 #endif
208 
209 TAILQ_HEAD(, ifg_group) ifg_head = TAILQ_HEAD_INITIALIZER(ifg_head);
210 
211 /*
212  * Network interface utility routines.
213  *
214  * Routines with ifa_ifwith* names take sockaddr *'s as
215  * parameters.
216  */
217 /* ARGSUSED*/
218 static void
219 ifinit(void *dummy)
220 {
221 	struct ifnet *ifp;
222 
223 	callout_init_mp(&if_slowtimo_timer);
224 	netmsg_init(&if_slowtimo_netmsg, NULL, &netisr_adone_rport,
225 	    MSGF_PRIORITY, if_slowtimo_dispatch);
226 
227 	/* XXX is this necessary? */
228 	ifnet_lock();
229 	TAILQ_FOREACH(ifp, &ifnetlist, if_link) {
230 		if (ifp->if_snd.altq_maxlen == 0) {
231 			if_printf(ifp, "XXX: driver didn't set altq_maxlen\n");
232 			ifq_set_maxlen(&ifp->if_snd, ifqmaxlen);
233 		}
234 	}
235 	ifnet_unlock();
236 
237 	/* Start if_slowtimo */
238 	lwkt_sendmsg(netisr_cpuport(0), &if_slowtimo_netmsg.lmsg);
239 }
240 
241 static void
242 ifsq_ifstart_ipifunc(void *arg)
243 {
244 	struct ifaltq_subque *ifsq = arg;
245 	struct lwkt_msg *lmsg = ifsq_get_ifstart_lmsg(ifsq, mycpuid);
246 
247 	crit_enter();
248 	if (lmsg->ms_flags & MSGF_DONE)
249 		lwkt_sendmsg_oncpu(netisr_cpuport(mycpuid), lmsg);
250 	crit_exit();
251 }
252 
253 static __inline void
254 ifsq_stage_remove(struct ifsubq_stage_head *head, struct ifsubq_stage *stage)
255 {
256 	KKASSERT(stage->stg_flags & IFSQ_STAGE_FLAG_QUED);
257 	TAILQ_REMOVE(&head->stg_head, stage, stg_link);
258 	stage->stg_flags &= ~(IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED);
259 	stage->stg_cnt = 0;
260 	stage->stg_len = 0;
261 }
262 
263 static __inline void
264 ifsq_stage_insert(struct ifsubq_stage_head *head, struct ifsubq_stage *stage)
265 {
266 	KKASSERT((stage->stg_flags &
267 	    (IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED)) == 0);
268 	stage->stg_flags |= IFSQ_STAGE_FLAG_QUED;
269 	TAILQ_INSERT_TAIL(&head->stg_head, stage, stg_link);
270 }
271 
272 /*
273  * Schedule ifnet.if_start on the subqueue owner CPU
274  */
275 static void
276 ifsq_ifstart_schedule(struct ifaltq_subque *ifsq, int force)
277 {
278 	int cpu;
279 
280 	if (!force && curthread->td_type == TD_TYPE_NETISR &&
281 	    ifsq_stage_cntmax > 0) {
282 		struct ifsubq_stage *stage = ifsq_get_stage(ifsq, mycpuid);
283 
284 		stage->stg_cnt = 0;
285 		stage->stg_len = 0;
286 		if ((stage->stg_flags & IFSQ_STAGE_FLAG_QUED) == 0)
287 			ifsq_stage_insert(&ifsubq_stage_heads[mycpuid], stage);
288 		stage->stg_flags |= IFSQ_STAGE_FLAG_SCHED;
289 		return;
290 	}
291 
292 	cpu = ifsq_get_cpuid(ifsq);
293 	if (cpu != mycpuid)
294 		lwkt_send_ipiq(globaldata_find(cpu), ifsq_ifstart_ipifunc, ifsq);
295 	else
296 		ifsq_ifstart_ipifunc(ifsq);
297 }
298 
299 /*
300  * NOTE:
301  * This function will release ifnet.if_start subqueue interlock,
302  * if ifnet.if_start for the subqueue does not need to be scheduled
303  */
304 static __inline int
305 ifsq_ifstart_need_schedule(struct ifaltq_subque *ifsq, int running)
306 {
307 	if (!running || ifsq_is_empty(ifsq)
308 #ifdef ALTQ
309 	    || ifsq->ifsq_altq->altq_tbr != NULL
310 #endif
311 	) {
312 		ALTQ_SQ_LOCK(ifsq);
313 		/*
314 		 * ifnet.if_start subqueue interlock is released, if:
315 		 * 1) Hardware can not take any packets, due to
316 		 *    o  interface is marked down
317 		 *    o  hardware queue is full (ifsq_is_oactive)
318 		 *    Under the second situation, hardware interrupt
319 		 *    or polling(4) will call/schedule ifnet.if_start
320 		 *    on the subqueue when hardware queue is ready
321 		 * 2) There is no packet in the subqueue.
322 		 *    Further ifq_dispatch or ifq_handoff will call/
323 		 *    schedule ifnet.if_start on the subqueue.
324 		 * 3) TBR is used and it does not allow further
325 		 *    dequeueing.
326 		 *    TBR callout will call ifnet.if_start on the
327 		 *    subqueue.
328 		 */
329 		if (!running || !ifsq_data_ready(ifsq)) {
330 			ifsq_clr_started(ifsq);
331 			ALTQ_SQ_UNLOCK(ifsq);
332 			return 0;
333 		}
334 		ALTQ_SQ_UNLOCK(ifsq);
335 	}
336 	return 1;
337 }
338 
339 static void
340 ifsq_ifstart_dispatch(netmsg_t msg)
341 {
342 	struct lwkt_msg *lmsg = &msg->base.lmsg;
343 	struct ifaltq_subque *ifsq = lmsg->u.ms_resultp;
344 	struct ifnet *ifp = ifsq_get_ifp(ifsq);
345 	struct globaldata *gd = mycpu;
346 	int running = 0, need_sched;
347 
348 	crit_enter_gd(gd);
349 
350 	lwkt_replymsg(lmsg, 0);	/* reply ASAP */
351 
352 	if (gd->gd_cpuid != ifsq_get_cpuid(ifsq)) {
353 		/*
354 		 * We need to chase the subqueue owner CPU change.
355 		 */
356 		ifsq_ifstart_schedule(ifsq, 1);
357 		crit_exit_gd(gd);
358 		return;
359 	}
360 
361 	ifsq_serialize_hw(ifsq);
362 	if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq)) {
363 		ifp->if_start(ifp, ifsq);
364 		if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
365 			running = 1;
366 	}
367 	need_sched = ifsq_ifstart_need_schedule(ifsq, running);
368 	ifsq_deserialize_hw(ifsq);
369 
370 	if (need_sched) {
371 		/*
372 		 * More data need to be transmitted, ifnet.if_start is
373 		 * scheduled on the subqueue owner CPU, and we keep going.
374 		 * NOTE: ifnet.if_start subqueue interlock is not released.
375 		 */
376 		ifsq_ifstart_schedule(ifsq, 0);
377 	}
378 
379 	crit_exit_gd(gd);
380 }
381 
382 /* Device driver ifnet.if_start helper function */
383 void
384 ifsq_devstart(struct ifaltq_subque *ifsq)
385 {
386 	struct ifnet *ifp = ifsq_get_ifp(ifsq);
387 	int running = 0;
388 
389 	ASSERT_ALTQ_SQ_SERIALIZED_HW(ifsq);
390 
391 	ALTQ_SQ_LOCK(ifsq);
392 	if (ifsq_is_started(ifsq) || !ifsq_data_ready(ifsq)) {
393 		ALTQ_SQ_UNLOCK(ifsq);
394 		return;
395 	}
396 	ifsq_set_started(ifsq);
397 	ALTQ_SQ_UNLOCK(ifsq);
398 
399 	ifp->if_start(ifp, ifsq);
400 
401 	if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
402 		running = 1;
403 
404 	if (ifsq_ifstart_need_schedule(ifsq, running)) {
405 		/*
406 		 * More data need to be transmitted, ifnet.if_start is
407 		 * scheduled on ifnet's CPU, and we keep going.
408 		 * NOTE: ifnet.if_start interlock is not released.
409 		 */
410 		ifsq_ifstart_schedule(ifsq, 0);
411 	}
412 }
413 
414 void
415 if_devstart(struct ifnet *ifp)
416 {
417 	ifsq_devstart(ifq_get_subq_default(&ifp->if_snd));
418 }
419 
420 /* Device driver ifnet.if_start schedule helper function */
421 void
422 ifsq_devstart_sched(struct ifaltq_subque *ifsq)
423 {
424 	ifsq_ifstart_schedule(ifsq, 1);
425 }
426 
427 void
428 if_devstart_sched(struct ifnet *ifp)
429 {
430 	ifsq_devstart_sched(ifq_get_subq_default(&ifp->if_snd));
431 }
432 
433 static void
434 if_default_serialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
435 {
436 	lwkt_serialize_enter(ifp->if_serializer);
437 }
438 
439 static void
440 if_default_deserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
441 {
442 	lwkt_serialize_exit(ifp->if_serializer);
443 }
444 
445 static int
446 if_default_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
447 {
448 	return lwkt_serialize_try(ifp->if_serializer);
449 }
450 
451 #ifdef INVARIANTS
452 static void
453 if_default_serialize_assert(struct ifnet *ifp,
454 			    enum ifnet_serialize slz __unused,
455 			    boolean_t serialized)
456 {
457 	if (serialized)
458 		ASSERT_SERIALIZED(ifp->if_serializer);
459 	else
460 		ASSERT_NOT_SERIALIZED(ifp->if_serializer);
461 }
462 #endif
463 
464 /*
465  * Attach an interface to the list of "active" interfaces.
466  *
467  * The serializer is optional.
468  */
469 void
470 if_attach(struct ifnet *ifp, lwkt_serialize_t serializer)
471 {
472 	unsigned socksize;
473 	int namelen, masklen;
474 	struct sockaddr_dl *sdl, *sdl_addr;
475 	struct ifaddr *ifa;
476 	struct ifaltq *ifq;
477 	struct ifnet **old_ifindex2ifnet = NULL;
478 	struct ifnet_array *old_ifnet_array;
479 	int i, q;
480 
481 	static int if_indexlim = 8;
482 
483 	if (ifp->if_serialize != NULL) {
484 		KASSERT(ifp->if_deserialize != NULL &&
485 			ifp->if_tryserialize != NULL &&
486 			ifp->if_serialize_assert != NULL,
487 			("serialize functions are partially setup"));
488 
489 		/*
490 		 * If the device supplies serialize functions,
491 		 * then clear if_serializer to catch any invalid
492 		 * usage of this field.
493 		 */
494 		KASSERT(serializer == NULL,
495 			("both serialize functions and default serializer "
496 			 "are supplied"));
497 		ifp->if_serializer = NULL;
498 	} else {
499 		KASSERT(ifp->if_deserialize == NULL &&
500 			ifp->if_tryserialize == NULL &&
501 			ifp->if_serialize_assert == NULL,
502 			("serialize functions are partially setup"));
503 		ifp->if_serialize = if_default_serialize;
504 		ifp->if_deserialize = if_default_deserialize;
505 		ifp->if_tryserialize = if_default_tryserialize;
506 #ifdef INVARIANTS
507 		ifp->if_serialize_assert = if_default_serialize_assert;
508 #endif
509 
510 		/*
511 		 * The serializer can be passed in from the device,
512 		 * allowing the same serializer to be used for both
513 		 * the interrupt interlock and the device queue.
514 		 * If not specified, the netif structure will use an
515 		 * embedded serializer.
516 		 */
517 		if (serializer == NULL) {
518 			serializer = &ifp->if_default_serializer;
519 			lwkt_serialize_init(serializer);
520 		}
521 		ifp->if_serializer = serializer;
522 	}
523 
524 	/*
525 	 * XXX -
526 	 * The old code would work if the interface passed a pre-existing
527 	 * chain of ifaddrs to this code.  We don't trust our callers to
528 	 * properly initialize the tailq, however, so we no longer allow
529 	 * this unlikely case.
530 	 */
531 	ifp->if_addrheads = kmalloc(ncpus * sizeof(struct ifaddrhead),
532 				    M_IFADDR, M_WAITOK | M_ZERO);
533 	for (i = 0; i < ncpus; ++i)
534 		TAILQ_INIT(&ifp->if_addrheads[i]);
535 
536 	TAILQ_INIT(&ifp->if_multiaddrs);
537 	TAILQ_INIT(&ifp->if_groups);
538 	getmicrotime(&ifp->if_lastchange);
539 
540 	/*
541 	 * create a Link Level name for this device
542 	 */
543 	namelen = strlen(ifp->if_xname);
544 	masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + namelen;
545 	socksize = masklen + ifp->if_addrlen;
546 	if (socksize < sizeof(*sdl))
547 		socksize = sizeof(*sdl);
548 	socksize = RT_ROUNDUP(socksize);
549 	ifa = ifa_create(sizeof(struct ifaddr) + 2 * socksize);
550 	sdl = sdl_addr = (struct sockaddr_dl *)(ifa + 1);
551 	sdl->sdl_len = socksize;
552 	sdl->sdl_family = AF_LINK;
553 	bcopy(ifp->if_xname, sdl->sdl_data, namelen);
554 	sdl->sdl_nlen = namelen;
555 	sdl->sdl_type = ifp->if_type;
556 	ifp->if_lladdr = ifa;
557 	ifa->ifa_ifp = ifp;
558 	ifa->ifa_rtrequest = link_rtrequest;
559 	ifa->ifa_addr = (struct sockaddr *)sdl;
560 	sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
561 	ifa->ifa_netmask = (struct sockaddr *)sdl;
562 	sdl->sdl_len = masklen;
563 	while (namelen != 0)
564 		sdl->sdl_data[--namelen] = 0xff;
565 	ifa_iflink(ifa, ifp, 0 /* Insert head */);
566 
567 	ifp->if_data_pcpu = kmalloc_cachealign(
568 	    ncpus * sizeof(struct ifdata_pcpu), M_DEVBUF, M_WAITOK | M_ZERO);
569 
570 	if (ifp->if_mapsubq == NULL)
571 		ifp->if_mapsubq = ifq_mapsubq_default;
572 
573 	ifq = &ifp->if_snd;
574 	ifq->altq_type = 0;
575 	ifq->altq_disc = NULL;
576 	ifq->altq_flags &= ALTQF_CANTCHANGE;
577 	ifq->altq_tbr = NULL;
578 	ifq->altq_ifp = ifp;
579 
580 	if (ifq->altq_subq_cnt <= 0)
581 		ifq->altq_subq_cnt = 1;
582 	ifq->altq_subq = kmalloc_cachealign(
583 	    ifq->altq_subq_cnt * sizeof(struct ifaltq_subque),
584 	    M_DEVBUF, M_WAITOK | M_ZERO);
585 
586 	if (ifq->altq_maxlen == 0) {
587 		if_printf(ifp, "driver didn't set altq_maxlen\n");
588 		ifq_set_maxlen(ifq, ifqmaxlen);
589 	}
590 
591 	for (q = 0; q < ifq->altq_subq_cnt; ++q) {
592 		struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
593 
594 		ALTQ_SQ_LOCK_INIT(ifsq);
595 		ifsq->ifsq_index = q;
596 
597 		ifsq->ifsq_altq = ifq;
598 		ifsq->ifsq_ifp = ifp;
599 
600 		ifsq->ifsq_maxlen = ifq->altq_maxlen;
601 		ifsq->ifsq_maxbcnt = ifsq->ifsq_maxlen * MCLBYTES;
602 		ifsq->ifsq_prepended = NULL;
603 		ifsq->ifsq_started = 0;
604 		ifsq->ifsq_hw_oactive = 0;
605 		ifsq_set_cpuid(ifsq, 0);
606 		if (ifp->if_serializer != NULL)
607 			ifsq_set_hw_serialize(ifsq, ifp->if_serializer);
608 
609 		ifsq->ifsq_stage =
610 		    kmalloc_cachealign(ncpus * sizeof(struct ifsubq_stage),
611 		    M_DEVBUF, M_WAITOK | M_ZERO);
612 		for (i = 0; i < ncpus; ++i)
613 			ifsq->ifsq_stage[i].stg_subq = ifsq;
614 
615 		ifsq->ifsq_ifstart_nmsg =
616 		    kmalloc(ncpus * sizeof(struct netmsg_base),
617 		    M_LWKTMSG, M_WAITOK);
618 		for (i = 0; i < ncpus; ++i) {
619 			netmsg_init(&ifsq->ifsq_ifstart_nmsg[i], NULL,
620 			    &netisr_adone_rport, 0, ifsq_ifstart_dispatch);
621 			ifsq->ifsq_ifstart_nmsg[i].lmsg.u.ms_resultp = ifsq;
622 		}
623 	}
624 	ifq_set_classic(ifq);
625 
626 	/*
627 	 * Increase mbuf cluster/jcluster limits for the mbufs that
628 	 * could sit on the device queues for quite some time.
629 	 */
630 	if (ifp->if_nmbclusters > 0)
631 		mcl_inclimit(ifp->if_nmbclusters);
632 	if (ifp->if_nmbjclusters > 0)
633 		mjcl_inclimit(ifp->if_nmbjclusters);
634 
635 	/*
636 	 * Install this ifp into ifindex2inet, ifnet queue and ifnet
637 	 * array after it is setup.
638 	 *
639 	 * Protect ifindex2ifnet, ifnet queue and ifnet array changes
640 	 * by ifnet lock, so that non-netisr threads could get a
641 	 * consistent view.
642 	 */
643 	ifnet_lock();
644 
645 	/* Don't update if_index until ifindex2ifnet is setup */
646 	ifp->if_index = if_index + 1;
647 	sdl_addr->sdl_index = ifp->if_index;
648 
649 	/*
650 	 * Install this ifp into ifindex2ifnet
651 	 */
652 	if (ifindex2ifnet == NULL || ifp->if_index >= if_indexlim) {
653 		unsigned int n;
654 		struct ifnet **q;
655 
656 		/*
657 		 * Grow ifindex2ifnet
658 		 */
659 		if_indexlim <<= 1;
660 		n = if_indexlim * sizeof(*q);
661 		q = kmalloc(n, M_IFADDR, M_WAITOK | M_ZERO);
662 		if (ifindex2ifnet != NULL) {
663 			bcopy(ifindex2ifnet, q, n/2);
664 			/* Free old ifindex2ifnet after sync all netisrs */
665 			old_ifindex2ifnet = ifindex2ifnet;
666 		}
667 		ifindex2ifnet = q;
668 	}
669 	ifindex2ifnet[ifp->if_index] = ifp;
670 	/*
671 	 * Update if_index after this ifp is installed into ifindex2ifnet,
672 	 * so that netisrs could get a consistent view of ifindex2ifnet.
673 	 */
674 	cpu_sfence();
675 	if_index = ifp->if_index;
676 
677 	/*
678 	 * Install this ifp into ifnet array.
679 	 */
680 	/* Free old ifnet array after sync all netisrs */
681 	old_ifnet_array = ifnet_array;
682 	ifnet_array = ifnet_array_add(ifp, old_ifnet_array);
683 
684 	/*
685 	 * Install this ifp into ifnet queue.
686 	 */
687 	TAILQ_INSERT_TAIL(&ifnetlist, ifp, if_link);
688 
689 	ifnet_unlock();
690 
691 	/*
692 	 * Sync all netisrs so that the old ifindex2ifnet and ifnet array
693 	 * are no longer accessed and we can free them safely later on.
694 	 */
695 	netmsg_service_sync();
696 	if (old_ifindex2ifnet != NULL)
697 		kfree(old_ifindex2ifnet, M_IFADDR);
698 	ifnet_array_free(old_ifnet_array);
699 
700 	if (!SLIST_EMPTY(&domains))
701 		if_attachdomain1(ifp);
702 
703 	/* Announce the interface. */
704 	EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
705 	devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
706 	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
707 }
708 
709 static void
710 if_attachdomain(void *dummy)
711 {
712 	struct ifnet *ifp;
713 
714 	ifnet_lock();
715 	TAILQ_FOREACH(ifp, &ifnetlist, if_list)
716 		if_attachdomain1(ifp);
717 	ifnet_unlock();
718 }
719 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST,
720 	if_attachdomain, NULL);
721 
722 static void
723 if_attachdomain1(struct ifnet *ifp)
724 {
725 	struct domain *dp;
726 
727 	crit_enter();
728 
729 	/* address family dependent data region */
730 	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
731 	SLIST_FOREACH(dp, &domains, dom_next)
732 		if (dp->dom_ifattach)
733 			ifp->if_afdata[dp->dom_family] =
734 				(*dp->dom_ifattach)(ifp);
735 	crit_exit();
736 }
737 
738 /*
739  * Purge all addresses whose type is _not_ AF_LINK
740  */
741 static void
742 if_purgeaddrs_nolink_dispatch(netmsg_t nmsg)
743 {
744 	struct lwkt_msg *lmsg = &nmsg->lmsg;
745 	struct ifnet *ifp = lmsg->u.ms_resultp;
746 	struct ifaddr_container *ifac, *next;
747 
748 	ASSERT_IN_NETISR(0);
749 
750 	/*
751 	 * The ifaddr processing in the following loop will block,
752 	 * however, this function is called in netisr0, in which
753 	 * ifaddr list changes happen, so we don't care about the
754 	 * blockness of the ifaddr processing here.
755 	 */
756 	TAILQ_FOREACH_MUTABLE(ifac, &ifp->if_addrheads[mycpuid],
757 			      ifa_link, next) {
758 		struct ifaddr *ifa = ifac->ifa;
759 
760 		/* Ignore marker */
761 		if (ifa->ifa_addr->sa_family == AF_UNSPEC)
762 			continue;
763 
764 		/* Leave link ifaddr as it is */
765 		if (ifa->ifa_addr->sa_family == AF_LINK)
766 			continue;
767 #ifdef INET
768 		/* XXX: Ugly!! ad hoc just for INET */
769 		if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET) {
770 			struct ifaliasreq ifr;
771 #ifdef IFADDR_DEBUG_VERBOSE
772 			int i;
773 
774 			kprintf("purge in4 addr %p: ", ifa);
775 			for (i = 0; i < ncpus; ++i)
776 				kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
777 			kprintf("\n");
778 #endif
779 
780 			bzero(&ifr, sizeof ifr);
781 			ifr.ifra_addr = *ifa->ifa_addr;
782 			if (ifa->ifa_dstaddr)
783 				ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
784 			if (in_control(SIOCDIFADDR, (caddr_t)&ifr, ifp,
785 				       NULL) == 0)
786 				continue;
787 		}
788 #endif /* INET */
789 #ifdef INET6
790 		if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET6) {
791 #ifdef IFADDR_DEBUG_VERBOSE
792 			int i;
793 
794 			kprintf("purge in6 addr %p: ", ifa);
795 			for (i = 0; i < ncpus; ++i)
796 				kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
797 			kprintf("\n");
798 #endif
799 
800 			in6_purgeaddr(ifa);
801 			/* ifp_addrhead is already updated */
802 			continue;
803 		}
804 #endif /* INET6 */
805 		ifa_ifunlink(ifa, ifp);
806 		ifa_destroy(ifa);
807 	}
808 
809 	lwkt_replymsg(lmsg, 0);
810 }
811 
812 void
813 if_purgeaddrs_nolink(struct ifnet *ifp)
814 {
815 	struct netmsg_base nmsg;
816 	struct lwkt_msg *lmsg = &nmsg.lmsg;
817 
818 	ASSERT_CANDOMSG_NETISR0(curthread);
819 
820 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, 0,
821 	    if_purgeaddrs_nolink_dispatch);
822 	lmsg->u.ms_resultp = ifp;
823 	lwkt_domsg(netisr_cpuport(0), lmsg, 0);
824 }
825 
826 static void
827 ifq_stage_detach_handler(netmsg_t nmsg)
828 {
829 	struct ifaltq *ifq = nmsg->lmsg.u.ms_resultp;
830 	int q;
831 
832 	for (q = 0; q < ifq->altq_subq_cnt; ++q) {
833 		struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
834 		struct ifsubq_stage *stage = ifsq_get_stage(ifsq, mycpuid);
835 
836 		if (stage->stg_flags & IFSQ_STAGE_FLAG_QUED)
837 			ifsq_stage_remove(&ifsubq_stage_heads[mycpuid], stage);
838 	}
839 	lwkt_replymsg(&nmsg->lmsg, 0);
840 }
841 
842 static void
843 ifq_stage_detach(struct ifaltq *ifq)
844 {
845 	struct netmsg_base base;
846 	int cpu;
847 
848 	netmsg_init(&base, NULL, &curthread->td_msgport, 0,
849 	    ifq_stage_detach_handler);
850 	base.lmsg.u.ms_resultp = ifq;
851 
852 	for (cpu = 0; cpu < ncpus; ++cpu)
853 		lwkt_domsg(netisr_cpuport(cpu), &base.lmsg, 0);
854 }
855 
856 struct netmsg_if_rtdel {
857 	struct netmsg_base	base;
858 	struct ifnet		*ifp;
859 };
860 
861 static void
862 if_rtdel_dispatch(netmsg_t msg)
863 {
864 	struct netmsg_if_rtdel *rmsg = (void *)msg;
865 	int i, nextcpu, cpu;
866 
867 	cpu = mycpuid;
868 	for (i = 1; i <= AF_MAX; i++) {
869 		struct radix_node_head	*rnh;
870 
871 		if ((rnh = rt_tables[cpu][i]) == NULL)
872 			continue;
873 		rnh->rnh_walktree(rnh, if_rtdel, rmsg->ifp);
874 	}
875 
876 	nextcpu = cpu + 1;
877 	if (nextcpu < ncpus)
878 		lwkt_forwardmsg(netisr_cpuport(nextcpu), &rmsg->base.lmsg);
879 	else
880 		lwkt_replymsg(&rmsg->base.lmsg, 0);
881 }
882 
883 /*
884  * Detach an interface, removing it from the
885  * list of "active" interfaces.
886  */
887 void
888 if_detach(struct ifnet *ifp)
889 {
890 	struct ifnet_array *old_ifnet_array;
891 	struct netmsg_if_rtdel msg;
892 	struct domain *dp;
893 	int q;
894 
895 	/* Announce that the interface is gone. */
896 	EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
897 	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
898 	devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
899 
900 	/*
901 	 * Remove this ifp from ifindex2inet, ifnet queue and ifnet
902 	 * array before it is whacked.
903 	 *
904 	 * Protect ifindex2ifnet, ifnet queue and ifnet array changes
905 	 * by ifnet lock, so that non-netisr threads could get a
906 	 * consistent view.
907 	 */
908 	ifnet_lock();
909 
910 	/*
911 	 * Remove this ifp from ifindex2ifnet and maybe decrement if_index.
912 	 */
913 	ifindex2ifnet[ifp->if_index] = NULL;
914 	while (if_index > 0 && ifindex2ifnet[if_index] == NULL)
915 		if_index--;
916 
917 	/*
918 	 * Remove this ifp from ifnet queue.
919 	 */
920 	TAILQ_REMOVE(&ifnetlist, ifp, if_link);
921 
922 	/*
923 	 * Remove this ifp from ifnet array.
924 	 */
925 	/* Free old ifnet array after sync all netisrs */
926 	old_ifnet_array = ifnet_array;
927 	ifnet_array = ifnet_array_del(ifp, old_ifnet_array);
928 
929 	ifnet_unlock();
930 
931 	/*
932 	 * Sync all netisrs so that the old ifnet array is no longer
933 	 * accessed and we can free it safely later on.
934 	 */
935 	netmsg_service_sync();
936 	ifnet_array_free(old_ifnet_array);
937 
938 	/*
939 	 * Remove routes and flush queues.
940 	 */
941 	crit_enter();
942 #ifdef IFPOLL_ENABLE
943 	if (ifp->if_flags & IFF_NPOLLING)
944 		ifpoll_deregister(ifp);
945 #endif
946 	if_down(ifp);
947 
948 	/* Decrease the mbuf clusters/jclusters limits increased by us */
949 	if (ifp->if_nmbclusters > 0)
950 		mcl_inclimit(-ifp->if_nmbclusters);
951 	if (ifp->if_nmbjclusters > 0)
952 		mjcl_inclimit(-ifp->if_nmbjclusters);
953 
954 #ifdef ALTQ
955 	if (ifq_is_enabled(&ifp->if_snd))
956 		altq_disable(&ifp->if_snd);
957 	if (ifq_is_attached(&ifp->if_snd))
958 		altq_detach(&ifp->if_snd);
959 #endif
960 
961 	/*
962 	 * Clean up all addresses.
963 	 */
964 	ifp->if_lladdr = NULL;
965 
966 	if_purgeaddrs_nolink(ifp);
967 	if (!TAILQ_EMPTY(&ifp->if_addrheads[mycpuid])) {
968 		struct ifaddr *ifa;
969 
970 		ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
971 		KASSERT(ifa->ifa_addr->sa_family == AF_LINK,
972 			("non-link ifaddr is left on if_addrheads"));
973 
974 		ifa_ifunlink(ifa, ifp);
975 		ifa_destroy(ifa);
976 		KASSERT(TAILQ_EMPTY(&ifp->if_addrheads[mycpuid]),
977 			("there are still ifaddrs left on if_addrheads"));
978 	}
979 
980 #ifdef INET
981 	/*
982 	 * Remove all IPv4 kernel structures related to ifp.
983 	 */
984 	in_ifdetach(ifp);
985 #endif
986 
987 #ifdef INET6
988 	/*
989 	 * Remove all IPv6 kernel structs related to ifp.  This should be done
990 	 * before removing routing entries below, since IPv6 interface direct
991 	 * routes are expected to be removed by the IPv6-specific kernel API.
992 	 * Otherwise, the kernel will detect some inconsistency and bark it.
993 	 */
994 	in6_ifdetach(ifp);
995 #endif
996 
997 	/*
998 	 * Delete all remaining routes using this interface
999 	 */
1000 	netmsg_init(&msg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
1001 	    if_rtdel_dispatch);
1002 	msg.ifp = ifp;
1003 	rt_domsg_global(&msg.base);
1004 
1005 	SLIST_FOREACH(dp, &domains, dom_next)
1006 		if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
1007 			(*dp->dom_ifdetach)(ifp,
1008 				ifp->if_afdata[dp->dom_family]);
1009 
1010 	kfree(ifp->if_addrheads, M_IFADDR);
1011 
1012 	lwkt_synchronize_ipiqs("if_detach");
1013 	ifq_stage_detach(&ifp->if_snd);
1014 
1015 	for (q = 0; q < ifp->if_snd.altq_subq_cnt; ++q) {
1016 		struct ifaltq_subque *ifsq = &ifp->if_snd.altq_subq[q];
1017 
1018 		kfree(ifsq->ifsq_ifstart_nmsg, M_LWKTMSG);
1019 		kfree(ifsq->ifsq_stage, M_DEVBUF);
1020 	}
1021 	kfree(ifp->if_snd.altq_subq, M_DEVBUF);
1022 
1023 	kfree(ifp->if_data_pcpu, M_DEVBUF);
1024 
1025 	crit_exit();
1026 }
1027 
1028 /*
1029  * Create interface group without members
1030  */
1031 struct ifg_group *
1032 if_creategroup(const char *groupname)
1033 {
1034         struct ifg_group        *ifg = NULL;
1035 
1036         if ((ifg = (struct ifg_group *)kmalloc(sizeof(struct ifg_group),
1037             M_TEMP, M_NOWAIT)) == NULL)
1038                 return (NULL);
1039 
1040         strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
1041         ifg->ifg_refcnt = 0;
1042         ifg->ifg_carp_demoted = 0;
1043         TAILQ_INIT(&ifg->ifg_members);
1044 #if NPF > 0
1045         pfi_attach_ifgroup(ifg);
1046 #endif
1047         TAILQ_INSERT_TAIL(&ifg_head, ifg, ifg_next);
1048 
1049         return (ifg);
1050 }
1051 
1052 /*
1053  * Add a group to an interface
1054  */
1055 int
1056 if_addgroup(struct ifnet *ifp, const char *groupname)
1057 {
1058 	struct ifg_list		*ifgl;
1059 	struct ifg_group	*ifg = NULL;
1060 	struct ifg_member	*ifgm;
1061 
1062 	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
1063 	    groupname[strlen(groupname) - 1] <= '9')
1064 		return (EINVAL);
1065 
1066 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
1067 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
1068 			return (EEXIST);
1069 
1070 	if ((ifgl = kmalloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL)
1071 		return (ENOMEM);
1072 
1073 	if ((ifgm = kmalloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) {
1074 		kfree(ifgl, M_TEMP);
1075 		return (ENOMEM);
1076 	}
1077 
1078 	TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
1079 		if (!strcmp(ifg->ifg_group, groupname))
1080 			break;
1081 
1082 	if (ifg == NULL && (ifg = if_creategroup(groupname)) == NULL) {
1083 		kfree(ifgl, M_TEMP);
1084 		kfree(ifgm, M_TEMP);
1085 		return (ENOMEM);
1086 	}
1087 
1088 	ifg->ifg_refcnt++;
1089 	ifgl->ifgl_group = ifg;
1090 	ifgm->ifgm_ifp = ifp;
1091 
1092 	TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
1093 	TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
1094 
1095 #if NPF > 0
1096 	pfi_group_change(groupname);
1097 #endif
1098 
1099 	return (0);
1100 }
1101 
1102 /*
1103  * Remove a group from an interface
1104  */
1105 int
1106 if_delgroup(struct ifnet *ifp, const char *groupname)
1107 {
1108 	struct ifg_list		*ifgl;
1109 	struct ifg_member	*ifgm;
1110 
1111 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
1112 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
1113 			break;
1114 	if (ifgl == NULL)
1115 		return (ENOENT);
1116 
1117 	TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
1118 
1119 	TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
1120 		if (ifgm->ifgm_ifp == ifp)
1121 			break;
1122 
1123 	if (ifgm != NULL) {
1124 		TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
1125 		kfree(ifgm, M_TEMP);
1126 	}
1127 
1128 	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
1129 		TAILQ_REMOVE(&ifg_head, ifgl->ifgl_group, ifg_next);
1130 #if NPF > 0
1131 		pfi_detach_ifgroup(ifgl->ifgl_group);
1132 #endif
1133 		kfree(ifgl->ifgl_group, M_TEMP);
1134 	}
1135 
1136 	kfree(ifgl, M_TEMP);
1137 
1138 #if NPF > 0
1139 	pfi_group_change(groupname);
1140 #endif
1141 
1142 	return (0);
1143 }
1144 
1145 /*
1146  * Stores all groups from an interface in memory pointed
1147  * to by data
1148  */
1149 int
1150 if_getgroup(caddr_t data, struct ifnet *ifp)
1151 {
1152 	int			 len, error;
1153 	struct ifg_list		*ifgl;
1154 	struct ifg_req		 ifgrq, *ifgp;
1155 	struct ifgroupreq	*ifgr = (struct ifgroupreq *)data;
1156 
1157 	if (ifgr->ifgr_len == 0) {
1158 		TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
1159 			ifgr->ifgr_len += sizeof(struct ifg_req);
1160 		return (0);
1161 	}
1162 
1163 	len = ifgr->ifgr_len;
1164 	ifgp = ifgr->ifgr_groups;
1165 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
1166 		if (len < sizeof(ifgrq))
1167 			return (EINVAL);
1168 		bzero(&ifgrq, sizeof ifgrq);
1169 		strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
1170 		    sizeof(ifgrq.ifgrq_group));
1171 		if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
1172 		    sizeof(struct ifg_req))))
1173 			return (error);
1174 		len -= sizeof(ifgrq);
1175 		ifgp++;
1176 	}
1177 
1178 	return (0);
1179 }
1180 
1181 /*
1182  * Stores all members of a group in memory pointed to by data
1183  */
1184 int
1185 if_getgroupmembers(caddr_t data)
1186 {
1187 	struct ifgroupreq	*ifgr = (struct ifgroupreq *)data;
1188 	struct ifg_group	*ifg;
1189 	struct ifg_member	*ifgm;
1190 	struct ifg_req		 ifgrq, *ifgp;
1191 	int			 len, error;
1192 
1193 	TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
1194 		if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
1195 			break;
1196 	if (ifg == NULL)
1197 		return (ENOENT);
1198 
1199 	if (ifgr->ifgr_len == 0) {
1200 		TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
1201 			ifgr->ifgr_len += sizeof(ifgrq);
1202 		return (0);
1203 	}
1204 
1205 	len = ifgr->ifgr_len;
1206 	ifgp = ifgr->ifgr_groups;
1207 	TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
1208 		if (len < sizeof(ifgrq))
1209 			return (EINVAL);
1210 		bzero(&ifgrq, sizeof ifgrq);
1211 		strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
1212 		    sizeof(ifgrq.ifgrq_member));
1213 		if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
1214 		    sizeof(struct ifg_req))))
1215 			return (error);
1216 		len -= sizeof(ifgrq);
1217 		ifgp++;
1218 	}
1219 
1220 	return (0);
1221 }
1222 
1223 /*
1224  * Delete Routes for a Network Interface
1225  *
1226  * Called for each routing entry via the rnh->rnh_walktree() call above
1227  * to delete all route entries referencing a detaching network interface.
1228  *
1229  * Arguments:
1230  *	rn	pointer to node in the routing table
1231  *	arg	argument passed to rnh->rnh_walktree() - detaching interface
1232  *
1233  * Returns:
1234  *	0	successful
1235  *	errno	failed - reason indicated
1236  *
1237  */
1238 static int
1239 if_rtdel(struct radix_node *rn, void *arg)
1240 {
1241 	struct rtentry	*rt = (struct rtentry *)rn;
1242 	struct ifnet	*ifp = arg;
1243 	int		err;
1244 
1245 	if (rt->rt_ifp == ifp) {
1246 
1247 		/*
1248 		 * Protect (sorta) against walktree recursion problems
1249 		 * with cloned routes
1250 		 */
1251 		if (!(rt->rt_flags & RTF_UP))
1252 			return (0);
1253 
1254 		err = rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
1255 				rt_mask(rt), rt->rt_flags,
1256 				NULL);
1257 		if (err) {
1258 			log(LOG_WARNING, "if_rtdel: error %d\n", err);
1259 		}
1260 	}
1261 
1262 	return (0);
1263 }
1264 
1265 static __inline boolean_t
1266 ifa_prefer(const struct ifaddr *cur_ifa, const struct ifaddr *old_ifa)
1267 {
1268 	if (old_ifa == NULL)
1269 		return TRUE;
1270 
1271 	if ((old_ifa->ifa_ifp->if_flags & IFF_UP) == 0 &&
1272 	    (cur_ifa->ifa_ifp->if_flags & IFF_UP))
1273 		return TRUE;
1274 	if ((old_ifa->ifa_flags & IFA_ROUTE) == 0 &&
1275 	    (cur_ifa->ifa_flags & IFA_ROUTE))
1276 		return TRUE;
1277 	return FALSE;
1278 }
1279 
1280 /*
1281  * Locate an interface based on a complete address.
1282  */
1283 struct ifaddr *
1284 ifa_ifwithaddr(struct sockaddr *addr)
1285 {
1286 	const struct ifnet_array *arr;
1287 	int i;
1288 
1289 	arr = ifnet_array_get();
1290 	for (i = 0; i < arr->ifnet_count; ++i) {
1291 		struct ifnet *ifp = arr->ifnet_arr[i];
1292 		struct ifaddr_container *ifac;
1293 
1294 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1295 			struct ifaddr *ifa = ifac->ifa;
1296 
1297 			if (ifa->ifa_addr->sa_family != addr->sa_family)
1298 				continue;
1299 			if (sa_equal(addr, ifa->ifa_addr))
1300 				return (ifa);
1301 			if ((ifp->if_flags & IFF_BROADCAST) &&
1302 			    ifa->ifa_broadaddr &&
1303 			    /* IPv6 doesn't have broadcast */
1304 			    ifa->ifa_broadaddr->sa_len != 0 &&
1305 			    sa_equal(ifa->ifa_broadaddr, addr))
1306 				return (ifa);
1307 		}
1308 	}
1309 	return (NULL);
1310 }
1311 
1312 /*
1313  * Locate the point to point interface with a given destination address.
1314  */
1315 struct ifaddr *
1316 ifa_ifwithdstaddr(struct sockaddr *addr)
1317 {
1318 	const struct ifnet_array *arr;
1319 	int i;
1320 
1321 	arr = ifnet_array_get();
1322 	for (i = 0; i < arr->ifnet_count; ++i) {
1323 		struct ifnet *ifp = arr->ifnet_arr[i];
1324 		struct ifaddr_container *ifac;
1325 
1326 		if (!(ifp->if_flags & IFF_POINTOPOINT))
1327 			continue;
1328 
1329 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1330 			struct ifaddr *ifa = ifac->ifa;
1331 
1332 			if (ifa->ifa_addr->sa_family != addr->sa_family)
1333 				continue;
1334 			if (ifa->ifa_dstaddr &&
1335 			    sa_equal(addr, ifa->ifa_dstaddr))
1336 				return (ifa);
1337 		}
1338 	}
1339 	return (NULL);
1340 }
1341 
1342 /*
1343  * Find an interface on a specific network.  If many, choice
1344  * is most specific found.
1345  */
1346 struct ifaddr *
1347 ifa_ifwithnet(struct sockaddr *addr)
1348 {
1349 	struct ifaddr *ifa_maybe = NULL;
1350 	u_int af = addr->sa_family;
1351 	char *addr_data = addr->sa_data, *cplim;
1352 	const struct ifnet_array *arr;
1353 	int i;
1354 
1355 	/*
1356 	 * AF_LINK addresses can be looked up directly by their index number,
1357 	 * so do that if we can.
1358 	 */
1359 	if (af == AF_LINK) {
1360 		struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
1361 
1362 		if (sdl->sdl_index && sdl->sdl_index <= if_index)
1363 			return (ifindex2ifnet[sdl->sdl_index]->if_lladdr);
1364 	}
1365 
1366 	/*
1367 	 * Scan though each interface, looking for ones that have
1368 	 * addresses in this address family.
1369 	 */
1370 	arr = ifnet_array_get();
1371 	for (i = 0; i < arr->ifnet_count; ++i) {
1372 		struct ifnet *ifp = arr->ifnet_arr[i];
1373 		struct ifaddr_container *ifac;
1374 
1375 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1376 			struct ifaddr *ifa = ifac->ifa;
1377 			char *cp, *cp2, *cp3;
1378 
1379 			if (ifa->ifa_addr->sa_family != af)
1380 next:				continue;
1381 			if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT) {
1382 				/*
1383 				 * This is a bit broken as it doesn't
1384 				 * take into account that the remote end may
1385 				 * be a single node in the network we are
1386 				 * looking for.
1387 				 * The trouble is that we don't know the
1388 				 * netmask for the remote end.
1389 				 */
1390 				if (ifa->ifa_dstaddr != NULL &&
1391 				    sa_equal(addr, ifa->ifa_dstaddr))
1392 					return (ifa);
1393 			} else {
1394 				/*
1395 				 * if we have a special address handler,
1396 				 * then use it instead of the generic one.
1397 				 */
1398 				if (ifa->ifa_claim_addr) {
1399 					if ((*ifa->ifa_claim_addr)(ifa, addr)) {
1400 						return (ifa);
1401 					} else {
1402 						continue;
1403 					}
1404 				}
1405 
1406 				/*
1407 				 * Scan all the bits in the ifa's address.
1408 				 * If a bit dissagrees with what we are
1409 				 * looking for, mask it with the netmask
1410 				 * to see if it really matters.
1411 				 * (A byte at a time)
1412 				 */
1413 				if (ifa->ifa_netmask == 0)
1414 					continue;
1415 				cp = addr_data;
1416 				cp2 = ifa->ifa_addr->sa_data;
1417 				cp3 = ifa->ifa_netmask->sa_data;
1418 				cplim = ifa->ifa_netmask->sa_len +
1419 					(char *)ifa->ifa_netmask;
1420 				while (cp3 < cplim)
1421 					if ((*cp++ ^ *cp2++) & *cp3++)
1422 						goto next; /* next address! */
1423 				/*
1424 				 * If the netmask of what we just found
1425 				 * is more specific than what we had before
1426 				 * (if we had one) then remember the new one
1427 				 * before continuing to search for an even
1428 				 * better one.  If the netmasks are equal,
1429 				 * we prefer the this ifa based on the result
1430 				 * of ifa_prefer().
1431 				 */
1432 				if (ifa_maybe == NULL ||
1433 				    rn_refines((char *)ifa->ifa_netmask,
1434 				        (char *)ifa_maybe->ifa_netmask) ||
1435 				    (sa_equal(ifa_maybe->ifa_netmask,
1436 				        ifa->ifa_netmask) &&
1437 				     ifa_prefer(ifa, ifa_maybe)))
1438 					ifa_maybe = ifa;
1439 			}
1440 		}
1441 	}
1442 	return (ifa_maybe);
1443 }
1444 
1445 /*
1446  * Find an interface address specific to an interface best matching
1447  * a given address.
1448  */
1449 struct ifaddr *
1450 ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp)
1451 {
1452 	struct ifaddr_container *ifac;
1453 	char *cp, *cp2, *cp3;
1454 	char *cplim;
1455 	struct ifaddr *ifa_maybe = NULL;
1456 	u_int af = addr->sa_family;
1457 
1458 	if (af >= AF_MAX)
1459 		return (0);
1460 	TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1461 		struct ifaddr *ifa = ifac->ifa;
1462 
1463 		if (ifa->ifa_addr->sa_family != af)
1464 			continue;
1465 		if (ifa_maybe == NULL)
1466 			ifa_maybe = ifa;
1467 		if (ifa->ifa_netmask == NULL) {
1468 			if (sa_equal(addr, ifa->ifa_addr) ||
1469 			    (ifa->ifa_dstaddr != NULL &&
1470 			     sa_equal(addr, ifa->ifa_dstaddr)))
1471 				return (ifa);
1472 			continue;
1473 		}
1474 		if (ifp->if_flags & IFF_POINTOPOINT) {
1475 			if (sa_equal(addr, ifa->ifa_dstaddr))
1476 				return (ifa);
1477 		} else {
1478 			cp = addr->sa_data;
1479 			cp2 = ifa->ifa_addr->sa_data;
1480 			cp3 = ifa->ifa_netmask->sa_data;
1481 			cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
1482 			for (; cp3 < cplim; cp3++)
1483 				if ((*cp++ ^ *cp2++) & *cp3)
1484 					break;
1485 			if (cp3 == cplim)
1486 				return (ifa);
1487 		}
1488 	}
1489 	return (ifa_maybe);
1490 }
1491 
1492 /*
1493  * Default action when installing a route with a Link Level gateway.
1494  * Lookup an appropriate real ifa to point to.
1495  * This should be moved to /sys/net/link.c eventually.
1496  */
1497 static void
1498 link_rtrequest(int cmd, struct rtentry *rt)
1499 {
1500 	struct ifaddr *ifa;
1501 	struct sockaddr *dst;
1502 	struct ifnet *ifp;
1503 
1504 	if (cmd != RTM_ADD || (ifa = rt->rt_ifa) == NULL ||
1505 	    (ifp = ifa->ifa_ifp) == NULL || (dst = rt_key(rt)) == NULL)
1506 		return;
1507 	ifa = ifaof_ifpforaddr(dst, ifp);
1508 	if (ifa != NULL) {
1509 		IFAFREE(rt->rt_ifa);
1510 		IFAREF(ifa);
1511 		rt->rt_ifa = ifa;
1512 		if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
1513 			ifa->ifa_rtrequest(cmd, rt);
1514 	}
1515 }
1516 
1517 struct netmsg_ifroute {
1518 	struct netmsg_base	base;
1519 	struct ifnet		*ifp;
1520 	int			flag;
1521 	int			fam;
1522 };
1523 
1524 /*
1525  * Mark an interface down and notify protocols of the transition.
1526  */
1527 static void
1528 if_unroute_dispatch(netmsg_t nmsg)
1529 {
1530 	struct netmsg_ifroute *msg = (struct netmsg_ifroute *)nmsg;
1531 	struct ifnet *ifp = msg->ifp;
1532 	int flag = msg->flag, fam = msg->fam;
1533 	struct ifaddr_container *ifac;
1534 
1535 	ifp->if_flags &= ~flag;
1536 	getmicrotime(&ifp->if_lastchange);
1537 	/*
1538 	 * The ifaddr processing in the following loop will block,
1539 	 * however, this function is called in netisr0, in which
1540 	 * ifaddr list changes happen, so we don't care about the
1541 	 * blockness of the ifaddr processing here.
1542 	 */
1543 	TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1544 		struct ifaddr *ifa = ifac->ifa;
1545 
1546 		/* Ignore marker */
1547 		if (ifa->ifa_addr->sa_family == AF_UNSPEC)
1548 			continue;
1549 
1550 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1551 			kpfctlinput(PRC_IFDOWN, ifa->ifa_addr);
1552 	}
1553 	ifq_purge_all(&ifp->if_snd);
1554 	rt_ifmsg(ifp);
1555 
1556 	lwkt_replymsg(&nmsg->lmsg, 0);
1557 }
1558 
1559 void
1560 if_unroute(struct ifnet *ifp, int flag, int fam)
1561 {
1562 	struct netmsg_ifroute msg;
1563 
1564 	ASSERT_CANDOMSG_NETISR0(curthread);
1565 
1566 	netmsg_init(&msg.base, NULL, &curthread->td_msgport, 0,
1567 	    if_unroute_dispatch);
1568 	msg.ifp = ifp;
1569 	msg.flag = flag;
1570 	msg.fam = fam;
1571 	lwkt_domsg(netisr_cpuport(0), &msg.base.lmsg, 0);
1572 }
1573 
1574 /*
1575  * Mark an interface up and notify protocols of the transition.
1576  */
1577 static void
1578 if_route_dispatch(netmsg_t nmsg)
1579 {
1580 	struct netmsg_ifroute *msg = (struct netmsg_ifroute *)nmsg;
1581 	struct ifnet *ifp = msg->ifp;
1582 	int flag = msg->flag, fam = msg->fam;
1583 	struct ifaddr_container *ifac;
1584 
1585 	ifq_purge_all(&ifp->if_snd);
1586 	ifp->if_flags |= flag;
1587 	getmicrotime(&ifp->if_lastchange);
1588 	/*
1589 	 * The ifaddr processing in the following loop will block,
1590 	 * however, this function is called in netisr0, in which
1591 	 * ifaddr list changes happen, so we don't care about the
1592 	 * blockness of the ifaddr processing here.
1593 	 */
1594 	TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1595 		struct ifaddr *ifa = ifac->ifa;
1596 
1597 		/* Ignore marker */
1598 		if (ifa->ifa_addr->sa_family == AF_UNSPEC)
1599 			continue;
1600 
1601 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1602 			kpfctlinput(PRC_IFUP, ifa->ifa_addr);
1603 	}
1604 	rt_ifmsg(ifp);
1605 #ifdef INET6
1606 	in6_if_up(ifp);
1607 #endif
1608 
1609 	lwkt_replymsg(&nmsg->lmsg, 0);
1610 }
1611 
1612 void
1613 if_route(struct ifnet *ifp, int flag, int fam)
1614 {
1615 	struct netmsg_ifroute msg;
1616 
1617 	ASSERT_CANDOMSG_NETISR0(curthread);
1618 
1619 	netmsg_init(&msg.base, NULL, &curthread->td_msgport, 0,
1620 	    if_route_dispatch);
1621 	msg.ifp = ifp;
1622 	msg.flag = flag;
1623 	msg.fam = fam;
1624 	lwkt_domsg(netisr_cpuport(0), &msg.base.lmsg, 0);
1625 }
1626 
1627 /*
1628  * Mark an interface down and notify protocols of the transition.  An
1629  * interface going down is also considered to be a synchronizing event.
1630  * We must ensure that all packet processing related to the interface
1631  * has completed before we return so e.g. the caller can free the ifnet
1632  * structure that the mbufs may be referencing.
1633  *
1634  * NOTE: must be called at splnet or eqivalent.
1635  */
1636 void
1637 if_down(struct ifnet *ifp)
1638 {
1639 	if_unroute(ifp, IFF_UP, AF_UNSPEC);
1640 	netmsg_service_sync();
1641 }
1642 
1643 /*
1644  * Mark an interface up and notify protocols of
1645  * the transition.
1646  * NOTE: must be called at splnet or eqivalent.
1647  */
1648 void
1649 if_up(struct ifnet *ifp)
1650 {
1651 	if_route(ifp, IFF_UP, AF_UNSPEC);
1652 }
1653 
1654 /*
1655  * Process a link state change.
1656  * NOTE: must be called at splsoftnet or equivalent.
1657  */
1658 void
1659 if_link_state_change(struct ifnet *ifp)
1660 {
1661 	int link_state = ifp->if_link_state;
1662 
1663 	rt_ifmsg(ifp);
1664 	devctl_notify("IFNET", ifp->if_xname,
1665 	    (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL);
1666 }
1667 
1668 /*
1669  * Handle interface watchdog timer routines.  Called
1670  * from softclock, we decrement timers (if set) and
1671  * call the appropriate interface routine on expiration.
1672  */
1673 static void
1674 if_slowtimo_dispatch(netmsg_t nmsg)
1675 {
1676 	struct globaldata *gd = mycpu;
1677 	const struct ifnet_array *arr;
1678 	int i;
1679 
1680 	ASSERT_IN_NETISR(0);
1681 
1682 	crit_enter_gd(gd);
1683 	lwkt_replymsg(&nmsg->lmsg, 0);  /* reply ASAP */
1684 	crit_exit_gd(gd);
1685 
1686 	arr = ifnet_array_get();
1687 	for (i = 0; i < arr->ifnet_count; ++i) {
1688 		struct ifnet *ifp = arr->ifnet_arr[i];
1689 
1690 		crit_enter_gd(gd);
1691 
1692 		if (if_stats_compat) {
1693 			IFNET_STAT_GET(ifp, ipackets, ifp->if_ipackets);
1694 			IFNET_STAT_GET(ifp, ierrors, ifp->if_ierrors);
1695 			IFNET_STAT_GET(ifp, opackets, ifp->if_opackets);
1696 			IFNET_STAT_GET(ifp, oerrors, ifp->if_oerrors);
1697 			IFNET_STAT_GET(ifp, collisions, ifp->if_collisions);
1698 			IFNET_STAT_GET(ifp, ibytes, ifp->if_ibytes);
1699 			IFNET_STAT_GET(ifp, obytes, ifp->if_obytes);
1700 			IFNET_STAT_GET(ifp, imcasts, ifp->if_imcasts);
1701 			IFNET_STAT_GET(ifp, omcasts, ifp->if_omcasts);
1702 			IFNET_STAT_GET(ifp, iqdrops, ifp->if_iqdrops);
1703 			IFNET_STAT_GET(ifp, noproto, ifp->if_noproto);
1704 			IFNET_STAT_GET(ifp, oqdrops, ifp->if_oqdrops);
1705 		}
1706 
1707 		if (ifp->if_timer == 0 || --ifp->if_timer) {
1708 			crit_exit_gd(gd);
1709 			continue;
1710 		}
1711 		if (ifp->if_watchdog) {
1712 			if (ifnet_tryserialize_all(ifp)) {
1713 				(*ifp->if_watchdog)(ifp);
1714 				ifnet_deserialize_all(ifp);
1715 			} else {
1716 				/* try again next timeout */
1717 				++ifp->if_timer;
1718 			}
1719 		}
1720 
1721 		crit_exit_gd(gd);
1722 	}
1723 
1724 	callout_reset(&if_slowtimo_timer, hz / IFNET_SLOWHZ, if_slowtimo, NULL);
1725 }
1726 
1727 static void
1728 if_slowtimo(void *arg __unused)
1729 {
1730 	struct lwkt_msg *lmsg = &if_slowtimo_netmsg.lmsg;
1731 
1732 	KASSERT(mycpuid == 0, ("not on cpu0"));
1733 	crit_enter();
1734 	if (lmsg->ms_flags & MSGF_DONE)
1735 		lwkt_sendmsg_oncpu(netisr_cpuport(0), lmsg);
1736 	crit_exit();
1737 }
1738 
1739 /*
1740  * Map interface name to
1741  * interface structure pointer.
1742  */
1743 struct ifnet *
1744 ifunit(const char *name)
1745 {
1746 	struct ifnet *ifp;
1747 
1748 	/*
1749 	 * Search all the interfaces for this name/number
1750 	 */
1751 	KASSERT(mtx_owned(&ifnet_mtx), ("ifnet is not locked"));
1752 
1753 	TAILQ_FOREACH(ifp, &ifnetlist, if_link) {
1754 		if (strncmp(ifp->if_xname, name, IFNAMSIZ) == 0)
1755 			break;
1756 	}
1757 	return (ifp);
1758 }
1759 
1760 struct ifnet *
1761 ifunit_netisr(const char *name)
1762 {
1763 	const struct ifnet_array *arr;
1764 	int i;
1765 
1766 	/*
1767 	 * Search all the interfaces for this name/number
1768 	 */
1769 
1770 	arr = ifnet_array_get();
1771 	for (i = 0; i < arr->ifnet_count; ++i) {
1772 		struct ifnet *ifp = arr->ifnet_arr[i];
1773 
1774 		if (strncmp(ifp->if_xname, name, IFNAMSIZ) == 0)
1775 			return ifp;
1776 	}
1777 	return NULL;
1778 }
1779 
1780 /*
1781  * Interface ioctls.
1782  */
1783 int
1784 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct ucred *cred)
1785 {
1786 	struct ifnet *ifp;
1787 	struct ifreq *ifr;
1788 	struct ifstat *ifs;
1789 	int error, do_ifup = 0;
1790 	short oif_flags;
1791 	int new_flags;
1792 	size_t namelen, onamelen;
1793 	char new_name[IFNAMSIZ];
1794 	struct ifaddr *ifa;
1795 	struct sockaddr_dl *sdl;
1796 
1797 	switch (cmd) {
1798 	case SIOCGIFCONF:
1799 	case OSIOCGIFCONF:
1800 		return (ifconf(cmd, data, cred));
1801 	default:
1802 		break;
1803 	}
1804 
1805 	ifr = (struct ifreq *)data;
1806 
1807 	switch (cmd) {
1808 	case SIOCIFCREATE:
1809 	case SIOCIFCREATE2:
1810 		if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1811 			return (error);
1812 		return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name),
1813 		    	cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL));
1814 	case SIOCIFDESTROY:
1815 		if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1816 			return (error);
1817 		return (if_clone_destroy(ifr->ifr_name));
1818 	case SIOCIFGCLONERS:
1819 		return (if_clone_list((struct if_clonereq *)data));
1820 	default:
1821 		break;
1822 	}
1823 
1824 	/*
1825 	 * Nominal ioctl through interface, lookup the ifp and obtain a
1826 	 * lock to serialize the ifconfig ioctl operation.
1827 	 */
1828 	ifnet_lock();
1829 
1830 	ifp = ifunit(ifr->ifr_name);
1831 	if (ifp == NULL) {
1832 		ifnet_unlock();
1833 		return (ENXIO);
1834 	}
1835 	error = 0;
1836 
1837 	switch (cmd) {
1838 	case SIOCGIFINDEX:
1839 		ifr->ifr_index = ifp->if_index;
1840 		break;
1841 
1842 	case SIOCGIFFLAGS:
1843 		ifr->ifr_flags = ifp->if_flags;
1844 		ifr->ifr_flagshigh = ifp->if_flags >> 16;
1845 		break;
1846 
1847 	case SIOCGIFCAP:
1848 		ifr->ifr_reqcap = ifp->if_capabilities;
1849 		ifr->ifr_curcap = ifp->if_capenable;
1850 		break;
1851 
1852 	case SIOCGIFMETRIC:
1853 		ifr->ifr_metric = ifp->if_metric;
1854 		break;
1855 
1856 	case SIOCGIFMTU:
1857 		ifr->ifr_mtu = ifp->if_mtu;
1858 		break;
1859 
1860 	case SIOCGIFTSOLEN:
1861 		ifr->ifr_tsolen = ifp->if_tsolen;
1862 		break;
1863 
1864 	case SIOCGIFDATA:
1865 		error = copyout((caddr_t)&ifp->if_data, ifr->ifr_data,
1866 				sizeof(ifp->if_data));
1867 		break;
1868 
1869 	case SIOCGIFPHYS:
1870 		ifr->ifr_phys = ifp->if_physical;
1871 		break;
1872 
1873 	case SIOCGIFPOLLCPU:
1874 		ifr->ifr_pollcpu = -1;
1875 		break;
1876 
1877 	case SIOCSIFPOLLCPU:
1878 		break;
1879 
1880 	case SIOCSIFFLAGS:
1881 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1882 		if (error)
1883 			break;
1884 		new_flags = (ifr->ifr_flags & 0xffff) |
1885 		    (ifr->ifr_flagshigh << 16);
1886 		if (ifp->if_flags & IFF_SMART) {
1887 			/* Smart drivers twiddle their own routes */
1888 		} else if (ifp->if_flags & IFF_UP &&
1889 		    (new_flags & IFF_UP) == 0) {
1890 			if_down(ifp);
1891 		} else if (new_flags & IFF_UP &&
1892 		    (ifp->if_flags & IFF_UP) == 0) {
1893 			do_ifup = 1;
1894 		}
1895 
1896 #ifdef IFPOLL_ENABLE
1897 		if ((new_flags ^ ifp->if_flags) & IFF_NPOLLING) {
1898 			if (new_flags & IFF_NPOLLING)
1899 				ifpoll_register(ifp);
1900 			else
1901 				ifpoll_deregister(ifp);
1902 		}
1903 #endif
1904 
1905 		ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
1906 			(new_flags &~ IFF_CANTCHANGE);
1907 		if (new_flags & IFF_PPROMISC) {
1908 			/* Permanently promiscuous mode requested */
1909 			ifp->if_flags |= IFF_PROMISC;
1910 		} else if (ifp->if_pcount == 0) {
1911 			ifp->if_flags &= ~IFF_PROMISC;
1912 		}
1913 		if (ifp->if_ioctl) {
1914 			ifnet_serialize_all(ifp);
1915 			ifp->if_ioctl(ifp, cmd, data, cred);
1916 			ifnet_deserialize_all(ifp);
1917 		}
1918 		if (do_ifup)
1919 			if_up(ifp);
1920 		getmicrotime(&ifp->if_lastchange);
1921 		break;
1922 
1923 	case SIOCSIFCAP:
1924 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1925 		if (error)
1926 			break;
1927 		if (ifr->ifr_reqcap & ~ifp->if_capabilities) {
1928 			error = EINVAL;
1929 			break;
1930 		}
1931 		ifnet_serialize_all(ifp);
1932 		ifp->if_ioctl(ifp, cmd, data, cred);
1933 		ifnet_deserialize_all(ifp);
1934 		break;
1935 
1936 	case SIOCSIFNAME:
1937 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1938 		if (error)
1939 			break;
1940 		error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
1941 		if (error)
1942 			break;
1943 		if (new_name[0] == '\0') {
1944 			error = EINVAL;
1945 			break;
1946 		}
1947 		if (ifunit(new_name) != NULL) {
1948 			error = EEXIST;
1949 			break;
1950 		}
1951 
1952 		EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
1953 
1954 		/* Announce the departure of the interface. */
1955 		rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
1956 
1957 		strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
1958 		ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
1959 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
1960 		namelen = strlen(new_name);
1961 		onamelen = sdl->sdl_nlen;
1962 		/*
1963 		 * Move the address if needed.  This is safe because we
1964 		 * allocate space for a name of length IFNAMSIZ when we
1965 		 * create this in if_attach().
1966 		 */
1967 		if (namelen != onamelen) {
1968 			bcopy(sdl->sdl_data + onamelen,
1969 			    sdl->sdl_data + namelen, sdl->sdl_alen);
1970 		}
1971 		bcopy(new_name, sdl->sdl_data, namelen);
1972 		sdl->sdl_nlen = namelen;
1973 		sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
1974 		bzero(sdl->sdl_data, onamelen);
1975 		while (namelen != 0)
1976 			sdl->sdl_data[--namelen] = 0xff;
1977 
1978 		EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
1979 
1980 		/* Announce the return of the interface. */
1981 		rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
1982 		break;
1983 
1984 	case SIOCSIFMETRIC:
1985 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1986 		if (error)
1987 			break;
1988 		ifp->if_metric = ifr->ifr_metric;
1989 		getmicrotime(&ifp->if_lastchange);
1990 		break;
1991 
1992 	case SIOCSIFPHYS:
1993 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1994 		if (error)
1995 			break;
1996 		if (ifp->if_ioctl == NULL) {
1997 		        error = EOPNOTSUPP;
1998 			break;
1999 		}
2000 		ifnet_serialize_all(ifp);
2001 		error = ifp->if_ioctl(ifp, cmd, data, cred);
2002 		ifnet_deserialize_all(ifp);
2003 		if (error == 0)
2004 			getmicrotime(&ifp->if_lastchange);
2005 		break;
2006 
2007 	case SIOCSIFMTU:
2008 	{
2009 		u_long oldmtu = ifp->if_mtu;
2010 
2011 		error = priv_check_cred(cred, PRIV_ROOT, 0);
2012 		if (error)
2013 			break;
2014 		if (ifp->if_ioctl == NULL) {
2015 			error = EOPNOTSUPP;
2016 			break;
2017 		}
2018 		if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) {
2019 			error = EINVAL;
2020 			break;
2021 		}
2022 		ifnet_serialize_all(ifp);
2023 		error = ifp->if_ioctl(ifp, cmd, data, cred);
2024 		ifnet_deserialize_all(ifp);
2025 		if (error == 0) {
2026 			getmicrotime(&ifp->if_lastchange);
2027 			rt_ifmsg(ifp);
2028 		}
2029 		/*
2030 		 * If the link MTU changed, do network layer specific procedure.
2031 		 */
2032 		if (ifp->if_mtu != oldmtu) {
2033 #ifdef INET6
2034 			nd6_setmtu(ifp);
2035 #endif
2036 		}
2037 		break;
2038 	}
2039 
2040 	case SIOCSIFTSOLEN:
2041 		error = priv_check_cred(cred, PRIV_ROOT, 0);
2042 		if (error)
2043 			break;
2044 
2045 		/* XXX need driver supplied upper limit */
2046 		if (ifr->ifr_tsolen <= 0) {
2047 			error = EINVAL;
2048 			break;
2049 		}
2050 		ifp->if_tsolen = ifr->ifr_tsolen;
2051 		break;
2052 
2053 	case SIOCADDMULTI:
2054 	case SIOCDELMULTI:
2055 		error = priv_check_cred(cred, PRIV_ROOT, 0);
2056 		if (error)
2057 			break;
2058 
2059 		/* Don't allow group membership on non-multicast interfaces. */
2060 		if ((ifp->if_flags & IFF_MULTICAST) == 0) {
2061 			error = EOPNOTSUPP;
2062 			break;
2063 		}
2064 
2065 		/* Don't let users screw up protocols' entries. */
2066 		if (ifr->ifr_addr.sa_family != AF_LINK) {
2067 			error = EINVAL;
2068 			break;
2069 		}
2070 
2071 		if (cmd == SIOCADDMULTI) {
2072 			struct ifmultiaddr *ifma;
2073 			error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
2074 		} else {
2075 			error = if_delmulti(ifp, &ifr->ifr_addr);
2076 		}
2077 		if (error == 0)
2078 			getmicrotime(&ifp->if_lastchange);
2079 		break;
2080 
2081 	case SIOCSIFPHYADDR:
2082 	case SIOCDIFPHYADDR:
2083 #ifdef INET6
2084 	case SIOCSIFPHYADDR_IN6:
2085 #endif
2086 	case SIOCSLIFPHYADDR:
2087         case SIOCSIFMEDIA:
2088 	case SIOCSIFGENERIC:
2089 		error = priv_check_cred(cred, PRIV_ROOT, 0);
2090 		if (error)
2091 			break;
2092 		if (ifp->if_ioctl == 0) {
2093 			error = EOPNOTSUPP;
2094 			break;
2095 		}
2096 		ifnet_serialize_all(ifp);
2097 		error = ifp->if_ioctl(ifp, cmd, data, cred);
2098 		ifnet_deserialize_all(ifp);
2099 		if (error == 0)
2100 			getmicrotime(&ifp->if_lastchange);
2101 		break;
2102 
2103 	case SIOCGIFSTATUS:
2104 		ifs = (struct ifstat *)data;
2105 		ifs->ascii[0] = '\0';
2106 		/* fall through */
2107 	case SIOCGIFPSRCADDR:
2108 	case SIOCGIFPDSTADDR:
2109 	case SIOCGLIFPHYADDR:
2110 	case SIOCGIFMEDIA:
2111 	case SIOCGIFGENERIC:
2112 		if (ifp->if_ioctl == NULL) {
2113 			error = EOPNOTSUPP;
2114 			break;
2115 		}
2116 		ifnet_serialize_all(ifp);
2117 		error = ifp->if_ioctl(ifp, cmd, data, cred);
2118 		ifnet_deserialize_all(ifp);
2119 		break;
2120 
2121 	case SIOCSIFLLADDR:
2122 		error = priv_check_cred(cred, PRIV_ROOT, 0);
2123 		if (error)
2124 			break;
2125 		error = if_setlladdr(ifp, ifr->ifr_addr.sa_data,
2126 				     ifr->ifr_addr.sa_len);
2127 		EVENTHANDLER_INVOKE(iflladdr_event, ifp);
2128 		break;
2129 
2130 	default:
2131 		oif_flags = ifp->if_flags;
2132 		if (so->so_proto == 0) {
2133 			error = EOPNOTSUPP;
2134 			break;
2135 		}
2136 		error = so_pru_control_direct(so, cmd, data, ifp);
2137 
2138 		if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
2139 #ifdef INET6
2140 			DELAY(100);/* XXX: temporary workaround for fxp issue*/
2141 			if (ifp->if_flags & IFF_UP) {
2142 				crit_enter();
2143 				in6_if_up(ifp);
2144 				crit_exit();
2145 			}
2146 #endif
2147 		}
2148 		break;
2149 	}
2150 
2151 	ifnet_unlock();
2152 	return (error);
2153 }
2154 
2155 /*
2156  * Set/clear promiscuous mode on interface ifp based on the truth value
2157  * of pswitch.  The calls are reference counted so that only the first
2158  * "on" request actually has an effect, as does the final "off" request.
2159  * Results are undefined if the "off" and "on" requests are not matched.
2160  */
2161 int
2162 ifpromisc(struct ifnet *ifp, int pswitch)
2163 {
2164 	struct ifreq ifr;
2165 	int error;
2166 	int oldflags;
2167 
2168 	oldflags = ifp->if_flags;
2169 	if (ifp->if_flags & IFF_PPROMISC) {
2170 		/* Do nothing if device is in permanently promiscuous mode */
2171 		ifp->if_pcount += pswitch ? 1 : -1;
2172 		return (0);
2173 	}
2174 	if (pswitch) {
2175 		/*
2176 		 * If the device is not configured up, we cannot put it in
2177 		 * promiscuous mode.
2178 		 */
2179 		if ((ifp->if_flags & IFF_UP) == 0)
2180 			return (ENETDOWN);
2181 		if (ifp->if_pcount++ != 0)
2182 			return (0);
2183 		ifp->if_flags |= IFF_PROMISC;
2184 		log(LOG_INFO, "%s: promiscuous mode enabled\n",
2185 		    ifp->if_xname);
2186 	} else {
2187 		if (--ifp->if_pcount > 0)
2188 			return (0);
2189 		ifp->if_flags &= ~IFF_PROMISC;
2190 		log(LOG_INFO, "%s: promiscuous mode disabled\n",
2191 		    ifp->if_xname);
2192 	}
2193 	ifr.ifr_flags = ifp->if_flags;
2194 	ifr.ifr_flagshigh = ifp->if_flags >> 16;
2195 	ifnet_serialize_all(ifp);
2196 	error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr, NULL);
2197 	ifnet_deserialize_all(ifp);
2198 	if (error == 0)
2199 		rt_ifmsg(ifp);
2200 	else
2201 		ifp->if_flags = oldflags;
2202 	return error;
2203 }
2204 
2205 /*
2206  * Return interface configuration
2207  * of system.  List may be used
2208  * in later ioctl's (above) to get
2209  * other information.
2210  */
2211 static int
2212 ifconf(u_long cmd, caddr_t data, struct ucred *cred)
2213 {
2214 	struct ifconf *ifc = (struct ifconf *)data;
2215 	struct ifnet *ifp;
2216 	struct sockaddr *sa;
2217 	struct ifreq ifr, *ifrp;
2218 	int space = ifc->ifc_len, error = 0;
2219 
2220 	ifrp = ifc->ifc_req;
2221 
2222 	ifnet_lock();
2223 	TAILQ_FOREACH(ifp, &ifnetlist, if_link) {
2224 		struct ifaddr_container *ifac, *ifac_mark;
2225 		struct ifaddr_marker mark;
2226 		struct ifaddrhead *head;
2227 		int addrs;
2228 
2229 		if (space <= sizeof ifr)
2230 			break;
2231 
2232 		/*
2233 		 * Zero the stack declared structure first to prevent
2234 		 * memory disclosure.
2235 		 */
2236 		bzero(&ifr, sizeof(ifr));
2237 		if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
2238 		    >= sizeof(ifr.ifr_name)) {
2239 			error = ENAMETOOLONG;
2240 			break;
2241 		}
2242 
2243 		/*
2244 		 * Add a marker, since copyout() could block and during that
2245 		 * period the list could be changed.  Inserting the marker to
2246 		 * the header of the list will not cause trouble for the code
2247 		 * assuming that the first element of the list is AF_LINK; the
2248 		 * marker will be moved to the next position w/o blocking.
2249 		 */
2250 		ifa_marker_init(&mark, ifp);
2251 		ifac_mark = &mark.ifac;
2252 		head = &ifp->if_addrheads[mycpuid];
2253 
2254 		addrs = 0;
2255 		TAILQ_INSERT_HEAD(head, ifac_mark, ifa_link);
2256 		while ((ifac = TAILQ_NEXT(ifac_mark, ifa_link)) != NULL) {
2257 			struct ifaddr *ifa = ifac->ifa;
2258 
2259 			TAILQ_REMOVE(head, ifac_mark, ifa_link);
2260 			TAILQ_INSERT_AFTER(head, ifac, ifac_mark, ifa_link);
2261 
2262 			/* Ignore marker */
2263 			if (ifa->ifa_addr->sa_family == AF_UNSPEC)
2264 				continue;
2265 
2266 			if (space <= sizeof ifr)
2267 				break;
2268 			sa = ifa->ifa_addr;
2269 			if (cred->cr_prison &&
2270 			    prison_if(cred, sa))
2271 				continue;
2272 			addrs++;
2273 			/*
2274 			 * Keep a reference on this ifaddr, so that it will
2275 			 * not be destroyed when its address is copied to
2276 			 * the userland, which could block.
2277 			 */
2278 			IFAREF(ifa);
2279 			if (sa->sa_len <= sizeof(*sa)) {
2280 				ifr.ifr_addr = *sa;
2281 				error = copyout(&ifr, ifrp, sizeof ifr);
2282 				ifrp++;
2283 			} else {
2284 				if (space < (sizeof ifr) + sa->sa_len -
2285 					    sizeof(*sa)) {
2286 					IFAFREE(ifa);
2287 					break;
2288 				}
2289 				space -= sa->sa_len - sizeof(*sa);
2290 				error = copyout(&ifr, ifrp,
2291 						sizeof ifr.ifr_name);
2292 				if (error == 0)
2293 					error = copyout(sa, &ifrp->ifr_addr,
2294 							sa->sa_len);
2295 				ifrp = (struct ifreq *)
2296 					(sa->sa_len + (caddr_t)&ifrp->ifr_addr);
2297 			}
2298 			IFAFREE(ifa);
2299 			if (error)
2300 				break;
2301 			space -= sizeof ifr;
2302 		}
2303 		TAILQ_REMOVE(head, ifac_mark, ifa_link);
2304 		if (error)
2305 			break;
2306 		if (!addrs) {
2307 			bzero(&ifr.ifr_addr, sizeof ifr.ifr_addr);
2308 			error = copyout(&ifr, ifrp, sizeof ifr);
2309 			if (error)
2310 				break;
2311 			space -= sizeof ifr;
2312 			ifrp++;
2313 		}
2314 	}
2315 	ifnet_unlock();
2316 
2317 	ifc->ifc_len -= space;
2318 	return (error);
2319 }
2320 
2321 /*
2322  * Just like if_promisc(), but for all-multicast-reception mode.
2323  */
2324 int
2325 if_allmulti(struct ifnet *ifp, int onswitch)
2326 {
2327 	int error = 0;
2328 	struct ifreq ifr;
2329 
2330 	crit_enter();
2331 
2332 	if (onswitch) {
2333 		if (ifp->if_amcount++ == 0) {
2334 			ifp->if_flags |= IFF_ALLMULTI;
2335 			ifr.ifr_flags = ifp->if_flags;
2336 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
2337 			ifnet_serialize_all(ifp);
2338 			error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2339 					      NULL);
2340 			ifnet_deserialize_all(ifp);
2341 		}
2342 	} else {
2343 		if (ifp->if_amcount > 1) {
2344 			ifp->if_amcount--;
2345 		} else {
2346 			ifp->if_amcount = 0;
2347 			ifp->if_flags &= ~IFF_ALLMULTI;
2348 			ifr.ifr_flags = ifp->if_flags;
2349 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
2350 			ifnet_serialize_all(ifp);
2351 			error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2352 					      NULL);
2353 			ifnet_deserialize_all(ifp);
2354 		}
2355 	}
2356 
2357 	crit_exit();
2358 
2359 	if (error == 0)
2360 		rt_ifmsg(ifp);
2361 	return error;
2362 }
2363 
2364 /*
2365  * Add a multicast listenership to the interface in question.
2366  * The link layer provides a routine which converts
2367  */
2368 int
2369 if_addmulti_serialized(struct ifnet *ifp, struct sockaddr *sa,
2370     struct ifmultiaddr **retifma)
2371 {
2372 	struct sockaddr *llsa, *dupsa;
2373 	int error;
2374 	struct ifmultiaddr *ifma;
2375 
2376 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
2377 
2378 	/*
2379 	 * If the matching multicast address already exists
2380 	 * then don't add a new one, just add a reference
2381 	 */
2382 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2383 		if (sa_equal(sa, ifma->ifma_addr)) {
2384 			ifma->ifma_refcount++;
2385 			if (retifma)
2386 				*retifma = ifma;
2387 			return 0;
2388 		}
2389 	}
2390 
2391 	/*
2392 	 * Give the link layer a chance to accept/reject it, and also
2393 	 * find out which AF_LINK address this maps to, if it isn't one
2394 	 * already.
2395 	 */
2396 	if (ifp->if_resolvemulti) {
2397 		error = ifp->if_resolvemulti(ifp, &llsa, sa);
2398 		if (error)
2399 			return error;
2400 	} else {
2401 		llsa = NULL;
2402 	}
2403 
2404 	ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_INTWAIT);
2405 	dupsa = kmalloc(sa->sa_len, M_IFMADDR, M_INTWAIT);
2406 	bcopy(sa, dupsa, sa->sa_len);
2407 
2408 	ifma->ifma_addr = dupsa;
2409 	ifma->ifma_lladdr = llsa;
2410 	ifma->ifma_ifp = ifp;
2411 	ifma->ifma_refcount = 1;
2412 	ifma->ifma_protospec = NULL;
2413 	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
2414 
2415 	TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2416 	if (retifma)
2417 		*retifma = ifma;
2418 
2419 	if (llsa != NULL) {
2420 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2421 			if (sa_equal(ifma->ifma_addr, llsa))
2422 				break;
2423 		}
2424 		if (ifma) {
2425 			ifma->ifma_refcount++;
2426 		} else {
2427 			ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_INTWAIT);
2428 			dupsa = kmalloc(llsa->sa_len, M_IFMADDR, M_INTWAIT);
2429 			bcopy(llsa, dupsa, llsa->sa_len);
2430 			ifma->ifma_addr = dupsa;
2431 			ifma->ifma_ifp = ifp;
2432 			ifma->ifma_refcount = 1;
2433 			TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2434 		}
2435 	}
2436 	/*
2437 	 * We are certain we have added something, so call down to the
2438 	 * interface to let them know about it.
2439 	 */
2440 	if (ifp->if_ioctl)
2441 		ifp->if_ioctl(ifp, SIOCADDMULTI, 0, NULL);
2442 
2443 	return 0;
2444 }
2445 
2446 int
2447 if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
2448     struct ifmultiaddr **retifma)
2449 {
2450 	int error;
2451 
2452 	ifnet_serialize_all(ifp);
2453 	error = if_addmulti_serialized(ifp, sa, retifma);
2454 	ifnet_deserialize_all(ifp);
2455 
2456 	return error;
2457 }
2458 
2459 /*
2460  * Remove a reference to a multicast address on this interface.  Yell
2461  * if the request does not match an existing membership.
2462  */
2463 static int
2464 if_delmulti_serialized(struct ifnet *ifp, struct sockaddr *sa)
2465 {
2466 	struct ifmultiaddr *ifma;
2467 
2468 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
2469 
2470 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2471 		if (sa_equal(sa, ifma->ifma_addr))
2472 			break;
2473 	if (ifma == NULL)
2474 		return ENOENT;
2475 
2476 	if (ifma->ifma_refcount > 1) {
2477 		ifma->ifma_refcount--;
2478 		return 0;
2479 	}
2480 
2481 	rt_newmaddrmsg(RTM_DELMADDR, ifma);
2482 	sa = ifma->ifma_lladdr;
2483 	TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2484 	/*
2485 	 * Make sure the interface driver is notified
2486 	 * in the case of a link layer mcast group being left.
2487 	 */
2488 	if (ifma->ifma_addr->sa_family == AF_LINK && sa == NULL)
2489 		ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2490 	kfree(ifma->ifma_addr, M_IFMADDR);
2491 	kfree(ifma, M_IFMADDR);
2492 	if (sa == NULL)
2493 		return 0;
2494 
2495 	/*
2496 	 * Now look for the link-layer address which corresponds to
2497 	 * this network address.  It had been squirreled away in
2498 	 * ifma->ifma_lladdr for this purpose (so we don't have
2499 	 * to call ifp->if_resolvemulti() again), and we saved that
2500 	 * value in sa above.  If some nasty deleted the
2501 	 * link-layer address out from underneath us, we can deal because
2502 	 * the address we stored was is not the same as the one which was
2503 	 * in the record for the link-layer address.  (So we don't complain
2504 	 * in that case.)
2505 	 */
2506 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2507 		if (sa_equal(sa, ifma->ifma_addr))
2508 			break;
2509 	if (ifma == NULL)
2510 		return 0;
2511 
2512 	if (ifma->ifma_refcount > 1) {
2513 		ifma->ifma_refcount--;
2514 		return 0;
2515 	}
2516 
2517 	TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2518 	ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2519 	kfree(ifma->ifma_addr, M_IFMADDR);
2520 	kfree(sa, M_IFMADDR);
2521 	kfree(ifma, M_IFMADDR);
2522 
2523 	return 0;
2524 }
2525 
2526 int
2527 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
2528 {
2529 	int error;
2530 
2531 	ifnet_serialize_all(ifp);
2532 	error = if_delmulti_serialized(ifp, sa);
2533 	ifnet_deserialize_all(ifp);
2534 
2535 	return error;
2536 }
2537 
2538 /*
2539  * Delete all multicast group membership for an interface.
2540  * Should be used to quickly flush all multicast filters.
2541  */
2542 void
2543 if_delallmulti_serialized(struct ifnet *ifp)
2544 {
2545 	struct ifmultiaddr *ifma, mark;
2546 	struct sockaddr sa;
2547 
2548 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
2549 
2550 	bzero(&sa, sizeof(sa));
2551 	sa.sa_family = AF_UNSPEC;
2552 	sa.sa_len = sizeof(sa);
2553 
2554 	bzero(&mark, sizeof(mark));
2555 	mark.ifma_addr = &sa;
2556 
2557 	TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, &mark, ifma_link);
2558 	while ((ifma = TAILQ_NEXT(&mark, ifma_link)) != NULL) {
2559 		TAILQ_REMOVE(&ifp->if_multiaddrs, &mark, ifma_link);
2560 		TAILQ_INSERT_AFTER(&ifp->if_multiaddrs, ifma, &mark,
2561 		    ifma_link);
2562 
2563 		if (ifma->ifma_addr->sa_family == AF_UNSPEC)
2564 			continue;
2565 
2566 		if_delmulti_serialized(ifp, ifma->ifma_addr);
2567 	}
2568 	TAILQ_REMOVE(&ifp->if_multiaddrs, &mark, ifma_link);
2569 }
2570 
2571 
2572 /*
2573  * Set the link layer address on an interface.
2574  *
2575  * At this time we only support certain types of interfaces,
2576  * and we don't allow the length of the address to change.
2577  */
2578 int
2579 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
2580 {
2581 	struct sockaddr_dl *sdl;
2582 	struct ifreq ifr;
2583 
2584 	sdl = IF_LLSOCKADDR(ifp);
2585 	if (sdl == NULL)
2586 		return (EINVAL);
2587 	if (len != sdl->sdl_alen)	/* don't allow length to change */
2588 		return (EINVAL);
2589 	switch (ifp->if_type) {
2590 	case IFT_ETHER:			/* these types use struct arpcom */
2591 	case IFT_XETHER:
2592 	case IFT_L2VLAN:
2593 	case IFT_IEEE8023ADLAG:
2594 		bcopy(lladdr, ((struct arpcom *)ifp->if_softc)->ac_enaddr, len);
2595 		bcopy(lladdr, LLADDR(sdl), len);
2596 		break;
2597 	default:
2598 		return (ENODEV);
2599 	}
2600 	/*
2601 	 * If the interface is already up, we need
2602 	 * to re-init it in order to reprogram its
2603 	 * address filter.
2604 	 */
2605 	ifnet_serialize_all(ifp);
2606 	if ((ifp->if_flags & IFF_UP) != 0) {
2607 #ifdef INET
2608 		struct ifaddr_container *ifac;
2609 #endif
2610 
2611 		ifp->if_flags &= ~IFF_UP;
2612 		ifr.ifr_flags = ifp->if_flags;
2613 		ifr.ifr_flagshigh = ifp->if_flags >> 16;
2614 		ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2615 			      NULL);
2616 		ifp->if_flags |= IFF_UP;
2617 		ifr.ifr_flags = ifp->if_flags;
2618 		ifr.ifr_flagshigh = ifp->if_flags >> 16;
2619 		ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2620 				 NULL);
2621 #ifdef INET
2622 		/*
2623 		 * Also send gratuitous ARPs to notify other nodes about
2624 		 * the address change.
2625 		 */
2626 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
2627 			struct ifaddr *ifa = ifac->ifa;
2628 
2629 			if (ifa->ifa_addr != NULL &&
2630 			    ifa->ifa_addr->sa_family == AF_INET)
2631 				arp_gratuitous(ifp, ifa);
2632 		}
2633 #endif
2634 	}
2635 	ifnet_deserialize_all(ifp);
2636 	return (0);
2637 }
2638 
2639 struct ifmultiaddr *
2640 ifmaof_ifpforaddr(struct sockaddr *sa, struct ifnet *ifp)
2641 {
2642 	struct ifmultiaddr *ifma;
2643 
2644 	/* TODO: need ifnet_serialize_main */
2645 	ifnet_serialize_all(ifp);
2646 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2647 		if (sa_equal(ifma->ifma_addr, sa))
2648 			break;
2649 	ifnet_deserialize_all(ifp);
2650 
2651 	return ifma;
2652 }
2653 
2654 /*
2655  * This function locates the first real ethernet MAC from a network
2656  * card and loads it into node, returning 0 on success or ENOENT if
2657  * no suitable interfaces were found.  It is used by the uuid code to
2658  * generate a unique 6-byte number.
2659  */
2660 int
2661 if_getanyethermac(uint16_t *node, int minlen)
2662 {
2663 	struct ifnet *ifp;
2664 	struct sockaddr_dl *sdl;
2665 
2666 	ifnet_lock();
2667 	TAILQ_FOREACH(ifp, &ifnetlist, if_link) {
2668 		if (ifp->if_type != IFT_ETHER)
2669 			continue;
2670 		sdl = IF_LLSOCKADDR(ifp);
2671 		if (sdl->sdl_alen < minlen)
2672 			continue;
2673 		bcopy(((struct arpcom *)ifp->if_softc)->ac_enaddr, node,
2674 		      minlen);
2675 		ifnet_unlock();
2676 		return(0);
2677 	}
2678 	ifnet_unlock();
2679 	return (ENOENT);
2680 }
2681 
2682 /*
2683  * The name argument must be a pointer to storage which will last as
2684  * long as the interface does.  For physical devices, the result of
2685  * device_get_name(dev) is a good choice and for pseudo-devices a
2686  * static string works well.
2687  */
2688 void
2689 if_initname(struct ifnet *ifp, const char *name, int unit)
2690 {
2691 	ifp->if_dname = name;
2692 	ifp->if_dunit = unit;
2693 	if (unit != IF_DUNIT_NONE)
2694 		ksnprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
2695 	else
2696 		strlcpy(ifp->if_xname, name, IFNAMSIZ);
2697 }
2698 
2699 int
2700 if_printf(struct ifnet *ifp, const char *fmt, ...)
2701 {
2702 	__va_list ap;
2703 	int retval;
2704 
2705 	retval = kprintf("%s: ", ifp->if_xname);
2706 	__va_start(ap, fmt);
2707 	retval += kvprintf(fmt, ap);
2708 	__va_end(ap);
2709 	return (retval);
2710 }
2711 
2712 struct ifnet *
2713 if_alloc(uint8_t type)
2714 {
2715         struct ifnet *ifp;
2716 	size_t size;
2717 
2718 	/*
2719 	 * XXX temporary hack until arpcom is setup in if_l2com
2720 	 */
2721 	if (type == IFT_ETHER)
2722 		size = sizeof(struct arpcom);
2723 	else
2724 		size = sizeof(struct ifnet);
2725 
2726 	ifp = kmalloc(size, M_IFNET, M_WAITOK|M_ZERO);
2727 
2728 	ifp->if_type = type;
2729 
2730 	if (if_com_alloc[type] != NULL) {
2731 		ifp->if_l2com = if_com_alloc[type](type, ifp);
2732 		if (ifp->if_l2com == NULL) {
2733 			kfree(ifp, M_IFNET);
2734 			return (NULL);
2735 		}
2736 	}
2737 	return (ifp);
2738 }
2739 
2740 void
2741 if_free(struct ifnet *ifp)
2742 {
2743 	kfree(ifp, M_IFNET);
2744 }
2745 
2746 void
2747 ifq_set_classic(struct ifaltq *ifq)
2748 {
2749 	ifq_set_methods(ifq, ifq->altq_ifp->if_mapsubq,
2750 	    ifsq_classic_enqueue, ifsq_classic_dequeue, ifsq_classic_request);
2751 }
2752 
2753 void
2754 ifq_set_methods(struct ifaltq *ifq, altq_mapsubq_t mapsubq,
2755     ifsq_enqueue_t enqueue, ifsq_dequeue_t dequeue, ifsq_request_t request)
2756 {
2757 	int q;
2758 
2759 	KASSERT(mapsubq != NULL, ("mapsubq is not specified"));
2760 	KASSERT(enqueue != NULL, ("enqueue is not specified"));
2761 	KASSERT(dequeue != NULL, ("dequeue is not specified"));
2762 	KASSERT(request != NULL, ("request is not specified"));
2763 
2764 	ifq->altq_mapsubq = mapsubq;
2765 	for (q = 0; q < ifq->altq_subq_cnt; ++q) {
2766 		struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
2767 
2768 		ifsq->ifsq_enqueue = enqueue;
2769 		ifsq->ifsq_dequeue = dequeue;
2770 		ifsq->ifsq_request = request;
2771 	}
2772 }
2773 
2774 static void
2775 ifsq_norm_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m)
2776 {
2777 
2778 	classq_add(&ifsq->ifsq_norm, m);
2779 	ALTQ_SQ_CNTR_INC(ifsq, m->m_pkthdr.len);
2780 }
2781 
2782 static void
2783 ifsq_prio_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m)
2784 {
2785 
2786 	classq_add(&ifsq->ifsq_prio, m);
2787 	ALTQ_SQ_CNTR_INC(ifsq, m->m_pkthdr.len);
2788 	ALTQ_SQ_PRIO_CNTR_INC(ifsq, m->m_pkthdr.len);
2789 }
2790 
2791 static struct mbuf *
2792 ifsq_norm_dequeue(struct ifaltq_subque *ifsq)
2793 {
2794 	struct mbuf *m;
2795 
2796 	m = classq_get(&ifsq->ifsq_norm);
2797 	if (m != NULL)
2798 		ALTQ_SQ_CNTR_DEC(ifsq, m->m_pkthdr.len);
2799 	return (m);
2800 }
2801 
2802 static struct mbuf *
2803 ifsq_prio_dequeue(struct ifaltq_subque *ifsq)
2804 {
2805 	struct mbuf *m;
2806 
2807 	m = classq_get(&ifsq->ifsq_prio);
2808 	if (m != NULL) {
2809 		ALTQ_SQ_CNTR_DEC(ifsq, m->m_pkthdr.len);
2810 		ALTQ_SQ_PRIO_CNTR_DEC(ifsq, m->m_pkthdr.len);
2811 	}
2812 	return (m);
2813 }
2814 
2815 int
2816 ifsq_classic_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m,
2817     struct altq_pktattr *pa __unused)
2818 {
2819 
2820 	M_ASSERTPKTHDR(m);
2821 again:
2822 	if (ifsq->ifsq_len >= ifsq->ifsq_maxlen ||
2823 	    ifsq->ifsq_bcnt >= ifsq->ifsq_maxbcnt) {
2824 		struct mbuf *m_drop;
2825 
2826 		if (m->m_flags & M_PRIO) {
2827 			m_drop = NULL;
2828 			if (ifsq->ifsq_prio_len < (ifsq->ifsq_maxlen >> 1) &&
2829 			    ifsq->ifsq_prio_bcnt < (ifsq->ifsq_maxbcnt >> 1)) {
2830 				/* Try dropping some from normal queue. */
2831 				m_drop = ifsq_norm_dequeue(ifsq);
2832 			}
2833 			if (m_drop == NULL)
2834 				m_drop = ifsq_prio_dequeue(ifsq);
2835 		} else {
2836 			m_drop = ifsq_norm_dequeue(ifsq);
2837 		}
2838 		if (m_drop != NULL) {
2839 			IFNET_STAT_INC(ifsq->ifsq_ifp, oqdrops, 1);
2840 			m_freem(m_drop);
2841 			goto again;
2842 		}
2843 		/*
2844 		 * No old packets could be dropped!
2845 		 * NOTE: Caller increases oqdrops.
2846 		 */
2847 		m_freem(m);
2848 		return (ENOBUFS);
2849 	} else {
2850 		if (m->m_flags & M_PRIO)
2851 			ifsq_prio_enqueue(ifsq, m);
2852 		else
2853 			ifsq_norm_enqueue(ifsq, m);
2854 		return (0);
2855 	}
2856 }
2857 
2858 struct mbuf *
2859 ifsq_classic_dequeue(struct ifaltq_subque *ifsq, int op)
2860 {
2861 	struct mbuf *m;
2862 
2863 	switch (op) {
2864 	case ALTDQ_POLL:
2865 		m = classq_head(&ifsq->ifsq_prio);
2866 		if (m == NULL)
2867 			m = classq_head(&ifsq->ifsq_norm);
2868 		break;
2869 
2870 	case ALTDQ_REMOVE:
2871 		m = ifsq_prio_dequeue(ifsq);
2872 		if (m == NULL)
2873 			m = ifsq_norm_dequeue(ifsq);
2874 		break;
2875 
2876 	default:
2877 		panic("unsupported ALTQ dequeue op: %d", op);
2878 	}
2879 	return m;
2880 }
2881 
2882 int
2883 ifsq_classic_request(struct ifaltq_subque *ifsq, int req, void *arg)
2884 {
2885 	switch (req) {
2886 	case ALTRQ_PURGE:
2887 		for (;;) {
2888 			struct mbuf *m;
2889 
2890 			m = ifsq_classic_dequeue(ifsq, ALTDQ_REMOVE);
2891 			if (m == NULL)
2892 				break;
2893 			m_freem(m);
2894 		}
2895 		break;
2896 
2897 	default:
2898 		panic("unsupported ALTQ request: %d", req);
2899 	}
2900 	return 0;
2901 }
2902 
2903 static void
2904 ifsq_ifstart_try(struct ifaltq_subque *ifsq, int force_sched)
2905 {
2906 	struct ifnet *ifp = ifsq_get_ifp(ifsq);
2907 	int running = 0, need_sched;
2908 
2909 	/*
2910 	 * Try to do direct ifnet.if_start on the subqueue first, if there is
2911 	 * contention on the subqueue hardware serializer, ifnet.if_start on
2912 	 * the subqueue will be scheduled on the subqueue owner CPU.
2913 	 */
2914 	if (!ifsq_tryserialize_hw(ifsq)) {
2915 		/*
2916 		 * Subqueue hardware serializer contention happened,
2917 		 * ifnet.if_start on the subqueue is scheduled on
2918 		 * the subqueue owner CPU, and we keep going.
2919 		 */
2920 		ifsq_ifstart_schedule(ifsq, 1);
2921 		return;
2922 	}
2923 
2924 	if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq)) {
2925 		ifp->if_start(ifp, ifsq);
2926 		if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
2927 			running = 1;
2928 	}
2929 	need_sched = ifsq_ifstart_need_schedule(ifsq, running);
2930 
2931 	ifsq_deserialize_hw(ifsq);
2932 
2933 	if (need_sched) {
2934 		/*
2935 		 * More data need to be transmitted, ifnet.if_start on the
2936 		 * subqueue is scheduled on the subqueue owner CPU, and we
2937 		 * keep going.
2938 		 * NOTE: ifnet.if_start subqueue interlock is not released.
2939 		 */
2940 		ifsq_ifstart_schedule(ifsq, force_sched);
2941 	}
2942 }
2943 
2944 /*
2945  * Subqeue packets staging mechanism:
2946  *
2947  * The packets enqueued into the subqueue are staged to a certain amount
2948  * before the ifnet.if_start on the subqueue is called.  In this way, the
2949  * driver could avoid writing to hardware registers upon every packet,
2950  * instead, hardware registers could be written when certain amount of
2951  * packets are put onto hardware TX ring.  The measurement on several modern
2952  * NICs (emx(4), igb(4), bnx(4), bge(4), jme(4)) shows that the hardware
2953  * registers writing aggregation could save ~20% CPU time when 18bytes UDP
2954  * datagrams are transmitted at 1.48Mpps.  The performance improvement by
2955  * hardware registers writing aggeregation is also mentioned by Luigi Rizzo's
2956  * netmap paper (http://info.iet.unipi.it/~luigi/netmap/).
2957  *
2958  * Subqueue packets staging is performed for two entry points into drivers'
2959  * transmission function:
2960  * - Direct ifnet.if_start calling on the subqueue, i.e. ifsq_ifstart_try()
2961  * - ifnet.if_start scheduling on the subqueue, i.e. ifsq_ifstart_schedule()
2962  *
2963  * Subqueue packets staging will be stopped upon any of the following
2964  * conditions:
2965  * - If the count of packets enqueued on the current CPU is great than or
2966  *   equal to ifsq_stage_cntmax. (XXX this should be per-interface)
2967  * - If the total length of packets enqueued on the current CPU is great
2968  *   than or equal to the hardware's MTU - max_protohdr.  max_protohdr is
2969  *   cut from the hardware's MTU mainly bacause a full TCP segment's size
2970  *   is usually less than hardware's MTU.
2971  * - ifsq_ifstart_schedule() is not pending on the current CPU and
2972  *   ifnet.if_start subqueue interlock (ifaltq_subq.ifsq_started) is not
2973  *   released.
2974  * - The if_start_rollup(), which is registered as low priority netisr
2975  *   rollup function, is called; probably because no more work is pending
2976  *   for netisr.
2977  *
2978  * NOTE:
2979  * Currently subqueue packet staging is only performed in netisr threads.
2980  */
2981 int
2982 ifq_dispatch(struct ifnet *ifp, struct mbuf *m, struct altq_pktattr *pa)
2983 {
2984 	struct ifaltq *ifq = &ifp->if_snd;
2985 	struct ifaltq_subque *ifsq;
2986 	int error, start = 0, len, mcast = 0, avoid_start = 0;
2987 	struct ifsubq_stage_head *head = NULL;
2988 	struct ifsubq_stage *stage = NULL;
2989 	struct globaldata *gd = mycpu;
2990 	struct thread *td = gd->gd_curthread;
2991 
2992 	crit_enter_quick(td);
2993 
2994 	ifsq = ifq_map_subq(ifq, gd->gd_cpuid);
2995 	ASSERT_ALTQ_SQ_NOT_SERIALIZED_HW(ifsq);
2996 
2997 	len = m->m_pkthdr.len;
2998 	if (m->m_flags & M_MCAST)
2999 		mcast = 1;
3000 
3001 	if (td->td_type == TD_TYPE_NETISR) {
3002 		head = &ifsubq_stage_heads[mycpuid];
3003 		stage = ifsq_get_stage(ifsq, mycpuid);
3004 
3005 		stage->stg_cnt++;
3006 		stage->stg_len += len;
3007 		if (stage->stg_cnt < ifsq_stage_cntmax &&
3008 		    stage->stg_len < (ifp->if_mtu - max_protohdr))
3009 			avoid_start = 1;
3010 	}
3011 
3012 	ALTQ_SQ_LOCK(ifsq);
3013 	error = ifsq_enqueue_locked(ifsq, m, pa);
3014 	if (error) {
3015 		IFNET_STAT_INC(ifp, oqdrops, 1);
3016 		if (!ifsq_data_ready(ifsq)) {
3017 			ALTQ_SQ_UNLOCK(ifsq);
3018 			crit_exit_quick(td);
3019 			return error;
3020 		}
3021 		avoid_start = 0;
3022 	}
3023 	if (!ifsq_is_started(ifsq)) {
3024 		if (avoid_start) {
3025 			ALTQ_SQ_UNLOCK(ifsq);
3026 
3027 			KKASSERT(!error);
3028 			if ((stage->stg_flags & IFSQ_STAGE_FLAG_QUED) == 0)
3029 				ifsq_stage_insert(head, stage);
3030 
3031 			IFNET_STAT_INC(ifp, obytes, len);
3032 			if (mcast)
3033 				IFNET_STAT_INC(ifp, omcasts, 1);
3034 			crit_exit_quick(td);
3035 			return error;
3036 		}
3037 
3038 		/*
3039 		 * Hold the subqueue interlock of ifnet.if_start
3040 		 */
3041 		ifsq_set_started(ifsq);
3042 		start = 1;
3043 	}
3044 	ALTQ_SQ_UNLOCK(ifsq);
3045 
3046 	if (!error) {
3047 		IFNET_STAT_INC(ifp, obytes, len);
3048 		if (mcast)
3049 			IFNET_STAT_INC(ifp, omcasts, 1);
3050 	}
3051 
3052 	if (stage != NULL) {
3053 		if (!start && (stage->stg_flags & IFSQ_STAGE_FLAG_SCHED)) {
3054 			KKASSERT(stage->stg_flags & IFSQ_STAGE_FLAG_QUED);
3055 			if (!avoid_start) {
3056 				ifsq_stage_remove(head, stage);
3057 				ifsq_ifstart_schedule(ifsq, 1);
3058 			}
3059 			crit_exit_quick(td);
3060 			return error;
3061 		}
3062 
3063 		if (stage->stg_flags & IFSQ_STAGE_FLAG_QUED) {
3064 			ifsq_stage_remove(head, stage);
3065 		} else {
3066 			stage->stg_cnt = 0;
3067 			stage->stg_len = 0;
3068 		}
3069 	}
3070 
3071 	if (!start) {
3072 		crit_exit_quick(td);
3073 		return error;
3074 	}
3075 
3076 	ifsq_ifstart_try(ifsq, 0);
3077 
3078 	crit_exit_quick(td);
3079 	return error;
3080 }
3081 
3082 void *
3083 ifa_create(int size)
3084 {
3085 	struct ifaddr *ifa;
3086 	int i;
3087 
3088 	KASSERT(size >= sizeof(*ifa), ("ifaddr size too small"));
3089 
3090 	ifa = kmalloc(size, M_IFADDR, M_INTWAIT | M_ZERO);
3091 	ifa->ifa_containers =
3092 	    kmalloc_cachealign(ncpus * sizeof(struct ifaddr_container),
3093 	        M_IFADDR, M_INTWAIT | M_ZERO);
3094 
3095 	ifa->ifa_ncnt = ncpus;
3096 	for (i = 0; i < ncpus; ++i) {
3097 		struct ifaddr_container *ifac = &ifa->ifa_containers[i];
3098 
3099 		ifac->ifa_magic = IFA_CONTAINER_MAGIC;
3100 		ifac->ifa = ifa;
3101 		ifac->ifa_refcnt = 1;
3102 	}
3103 #ifdef IFADDR_DEBUG
3104 	kprintf("alloc ifa %p %d\n", ifa, size);
3105 #endif
3106 	return ifa;
3107 }
3108 
3109 void
3110 ifac_free(struct ifaddr_container *ifac, int cpu_id)
3111 {
3112 	struct ifaddr *ifa = ifac->ifa;
3113 
3114 	KKASSERT(ifac->ifa_magic == IFA_CONTAINER_MAGIC);
3115 	KKASSERT(ifac->ifa_refcnt == 0);
3116 	KASSERT(ifac->ifa_listmask == 0,
3117 		("ifa is still on %#x lists", ifac->ifa_listmask));
3118 
3119 	ifac->ifa_magic = IFA_CONTAINER_DEAD;
3120 
3121 #ifdef IFADDR_DEBUG_VERBOSE
3122 	kprintf("try free ifa %p cpu_id %d\n", ifac->ifa, cpu_id);
3123 #endif
3124 
3125 	KASSERT(ifa->ifa_ncnt > 0 && ifa->ifa_ncnt <= ncpus,
3126 		("invalid # of ifac, %d", ifa->ifa_ncnt));
3127 	if (atomic_fetchadd_int(&ifa->ifa_ncnt, -1) == 1) {
3128 #ifdef IFADDR_DEBUG
3129 		kprintf("free ifa %p\n", ifa);
3130 #endif
3131 		kfree(ifa->ifa_containers, M_IFADDR);
3132 		kfree(ifa, M_IFADDR);
3133 	}
3134 }
3135 
3136 static void
3137 ifa_iflink_dispatch(netmsg_t nmsg)
3138 {
3139 	struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
3140 	struct ifaddr *ifa = msg->ifa;
3141 	struct ifnet *ifp = msg->ifp;
3142 	int cpu = mycpuid;
3143 	struct ifaddr_container *ifac;
3144 
3145 	crit_enter();
3146 
3147 	ifac = &ifa->ifa_containers[cpu];
3148 	ASSERT_IFAC_VALID(ifac);
3149 	KASSERT((ifac->ifa_listmask & IFA_LIST_IFADDRHEAD) == 0,
3150 		("ifaddr is on if_addrheads"));
3151 
3152 	ifac->ifa_listmask |= IFA_LIST_IFADDRHEAD;
3153 	if (msg->tail)
3154 		TAILQ_INSERT_TAIL(&ifp->if_addrheads[cpu], ifac, ifa_link);
3155 	else
3156 		TAILQ_INSERT_HEAD(&ifp->if_addrheads[cpu], ifac, ifa_link);
3157 
3158 	crit_exit();
3159 
3160 	netisr_forwardmsg(&nmsg->base, cpu + 1);
3161 }
3162 
3163 void
3164 ifa_iflink(struct ifaddr *ifa, struct ifnet *ifp, int tail)
3165 {
3166 	struct netmsg_ifaddr msg;
3167 
3168 	netmsg_init(&msg.base, NULL, &curthread->td_msgport,
3169 		    0, ifa_iflink_dispatch);
3170 	msg.ifa = ifa;
3171 	msg.ifp = ifp;
3172 	msg.tail = tail;
3173 
3174 	netisr_domsg(&msg.base, 0);
3175 }
3176 
3177 static void
3178 ifa_ifunlink_dispatch(netmsg_t nmsg)
3179 {
3180 	struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
3181 	struct ifaddr *ifa = msg->ifa;
3182 	struct ifnet *ifp = msg->ifp;
3183 	int cpu = mycpuid;
3184 	struct ifaddr_container *ifac;
3185 
3186 	crit_enter();
3187 
3188 	ifac = &ifa->ifa_containers[cpu];
3189 	ASSERT_IFAC_VALID(ifac);
3190 	KASSERT(ifac->ifa_listmask & IFA_LIST_IFADDRHEAD,
3191 		("ifaddr is not on if_addrhead"));
3192 
3193 	TAILQ_REMOVE(&ifp->if_addrheads[cpu], ifac, ifa_link);
3194 	ifac->ifa_listmask &= ~IFA_LIST_IFADDRHEAD;
3195 
3196 	crit_exit();
3197 
3198 	netisr_forwardmsg(&nmsg->base, cpu + 1);
3199 }
3200 
3201 void
3202 ifa_ifunlink(struct ifaddr *ifa, struct ifnet *ifp)
3203 {
3204 	struct netmsg_ifaddr msg;
3205 
3206 	netmsg_init(&msg.base, NULL, &curthread->td_msgport,
3207 		    0, ifa_ifunlink_dispatch);
3208 	msg.ifa = ifa;
3209 	msg.ifp = ifp;
3210 
3211 	netisr_domsg(&msg.base, 0);
3212 }
3213 
3214 static void
3215 ifa_destroy_dispatch(netmsg_t nmsg)
3216 {
3217 	struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
3218 
3219 	IFAFREE(msg->ifa);
3220 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
3221 }
3222 
3223 void
3224 ifa_destroy(struct ifaddr *ifa)
3225 {
3226 	struct netmsg_ifaddr msg;
3227 
3228 	netmsg_init(&msg.base, NULL, &curthread->td_msgport,
3229 		    0, ifa_destroy_dispatch);
3230 	msg.ifa = ifa;
3231 
3232 	netisr_domsg(&msg.base, 0);
3233 }
3234 
3235 static void
3236 if_start_rollup(void)
3237 {
3238 	struct ifsubq_stage_head *head = &ifsubq_stage_heads[mycpuid];
3239 	struct ifsubq_stage *stage;
3240 
3241 	crit_enter();
3242 
3243 	while ((stage = TAILQ_FIRST(&head->stg_head)) != NULL) {
3244 		struct ifaltq_subque *ifsq = stage->stg_subq;
3245 		int is_sched = 0;
3246 
3247 		if (stage->stg_flags & IFSQ_STAGE_FLAG_SCHED)
3248 			is_sched = 1;
3249 		ifsq_stage_remove(head, stage);
3250 
3251 		if (is_sched) {
3252 			ifsq_ifstart_schedule(ifsq, 1);
3253 		} else {
3254 			int start = 0;
3255 
3256 			ALTQ_SQ_LOCK(ifsq);
3257 			if (!ifsq_is_started(ifsq)) {
3258 				/*
3259 				 * Hold the subqueue interlock of
3260 				 * ifnet.if_start
3261 				 */
3262 				ifsq_set_started(ifsq);
3263 				start = 1;
3264 			}
3265 			ALTQ_SQ_UNLOCK(ifsq);
3266 
3267 			if (start)
3268 				ifsq_ifstart_try(ifsq, 1);
3269 		}
3270 		KKASSERT((stage->stg_flags &
3271 		    (IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED)) == 0);
3272 	}
3273 
3274 	crit_exit();
3275 }
3276 
3277 static void
3278 ifnetinit(void *dummy __unused)
3279 {
3280 	int i;
3281 
3282 	for (i = 0; i < ncpus; ++i)
3283 		TAILQ_INIT(&ifsubq_stage_heads[i].stg_head);
3284 	netisr_register_rollup(if_start_rollup, NETISR_ROLLUP_PRIO_IFSTART);
3285 }
3286 
3287 void
3288 if_register_com_alloc(u_char type,
3289     if_com_alloc_t *a, if_com_free_t *f)
3290 {
3291 
3292         KASSERT(if_com_alloc[type] == NULL,
3293             ("if_register_com_alloc: %d already registered", type));
3294         KASSERT(if_com_free[type] == NULL,
3295             ("if_register_com_alloc: %d free already registered", type));
3296 
3297         if_com_alloc[type] = a;
3298         if_com_free[type] = f;
3299 }
3300 
3301 void
3302 if_deregister_com_alloc(u_char type)
3303 {
3304 
3305         KASSERT(if_com_alloc[type] != NULL,
3306             ("if_deregister_com_alloc: %d not registered", type));
3307         KASSERT(if_com_free[type] != NULL,
3308             ("if_deregister_com_alloc: %d free not registered", type));
3309         if_com_alloc[type] = NULL;
3310         if_com_free[type] = NULL;
3311 }
3312 
3313 void
3314 ifq_set_maxlen(struct ifaltq *ifq, int len)
3315 {
3316 	ifq->altq_maxlen = len + (ncpus * ifsq_stage_cntmax);
3317 }
3318 
3319 int
3320 ifq_mapsubq_default(struct ifaltq *ifq __unused, int cpuid __unused)
3321 {
3322 	return ALTQ_SUBQ_INDEX_DEFAULT;
3323 }
3324 
3325 int
3326 ifq_mapsubq_mask(struct ifaltq *ifq, int cpuid)
3327 {
3328 
3329 	return (cpuid & ifq->altq_subq_mappriv);
3330 }
3331 
3332 int
3333 ifq_mapsubq_modulo(struct ifaltq *ifq, int cpuid)
3334 {
3335 
3336 	return (cpuid % ifq->altq_subq_mappriv);
3337 }
3338 
3339 static void
3340 ifsq_watchdog(void *arg)
3341 {
3342 	struct ifsubq_watchdog *wd = arg;
3343 	struct ifnet *ifp;
3344 
3345 	if (__predict_true(wd->wd_timer == 0 || --wd->wd_timer))
3346 		goto done;
3347 
3348 	ifp = ifsq_get_ifp(wd->wd_subq);
3349 	if (ifnet_tryserialize_all(ifp)) {
3350 		wd->wd_watchdog(wd->wd_subq);
3351 		ifnet_deserialize_all(ifp);
3352 	} else {
3353 		/* try again next timeout */
3354 		wd->wd_timer = 1;
3355 	}
3356 done:
3357 	ifsq_watchdog_reset(wd);
3358 }
3359 
3360 static void
3361 ifsq_watchdog_reset(struct ifsubq_watchdog *wd)
3362 {
3363 	callout_reset_bycpu(&wd->wd_callout, hz, ifsq_watchdog, wd,
3364 	    ifsq_get_cpuid(wd->wd_subq));
3365 }
3366 
3367 void
3368 ifsq_watchdog_init(struct ifsubq_watchdog *wd, struct ifaltq_subque *ifsq,
3369     ifsq_watchdog_t watchdog)
3370 {
3371 	callout_init_mp(&wd->wd_callout);
3372 	wd->wd_timer = 0;
3373 	wd->wd_subq = ifsq;
3374 	wd->wd_watchdog = watchdog;
3375 }
3376 
3377 void
3378 ifsq_watchdog_start(struct ifsubq_watchdog *wd)
3379 {
3380 	wd->wd_timer = 0;
3381 	ifsq_watchdog_reset(wd);
3382 }
3383 
3384 void
3385 ifsq_watchdog_stop(struct ifsubq_watchdog *wd)
3386 {
3387 	wd->wd_timer = 0;
3388 	callout_stop(&wd->wd_callout);
3389 }
3390 
3391 void
3392 ifnet_lock(void)
3393 {
3394 	KASSERT(curthread->td_type != TD_TYPE_NETISR,
3395 	    ("try holding ifnet lock in netisr"));
3396 	mtx_lock(&ifnet_mtx);
3397 }
3398 
3399 void
3400 ifnet_unlock(void)
3401 {
3402 	KASSERT(curthread->td_type != TD_TYPE_NETISR,
3403 	    ("try holding ifnet lock in netisr"));
3404 	mtx_unlock(&ifnet_mtx);
3405 }
3406 
3407 static struct ifnet_array *
3408 ifnet_array_alloc(int count)
3409 {
3410 	struct ifnet_array *arr;
3411 
3412 	arr = kmalloc(__offsetof(struct ifnet_array, ifnet_arr[count]),
3413 	    M_IFNET, M_WAITOK);
3414 	arr->ifnet_count = count;
3415 
3416 	return arr;
3417 }
3418 
3419 static void
3420 ifnet_array_free(struct ifnet_array *arr)
3421 {
3422 	if (arr == &ifnet_array0)
3423 		return;
3424 	kfree(arr, M_IFNET);
3425 }
3426 
3427 static struct ifnet_array *
3428 ifnet_array_add(struct ifnet *ifp, const struct ifnet_array *old_arr)
3429 {
3430 	struct ifnet_array *arr;
3431 	int count, i;
3432 
3433 	KASSERT(old_arr->ifnet_count >= 0,
3434 	    ("invalid ifnet array count %d", old_arr->ifnet_count));
3435 	count = old_arr->ifnet_count + 1;
3436 	arr = ifnet_array_alloc(count);
3437 
3438 	/*
3439 	 * Save the old ifnet array and append this ifp to the end of
3440 	 * the new ifnet array.
3441 	 */
3442 	for (i = 0; i < old_arr->ifnet_count; ++i) {
3443 		KASSERT(old_arr->ifnet_arr[i] != ifp,
3444 		    ("%s is already in ifnet array", ifp->if_xname));
3445 		arr->ifnet_arr[i] = old_arr->ifnet_arr[i];
3446 	}
3447 	KASSERT(i == count - 1,
3448 	    ("add %s, ifnet array index mismatch, should be %d, but got %d",
3449 	     ifp->if_xname, count - 1, i));
3450 	arr->ifnet_arr[i] = ifp;
3451 
3452 	return arr;
3453 }
3454 
3455 static struct ifnet_array *
3456 ifnet_array_del(struct ifnet *ifp, const struct ifnet_array *old_arr)
3457 {
3458 	struct ifnet_array *arr;
3459 	int count, i, idx, found = 0;
3460 
3461 	KASSERT(old_arr->ifnet_count > 0,
3462 	    ("invalid ifnet array count %d", old_arr->ifnet_count));
3463 	count = old_arr->ifnet_count - 1;
3464 	arr = ifnet_array_alloc(count);
3465 
3466 	/*
3467 	 * Save the old ifnet array, but skip this ifp.
3468 	 */
3469 	idx = 0;
3470 	for (i = 0; i < old_arr->ifnet_count; ++i) {
3471 		if (old_arr->ifnet_arr[i] == ifp) {
3472 			KASSERT(!found,
3473 			    ("dup %s is in ifnet array", ifp->if_xname));
3474 			found = 1;
3475 			continue;
3476 		}
3477 		KASSERT(idx < count,
3478 		    ("invalid ifnet array index %d, count %d", idx, count));
3479 		arr->ifnet_arr[idx] = old_arr->ifnet_arr[i];
3480 		++idx;
3481 	}
3482 	KASSERT(found, ("%s is not in ifnet array", ifp->if_xname));
3483 	KASSERT(idx == count,
3484 	    ("del %s, ifnet array count mismatch, should be %d, but got %d ",
3485 	     ifp->if_xname, count, idx));
3486 
3487 	return arr;
3488 }
3489 
3490 const struct ifnet_array *
3491 ifnet_array_get(void)
3492 {
3493 	const struct ifnet_array *ret;
3494 
3495 	KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr"));
3496 	ret = ifnet_array;
3497 	/* Make sure 'ret' is really used. */
3498 	cpu_ccfence();
3499 	return (ret);
3500 }
3501 
3502 int
3503 ifnet_array_isempty(void)
3504 {
3505 	KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr"));
3506 	if (ifnet_array->ifnet_count == 0)
3507 		return 1;
3508 	else
3509 		return 0;
3510 }
3511 
3512 void
3513 ifa_marker_init(struct ifaddr_marker *mark, struct ifnet *ifp)
3514 {
3515 	struct ifaddr *ifa;
3516 
3517 	memset(mark, 0, sizeof(*mark));
3518 	ifa = &mark->ifa;
3519 
3520 	mark->ifac.ifa = ifa;
3521 
3522 	ifa->ifa_addr = &mark->addr;
3523 	ifa->ifa_dstaddr = &mark->dstaddr;
3524 	ifa->ifa_netmask = &mark->netmask;
3525 	ifa->ifa_ifp = ifp;
3526 }
3527 
3528 static int
3529 if_ringcnt_fixup(int ring_cnt, int ring_cntmax)
3530 {
3531 
3532 	KASSERT(ring_cntmax > 0, ("invalid ring count max %d", ring_cntmax));
3533 	if (ring_cnt == 1 || ring_cntmax == 1 || netisr_ncpus == 1)
3534 		return (1);
3535 
3536 	if (ring_cnt <= 0 || ring_cnt > ring_cntmax)
3537 		ring_cnt = ring_cntmax;
3538 	if (ring_cnt > netisr_ncpus)
3539 		ring_cnt = netisr_ncpus;
3540 	return (ring_cnt);
3541 }
3542 
3543 static void
3544 if_ringmap_set_grid(device_t dev, struct if_ringmap *rm, int grid)
3545 {
3546 	int i, offset;
3547 
3548 	KASSERT(grid > 0, ("invalid if_ringmap grid %d", grid));
3549 	rm->rm_grid = grid;
3550 
3551 	offset = (rm->rm_grid * device_get_unit(dev)) % netisr_ncpus;
3552 	for (i = 0; i < rm->rm_cnt; ++i)
3553 		rm->rm_cpumap[i] = (offset + i) % netisr_ncpus;
3554 }
3555 
3556 struct if_ringmap *
3557 if_ringmap_alloc(device_t dev, int ring_cnt, int ring_cntmax)
3558 {
3559 	struct if_ringmap *rm;
3560 	int i, grid = 0;
3561 
3562 	ring_cnt = if_ringcnt_fixup(ring_cnt, ring_cntmax);
3563 	rm = kmalloc(__offsetof(struct if_ringmap, rm_cpumap[ring_cnt]),
3564 	    M_DEVBUF, M_WAITOK | M_ZERO);
3565 
3566 	rm->rm_cnt = ring_cnt;
3567 	for (i = 0; i < netisr_ncpus; ++i) {
3568 		if (netisr_ncpus % (i + 1) != 0)
3569 			continue;
3570 
3571 		if (rm->rm_cnt > netisr_ncpus / (i + 2)) {
3572 			grid = netisr_ncpus / (i + 1);
3573 			if (rm->rm_cnt > grid)
3574 				rm->rm_cnt = grid;
3575 			break;
3576 		}
3577 	}
3578 	if_ringmap_set_grid(dev, rm, grid);
3579 
3580 	return (rm);
3581 }
3582 
3583 void
3584 if_ringmap_free(struct if_ringmap *rm)
3585 {
3586 
3587 	kfree(rm, M_DEVBUF);
3588 }
3589 
3590 void
3591 if_ringmap_align(device_t dev, struct if_ringmap *rm0, struct if_ringmap *rm1)
3592 {
3593 
3594 	if (rm0->rm_grid > rm1->rm_grid)
3595 		if_ringmap_set_grid(dev, rm1, rm0->rm_grid);
3596 	else if (rm0->rm_grid < rm1->rm_grid)
3597 		if_ringmap_set_grid(dev, rm0, rm1->rm_grid);
3598 }
3599 
3600 void
3601 if_ringmap_match(device_t dev, struct if_ringmap *rm0, struct if_ringmap *rm1)
3602 {
3603 
3604 	if (rm0->rm_grid == netisr_ncpus || rm1->rm_grid == netisr_ncpus)
3605 		return;
3606 	if_ringmap_align(dev, rm0, rm1);
3607 }
3608 
3609 int
3610 if_ringmap_count(const struct if_ringmap *rm)
3611 {
3612 
3613 	return (rm->rm_cnt);
3614 }
3615 
3616 int
3617 if_ringmap_cpumap(const struct if_ringmap *rm, int ring)
3618 {
3619 
3620 	KASSERT(ring >= 0 && ring < rm->rm_cnt, ("invalid ring %d", ring));
3621 	return (rm->rm_cpumap[ring]);
3622 }
3623 
3624 void
3625 if_ringmap_rdrtable(const struct if_ringmap *rm, int table[], int table_nent)
3626 {
3627 	int i, grid_idx, grid_cnt, patch_off, patch_cnt, ncopy;
3628 
3629 	KASSERT(table_nent > 0 && (table_nent & NETISR_CPUMASK) == 0,
3630 	    ("invalid redirect table entries %d", table_nent));
3631 
3632 	grid_idx = 0;
3633 	for (i = 0; i < NETISR_CPUMAX; ++i) {
3634 		table[i] = grid_idx++ % rm->rm_cnt;
3635 
3636 		if (grid_idx == rm->rm_grid)
3637 			grid_idx = 0;
3638 	}
3639 
3640 	/*
3641 	 * Make the ring distributed more evenly for the remainder of each
3642 	 * grid.
3643 	 */
3644 	patch_cnt = rm->rm_grid % rm->rm_cnt;
3645 	if (patch_cnt == 0)
3646 		goto done;
3647 	patch_off = rm->rm_grid - (rm->rm_grid % rm->rm_cnt);
3648 
3649 	grid_cnt = roundup(NETISR_CPUMAX, rm->rm_grid) / rm->rm_grid;
3650 	grid_idx = 0;
3651 	for (i = 0; i < grid_cnt; ++i) {
3652 		int j;
3653 
3654 		for (j = 0; j < patch_cnt; ++j) {
3655 			int fix_idx;
3656 
3657 			fix_idx = (i * rm->rm_grid) + patch_off + j;
3658 			if (fix_idx >= NETISR_CPUMAX)
3659 				goto done;
3660 			table[fix_idx] = grid_idx++ % rm->rm_cnt;
3661 		}
3662 	}
3663 done:
3664 	ncopy = table_nent / NETISR_CPUMAX;
3665 	for (i = 1; i < ncopy; ++i) {
3666 		memcpy(&table[i * NETISR_CPUMAX], table,
3667 		    NETISR_CPUMAX * sizeof(table[0]));
3668 	}
3669 	if (if_ringmap_dumprdr) {
3670 		for (i = 0; i < table_nent; ++i) {
3671 			if (i != 0 && i % 16 == 0)
3672 				kprintf("\n");
3673 			kprintf("%03d ", table[i]);
3674 		}
3675 		kprintf("\n");
3676 	}
3677 }
3678 
3679 int
3680 if_ringmap_cpumap_sysctl(SYSCTL_HANDLER_ARGS)
3681 {
3682 	struct if_ringmap *rm = arg1;
3683 	int i, error = 0;
3684 
3685 	for (i = 0; i < rm->rm_cnt; ++i) {
3686 		int cpu = rm->rm_cpumap[i];
3687 
3688 		error = SYSCTL_OUT(req, &cpu, sizeof(cpu));
3689 		if (error)
3690 			break;
3691 	}
3692 	return (error);
3693 }
3694 
3695 int
3696 if_ring_count2(int ring_cnt, int ring_cntmax)
3697 {
3698 
3699 	ring_cnt = if_ringcnt_fixup(ring_cnt, ring_cntmax);
3700 	return (1 << (fls(ring_cnt) - 1));
3701 }
3702