xref: /dflybsd-src/sys/net/if.c (revision 2799a574be714cd990b2d176c0e290cb04b4e6d5)
1 /*
2  * Copyright (c) 1980, 1986, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *	@(#)if.c	8.3 (Berkeley) 1/4/94
30  * $FreeBSD: src/sys/net/if.c,v 1.185 2004/03/13 02:35:03 brooks Exp $
31  */
32 
33 #include "opt_inet6.h"
34 #include "opt_inet.h"
35 #include "opt_ifpoll.h"
36 
37 #include <sys/param.h>
38 #include <sys/malloc.h>
39 #include <sys/mbuf.h>
40 #include <sys/systm.h>
41 #include <sys/proc.h>
42 #include <sys/priv.h>
43 #include <sys/protosw.h>
44 #include <sys/socket.h>
45 #include <sys/socketvar.h>
46 #include <sys/socketops.h>
47 #include <sys/kernel.h>
48 #include <sys/ktr.h>
49 #include <sys/mutex.h>
50 #include <sys/sockio.h>
51 #include <sys/syslog.h>
52 #include <sys/sysctl.h>
53 #include <sys/domain.h>
54 #include <sys/thread.h>
55 #include <sys/serialize.h>
56 #include <sys/bus.h>
57 
58 #include <sys/thread2.h>
59 #include <sys/msgport2.h>
60 #include <sys/mutex2.h>
61 
62 #include <net/if.h>
63 #include <net/if_arp.h>
64 #include <net/if_dl.h>
65 #include <net/if_types.h>
66 #include <net/if_var.h>
67 #include <net/ifq_var.h>
68 #include <net/radix.h>
69 #include <net/route.h>
70 #include <net/if_clone.h>
71 #include <net/netisr2.h>
72 #include <net/netmsg2.h>
73 
74 #include <machine/atomic.h>
75 #include <machine/stdarg.h>
76 #include <machine/smp.h>
77 
78 #if defined(INET) || defined(INET6)
79 /*XXX*/
80 #include <netinet/in.h>
81 #include <netinet/in_var.h>
82 #include <netinet/if_ether.h>
83 #ifdef INET6
84 #include <netinet6/in6_var.h>
85 #include <netinet6/in6_ifattach.h>
86 #endif
87 #endif
88 
89 struct netmsg_ifaddr {
90 	struct netmsg_base base;
91 	struct ifaddr	*ifa;
92 	struct ifnet	*ifp;
93 	int		tail;
94 };
95 
96 struct ifsubq_stage_head {
97 	TAILQ_HEAD(, ifsubq_stage)	stg_head;
98 } __cachealign;
99 
100 /*
101  * System initialization
102  */
103 static void	if_attachdomain(void *);
104 static void	if_attachdomain1(struct ifnet *);
105 static int	ifconf(u_long, caddr_t, struct ucred *);
106 static void	ifinit(void *);
107 static void	ifnetinit(void *);
108 static void	if_slowtimo(void *);
109 static void	link_rtrequest(int, struct rtentry *);
110 static int	if_rtdel(struct radix_node *, void *);
111 static void	if_slowtimo_dispatch(netmsg_t);
112 
113 /* Helper functions */
114 static void	ifsq_watchdog_reset(struct ifsubq_watchdog *);
115 static int	if_delmulti_serialized(struct ifnet *, struct sockaddr *);
116 static struct ifnet_array *ifnet_array_alloc(int);
117 static void	ifnet_array_free(struct ifnet_array *);
118 static struct ifnet_array *ifnet_array_add(struct ifnet *,
119 		    const struct ifnet_array *);
120 static struct ifnet_array *ifnet_array_del(struct ifnet *,
121 		    const struct ifnet_array *);
122 
123 #ifdef INET6
124 /*
125  * XXX: declare here to avoid to include many inet6 related files..
126  * should be more generalized?
127  */
128 extern void	nd6_setmtu(struct ifnet *);
129 #endif
130 
131 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
132 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
133 
134 static int ifsq_stage_cntmax = 4;
135 TUNABLE_INT("net.link.stage_cntmax", &ifsq_stage_cntmax);
136 SYSCTL_INT(_net_link, OID_AUTO, stage_cntmax, CTLFLAG_RW,
137     &ifsq_stage_cntmax, 0, "ifq staging packet count max");
138 
139 static int if_stats_compat = 0;
140 SYSCTL_INT(_net_link, OID_AUTO, stats_compat, CTLFLAG_RW,
141     &if_stats_compat, 0, "Compat the old ifnet stats");
142 
143 SYSINIT(interfaces, SI_SUB_PROTO_IF, SI_ORDER_FIRST, ifinit, NULL);
144 SYSINIT(ifnet, SI_SUB_PRE_DRIVERS, SI_ORDER_ANY, ifnetinit, NULL);
145 
146 static  if_com_alloc_t *if_com_alloc[256];
147 static  if_com_free_t *if_com_free[256];
148 
149 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
150 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
151 MALLOC_DEFINE(M_IFNET, "ifnet", "interface structure");
152 
153 int			ifqmaxlen = IFQ_MAXLEN;
154 struct ifnethead	ifnet = TAILQ_HEAD_INITIALIZER(ifnet);
155 
156 static struct ifnet_array	ifnet_array0;
157 static struct ifnet_array	*ifnet_array = &ifnet_array0;
158 
159 static struct callout		if_slowtimo_timer;
160 static struct netmsg_base	if_slowtimo_netmsg;
161 
162 int			if_index = 0;
163 struct ifnet		**ifindex2ifnet = NULL;
164 static struct mtx	ifnet_mtx = MTX_INITIALIZER("ifnet");
165 
166 static struct ifsubq_stage_head	ifsubq_stage_heads[MAXCPU];
167 
168 #ifdef notyet
169 #define IFQ_KTR_STRING		"ifq=%p"
170 #define IFQ_KTR_ARGS	struct ifaltq *ifq
171 #ifndef KTR_IFQ
172 #define KTR_IFQ			KTR_ALL
173 #endif
174 KTR_INFO_MASTER(ifq);
175 KTR_INFO(KTR_IFQ, ifq, enqueue, 0, IFQ_KTR_STRING, IFQ_KTR_ARGS);
176 KTR_INFO(KTR_IFQ, ifq, dequeue, 1, IFQ_KTR_STRING, IFQ_KTR_ARGS);
177 #define logifq(name, arg)	KTR_LOG(ifq_ ## name, arg)
178 
179 #define IF_START_KTR_STRING	"ifp=%p"
180 #define IF_START_KTR_ARGS	struct ifnet *ifp
181 #ifndef KTR_IF_START
182 #define KTR_IF_START		KTR_ALL
183 #endif
184 KTR_INFO_MASTER(if_start);
185 KTR_INFO(KTR_IF_START, if_start, run, 0,
186 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
187 KTR_INFO(KTR_IF_START, if_start, sched, 1,
188 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
189 KTR_INFO(KTR_IF_START, if_start, avoid, 2,
190 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
191 KTR_INFO(KTR_IF_START, if_start, contend_sched, 3,
192 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
193 KTR_INFO(KTR_IF_START, if_start, chase_sched, 4,
194 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
195 #define logifstart(name, arg)	KTR_LOG(if_start_ ## name, arg)
196 #endif
197 
198 TAILQ_HEAD(, ifg_group) ifg_head = TAILQ_HEAD_INITIALIZER(ifg_head);
199 
200 /*
201  * Network interface utility routines.
202  *
203  * Routines with ifa_ifwith* names take sockaddr *'s as
204  * parameters.
205  */
206 /* ARGSUSED*/
207 static void
208 ifinit(void *dummy)
209 {
210 	struct ifnet *ifp;
211 
212 	callout_init_mp(&if_slowtimo_timer);
213 	netmsg_init(&if_slowtimo_netmsg, NULL, &netisr_adone_rport,
214 	    MSGF_PRIORITY, if_slowtimo_dispatch);
215 
216 	/* XXX is this necessary? */
217 	ifnet_lock();
218 	TAILQ_FOREACH(ifp, &ifnetlist, if_link) {
219 		if (ifp->if_snd.altq_maxlen == 0) {
220 			if_printf(ifp, "XXX: driver didn't set altq_maxlen\n");
221 			ifq_set_maxlen(&ifp->if_snd, ifqmaxlen);
222 		}
223 	}
224 	ifnet_unlock();
225 
226 	/* Start if_slowtimo */
227 	lwkt_sendmsg(netisr_cpuport(0), &if_slowtimo_netmsg.lmsg);
228 }
229 
230 static void
231 ifsq_ifstart_ipifunc(void *arg)
232 {
233 	struct ifaltq_subque *ifsq = arg;
234 	struct lwkt_msg *lmsg = ifsq_get_ifstart_lmsg(ifsq, mycpuid);
235 
236 	crit_enter();
237 	if (lmsg->ms_flags & MSGF_DONE)
238 		lwkt_sendmsg_oncpu(netisr_cpuport(mycpuid), lmsg);
239 	crit_exit();
240 }
241 
242 static __inline void
243 ifsq_stage_remove(struct ifsubq_stage_head *head, struct ifsubq_stage *stage)
244 {
245 	KKASSERT(stage->stg_flags & IFSQ_STAGE_FLAG_QUED);
246 	TAILQ_REMOVE(&head->stg_head, stage, stg_link);
247 	stage->stg_flags &= ~(IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED);
248 	stage->stg_cnt = 0;
249 	stage->stg_len = 0;
250 }
251 
252 static __inline void
253 ifsq_stage_insert(struct ifsubq_stage_head *head, struct ifsubq_stage *stage)
254 {
255 	KKASSERT((stage->stg_flags &
256 	    (IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED)) == 0);
257 	stage->stg_flags |= IFSQ_STAGE_FLAG_QUED;
258 	TAILQ_INSERT_TAIL(&head->stg_head, stage, stg_link);
259 }
260 
261 /*
262  * Schedule ifnet.if_start on the subqueue owner CPU
263  */
264 static void
265 ifsq_ifstart_schedule(struct ifaltq_subque *ifsq, int force)
266 {
267 	int cpu;
268 
269 	if (!force && curthread->td_type == TD_TYPE_NETISR &&
270 	    ifsq_stage_cntmax > 0) {
271 		struct ifsubq_stage *stage = ifsq_get_stage(ifsq, mycpuid);
272 
273 		stage->stg_cnt = 0;
274 		stage->stg_len = 0;
275 		if ((stage->stg_flags & IFSQ_STAGE_FLAG_QUED) == 0)
276 			ifsq_stage_insert(&ifsubq_stage_heads[mycpuid], stage);
277 		stage->stg_flags |= IFSQ_STAGE_FLAG_SCHED;
278 		return;
279 	}
280 
281 	cpu = ifsq_get_cpuid(ifsq);
282 	if (cpu != mycpuid)
283 		lwkt_send_ipiq(globaldata_find(cpu), ifsq_ifstart_ipifunc, ifsq);
284 	else
285 		ifsq_ifstart_ipifunc(ifsq);
286 }
287 
288 /*
289  * NOTE:
290  * This function will release ifnet.if_start subqueue interlock,
291  * if ifnet.if_start for the subqueue does not need to be scheduled
292  */
293 static __inline int
294 ifsq_ifstart_need_schedule(struct ifaltq_subque *ifsq, int running)
295 {
296 	if (!running || ifsq_is_empty(ifsq)
297 #ifdef ALTQ
298 	    || ifsq->ifsq_altq->altq_tbr != NULL
299 #endif
300 	) {
301 		ALTQ_SQ_LOCK(ifsq);
302 		/*
303 		 * ifnet.if_start subqueue interlock is released, if:
304 		 * 1) Hardware can not take any packets, due to
305 		 *    o  interface is marked down
306 		 *    o  hardware queue is full (ifsq_is_oactive)
307 		 *    Under the second situation, hardware interrupt
308 		 *    or polling(4) will call/schedule ifnet.if_start
309 		 *    on the subqueue when hardware queue is ready
310 		 * 2) There is no packet in the subqueue.
311 		 *    Further ifq_dispatch or ifq_handoff will call/
312 		 *    schedule ifnet.if_start on the subqueue.
313 		 * 3) TBR is used and it does not allow further
314 		 *    dequeueing.
315 		 *    TBR callout will call ifnet.if_start on the
316 		 *    subqueue.
317 		 */
318 		if (!running || !ifsq_data_ready(ifsq)) {
319 			ifsq_clr_started(ifsq);
320 			ALTQ_SQ_UNLOCK(ifsq);
321 			return 0;
322 		}
323 		ALTQ_SQ_UNLOCK(ifsq);
324 	}
325 	return 1;
326 }
327 
328 static void
329 ifsq_ifstart_dispatch(netmsg_t msg)
330 {
331 	struct lwkt_msg *lmsg = &msg->base.lmsg;
332 	struct ifaltq_subque *ifsq = lmsg->u.ms_resultp;
333 	struct ifnet *ifp = ifsq_get_ifp(ifsq);
334 	struct globaldata *gd = mycpu;
335 	int running = 0, need_sched;
336 
337 	crit_enter_gd(gd);
338 
339 	lwkt_replymsg(lmsg, 0);	/* reply ASAP */
340 
341 	if (gd->gd_cpuid != ifsq_get_cpuid(ifsq)) {
342 		/*
343 		 * We need to chase the subqueue owner CPU change.
344 		 */
345 		ifsq_ifstart_schedule(ifsq, 1);
346 		crit_exit_gd(gd);
347 		return;
348 	}
349 
350 	ifsq_serialize_hw(ifsq);
351 	if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq)) {
352 		ifp->if_start(ifp, ifsq);
353 		if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
354 			running = 1;
355 	}
356 	need_sched = ifsq_ifstart_need_schedule(ifsq, running);
357 	ifsq_deserialize_hw(ifsq);
358 
359 	if (need_sched) {
360 		/*
361 		 * More data need to be transmitted, ifnet.if_start is
362 		 * scheduled on the subqueue owner CPU, and we keep going.
363 		 * NOTE: ifnet.if_start subqueue interlock is not released.
364 		 */
365 		ifsq_ifstart_schedule(ifsq, 0);
366 	}
367 
368 	crit_exit_gd(gd);
369 }
370 
371 /* Device driver ifnet.if_start helper function */
372 void
373 ifsq_devstart(struct ifaltq_subque *ifsq)
374 {
375 	struct ifnet *ifp = ifsq_get_ifp(ifsq);
376 	int running = 0;
377 
378 	ASSERT_ALTQ_SQ_SERIALIZED_HW(ifsq);
379 
380 	ALTQ_SQ_LOCK(ifsq);
381 	if (ifsq_is_started(ifsq) || !ifsq_data_ready(ifsq)) {
382 		ALTQ_SQ_UNLOCK(ifsq);
383 		return;
384 	}
385 	ifsq_set_started(ifsq);
386 	ALTQ_SQ_UNLOCK(ifsq);
387 
388 	ifp->if_start(ifp, ifsq);
389 
390 	if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
391 		running = 1;
392 
393 	if (ifsq_ifstart_need_schedule(ifsq, running)) {
394 		/*
395 		 * More data need to be transmitted, ifnet.if_start is
396 		 * scheduled on ifnet's CPU, and we keep going.
397 		 * NOTE: ifnet.if_start interlock is not released.
398 		 */
399 		ifsq_ifstart_schedule(ifsq, 0);
400 	}
401 }
402 
403 void
404 if_devstart(struct ifnet *ifp)
405 {
406 	ifsq_devstart(ifq_get_subq_default(&ifp->if_snd));
407 }
408 
409 /* Device driver ifnet.if_start schedule helper function */
410 void
411 ifsq_devstart_sched(struct ifaltq_subque *ifsq)
412 {
413 	ifsq_ifstart_schedule(ifsq, 1);
414 }
415 
416 void
417 if_devstart_sched(struct ifnet *ifp)
418 {
419 	ifsq_devstart_sched(ifq_get_subq_default(&ifp->if_snd));
420 }
421 
422 static void
423 if_default_serialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
424 {
425 	lwkt_serialize_enter(ifp->if_serializer);
426 }
427 
428 static void
429 if_default_deserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
430 {
431 	lwkt_serialize_exit(ifp->if_serializer);
432 }
433 
434 static int
435 if_default_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
436 {
437 	return lwkt_serialize_try(ifp->if_serializer);
438 }
439 
440 #ifdef INVARIANTS
441 static void
442 if_default_serialize_assert(struct ifnet *ifp,
443 			    enum ifnet_serialize slz __unused,
444 			    boolean_t serialized)
445 {
446 	if (serialized)
447 		ASSERT_SERIALIZED(ifp->if_serializer);
448 	else
449 		ASSERT_NOT_SERIALIZED(ifp->if_serializer);
450 }
451 #endif
452 
453 /*
454  * Attach an interface to the list of "active" interfaces.
455  *
456  * The serializer is optional.
457  */
458 void
459 if_attach(struct ifnet *ifp, lwkt_serialize_t serializer)
460 {
461 	unsigned socksize;
462 	int namelen, masklen;
463 	struct sockaddr_dl *sdl, *sdl_addr;
464 	struct ifaddr *ifa;
465 	struct ifaltq *ifq;
466 	struct ifnet **old_ifindex2ifnet = NULL;
467 	struct ifnet_array *old_ifnet_array;
468 	int i, q;
469 
470 	static int if_indexlim = 8;
471 
472 	if (ifp->if_serialize != NULL) {
473 		KASSERT(ifp->if_deserialize != NULL &&
474 			ifp->if_tryserialize != NULL &&
475 			ifp->if_serialize_assert != NULL,
476 			("serialize functions are partially setup"));
477 
478 		/*
479 		 * If the device supplies serialize functions,
480 		 * then clear if_serializer to catch any invalid
481 		 * usage of this field.
482 		 */
483 		KASSERT(serializer == NULL,
484 			("both serialize functions and default serializer "
485 			 "are supplied"));
486 		ifp->if_serializer = NULL;
487 	} else {
488 		KASSERT(ifp->if_deserialize == NULL &&
489 			ifp->if_tryserialize == NULL &&
490 			ifp->if_serialize_assert == NULL,
491 			("serialize functions are partially setup"));
492 		ifp->if_serialize = if_default_serialize;
493 		ifp->if_deserialize = if_default_deserialize;
494 		ifp->if_tryserialize = if_default_tryserialize;
495 #ifdef INVARIANTS
496 		ifp->if_serialize_assert = if_default_serialize_assert;
497 #endif
498 
499 		/*
500 		 * The serializer can be passed in from the device,
501 		 * allowing the same serializer to be used for both
502 		 * the interrupt interlock and the device queue.
503 		 * If not specified, the netif structure will use an
504 		 * embedded serializer.
505 		 */
506 		if (serializer == NULL) {
507 			serializer = &ifp->if_default_serializer;
508 			lwkt_serialize_init(serializer);
509 		}
510 		ifp->if_serializer = serializer;
511 	}
512 
513 	/*
514 	 * XXX -
515 	 * The old code would work if the interface passed a pre-existing
516 	 * chain of ifaddrs to this code.  We don't trust our callers to
517 	 * properly initialize the tailq, however, so we no longer allow
518 	 * this unlikely case.
519 	 */
520 	ifp->if_addrheads = kmalloc(ncpus * sizeof(struct ifaddrhead),
521 				    M_IFADDR, M_WAITOK | M_ZERO);
522 	for (i = 0; i < ncpus; ++i)
523 		TAILQ_INIT(&ifp->if_addrheads[i]);
524 
525 	TAILQ_INIT(&ifp->if_multiaddrs);
526 	TAILQ_INIT(&ifp->if_groups);
527 	getmicrotime(&ifp->if_lastchange);
528 
529 	/*
530 	 * create a Link Level name for this device
531 	 */
532 	namelen = strlen(ifp->if_xname);
533 	masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + namelen;
534 	socksize = masklen + ifp->if_addrlen;
535 	if (socksize < sizeof(*sdl))
536 		socksize = sizeof(*sdl);
537 	socksize = RT_ROUNDUP(socksize);
538 	ifa = ifa_create(sizeof(struct ifaddr) + 2 * socksize);
539 	sdl = sdl_addr = (struct sockaddr_dl *)(ifa + 1);
540 	sdl->sdl_len = socksize;
541 	sdl->sdl_family = AF_LINK;
542 	bcopy(ifp->if_xname, sdl->sdl_data, namelen);
543 	sdl->sdl_nlen = namelen;
544 	sdl->sdl_type = ifp->if_type;
545 	ifp->if_lladdr = ifa;
546 	ifa->ifa_ifp = ifp;
547 	ifa->ifa_rtrequest = link_rtrequest;
548 	ifa->ifa_addr = (struct sockaddr *)sdl;
549 	sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
550 	ifa->ifa_netmask = (struct sockaddr *)sdl;
551 	sdl->sdl_len = masklen;
552 	while (namelen != 0)
553 		sdl->sdl_data[--namelen] = 0xff;
554 	ifa_iflink(ifa, ifp, 0 /* Insert head */);
555 
556 	ifp->if_data_pcpu = kmalloc_cachealign(
557 	    ncpus * sizeof(struct ifdata_pcpu), M_DEVBUF, M_WAITOK | M_ZERO);
558 
559 	if (ifp->if_mapsubq == NULL)
560 		ifp->if_mapsubq = ifq_mapsubq_default;
561 
562 	ifq = &ifp->if_snd;
563 	ifq->altq_type = 0;
564 	ifq->altq_disc = NULL;
565 	ifq->altq_flags &= ALTQF_CANTCHANGE;
566 	ifq->altq_tbr = NULL;
567 	ifq->altq_ifp = ifp;
568 
569 	if (ifq->altq_subq_cnt <= 0)
570 		ifq->altq_subq_cnt = 1;
571 	ifq->altq_subq = kmalloc_cachealign(
572 	    ifq->altq_subq_cnt * sizeof(struct ifaltq_subque),
573 	    M_DEVBUF, M_WAITOK | M_ZERO);
574 
575 	if (ifq->altq_maxlen == 0) {
576 		if_printf(ifp, "driver didn't set altq_maxlen\n");
577 		ifq_set_maxlen(ifq, ifqmaxlen);
578 	}
579 
580 	for (q = 0; q < ifq->altq_subq_cnt; ++q) {
581 		struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
582 
583 		ALTQ_SQ_LOCK_INIT(ifsq);
584 		ifsq->ifsq_index = q;
585 
586 		ifsq->ifsq_altq = ifq;
587 		ifsq->ifsq_ifp = ifp;
588 
589 		ifsq->ifsq_maxlen = ifq->altq_maxlen;
590 		ifsq->ifsq_maxbcnt = ifsq->ifsq_maxlen * MCLBYTES;
591 		ifsq->ifsq_prepended = NULL;
592 		ifsq->ifsq_started = 0;
593 		ifsq->ifsq_hw_oactive = 0;
594 		ifsq_set_cpuid(ifsq, 0);
595 		if (ifp->if_serializer != NULL)
596 			ifsq_set_hw_serialize(ifsq, ifp->if_serializer);
597 
598 		ifsq->ifsq_stage =
599 		    kmalloc_cachealign(ncpus * sizeof(struct ifsubq_stage),
600 		    M_DEVBUF, M_WAITOK | M_ZERO);
601 		for (i = 0; i < ncpus; ++i)
602 			ifsq->ifsq_stage[i].stg_subq = ifsq;
603 
604 		ifsq->ifsq_ifstart_nmsg =
605 		    kmalloc(ncpus * sizeof(struct netmsg_base),
606 		    M_LWKTMSG, M_WAITOK);
607 		for (i = 0; i < ncpus; ++i) {
608 			netmsg_init(&ifsq->ifsq_ifstart_nmsg[i], NULL,
609 			    &netisr_adone_rport, 0, ifsq_ifstart_dispatch);
610 			ifsq->ifsq_ifstart_nmsg[i].lmsg.u.ms_resultp = ifsq;
611 		}
612 	}
613 	ifq_set_classic(ifq);
614 
615 	/*
616 	 * Increase mbuf cluster/jcluster limits for the mbufs that
617 	 * could sit on the device queues for quite some time.
618 	 */
619 	if (ifp->if_nmbclusters > 0)
620 		mcl_inclimit(ifp->if_nmbclusters);
621 	if (ifp->if_nmbjclusters > 0)
622 		mjcl_inclimit(ifp->if_nmbjclusters);
623 
624 	/*
625 	 * Install this ifp into ifindex2inet, ifnet queue and ifnet
626 	 * array after it is setup.
627 	 *
628 	 * Protect ifindex2ifnet, ifnet queue and ifnet array changes
629 	 * by ifnet lock, so that non-netisr threads could get a
630 	 * consistent view.
631 	 */
632 	ifnet_lock();
633 
634 	/* Don't update if_index until ifindex2ifnet is setup */
635 	ifp->if_index = if_index + 1;
636 	sdl_addr->sdl_index = ifp->if_index;
637 
638 	/*
639 	 * Install this ifp into ifindex2ifnet
640 	 */
641 	if (ifindex2ifnet == NULL || ifp->if_index >= if_indexlim) {
642 		unsigned int n;
643 		struct ifnet **q;
644 
645 		/*
646 		 * Grow ifindex2ifnet
647 		 */
648 		if_indexlim <<= 1;
649 		n = if_indexlim * sizeof(*q);
650 		q = kmalloc(n, M_IFADDR, M_WAITOK | M_ZERO);
651 		if (ifindex2ifnet != NULL) {
652 			bcopy(ifindex2ifnet, q, n/2);
653 			/* Free old ifindex2ifnet after sync all netisrs */
654 			old_ifindex2ifnet = ifindex2ifnet;
655 		}
656 		ifindex2ifnet = q;
657 	}
658 	ifindex2ifnet[ifp->if_index] = ifp;
659 	/*
660 	 * Update if_index after this ifp is installed into ifindex2ifnet,
661 	 * so that netisrs could get a consistent view of ifindex2ifnet.
662 	 */
663 	cpu_sfence();
664 	if_index = ifp->if_index;
665 
666 	/*
667 	 * Install this ifp into ifnet array.
668 	 */
669 	/* Free old ifnet array after sync all netisrs */
670 	old_ifnet_array = ifnet_array;
671 	ifnet_array = ifnet_array_add(ifp, old_ifnet_array);
672 
673 	/*
674 	 * Install this ifp into ifnet queue.
675 	 */
676 	TAILQ_INSERT_TAIL(&ifnetlist, ifp, if_link);
677 
678 	ifnet_unlock();
679 
680 	/*
681 	 * Sync all netisrs so that the old ifindex2ifnet and ifnet array
682 	 * are no longer accessed and we can free them safely later on.
683 	 */
684 	netmsg_service_sync();
685 	if (old_ifindex2ifnet != NULL)
686 		kfree(old_ifindex2ifnet, M_IFADDR);
687 	ifnet_array_free(old_ifnet_array);
688 
689 	if (!SLIST_EMPTY(&domains))
690 		if_attachdomain1(ifp);
691 
692 	/* Announce the interface. */
693 	EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
694 	devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
695 	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
696 }
697 
698 static void
699 if_attachdomain(void *dummy)
700 {
701 	struct ifnet *ifp;
702 
703 	ifnet_lock();
704 	TAILQ_FOREACH(ifp, &ifnetlist, if_list)
705 		if_attachdomain1(ifp);
706 	ifnet_unlock();
707 }
708 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST,
709 	if_attachdomain, NULL);
710 
711 static void
712 if_attachdomain1(struct ifnet *ifp)
713 {
714 	struct domain *dp;
715 
716 	crit_enter();
717 
718 	/* address family dependent data region */
719 	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
720 	SLIST_FOREACH(dp, &domains, dom_next)
721 		if (dp->dom_ifattach)
722 			ifp->if_afdata[dp->dom_family] =
723 				(*dp->dom_ifattach)(ifp);
724 	crit_exit();
725 }
726 
727 /*
728  * Purge all addresses whose type is _not_ AF_LINK
729  */
730 static void
731 if_purgeaddrs_nolink_dispatch(netmsg_t nmsg)
732 {
733 	struct lwkt_msg *lmsg = &nmsg->lmsg;
734 	struct ifnet *ifp = lmsg->u.ms_resultp;
735 	struct ifaddr_container *ifac, *next;
736 
737 	ASSERT_IN_NETISR(0);
738 
739 	/*
740 	 * The ifaddr processing in the following loop will block,
741 	 * however, this function is called in netisr0, in which
742 	 * ifaddr list changes happen, so we don't care about the
743 	 * blockness of the ifaddr processing here.
744 	 */
745 	TAILQ_FOREACH_MUTABLE(ifac, &ifp->if_addrheads[mycpuid],
746 			      ifa_link, next) {
747 		struct ifaddr *ifa = ifac->ifa;
748 
749 		/* Ignore marker */
750 		if (ifa->ifa_addr->sa_family == AF_UNSPEC)
751 			continue;
752 
753 		/* Leave link ifaddr as it is */
754 		if (ifa->ifa_addr->sa_family == AF_LINK)
755 			continue;
756 #ifdef INET
757 		/* XXX: Ugly!! ad hoc just for INET */
758 		if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET) {
759 			struct ifaliasreq ifr;
760 #ifdef IFADDR_DEBUG_VERBOSE
761 			int i;
762 
763 			kprintf("purge in4 addr %p: ", ifa);
764 			for (i = 0; i < ncpus; ++i)
765 				kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
766 			kprintf("\n");
767 #endif
768 
769 			bzero(&ifr, sizeof ifr);
770 			ifr.ifra_addr = *ifa->ifa_addr;
771 			if (ifa->ifa_dstaddr)
772 				ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
773 			if (in_control(SIOCDIFADDR, (caddr_t)&ifr, ifp,
774 				       NULL) == 0)
775 				continue;
776 		}
777 #endif /* INET */
778 #ifdef INET6
779 		if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET6) {
780 #ifdef IFADDR_DEBUG_VERBOSE
781 			int i;
782 
783 			kprintf("purge in6 addr %p: ", ifa);
784 			for (i = 0; i < ncpus; ++i)
785 				kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
786 			kprintf("\n");
787 #endif
788 
789 			in6_purgeaddr(ifa);
790 			/* ifp_addrhead is already updated */
791 			continue;
792 		}
793 #endif /* INET6 */
794 		ifa_ifunlink(ifa, ifp);
795 		ifa_destroy(ifa);
796 	}
797 
798 	lwkt_replymsg(lmsg, 0);
799 }
800 
801 void
802 if_purgeaddrs_nolink(struct ifnet *ifp)
803 {
804 	struct netmsg_base nmsg;
805 	struct lwkt_msg *lmsg = &nmsg.lmsg;
806 
807 	ASSERT_CANDOMSG_NETISR0(curthread);
808 
809 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, 0,
810 	    if_purgeaddrs_nolink_dispatch);
811 	lmsg->u.ms_resultp = ifp;
812 	lwkt_domsg(netisr_cpuport(0), lmsg, 0);
813 }
814 
815 static void
816 ifq_stage_detach_handler(netmsg_t nmsg)
817 {
818 	struct ifaltq *ifq = nmsg->lmsg.u.ms_resultp;
819 	int q;
820 
821 	for (q = 0; q < ifq->altq_subq_cnt; ++q) {
822 		struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
823 		struct ifsubq_stage *stage = ifsq_get_stage(ifsq, mycpuid);
824 
825 		if (stage->stg_flags & IFSQ_STAGE_FLAG_QUED)
826 			ifsq_stage_remove(&ifsubq_stage_heads[mycpuid], stage);
827 	}
828 	lwkt_replymsg(&nmsg->lmsg, 0);
829 }
830 
831 static void
832 ifq_stage_detach(struct ifaltq *ifq)
833 {
834 	struct netmsg_base base;
835 	int cpu;
836 
837 	netmsg_init(&base, NULL, &curthread->td_msgport, 0,
838 	    ifq_stage_detach_handler);
839 	base.lmsg.u.ms_resultp = ifq;
840 
841 	for (cpu = 0; cpu < ncpus; ++cpu)
842 		lwkt_domsg(netisr_cpuport(cpu), &base.lmsg, 0);
843 }
844 
845 struct netmsg_if_rtdel {
846 	struct netmsg_base	base;
847 	struct ifnet		*ifp;
848 };
849 
850 static void
851 if_rtdel_dispatch(netmsg_t msg)
852 {
853 	struct netmsg_if_rtdel *rmsg = (void *)msg;
854 	int i, nextcpu, cpu;
855 
856 	cpu = mycpuid;
857 	for (i = 1; i <= AF_MAX; i++) {
858 		struct radix_node_head	*rnh;
859 
860 		if ((rnh = rt_tables[cpu][i]) == NULL)
861 			continue;
862 		rnh->rnh_walktree(rnh, if_rtdel, rmsg->ifp);
863 	}
864 
865 	nextcpu = cpu + 1;
866 	if (nextcpu < ncpus)
867 		lwkt_forwardmsg(netisr_cpuport(nextcpu), &rmsg->base.lmsg);
868 	else
869 		lwkt_replymsg(&rmsg->base.lmsg, 0);
870 }
871 
872 /*
873  * Detach an interface, removing it from the
874  * list of "active" interfaces.
875  */
876 void
877 if_detach(struct ifnet *ifp)
878 {
879 	struct ifnet_array *old_ifnet_array;
880 	struct netmsg_if_rtdel msg;
881 	struct domain *dp;
882 	int q;
883 
884 	/* Announce that the interface is gone. */
885 	EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
886 	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
887 	devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
888 
889 	/*
890 	 * Remove this ifp from ifindex2inet, ifnet queue and ifnet
891 	 * array before it is whacked.
892 	 *
893 	 * Protect ifindex2ifnet, ifnet queue and ifnet array changes
894 	 * by ifnet lock, so that non-netisr threads could get a
895 	 * consistent view.
896 	 */
897 	ifnet_lock();
898 
899 	/*
900 	 * Remove this ifp from ifindex2ifnet and maybe decrement if_index.
901 	 */
902 	ifindex2ifnet[ifp->if_index] = NULL;
903 	while (if_index > 0 && ifindex2ifnet[if_index] == NULL)
904 		if_index--;
905 
906 	/*
907 	 * Remove this ifp from ifnet queue.
908 	 */
909 	TAILQ_REMOVE(&ifnetlist, ifp, if_link);
910 
911 	/*
912 	 * Remove this ifp from ifnet array.
913 	 */
914 	/* Free old ifnet array after sync all netisrs */
915 	old_ifnet_array = ifnet_array;
916 	ifnet_array = ifnet_array_del(ifp, old_ifnet_array);
917 
918 	ifnet_unlock();
919 
920 	/*
921 	 * Sync all netisrs so that the old ifnet array is no longer
922 	 * accessed and we can free it safely later on.
923 	 */
924 	netmsg_service_sync();
925 	ifnet_array_free(old_ifnet_array);
926 
927 	/*
928 	 * Remove routes and flush queues.
929 	 */
930 	crit_enter();
931 #ifdef IFPOLL_ENABLE
932 	if (ifp->if_flags & IFF_NPOLLING)
933 		ifpoll_deregister(ifp);
934 #endif
935 	if_down(ifp);
936 
937 	/* Decrease the mbuf clusters/jclusters limits increased by us */
938 	if (ifp->if_nmbclusters > 0)
939 		mcl_inclimit(-ifp->if_nmbclusters);
940 	if (ifp->if_nmbjclusters > 0)
941 		mjcl_inclimit(-ifp->if_nmbjclusters);
942 
943 #ifdef ALTQ
944 	if (ifq_is_enabled(&ifp->if_snd))
945 		altq_disable(&ifp->if_snd);
946 	if (ifq_is_attached(&ifp->if_snd))
947 		altq_detach(&ifp->if_snd);
948 #endif
949 
950 	/*
951 	 * Clean up all addresses.
952 	 */
953 	ifp->if_lladdr = NULL;
954 
955 	if_purgeaddrs_nolink(ifp);
956 	if (!TAILQ_EMPTY(&ifp->if_addrheads[mycpuid])) {
957 		struct ifaddr *ifa;
958 
959 		ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
960 		KASSERT(ifa->ifa_addr->sa_family == AF_LINK,
961 			("non-link ifaddr is left on if_addrheads"));
962 
963 		ifa_ifunlink(ifa, ifp);
964 		ifa_destroy(ifa);
965 		KASSERT(TAILQ_EMPTY(&ifp->if_addrheads[mycpuid]),
966 			("there are still ifaddrs left on if_addrheads"));
967 	}
968 
969 #ifdef INET
970 	/*
971 	 * Remove all IPv4 kernel structures related to ifp.
972 	 */
973 	in_ifdetach(ifp);
974 #endif
975 
976 #ifdef INET6
977 	/*
978 	 * Remove all IPv6 kernel structs related to ifp.  This should be done
979 	 * before removing routing entries below, since IPv6 interface direct
980 	 * routes are expected to be removed by the IPv6-specific kernel API.
981 	 * Otherwise, the kernel will detect some inconsistency and bark it.
982 	 */
983 	in6_ifdetach(ifp);
984 #endif
985 
986 	/*
987 	 * Delete all remaining routes using this interface
988 	 */
989 	netmsg_init(&msg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
990 	    if_rtdel_dispatch);
991 	msg.ifp = ifp;
992 	rt_domsg_global(&msg.base);
993 
994 	SLIST_FOREACH(dp, &domains, dom_next)
995 		if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
996 			(*dp->dom_ifdetach)(ifp,
997 				ifp->if_afdata[dp->dom_family]);
998 
999 	kfree(ifp->if_addrheads, M_IFADDR);
1000 
1001 	lwkt_synchronize_ipiqs("if_detach");
1002 	ifq_stage_detach(&ifp->if_snd);
1003 
1004 	for (q = 0; q < ifp->if_snd.altq_subq_cnt; ++q) {
1005 		struct ifaltq_subque *ifsq = &ifp->if_snd.altq_subq[q];
1006 
1007 		kfree(ifsq->ifsq_ifstart_nmsg, M_LWKTMSG);
1008 		kfree(ifsq->ifsq_stage, M_DEVBUF);
1009 	}
1010 	kfree(ifp->if_snd.altq_subq, M_DEVBUF);
1011 
1012 	kfree(ifp->if_data_pcpu, M_DEVBUF);
1013 
1014 	crit_exit();
1015 }
1016 
1017 /*
1018  * Create interface group without members
1019  */
1020 struct ifg_group *
1021 if_creategroup(const char *groupname)
1022 {
1023         struct ifg_group        *ifg = NULL;
1024 
1025         if ((ifg = (struct ifg_group *)kmalloc(sizeof(struct ifg_group),
1026             M_TEMP, M_NOWAIT)) == NULL)
1027                 return (NULL);
1028 
1029         strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
1030         ifg->ifg_refcnt = 0;
1031         ifg->ifg_carp_demoted = 0;
1032         TAILQ_INIT(&ifg->ifg_members);
1033 #if NPF > 0
1034         pfi_attach_ifgroup(ifg);
1035 #endif
1036         TAILQ_INSERT_TAIL(&ifg_head, ifg, ifg_next);
1037 
1038         return (ifg);
1039 }
1040 
1041 /*
1042  * Add a group to an interface
1043  */
1044 int
1045 if_addgroup(struct ifnet *ifp, const char *groupname)
1046 {
1047 	struct ifg_list		*ifgl;
1048 	struct ifg_group	*ifg = NULL;
1049 	struct ifg_member	*ifgm;
1050 
1051 	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
1052 	    groupname[strlen(groupname) - 1] <= '9')
1053 		return (EINVAL);
1054 
1055 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
1056 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
1057 			return (EEXIST);
1058 
1059 	if ((ifgl = kmalloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL)
1060 		return (ENOMEM);
1061 
1062 	if ((ifgm = kmalloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) {
1063 		kfree(ifgl, M_TEMP);
1064 		return (ENOMEM);
1065 	}
1066 
1067 	TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
1068 		if (!strcmp(ifg->ifg_group, groupname))
1069 			break;
1070 
1071 	if (ifg == NULL && (ifg = if_creategroup(groupname)) == NULL) {
1072 		kfree(ifgl, M_TEMP);
1073 		kfree(ifgm, M_TEMP);
1074 		return (ENOMEM);
1075 	}
1076 
1077 	ifg->ifg_refcnt++;
1078 	ifgl->ifgl_group = ifg;
1079 	ifgm->ifgm_ifp = ifp;
1080 
1081 	TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
1082 	TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
1083 
1084 #if NPF > 0
1085 	pfi_group_change(groupname);
1086 #endif
1087 
1088 	return (0);
1089 }
1090 
1091 /*
1092  * Remove a group from an interface
1093  */
1094 int
1095 if_delgroup(struct ifnet *ifp, const char *groupname)
1096 {
1097 	struct ifg_list		*ifgl;
1098 	struct ifg_member	*ifgm;
1099 
1100 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
1101 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
1102 			break;
1103 	if (ifgl == NULL)
1104 		return (ENOENT);
1105 
1106 	TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
1107 
1108 	TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
1109 		if (ifgm->ifgm_ifp == ifp)
1110 			break;
1111 
1112 	if (ifgm != NULL) {
1113 		TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
1114 		kfree(ifgm, M_TEMP);
1115 	}
1116 
1117 	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
1118 		TAILQ_REMOVE(&ifg_head, ifgl->ifgl_group, ifg_next);
1119 #if NPF > 0
1120 		pfi_detach_ifgroup(ifgl->ifgl_group);
1121 #endif
1122 		kfree(ifgl->ifgl_group, M_TEMP);
1123 	}
1124 
1125 	kfree(ifgl, M_TEMP);
1126 
1127 #if NPF > 0
1128 	pfi_group_change(groupname);
1129 #endif
1130 
1131 	return (0);
1132 }
1133 
1134 /*
1135  * Stores all groups from an interface in memory pointed
1136  * to by data
1137  */
1138 int
1139 if_getgroup(caddr_t data, struct ifnet *ifp)
1140 {
1141 	int			 len, error;
1142 	struct ifg_list		*ifgl;
1143 	struct ifg_req		 ifgrq, *ifgp;
1144 	struct ifgroupreq	*ifgr = (struct ifgroupreq *)data;
1145 
1146 	if (ifgr->ifgr_len == 0) {
1147 		TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
1148 			ifgr->ifgr_len += sizeof(struct ifg_req);
1149 		return (0);
1150 	}
1151 
1152 	len = ifgr->ifgr_len;
1153 	ifgp = ifgr->ifgr_groups;
1154 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
1155 		if (len < sizeof(ifgrq))
1156 			return (EINVAL);
1157 		bzero(&ifgrq, sizeof ifgrq);
1158 		strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
1159 		    sizeof(ifgrq.ifgrq_group));
1160 		if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
1161 		    sizeof(struct ifg_req))))
1162 			return (error);
1163 		len -= sizeof(ifgrq);
1164 		ifgp++;
1165 	}
1166 
1167 	return (0);
1168 }
1169 
1170 /*
1171  * Stores all members of a group in memory pointed to by data
1172  */
1173 int
1174 if_getgroupmembers(caddr_t data)
1175 {
1176 	struct ifgroupreq	*ifgr = (struct ifgroupreq *)data;
1177 	struct ifg_group	*ifg;
1178 	struct ifg_member	*ifgm;
1179 	struct ifg_req		 ifgrq, *ifgp;
1180 	int			 len, error;
1181 
1182 	TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
1183 		if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
1184 			break;
1185 	if (ifg == NULL)
1186 		return (ENOENT);
1187 
1188 	if (ifgr->ifgr_len == 0) {
1189 		TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
1190 			ifgr->ifgr_len += sizeof(ifgrq);
1191 		return (0);
1192 	}
1193 
1194 	len = ifgr->ifgr_len;
1195 	ifgp = ifgr->ifgr_groups;
1196 	TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
1197 		if (len < sizeof(ifgrq))
1198 			return (EINVAL);
1199 		bzero(&ifgrq, sizeof ifgrq);
1200 		strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
1201 		    sizeof(ifgrq.ifgrq_member));
1202 		if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
1203 		    sizeof(struct ifg_req))))
1204 			return (error);
1205 		len -= sizeof(ifgrq);
1206 		ifgp++;
1207 	}
1208 
1209 	return (0);
1210 }
1211 
1212 /*
1213  * Delete Routes for a Network Interface
1214  *
1215  * Called for each routing entry via the rnh->rnh_walktree() call above
1216  * to delete all route entries referencing a detaching network interface.
1217  *
1218  * Arguments:
1219  *	rn	pointer to node in the routing table
1220  *	arg	argument passed to rnh->rnh_walktree() - detaching interface
1221  *
1222  * Returns:
1223  *	0	successful
1224  *	errno	failed - reason indicated
1225  *
1226  */
1227 static int
1228 if_rtdel(struct radix_node *rn, void *arg)
1229 {
1230 	struct rtentry	*rt = (struct rtentry *)rn;
1231 	struct ifnet	*ifp = arg;
1232 	int		err;
1233 
1234 	if (rt->rt_ifp == ifp) {
1235 
1236 		/*
1237 		 * Protect (sorta) against walktree recursion problems
1238 		 * with cloned routes
1239 		 */
1240 		if (!(rt->rt_flags & RTF_UP))
1241 			return (0);
1242 
1243 		err = rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
1244 				rt_mask(rt), rt->rt_flags,
1245 				NULL);
1246 		if (err) {
1247 			log(LOG_WARNING, "if_rtdel: error %d\n", err);
1248 		}
1249 	}
1250 
1251 	return (0);
1252 }
1253 
1254 static __inline boolean_t
1255 ifa_prefer(const struct ifaddr *cur_ifa, const struct ifaddr *old_ifa)
1256 {
1257 	if (old_ifa == NULL)
1258 		return TRUE;
1259 
1260 	if ((old_ifa->ifa_ifp->if_flags & IFF_UP) == 0 &&
1261 	    (cur_ifa->ifa_ifp->if_flags & IFF_UP))
1262 		return TRUE;
1263 	if ((old_ifa->ifa_flags & IFA_ROUTE) == 0 &&
1264 	    (cur_ifa->ifa_flags & IFA_ROUTE))
1265 		return TRUE;
1266 	return FALSE;
1267 }
1268 
1269 /*
1270  * Locate an interface based on a complete address.
1271  */
1272 struct ifaddr *
1273 ifa_ifwithaddr(struct sockaddr *addr)
1274 {
1275 	const struct ifnet_array *arr;
1276 	int i;
1277 
1278 	arr = ifnet_array_get();
1279 	for (i = 0; i < arr->ifnet_count; ++i) {
1280 		struct ifnet *ifp = arr->ifnet_arr[i];
1281 		struct ifaddr_container *ifac;
1282 
1283 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1284 			struct ifaddr *ifa = ifac->ifa;
1285 
1286 			if (ifa->ifa_addr->sa_family != addr->sa_family)
1287 				continue;
1288 			if (sa_equal(addr, ifa->ifa_addr))
1289 				return (ifa);
1290 			if ((ifp->if_flags & IFF_BROADCAST) &&
1291 			    ifa->ifa_broadaddr &&
1292 			    /* IPv6 doesn't have broadcast */
1293 			    ifa->ifa_broadaddr->sa_len != 0 &&
1294 			    sa_equal(ifa->ifa_broadaddr, addr))
1295 				return (ifa);
1296 		}
1297 	}
1298 	return (NULL);
1299 }
1300 
1301 /*
1302  * Locate the point to point interface with a given destination address.
1303  */
1304 struct ifaddr *
1305 ifa_ifwithdstaddr(struct sockaddr *addr)
1306 {
1307 	const struct ifnet_array *arr;
1308 	int i;
1309 
1310 	arr = ifnet_array_get();
1311 	for (i = 0; i < arr->ifnet_count; ++i) {
1312 		struct ifnet *ifp = arr->ifnet_arr[i];
1313 		struct ifaddr_container *ifac;
1314 
1315 		if (!(ifp->if_flags & IFF_POINTOPOINT))
1316 			continue;
1317 
1318 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1319 			struct ifaddr *ifa = ifac->ifa;
1320 
1321 			if (ifa->ifa_addr->sa_family != addr->sa_family)
1322 				continue;
1323 			if (ifa->ifa_dstaddr &&
1324 			    sa_equal(addr, ifa->ifa_dstaddr))
1325 				return (ifa);
1326 		}
1327 	}
1328 	return (NULL);
1329 }
1330 
1331 /*
1332  * Find an interface on a specific network.  If many, choice
1333  * is most specific found.
1334  */
1335 struct ifaddr *
1336 ifa_ifwithnet(struct sockaddr *addr)
1337 {
1338 	struct ifaddr *ifa_maybe = NULL;
1339 	u_int af = addr->sa_family;
1340 	char *addr_data = addr->sa_data, *cplim;
1341 	const struct ifnet_array *arr;
1342 	int i;
1343 
1344 	/*
1345 	 * AF_LINK addresses can be looked up directly by their index number,
1346 	 * so do that if we can.
1347 	 */
1348 	if (af == AF_LINK) {
1349 		struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
1350 
1351 		if (sdl->sdl_index && sdl->sdl_index <= if_index)
1352 			return (ifindex2ifnet[sdl->sdl_index]->if_lladdr);
1353 	}
1354 
1355 	/*
1356 	 * Scan though each interface, looking for ones that have
1357 	 * addresses in this address family.
1358 	 */
1359 	arr = ifnet_array_get();
1360 	for (i = 0; i < arr->ifnet_count; ++i) {
1361 		struct ifnet *ifp = arr->ifnet_arr[i];
1362 		struct ifaddr_container *ifac;
1363 
1364 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1365 			struct ifaddr *ifa = ifac->ifa;
1366 			char *cp, *cp2, *cp3;
1367 
1368 			if (ifa->ifa_addr->sa_family != af)
1369 next:				continue;
1370 			if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT) {
1371 				/*
1372 				 * This is a bit broken as it doesn't
1373 				 * take into account that the remote end may
1374 				 * be a single node in the network we are
1375 				 * looking for.
1376 				 * The trouble is that we don't know the
1377 				 * netmask for the remote end.
1378 				 */
1379 				if (ifa->ifa_dstaddr != NULL &&
1380 				    sa_equal(addr, ifa->ifa_dstaddr))
1381 					return (ifa);
1382 			} else {
1383 				/*
1384 				 * if we have a special address handler,
1385 				 * then use it instead of the generic one.
1386 				 */
1387 				if (ifa->ifa_claim_addr) {
1388 					if ((*ifa->ifa_claim_addr)(ifa, addr)) {
1389 						return (ifa);
1390 					} else {
1391 						continue;
1392 					}
1393 				}
1394 
1395 				/*
1396 				 * Scan all the bits in the ifa's address.
1397 				 * If a bit dissagrees with what we are
1398 				 * looking for, mask it with the netmask
1399 				 * to see if it really matters.
1400 				 * (A byte at a time)
1401 				 */
1402 				if (ifa->ifa_netmask == 0)
1403 					continue;
1404 				cp = addr_data;
1405 				cp2 = ifa->ifa_addr->sa_data;
1406 				cp3 = ifa->ifa_netmask->sa_data;
1407 				cplim = ifa->ifa_netmask->sa_len +
1408 					(char *)ifa->ifa_netmask;
1409 				while (cp3 < cplim)
1410 					if ((*cp++ ^ *cp2++) & *cp3++)
1411 						goto next; /* next address! */
1412 				/*
1413 				 * If the netmask of what we just found
1414 				 * is more specific than what we had before
1415 				 * (if we had one) then remember the new one
1416 				 * before continuing to search for an even
1417 				 * better one.  If the netmasks are equal,
1418 				 * we prefer the this ifa based on the result
1419 				 * of ifa_prefer().
1420 				 */
1421 				if (ifa_maybe == NULL ||
1422 				    rn_refines((char *)ifa->ifa_netmask,
1423 				        (char *)ifa_maybe->ifa_netmask) ||
1424 				    (sa_equal(ifa_maybe->ifa_netmask,
1425 				        ifa->ifa_netmask) &&
1426 				     ifa_prefer(ifa, ifa_maybe)))
1427 					ifa_maybe = ifa;
1428 			}
1429 		}
1430 	}
1431 	return (ifa_maybe);
1432 }
1433 
1434 /*
1435  * Find an interface address specific to an interface best matching
1436  * a given address.
1437  */
1438 struct ifaddr *
1439 ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp)
1440 {
1441 	struct ifaddr_container *ifac;
1442 	char *cp, *cp2, *cp3;
1443 	char *cplim;
1444 	struct ifaddr *ifa_maybe = NULL;
1445 	u_int af = addr->sa_family;
1446 
1447 	if (af >= AF_MAX)
1448 		return (0);
1449 	TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1450 		struct ifaddr *ifa = ifac->ifa;
1451 
1452 		if (ifa->ifa_addr->sa_family != af)
1453 			continue;
1454 		if (ifa_maybe == NULL)
1455 			ifa_maybe = ifa;
1456 		if (ifa->ifa_netmask == NULL) {
1457 			if (sa_equal(addr, ifa->ifa_addr) ||
1458 			    (ifa->ifa_dstaddr != NULL &&
1459 			     sa_equal(addr, ifa->ifa_dstaddr)))
1460 				return (ifa);
1461 			continue;
1462 		}
1463 		if (ifp->if_flags & IFF_POINTOPOINT) {
1464 			if (sa_equal(addr, ifa->ifa_dstaddr))
1465 				return (ifa);
1466 		} else {
1467 			cp = addr->sa_data;
1468 			cp2 = ifa->ifa_addr->sa_data;
1469 			cp3 = ifa->ifa_netmask->sa_data;
1470 			cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
1471 			for (; cp3 < cplim; cp3++)
1472 				if ((*cp++ ^ *cp2++) & *cp3)
1473 					break;
1474 			if (cp3 == cplim)
1475 				return (ifa);
1476 		}
1477 	}
1478 	return (ifa_maybe);
1479 }
1480 
1481 /*
1482  * Default action when installing a route with a Link Level gateway.
1483  * Lookup an appropriate real ifa to point to.
1484  * This should be moved to /sys/net/link.c eventually.
1485  */
1486 static void
1487 link_rtrequest(int cmd, struct rtentry *rt)
1488 {
1489 	struct ifaddr *ifa;
1490 	struct sockaddr *dst;
1491 	struct ifnet *ifp;
1492 
1493 	if (cmd != RTM_ADD || (ifa = rt->rt_ifa) == NULL ||
1494 	    (ifp = ifa->ifa_ifp) == NULL || (dst = rt_key(rt)) == NULL)
1495 		return;
1496 	ifa = ifaof_ifpforaddr(dst, ifp);
1497 	if (ifa != NULL) {
1498 		IFAFREE(rt->rt_ifa);
1499 		IFAREF(ifa);
1500 		rt->rt_ifa = ifa;
1501 		if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
1502 			ifa->ifa_rtrequest(cmd, rt);
1503 	}
1504 }
1505 
1506 struct netmsg_ifroute {
1507 	struct netmsg_base	base;
1508 	struct ifnet		*ifp;
1509 	int			flag;
1510 	int			fam;
1511 };
1512 
1513 /*
1514  * Mark an interface down and notify protocols of the transition.
1515  */
1516 static void
1517 if_unroute_dispatch(netmsg_t nmsg)
1518 {
1519 	struct netmsg_ifroute *msg = (struct netmsg_ifroute *)nmsg;
1520 	struct ifnet *ifp = msg->ifp;
1521 	int flag = msg->flag, fam = msg->fam;
1522 	struct ifaddr_container *ifac;
1523 
1524 	ifp->if_flags &= ~flag;
1525 	getmicrotime(&ifp->if_lastchange);
1526 	/*
1527 	 * The ifaddr processing in the following loop will block,
1528 	 * however, this function is called in netisr0, in which
1529 	 * ifaddr list changes happen, so we don't care about the
1530 	 * blockness of the ifaddr processing here.
1531 	 */
1532 	TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1533 		struct ifaddr *ifa = ifac->ifa;
1534 
1535 		/* Ignore marker */
1536 		if (ifa->ifa_addr->sa_family == AF_UNSPEC)
1537 			continue;
1538 
1539 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1540 			kpfctlinput(PRC_IFDOWN, ifa->ifa_addr);
1541 	}
1542 	ifq_purge_all(&ifp->if_snd);
1543 	rt_ifmsg(ifp);
1544 
1545 	lwkt_replymsg(&nmsg->lmsg, 0);
1546 }
1547 
1548 void
1549 if_unroute(struct ifnet *ifp, int flag, int fam)
1550 {
1551 	struct netmsg_ifroute msg;
1552 
1553 	ASSERT_CANDOMSG_NETISR0(curthread);
1554 
1555 	netmsg_init(&msg.base, NULL, &curthread->td_msgport, 0,
1556 	    if_unroute_dispatch);
1557 	msg.ifp = ifp;
1558 	msg.flag = flag;
1559 	msg.fam = fam;
1560 	lwkt_domsg(netisr_cpuport(0), &msg.base.lmsg, 0);
1561 }
1562 
1563 /*
1564  * Mark an interface up and notify protocols of the transition.
1565  */
1566 static void
1567 if_route_dispatch(netmsg_t nmsg)
1568 {
1569 	struct netmsg_ifroute *msg = (struct netmsg_ifroute *)nmsg;
1570 	struct ifnet *ifp = msg->ifp;
1571 	int flag = msg->flag, fam = msg->fam;
1572 	struct ifaddr_container *ifac;
1573 
1574 	ifq_purge_all(&ifp->if_snd);
1575 	ifp->if_flags |= flag;
1576 	getmicrotime(&ifp->if_lastchange);
1577 	/*
1578 	 * The ifaddr processing in the following loop will block,
1579 	 * however, this function is called in netisr0, in which
1580 	 * ifaddr list changes happen, so we don't care about the
1581 	 * blockness of the ifaddr processing here.
1582 	 */
1583 	TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1584 		struct ifaddr *ifa = ifac->ifa;
1585 
1586 		/* Ignore marker */
1587 		if (ifa->ifa_addr->sa_family == AF_UNSPEC)
1588 			continue;
1589 
1590 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1591 			kpfctlinput(PRC_IFUP, ifa->ifa_addr);
1592 	}
1593 	rt_ifmsg(ifp);
1594 #ifdef INET6
1595 	in6_if_up(ifp);
1596 #endif
1597 
1598 	lwkt_replymsg(&nmsg->lmsg, 0);
1599 }
1600 
1601 void
1602 if_route(struct ifnet *ifp, int flag, int fam)
1603 {
1604 	struct netmsg_ifroute msg;
1605 
1606 	ASSERT_CANDOMSG_NETISR0(curthread);
1607 
1608 	netmsg_init(&msg.base, NULL, &curthread->td_msgport, 0,
1609 	    if_route_dispatch);
1610 	msg.ifp = ifp;
1611 	msg.flag = flag;
1612 	msg.fam = fam;
1613 	lwkt_domsg(netisr_cpuport(0), &msg.base.lmsg, 0);
1614 }
1615 
1616 /*
1617  * Mark an interface down and notify protocols of the transition.  An
1618  * interface going down is also considered to be a synchronizing event.
1619  * We must ensure that all packet processing related to the interface
1620  * has completed before we return so e.g. the caller can free the ifnet
1621  * structure that the mbufs may be referencing.
1622  *
1623  * NOTE: must be called at splnet or eqivalent.
1624  */
1625 void
1626 if_down(struct ifnet *ifp)
1627 {
1628 	if_unroute(ifp, IFF_UP, AF_UNSPEC);
1629 	netmsg_service_sync();
1630 }
1631 
1632 /*
1633  * Mark an interface up and notify protocols of
1634  * the transition.
1635  * NOTE: must be called at splnet or eqivalent.
1636  */
1637 void
1638 if_up(struct ifnet *ifp)
1639 {
1640 	if_route(ifp, IFF_UP, AF_UNSPEC);
1641 }
1642 
1643 /*
1644  * Process a link state change.
1645  * NOTE: must be called at splsoftnet or equivalent.
1646  */
1647 void
1648 if_link_state_change(struct ifnet *ifp)
1649 {
1650 	int link_state = ifp->if_link_state;
1651 
1652 	rt_ifmsg(ifp);
1653 	devctl_notify("IFNET", ifp->if_xname,
1654 	    (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL);
1655 }
1656 
1657 /*
1658  * Handle interface watchdog timer routines.  Called
1659  * from softclock, we decrement timers (if set) and
1660  * call the appropriate interface routine on expiration.
1661  */
1662 static void
1663 if_slowtimo_dispatch(netmsg_t nmsg)
1664 {
1665 	struct globaldata *gd = mycpu;
1666 	const struct ifnet_array *arr;
1667 	int i;
1668 
1669 	ASSERT_IN_NETISR(0);
1670 
1671 	crit_enter_gd(gd);
1672 	lwkt_replymsg(&nmsg->lmsg, 0);  /* reply ASAP */
1673 	crit_exit_gd(gd);
1674 
1675 	arr = ifnet_array_get();
1676 	for (i = 0; i < arr->ifnet_count; ++i) {
1677 		struct ifnet *ifp = arr->ifnet_arr[i];
1678 
1679 		crit_enter_gd(gd);
1680 
1681 		if (if_stats_compat) {
1682 			IFNET_STAT_GET(ifp, ipackets, ifp->if_ipackets);
1683 			IFNET_STAT_GET(ifp, ierrors, ifp->if_ierrors);
1684 			IFNET_STAT_GET(ifp, opackets, ifp->if_opackets);
1685 			IFNET_STAT_GET(ifp, oerrors, ifp->if_oerrors);
1686 			IFNET_STAT_GET(ifp, collisions, ifp->if_collisions);
1687 			IFNET_STAT_GET(ifp, ibytes, ifp->if_ibytes);
1688 			IFNET_STAT_GET(ifp, obytes, ifp->if_obytes);
1689 			IFNET_STAT_GET(ifp, imcasts, ifp->if_imcasts);
1690 			IFNET_STAT_GET(ifp, omcasts, ifp->if_omcasts);
1691 			IFNET_STAT_GET(ifp, iqdrops, ifp->if_iqdrops);
1692 			IFNET_STAT_GET(ifp, noproto, ifp->if_noproto);
1693 			IFNET_STAT_GET(ifp, oqdrops, ifp->if_oqdrops);
1694 		}
1695 
1696 		if (ifp->if_timer == 0 || --ifp->if_timer) {
1697 			crit_exit_gd(gd);
1698 			continue;
1699 		}
1700 		if (ifp->if_watchdog) {
1701 			if (ifnet_tryserialize_all(ifp)) {
1702 				(*ifp->if_watchdog)(ifp);
1703 				ifnet_deserialize_all(ifp);
1704 			} else {
1705 				/* try again next timeout */
1706 				++ifp->if_timer;
1707 			}
1708 		}
1709 
1710 		crit_exit_gd(gd);
1711 	}
1712 
1713 	callout_reset(&if_slowtimo_timer, hz / IFNET_SLOWHZ, if_slowtimo, NULL);
1714 }
1715 
1716 static void
1717 if_slowtimo(void *arg __unused)
1718 {
1719 	struct lwkt_msg *lmsg = &if_slowtimo_netmsg.lmsg;
1720 
1721 	KASSERT(mycpuid == 0, ("not on cpu0"));
1722 	crit_enter();
1723 	if (lmsg->ms_flags & MSGF_DONE)
1724 		lwkt_sendmsg_oncpu(netisr_cpuport(0), lmsg);
1725 	crit_exit();
1726 }
1727 
1728 /*
1729  * Map interface name to
1730  * interface structure pointer.
1731  */
1732 struct ifnet *
1733 ifunit(const char *name)
1734 {
1735 	struct ifnet *ifp;
1736 
1737 	/*
1738 	 * Search all the interfaces for this name/number
1739 	 */
1740 	KASSERT(mtx_owned(&ifnet_mtx), ("ifnet is not locked"));
1741 
1742 	TAILQ_FOREACH(ifp, &ifnetlist, if_link) {
1743 		if (strncmp(ifp->if_xname, name, IFNAMSIZ) == 0)
1744 			break;
1745 	}
1746 	return (ifp);
1747 }
1748 
1749 struct ifnet *
1750 ifunit_netisr(const char *name)
1751 {
1752 	const struct ifnet_array *arr;
1753 	int i;
1754 
1755 	/*
1756 	 * Search all the interfaces for this name/number
1757 	 */
1758 
1759 	arr = ifnet_array_get();
1760 	for (i = 0; i < arr->ifnet_count; ++i) {
1761 		struct ifnet *ifp = arr->ifnet_arr[i];
1762 
1763 		if (strncmp(ifp->if_xname, name, IFNAMSIZ) == 0)
1764 			return ifp;
1765 	}
1766 	return NULL;
1767 }
1768 
1769 /*
1770  * Interface ioctls.
1771  */
1772 int
1773 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct ucred *cred)
1774 {
1775 	struct ifnet *ifp;
1776 	struct ifreq *ifr;
1777 	struct ifstat *ifs;
1778 	int error, do_ifup = 0;
1779 	short oif_flags;
1780 	int new_flags;
1781 	size_t namelen, onamelen;
1782 	char new_name[IFNAMSIZ];
1783 	struct ifaddr *ifa;
1784 	struct sockaddr_dl *sdl;
1785 
1786 	switch (cmd) {
1787 	case SIOCGIFCONF:
1788 	case OSIOCGIFCONF:
1789 		return (ifconf(cmd, data, cred));
1790 	default:
1791 		break;
1792 	}
1793 
1794 	ifr = (struct ifreq *)data;
1795 
1796 	switch (cmd) {
1797 	case SIOCIFCREATE:
1798 	case SIOCIFCREATE2:
1799 		if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1800 			return (error);
1801 		return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name),
1802 		    	cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL));
1803 	case SIOCIFDESTROY:
1804 		if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1805 			return (error);
1806 		return (if_clone_destroy(ifr->ifr_name));
1807 	case SIOCIFGCLONERS:
1808 		return (if_clone_list((struct if_clonereq *)data));
1809 	default:
1810 		break;
1811 	}
1812 
1813 	/*
1814 	 * Nominal ioctl through interface, lookup the ifp and obtain a
1815 	 * lock to serialize the ifconfig ioctl operation.
1816 	 */
1817 	ifnet_lock();
1818 
1819 	ifp = ifunit(ifr->ifr_name);
1820 	if (ifp == NULL) {
1821 		ifnet_unlock();
1822 		return (ENXIO);
1823 	}
1824 	error = 0;
1825 
1826 	switch (cmd) {
1827 	case SIOCGIFINDEX:
1828 		ifr->ifr_index = ifp->if_index;
1829 		break;
1830 
1831 	case SIOCGIFFLAGS:
1832 		ifr->ifr_flags = ifp->if_flags;
1833 		ifr->ifr_flagshigh = ifp->if_flags >> 16;
1834 		break;
1835 
1836 	case SIOCGIFCAP:
1837 		ifr->ifr_reqcap = ifp->if_capabilities;
1838 		ifr->ifr_curcap = ifp->if_capenable;
1839 		break;
1840 
1841 	case SIOCGIFMETRIC:
1842 		ifr->ifr_metric = ifp->if_metric;
1843 		break;
1844 
1845 	case SIOCGIFMTU:
1846 		ifr->ifr_mtu = ifp->if_mtu;
1847 		break;
1848 
1849 	case SIOCGIFTSOLEN:
1850 		ifr->ifr_tsolen = ifp->if_tsolen;
1851 		break;
1852 
1853 	case SIOCGIFDATA:
1854 		error = copyout((caddr_t)&ifp->if_data, ifr->ifr_data,
1855 				sizeof(ifp->if_data));
1856 		break;
1857 
1858 	case SIOCGIFPHYS:
1859 		ifr->ifr_phys = ifp->if_physical;
1860 		break;
1861 
1862 	case SIOCGIFPOLLCPU:
1863 		ifr->ifr_pollcpu = -1;
1864 		break;
1865 
1866 	case SIOCSIFPOLLCPU:
1867 		break;
1868 
1869 	case SIOCSIFFLAGS:
1870 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1871 		if (error)
1872 			break;
1873 		new_flags = (ifr->ifr_flags & 0xffff) |
1874 		    (ifr->ifr_flagshigh << 16);
1875 		if (ifp->if_flags & IFF_SMART) {
1876 			/* Smart drivers twiddle their own routes */
1877 		} else if (ifp->if_flags & IFF_UP &&
1878 		    (new_flags & IFF_UP) == 0) {
1879 			if_down(ifp);
1880 		} else if (new_flags & IFF_UP &&
1881 		    (ifp->if_flags & IFF_UP) == 0) {
1882 			do_ifup = 1;
1883 		}
1884 
1885 #ifdef IFPOLL_ENABLE
1886 		if ((new_flags ^ ifp->if_flags) & IFF_NPOLLING) {
1887 			if (new_flags & IFF_NPOLLING)
1888 				ifpoll_register(ifp);
1889 			else
1890 				ifpoll_deregister(ifp);
1891 		}
1892 #endif
1893 
1894 		ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
1895 			(new_flags &~ IFF_CANTCHANGE);
1896 		if (new_flags & IFF_PPROMISC) {
1897 			/* Permanently promiscuous mode requested */
1898 			ifp->if_flags |= IFF_PROMISC;
1899 		} else if (ifp->if_pcount == 0) {
1900 			ifp->if_flags &= ~IFF_PROMISC;
1901 		}
1902 		if (ifp->if_ioctl) {
1903 			ifnet_serialize_all(ifp);
1904 			ifp->if_ioctl(ifp, cmd, data, cred);
1905 			ifnet_deserialize_all(ifp);
1906 		}
1907 		if (do_ifup)
1908 			if_up(ifp);
1909 		getmicrotime(&ifp->if_lastchange);
1910 		break;
1911 
1912 	case SIOCSIFCAP:
1913 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1914 		if (error)
1915 			break;
1916 		if (ifr->ifr_reqcap & ~ifp->if_capabilities) {
1917 			error = EINVAL;
1918 			break;
1919 		}
1920 		ifnet_serialize_all(ifp);
1921 		ifp->if_ioctl(ifp, cmd, data, cred);
1922 		ifnet_deserialize_all(ifp);
1923 		break;
1924 
1925 	case SIOCSIFNAME:
1926 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1927 		if (error)
1928 			break;
1929 		error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
1930 		if (error)
1931 			break;
1932 		if (new_name[0] == '\0') {
1933 			error = EINVAL;
1934 			break;
1935 		}
1936 		if (ifunit(new_name) != NULL) {
1937 			error = EEXIST;
1938 			break;
1939 		}
1940 
1941 		EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
1942 
1943 		/* Announce the departure of the interface. */
1944 		rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
1945 
1946 		strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
1947 		ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
1948 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
1949 		namelen = strlen(new_name);
1950 		onamelen = sdl->sdl_nlen;
1951 		/*
1952 		 * Move the address if needed.  This is safe because we
1953 		 * allocate space for a name of length IFNAMSIZ when we
1954 		 * create this in if_attach().
1955 		 */
1956 		if (namelen != onamelen) {
1957 			bcopy(sdl->sdl_data + onamelen,
1958 			    sdl->sdl_data + namelen, sdl->sdl_alen);
1959 		}
1960 		bcopy(new_name, sdl->sdl_data, namelen);
1961 		sdl->sdl_nlen = namelen;
1962 		sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
1963 		bzero(sdl->sdl_data, onamelen);
1964 		while (namelen != 0)
1965 			sdl->sdl_data[--namelen] = 0xff;
1966 
1967 		EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
1968 
1969 		/* Announce the return of the interface. */
1970 		rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
1971 		break;
1972 
1973 	case SIOCSIFMETRIC:
1974 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1975 		if (error)
1976 			break;
1977 		ifp->if_metric = ifr->ifr_metric;
1978 		getmicrotime(&ifp->if_lastchange);
1979 		break;
1980 
1981 	case SIOCSIFPHYS:
1982 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1983 		if (error)
1984 			break;
1985 		if (ifp->if_ioctl == NULL) {
1986 		        error = EOPNOTSUPP;
1987 			break;
1988 		}
1989 		ifnet_serialize_all(ifp);
1990 		error = ifp->if_ioctl(ifp, cmd, data, cred);
1991 		ifnet_deserialize_all(ifp);
1992 		if (error == 0)
1993 			getmicrotime(&ifp->if_lastchange);
1994 		break;
1995 
1996 	case SIOCSIFMTU:
1997 	{
1998 		u_long oldmtu = ifp->if_mtu;
1999 
2000 		error = priv_check_cred(cred, PRIV_ROOT, 0);
2001 		if (error)
2002 			break;
2003 		if (ifp->if_ioctl == NULL) {
2004 			error = EOPNOTSUPP;
2005 			break;
2006 		}
2007 		if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) {
2008 			error = EINVAL;
2009 			break;
2010 		}
2011 		ifnet_serialize_all(ifp);
2012 		error = ifp->if_ioctl(ifp, cmd, data, cred);
2013 		ifnet_deserialize_all(ifp);
2014 		if (error == 0) {
2015 			getmicrotime(&ifp->if_lastchange);
2016 			rt_ifmsg(ifp);
2017 		}
2018 		/*
2019 		 * If the link MTU changed, do network layer specific procedure.
2020 		 */
2021 		if (ifp->if_mtu != oldmtu) {
2022 #ifdef INET6
2023 			nd6_setmtu(ifp);
2024 #endif
2025 		}
2026 		break;
2027 	}
2028 
2029 	case SIOCSIFTSOLEN:
2030 		error = priv_check_cred(cred, PRIV_ROOT, 0);
2031 		if (error)
2032 			break;
2033 
2034 		/* XXX need driver supplied upper limit */
2035 		if (ifr->ifr_tsolen <= 0) {
2036 			error = EINVAL;
2037 			break;
2038 		}
2039 		ifp->if_tsolen = ifr->ifr_tsolen;
2040 		break;
2041 
2042 	case SIOCADDMULTI:
2043 	case SIOCDELMULTI:
2044 		error = priv_check_cred(cred, PRIV_ROOT, 0);
2045 		if (error)
2046 			break;
2047 
2048 		/* Don't allow group membership on non-multicast interfaces. */
2049 		if ((ifp->if_flags & IFF_MULTICAST) == 0) {
2050 			error = EOPNOTSUPP;
2051 			break;
2052 		}
2053 
2054 		/* Don't let users screw up protocols' entries. */
2055 		if (ifr->ifr_addr.sa_family != AF_LINK) {
2056 			error = EINVAL;
2057 			break;
2058 		}
2059 
2060 		if (cmd == SIOCADDMULTI) {
2061 			struct ifmultiaddr *ifma;
2062 			error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
2063 		} else {
2064 			error = if_delmulti(ifp, &ifr->ifr_addr);
2065 		}
2066 		if (error == 0)
2067 			getmicrotime(&ifp->if_lastchange);
2068 		break;
2069 
2070 	case SIOCSIFPHYADDR:
2071 	case SIOCDIFPHYADDR:
2072 #ifdef INET6
2073 	case SIOCSIFPHYADDR_IN6:
2074 #endif
2075 	case SIOCSLIFPHYADDR:
2076         case SIOCSIFMEDIA:
2077 	case SIOCSIFGENERIC:
2078 		error = priv_check_cred(cred, PRIV_ROOT, 0);
2079 		if (error)
2080 			break;
2081 		if (ifp->if_ioctl == 0) {
2082 			error = EOPNOTSUPP;
2083 			break;
2084 		}
2085 		ifnet_serialize_all(ifp);
2086 		error = ifp->if_ioctl(ifp, cmd, data, cred);
2087 		ifnet_deserialize_all(ifp);
2088 		if (error == 0)
2089 			getmicrotime(&ifp->if_lastchange);
2090 		break;
2091 
2092 	case SIOCGIFSTATUS:
2093 		ifs = (struct ifstat *)data;
2094 		ifs->ascii[0] = '\0';
2095 		/* fall through */
2096 	case SIOCGIFPSRCADDR:
2097 	case SIOCGIFPDSTADDR:
2098 	case SIOCGLIFPHYADDR:
2099 	case SIOCGIFMEDIA:
2100 	case SIOCGIFGENERIC:
2101 		if (ifp->if_ioctl == NULL) {
2102 			error = EOPNOTSUPP;
2103 			break;
2104 		}
2105 		ifnet_serialize_all(ifp);
2106 		error = ifp->if_ioctl(ifp, cmd, data, cred);
2107 		ifnet_deserialize_all(ifp);
2108 		break;
2109 
2110 	case SIOCSIFLLADDR:
2111 		error = priv_check_cred(cred, PRIV_ROOT, 0);
2112 		if (error)
2113 			break;
2114 		error = if_setlladdr(ifp, ifr->ifr_addr.sa_data,
2115 				     ifr->ifr_addr.sa_len);
2116 		EVENTHANDLER_INVOKE(iflladdr_event, ifp);
2117 		break;
2118 
2119 	default:
2120 		oif_flags = ifp->if_flags;
2121 		if (so->so_proto == 0) {
2122 			error = EOPNOTSUPP;
2123 			break;
2124 		}
2125 		error = so_pru_control_direct(so, cmd, data, ifp);
2126 
2127 		if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
2128 #ifdef INET6
2129 			DELAY(100);/* XXX: temporary workaround for fxp issue*/
2130 			if (ifp->if_flags & IFF_UP) {
2131 				crit_enter();
2132 				in6_if_up(ifp);
2133 				crit_exit();
2134 			}
2135 #endif
2136 		}
2137 		break;
2138 	}
2139 
2140 	ifnet_unlock();
2141 	return (error);
2142 }
2143 
2144 /*
2145  * Set/clear promiscuous mode on interface ifp based on the truth value
2146  * of pswitch.  The calls are reference counted so that only the first
2147  * "on" request actually has an effect, as does the final "off" request.
2148  * Results are undefined if the "off" and "on" requests are not matched.
2149  */
2150 int
2151 ifpromisc(struct ifnet *ifp, int pswitch)
2152 {
2153 	struct ifreq ifr;
2154 	int error;
2155 	int oldflags;
2156 
2157 	oldflags = ifp->if_flags;
2158 	if (ifp->if_flags & IFF_PPROMISC) {
2159 		/* Do nothing if device is in permanently promiscuous mode */
2160 		ifp->if_pcount += pswitch ? 1 : -1;
2161 		return (0);
2162 	}
2163 	if (pswitch) {
2164 		/*
2165 		 * If the device is not configured up, we cannot put it in
2166 		 * promiscuous mode.
2167 		 */
2168 		if ((ifp->if_flags & IFF_UP) == 0)
2169 			return (ENETDOWN);
2170 		if (ifp->if_pcount++ != 0)
2171 			return (0);
2172 		ifp->if_flags |= IFF_PROMISC;
2173 		log(LOG_INFO, "%s: promiscuous mode enabled\n",
2174 		    ifp->if_xname);
2175 	} else {
2176 		if (--ifp->if_pcount > 0)
2177 			return (0);
2178 		ifp->if_flags &= ~IFF_PROMISC;
2179 		log(LOG_INFO, "%s: promiscuous mode disabled\n",
2180 		    ifp->if_xname);
2181 	}
2182 	ifr.ifr_flags = ifp->if_flags;
2183 	ifr.ifr_flagshigh = ifp->if_flags >> 16;
2184 	ifnet_serialize_all(ifp);
2185 	error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr, NULL);
2186 	ifnet_deserialize_all(ifp);
2187 	if (error == 0)
2188 		rt_ifmsg(ifp);
2189 	else
2190 		ifp->if_flags = oldflags;
2191 	return error;
2192 }
2193 
2194 /*
2195  * Return interface configuration
2196  * of system.  List may be used
2197  * in later ioctl's (above) to get
2198  * other information.
2199  */
2200 static int
2201 ifconf(u_long cmd, caddr_t data, struct ucred *cred)
2202 {
2203 	struct ifconf *ifc = (struct ifconf *)data;
2204 	struct ifnet *ifp;
2205 	struct sockaddr *sa;
2206 	struct ifreq ifr, *ifrp;
2207 	int space = ifc->ifc_len, error = 0;
2208 
2209 	ifrp = ifc->ifc_req;
2210 
2211 	ifnet_lock();
2212 	TAILQ_FOREACH(ifp, &ifnetlist, if_link) {
2213 		struct ifaddr_container *ifac, *ifac_mark;
2214 		struct ifaddr_marker mark;
2215 		struct ifaddrhead *head;
2216 		int addrs;
2217 
2218 		if (space <= sizeof ifr)
2219 			break;
2220 
2221 		/*
2222 		 * Zero the stack declared structure first to prevent
2223 		 * memory disclosure.
2224 		 */
2225 		bzero(&ifr, sizeof(ifr));
2226 		if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
2227 		    >= sizeof(ifr.ifr_name)) {
2228 			error = ENAMETOOLONG;
2229 			break;
2230 		}
2231 
2232 		/*
2233 		 * Add a marker, since copyout() could block and during that
2234 		 * period the list could be changed.  Inserting the marker to
2235 		 * the header of the list will not cause trouble for the code
2236 		 * assuming that the first element of the list is AF_LINK; the
2237 		 * marker will be moved to the next position w/o blocking.
2238 		 */
2239 		ifa_marker_init(&mark, ifp);
2240 		ifac_mark = &mark.ifac;
2241 		head = &ifp->if_addrheads[mycpuid];
2242 
2243 		addrs = 0;
2244 		TAILQ_INSERT_HEAD(head, ifac_mark, ifa_link);
2245 		while ((ifac = TAILQ_NEXT(ifac_mark, ifa_link)) != NULL) {
2246 			struct ifaddr *ifa = ifac->ifa;
2247 
2248 			TAILQ_REMOVE(head, ifac_mark, ifa_link);
2249 			TAILQ_INSERT_AFTER(head, ifac, ifac_mark, ifa_link);
2250 
2251 			/* Ignore marker */
2252 			if (ifa->ifa_addr->sa_family == AF_UNSPEC)
2253 				continue;
2254 
2255 			if (space <= sizeof ifr)
2256 				break;
2257 			sa = ifa->ifa_addr;
2258 			if (cred->cr_prison &&
2259 			    prison_if(cred, sa))
2260 				continue;
2261 			addrs++;
2262 			/*
2263 			 * Keep a reference on this ifaddr, so that it will
2264 			 * not be destroyed when its address is copied to
2265 			 * the userland, which could block.
2266 			 */
2267 			IFAREF(ifa);
2268 			if (sa->sa_len <= sizeof(*sa)) {
2269 				ifr.ifr_addr = *sa;
2270 				error = copyout(&ifr, ifrp, sizeof ifr);
2271 				ifrp++;
2272 			} else {
2273 				if (space < (sizeof ifr) + sa->sa_len -
2274 					    sizeof(*sa)) {
2275 					IFAFREE(ifa);
2276 					break;
2277 				}
2278 				space -= sa->sa_len - sizeof(*sa);
2279 				error = copyout(&ifr, ifrp,
2280 						sizeof ifr.ifr_name);
2281 				if (error == 0)
2282 					error = copyout(sa, &ifrp->ifr_addr,
2283 							sa->sa_len);
2284 				ifrp = (struct ifreq *)
2285 					(sa->sa_len + (caddr_t)&ifrp->ifr_addr);
2286 			}
2287 			IFAFREE(ifa);
2288 			if (error)
2289 				break;
2290 			space -= sizeof ifr;
2291 		}
2292 		TAILQ_REMOVE(head, ifac_mark, ifa_link);
2293 		if (error)
2294 			break;
2295 		if (!addrs) {
2296 			bzero(&ifr.ifr_addr, sizeof ifr.ifr_addr);
2297 			error = copyout(&ifr, ifrp, sizeof ifr);
2298 			if (error)
2299 				break;
2300 			space -= sizeof ifr;
2301 			ifrp++;
2302 		}
2303 	}
2304 	ifnet_unlock();
2305 
2306 	ifc->ifc_len -= space;
2307 	return (error);
2308 }
2309 
2310 /*
2311  * Just like if_promisc(), but for all-multicast-reception mode.
2312  */
2313 int
2314 if_allmulti(struct ifnet *ifp, int onswitch)
2315 {
2316 	int error = 0;
2317 	struct ifreq ifr;
2318 
2319 	crit_enter();
2320 
2321 	if (onswitch) {
2322 		if (ifp->if_amcount++ == 0) {
2323 			ifp->if_flags |= IFF_ALLMULTI;
2324 			ifr.ifr_flags = ifp->if_flags;
2325 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
2326 			ifnet_serialize_all(ifp);
2327 			error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2328 					      NULL);
2329 			ifnet_deserialize_all(ifp);
2330 		}
2331 	} else {
2332 		if (ifp->if_amcount > 1) {
2333 			ifp->if_amcount--;
2334 		} else {
2335 			ifp->if_amcount = 0;
2336 			ifp->if_flags &= ~IFF_ALLMULTI;
2337 			ifr.ifr_flags = ifp->if_flags;
2338 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
2339 			ifnet_serialize_all(ifp);
2340 			error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2341 					      NULL);
2342 			ifnet_deserialize_all(ifp);
2343 		}
2344 	}
2345 
2346 	crit_exit();
2347 
2348 	if (error == 0)
2349 		rt_ifmsg(ifp);
2350 	return error;
2351 }
2352 
2353 /*
2354  * Add a multicast listenership to the interface in question.
2355  * The link layer provides a routine which converts
2356  */
2357 int
2358 if_addmulti_serialized(struct ifnet *ifp, struct sockaddr *sa,
2359     struct ifmultiaddr **retifma)
2360 {
2361 	struct sockaddr *llsa, *dupsa;
2362 	int error;
2363 	struct ifmultiaddr *ifma;
2364 
2365 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
2366 
2367 	/*
2368 	 * If the matching multicast address already exists
2369 	 * then don't add a new one, just add a reference
2370 	 */
2371 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2372 		if (sa_equal(sa, ifma->ifma_addr)) {
2373 			ifma->ifma_refcount++;
2374 			if (retifma)
2375 				*retifma = ifma;
2376 			return 0;
2377 		}
2378 	}
2379 
2380 	/*
2381 	 * Give the link layer a chance to accept/reject it, and also
2382 	 * find out which AF_LINK address this maps to, if it isn't one
2383 	 * already.
2384 	 */
2385 	if (ifp->if_resolvemulti) {
2386 		error = ifp->if_resolvemulti(ifp, &llsa, sa);
2387 		if (error)
2388 			return error;
2389 	} else {
2390 		llsa = NULL;
2391 	}
2392 
2393 	ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_INTWAIT);
2394 	dupsa = kmalloc(sa->sa_len, M_IFMADDR, M_INTWAIT);
2395 	bcopy(sa, dupsa, sa->sa_len);
2396 
2397 	ifma->ifma_addr = dupsa;
2398 	ifma->ifma_lladdr = llsa;
2399 	ifma->ifma_ifp = ifp;
2400 	ifma->ifma_refcount = 1;
2401 	ifma->ifma_protospec = NULL;
2402 	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
2403 
2404 	TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2405 	if (retifma)
2406 		*retifma = ifma;
2407 
2408 	if (llsa != NULL) {
2409 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2410 			if (sa_equal(ifma->ifma_addr, llsa))
2411 				break;
2412 		}
2413 		if (ifma) {
2414 			ifma->ifma_refcount++;
2415 		} else {
2416 			ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_INTWAIT);
2417 			dupsa = kmalloc(llsa->sa_len, M_IFMADDR, M_INTWAIT);
2418 			bcopy(llsa, dupsa, llsa->sa_len);
2419 			ifma->ifma_addr = dupsa;
2420 			ifma->ifma_ifp = ifp;
2421 			ifma->ifma_refcount = 1;
2422 			TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2423 		}
2424 	}
2425 	/*
2426 	 * We are certain we have added something, so call down to the
2427 	 * interface to let them know about it.
2428 	 */
2429 	if (ifp->if_ioctl)
2430 		ifp->if_ioctl(ifp, SIOCADDMULTI, 0, NULL);
2431 
2432 	return 0;
2433 }
2434 
2435 int
2436 if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
2437     struct ifmultiaddr **retifma)
2438 {
2439 	int error;
2440 
2441 	ifnet_serialize_all(ifp);
2442 	error = if_addmulti_serialized(ifp, sa, retifma);
2443 	ifnet_deserialize_all(ifp);
2444 
2445 	return error;
2446 }
2447 
2448 /*
2449  * Remove a reference to a multicast address on this interface.  Yell
2450  * if the request does not match an existing membership.
2451  */
2452 static int
2453 if_delmulti_serialized(struct ifnet *ifp, struct sockaddr *sa)
2454 {
2455 	struct ifmultiaddr *ifma;
2456 
2457 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
2458 
2459 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2460 		if (sa_equal(sa, ifma->ifma_addr))
2461 			break;
2462 	if (ifma == NULL)
2463 		return ENOENT;
2464 
2465 	if (ifma->ifma_refcount > 1) {
2466 		ifma->ifma_refcount--;
2467 		return 0;
2468 	}
2469 
2470 	rt_newmaddrmsg(RTM_DELMADDR, ifma);
2471 	sa = ifma->ifma_lladdr;
2472 	TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2473 	/*
2474 	 * Make sure the interface driver is notified
2475 	 * in the case of a link layer mcast group being left.
2476 	 */
2477 	if (ifma->ifma_addr->sa_family == AF_LINK && sa == NULL)
2478 		ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2479 	kfree(ifma->ifma_addr, M_IFMADDR);
2480 	kfree(ifma, M_IFMADDR);
2481 	if (sa == NULL)
2482 		return 0;
2483 
2484 	/*
2485 	 * Now look for the link-layer address which corresponds to
2486 	 * this network address.  It had been squirreled away in
2487 	 * ifma->ifma_lladdr for this purpose (so we don't have
2488 	 * to call ifp->if_resolvemulti() again), and we saved that
2489 	 * value in sa above.  If some nasty deleted the
2490 	 * link-layer address out from underneath us, we can deal because
2491 	 * the address we stored was is not the same as the one which was
2492 	 * in the record for the link-layer address.  (So we don't complain
2493 	 * in that case.)
2494 	 */
2495 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2496 		if (sa_equal(sa, ifma->ifma_addr))
2497 			break;
2498 	if (ifma == NULL)
2499 		return 0;
2500 
2501 	if (ifma->ifma_refcount > 1) {
2502 		ifma->ifma_refcount--;
2503 		return 0;
2504 	}
2505 
2506 	TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2507 	ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2508 	kfree(ifma->ifma_addr, M_IFMADDR);
2509 	kfree(sa, M_IFMADDR);
2510 	kfree(ifma, M_IFMADDR);
2511 
2512 	return 0;
2513 }
2514 
2515 int
2516 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
2517 {
2518 	int error;
2519 
2520 	ifnet_serialize_all(ifp);
2521 	error = if_delmulti_serialized(ifp, sa);
2522 	ifnet_deserialize_all(ifp);
2523 
2524 	return error;
2525 }
2526 
2527 /*
2528  * Delete all multicast group membership for an interface.
2529  * Should be used to quickly flush all multicast filters.
2530  */
2531 void
2532 if_delallmulti_serialized(struct ifnet *ifp)
2533 {
2534 	struct ifmultiaddr *ifma, mark;
2535 	struct sockaddr sa;
2536 
2537 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
2538 
2539 	bzero(&sa, sizeof(sa));
2540 	sa.sa_family = AF_UNSPEC;
2541 	sa.sa_len = sizeof(sa);
2542 
2543 	bzero(&mark, sizeof(mark));
2544 	mark.ifma_addr = &sa;
2545 
2546 	TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, &mark, ifma_link);
2547 	while ((ifma = TAILQ_NEXT(&mark, ifma_link)) != NULL) {
2548 		TAILQ_REMOVE(&ifp->if_multiaddrs, &mark, ifma_link);
2549 		TAILQ_INSERT_AFTER(&ifp->if_multiaddrs, ifma, &mark,
2550 		    ifma_link);
2551 
2552 		if (ifma->ifma_addr->sa_family == AF_UNSPEC)
2553 			continue;
2554 
2555 		if_delmulti_serialized(ifp, ifma->ifma_addr);
2556 	}
2557 	TAILQ_REMOVE(&ifp->if_multiaddrs, &mark, ifma_link);
2558 }
2559 
2560 
2561 /*
2562  * Set the link layer address on an interface.
2563  *
2564  * At this time we only support certain types of interfaces,
2565  * and we don't allow the length of the address to change.
2566  */
2567 int
2568 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
2569 {
2570 	struct sockaddr_dl *sdl;
2571 	struct ifreq ifr;
2572 
2573 	sdl = IF_LLSOCKADDR(ifp);
2574 	if (sdl == NULL)
2575 		return (EINVAL);
2576 	if (len != sdl->sdl_alen)	/* don't allow length to change */
2577 		return (EINVAL);
2578 	switch (ifp->if_type) {
2579 	case IFT_ETHER:			/* these types use struct arpcom */
2580 	case IFT_XETHER:
2581 	case IFT_L2VLAN:
2582 	case IFT_IEEE8023ADLAG:
2583 		bcopy(lladdr, ((struct arpcom *)ifp->if_softc)->ac_enaddr, len);
2584 		bcopy(lladdr, LLADDR(sdl), len);
2585 		break;
2586 	default:
2587 		return (ENODEV);
2588 	}
2589 	/*
2590 	 * If the interface is already up, we need
2591 	 * to re-init it in order to reprogram its
2592 	 * address filter.
2593 	 */
2594 	ifnet_serialize_all(ifp);
2595 	if ((ifp->if_flags & IFF_UP) != 0) {
2596 #ifdef INET
2597 		struct ifaddr_container *ifac;
2598 #endif
2599 
2600 		ifp->if_flags &= ~IFF_UP;
2601 		ifr.ifr_flags = ifp->if_flags;
2602 		ifr.ifr_flagshigh = ifp->if_flags >> 16;
2603 		ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2604 			      NULL);
2605 		ifp->if_flags |= IFF_UP;
2606 		ifr.ifr_flags = ifp->if_flags;
2607 		ifr.ifr_flagshigh = ifp->if_flags >> 16;
2608 		ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2609 				 NULL);
2610 #ifdef INET
2611 		/*
2612 		 * Also send gratuitous ARPs to notify other nodes about
2613 		 * the address change.
2614 		 */
2615 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
2616 			struct ifaddr *ifa = ifac->ifa;
2617 
2618 			if (ifa->ifa_addr != NULL &&
2619 			    ifa->ifa_addr->sa_family == AF_INET)
2620 				arp_gratuitous(ifp, ifa);
2621 		}
2622 #endif
2623 	}
2624 	ifnet_deserialize_all(ifp);
2625 	return (0);
2626 }
2627 
2628 struct ifmultiaddr *
2629 ifmaof_ifpforaddr(struct sockaddr *sa, struct ifnet *ifp)
2630 {
2631 	struct ifmultiaddr *ifma;
2632 
2633 	/* TODO: need ifnet_serialize_main */
2634 	ifnet_serialize_all(ifp);
2635 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2636 		if (sa_equal(ifma->ifma_addr, sa))
2637 			break;
2638 	ifnet_deserialize_all(ifp);
2639 
2640 	return ifma;
2641 }
2642 
2643 /*
2644  * This function locates the first real ethernet MAC from a network
2645  * card and loads it into node, returning 0 on success or ENOENT if
2646  * no suitable interfaces were found.  It is used by the uuid code to
2647  * generate a unique 6-byte number.
2648  */
2649 int
2650 if_getanyethermac(uint16_t *node, int minlen)
2651 {
2652 	struct ifnet *ifp;
2653 	struct sockaddr_dl *sdl;
2654 
2655 	ifnet_lock();
2656 	TAILQ_FOREACH(ifp, &ifnetlist, if_link) {
2657 		if (ifp->if_type != IFT_ETHER)
2658 			continue;
2659 		sdl = IF_LLSOCKADDR(ifp);
2660 		if (sdl->sdl_alen < minlen)
2661 			continue;
2662 		bcopy(((struct arpcom *)ifp->if_softc)->ac_enaddr, node,
2663 		      minlen);
2664 		ifnet_unlock();
2665 		return(0);
2666 	}
2667 	ifnet_unlock();
2668 	return (ENOENT);
2669 }
2670 
2671 /*
2672  * The name argument must be a pointer to storage which will last as
2673  * long as the interface does.  For physical devices, the result of
2674  * device_get_name(dev) is a good choice and for pseudo-devices a
2675  * static string works well.
2676  */
2677 void
2678 if_initname(struct ifnet *ifp, const char *name, int unit)
2679 {
2680 	ifp->if_dname = name;
2681 	ifp->if_dunit = unit;
2682 	if (unit != IF_DUNIT_NONE)
2683 		ksnprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
2684 	else
2685 		strlcpy(ifp->if_xname, name, IFNAMSIZ);
2686 }
2687 
2688 int
2689 if_printf(struct ifnet *ifp, const char *fmt, ...)
2690 {
2691 	__va_list ap;
2692 	int retval;
2693 
2694 	retval = kprintf("%s: ", ifp->if_xname);
2695 	__va_start(ap, fmt);
2696 	retval += kvprintf(fmt, ap);
2697 	__va_end(ap);
2698 	return (retval);
2699 }
2700 
2701 struct ifnet *
2702 if_alloc(uint8_t type)
2703 {
2704         struct ifnet *ifp;
2705 	size_t size;
2706 
2707 	/*
2708 	 * XXX temporary hack until arpcom is setup in if_l2com
2709 	 */
2710 	if (type == IFT_ETHER)
2711 		size = sizeof(struct arpcom);
2712 	else
2713 		size = sizeof(struct ifnet);
2714 
2715 	ifp = kmalloc(size, M_IFNET, M_WAITOK|M_ZERO);
2716 
2717 	ifp->if_type = type;
2718 
2719 	if (if_com_alloc[type] != NULL) {
2720 		ifp->if_l2com = if_com_alloc[type](type, ifp);
2721 		if (ifp->if_l2com == NULL) {
2722 			kfree(ifp, M_IFNET);
2723 			return (NULL);
2724 		}
2725 	}
2726 	return (ifp);
2727 }
2728 
2729 void
2730 if_free(struct ifnet *ifp)
2731 {
2732 	kfree(ifp, M_IFNET);
2733 }
2734 
2735 void
2736 ifq_set_classic(struct ifaltq *ifq)
2737 {
2738 	ifq_set_methods(ifq, ifq->altq_ifp->if_mapsubq,
2739 	    ifsq_classic_enqueue, ifsq_classic_dequeue, ifsq_classic_request);
2740 }
2741 
2742 void
2743 ifq_set_methods(struct ifaltq *ifq, altq_mapsubq_t mapsubq,
2744     ifsq_enqueue_t enqueue, ifsq_dequeue_t dequeue, ifsq_request_t request)
2745 {
2746 	int q;
2747 
2748 	KASSERT(mapsubq != NULL, ("mapsubq is not specified"));
2749 	KASSERT(enqueue != NULL, ("enqueue is not specified"));
2750 	KASSERT(dequeue != NULL, ("dequeue is not specified"));
2751 	KASSERT(request != NULL, ("request is not specified"));
2752 
2753 	ifq->altq_mapsubq = mapsubq;
2754 	for (q = 0; q < ifq->altq_subq_cnt; ++q) {
2755 		struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
2756 
2757 		ifsq->ifsq_enqueue = enqueue;
2758 		ifsq->ifsq_dequeue = dequeue;
2759 		ifsq->ifsq_request = request;
2760 	}
2761 }
2762 
2763 static void
2764 ifsq_norm_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m)
2765 {
2766 
2767 	classq_add(&ifsq->ifsq_norm, m);
2768 	ALTQ_SQ_CNTR_INC(ifsq, m->m_pkthdr.len);
2769 }
2770 
2771 static void
2772 ifsq_prio_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m)
2773 {
2774 
2775 	classq_add(&ifsq->ifsq_prio, m);
2776 	ALTQ_SQ_CNTR_INC(ifsq, m->m_pkthdr.len);
2777 	ALTQ_SQ_PRIO_CNTR_INC(ifsq, m->m_pkthdr.len);
2778 }
2779 
2780 static struct mbuf *
2781 ifsq_norm_dequeue(struct ifaltq_subque *ifsq)
2782 {
2783 	struct mbuf *m;
2784 
2785 	m = classq_get(&ifsq->ifsq_norm);
2786 	if (m != NULL)
2787 		ALTQ_SQ_CNTR_DEC(ifsq, m->m_pkthdr.len);
2788 	return (m);
2789 }
2790 
2791 static struct mbuf *
2792 ifsq_prio_dequeue(struct ifaltq_subque *ifsq)
2793 {
2794 	struct mbuf *m;
2795 
2796 	m = classq_get(&ifsq->ifsq_prio);
2797 	if (m != NULL) {
2798 		ALTQ_SQ_CNTR_DEC(ifsq, m->m_pkthdr.len);
2799 		ALTQ_SQ_PRIO_CNTR_DEC(ifsq, m->m_pkthdr.len);
2800 	}
2801 	return (m);
2802 }
2803 
2804 int
2805 ifsq_classic_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m,
2806     struct altq_pktattr *pa __unused)
2807 {
2808 
2809 	M_ASSERTPKTHDR(m);
2810 again:
2811 	if (ifsq->ifsq_len >= ifsq->ifsq_maxlen ||
2812 	    ifsq->ifsq_bcnt >= ifsq->ifsq_maxbcnt) {
2813 		struct mbuf *m_drop;
2814 
2815 		if (m->m_flags & M_PRIO) {
2816 			m_drop = NULL;
2817 			if (ifsq->ifsq_prio_len < (ifsq->ifsq_maxlen >> 1) &&
2818 			    ifsq->ifsq_prio_bcnt < (ifsq->ifsq_maxbcnt >> 1)) {
2819 				/* Try dropping some from normal queue. */
2820 				m_drop = ifsq_norm_dequeue(ifsq);
2821 			}
2822 			if (m_drop == NULL)
2823 				m_drop = ifsq_prio_dequeue(ifsq);
2824 		} else {
2825 			m_drop = ifsq_norm_dequeue(ifsq);
2826 		}
2827 		if (m_drop != NULL) {
2828 			IFNET_STAT_INC(ifsq->ifsq_ifp, oqdrops, 1);
2829 			m_freem(m_drop);
2830 			goto again;
2831 		}
2832 		/*
2833 		 * No old packets could be dropped!
2834 		 * NOTE: Caller increases oqdrops.
2835 		 */
2836 		m_freem(m);
2837 		return (ENOBUFS);
2838 	} else {
2839 		if (m->m_flags & M_PRIO)
2840 			ifsq_prio_enqueue(ifsq, m);
2841 		else
2842 			ifsq_norm_enqueue(ifsq, m);
2843 		return (0);
2844 	}
2845 }
2846 
2847 struct mbuf *
2848 ifsq_classic_dequeue(struct ifaltq_subque *ifsq, int op)
2849 {
2850 	struct mbuf *m;
2851 
2852 	switch (op) {
2853 	case ALTDQ_POLL:
2854 		m = classq_head(&ifsq->ifsq_prio);
2855 		if (m == NULL)
2856 			m = classq_head(&ifsq->ifsq_norm);
2857 		break;
2858 
2859 	case ALTDQ_REMOVE:
2860 		m = ifsq_prio_dequeue(ifsq);
2861 		if (m == NULL)
2862 			m = ifsq_norm_dequeue(ifsq);
2863 		break;
2864 
2865 	default:
2866 		panic("unsupported ALTQ dequeue op: %d", op);
2867 	}
2868 	return m;
2869 }
2870 
2871 int
2872 ifsq_classic_request(struct ifaltq_subque *ifsq, int req, void *arg)
2873 {
2874 	switch (req) {
2875 	case ALTRQ_PURGE:
2876 		for (;;) {
2877 			struct mbuf *m;
2878 
2879 			m = ifsq_classic_dequeue(ifsq, ALTDQ_REMOVE);
2880 			if (m == NULL)
2881 				break;
2882 			m_freem(m);
2883 		}
2884 		break;
2885 
2886 	default:
2887 		panic("unsupported ALTQ request: %d", req);
2888 	}
2889 	return 0;
2890 }
2891 
2892 static void
2893 ifsq_ifstart_try(struct ifaltq_subque *ifsq, int force_sched)
2894 {
2895 	struct ifnet *ifp = ifsq_get_ifp(ifsq);
2896 	int running = 0, need_sched;
2897 
2898 	/*
2899 	 * Try to do direct ifnet.if_start on the subqueue first, if there is
2900 	 * contention on the subqueue hardware serializer, ifnet.if_start on
2901 	 * the subqueue will be scheduled on the subqueue owner CPU.
2902 	 */
2903 	if (!ifsq_tryserialize_hw(ifsq)) {
2904 		/*
2905 		 * Subqueue hardware serializer contention happened,
2906 		 * ifnet.if_start on the subqueue is scheduled on
2907 		 * the subqueue owner CPU, and we keep going.
2908 		 */
2909 		ifsq_ifstart_schedule(ifsq, 1);
2910 		return;
2911 	}
2912 
2913 	if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq)) {
2914 		ifp->if_start(ifp, ifsq);
2915 		if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
2916 			running = 1;
2917 	}
2918 	need_sched = ifsq_ifstart_need_schedule(ifsq, running);
2919 
2920 	ifsq_deserialize_hw(ifsq);
2921 
2922 	if (need_sched) {
2923 		/*
2924 		 * More data need to be transmitted, ifnet.if_start on the
2925 		 * subqueue is scheduled on the subqueue owner CPU, and we
2926 		 * keep going.
2927 		 * NOTE: ifnet.if_start subqueue interlock is not released.
2928 		 */
2929 		ifsq_ifstart_schedule(ifsq, force_sched);
2930 	}
2931 }
2932 
2933 /*
2934  * Subqeue packets staging mechanism:
2935  *
2936  * The packets enqueued into the subqueue are staged to a certain amount
2937  * before the ifnet.if_start on the subqueue is called.  In this way, the
2938  * driver could avoid writing to hardware registers upon every packet,
2939  * instead, hardware registers could be written when certain amount of
2940  * packets are put onto hardware TX ring.  The measurement on several modern
2941  * NICs (emx(4), igb(4), bnx(4), bge(4), jme(4)) shows that the hardware
2942  * registers writing aggregation could save ~20% CPU time when 18bytes UDP
2943  * datagrams are transmitted at 1.48Mpps.  The performance improvement by
2944  * hardware registers writing aggeregation is also mentioned by Luigi Rizzo's
2945  * netmap paper (http://info.iet.unipi.it/~luigi/netmap/).
2946  *
2947  * Subqueue packets staging is performed for two entry points into drivers'
2948  * transmission function:
2949  * - Direct ifnet.if_start calling on the subqueue, i.e. ifsq_ifstart_try()
2950  * - ifnet.if_start scheduling on the subqueue, i.e. ifsq_ifstart_schedule()
2951  *
2952  * Subqueue packets staging will be stopped upon any of the following
2953  * conditions:
2954  * - If the count of packets enqueued on the current CPU is great than or
2955  *   equal to ifsq_stage_cntmax. (XXX this should be per-interface)
2956  * - If the total length of packets enqueued on the current CPU is great
2957  *   than or equal to the hardware's MTU - max_protohdr.  max_protohdr is
2958  *   cut from the hardware's MTU mainly bacause a full TCP segment's size
2959  *   is usually less than hardware's MTU.
2960  * - ifsq_ifstart_schedule() is not pending on the current CPU and
2961  *   ifnet.if_start subqueue interlock (ifaltq_subq.ifsq_started) is not
2962  *   released.
2963  * - The if_start_rollup(), which is registered as low priority netisr
2964  *   rollup function, is called; probably because no more work is pending
2965  *   for netisr.
2966  *
2967  * NOTE:
2968  * Currently subqueue packet staging is only performed in netisr threads.
2969  */
2970 int
2971 ifq_dispatch(struct ifnet *ifp, struct mbuf *m, struct altq_pktattr *pa)
2972 {
2973 	struct ifaltq *ifq = &ifp->if_snd;
2974 	struct ifaltq_subque *ifsq;
2975 	int error, start = 0, len, mcast = 0, avoid_start = 0;
2976 	struct ifsubq_stage_head *head = NULL;
2977 	struct ifsubq_stage *stage = NULL;
2978 	struct globaldata *gd = mycpu;
2979 	struct thread *td = gd->gd_curthread;
2980 
2981 	crit_enter_quick(td);
2982 
2983 	ifsq = ifq_map_subq(ifq, gd->gd_cpuid);
2984 	ASSERT_ALTQ_SQ_NOT_SERIALIZED_HW(ifsq);
2985 
2986 	len = m->m_pkthdr.len;
2987 	if (m->m_flags & M_MCAST)
2988 		mcast = 1;
2989 
2990 	if (td->td_type == TD_TYPE_NETISR) {
2991 		head = &ifsubq_stage_heads[mycpuid];
2992 		stage = ifsq_get_stage(ifsq, mycpuid);
2993 
2994 		stage->stg_cnt++;
2995 		stage->stg_len += len;
2996 		if (stage->stg_cnt < ifsq_stage_cntmax &&
2997 		    stage->stg_len < (ifp->if_mtu - max_protohdr))
2998 			avoid_start = 1;
2999 	}
3000 
3001 	ALTQ_SQ_LOCK(ifsq);
3002 	error = ifsq_enqueue_locked(ifsq, m, pa);
3003 	if (error) {
3004 		IFNET_STAT_INC(ifp, oqdrops, 1);
3005 		if (!ifsq_data_ready(ifsq)) {
3006 			ALTQ_SQ_UNLOCK(ifsq);
3007 			crit_exit_quick(td);
3008 			return error;
3009 		}
3010 		avoid_start = 0;
3011 	}
3012 	if (!ifsq_is_started(ifsq)) {
3013 		if (avoid_start) {
3014 			ALTQ_SQ_UNLOCK(ifsq);
3015 
3016 			KKASSERT(!error);
3017 			if ((stage->stg_flags & IFSQ_STAGE_FLAG_QUED) == 0)
3018 				ifsq_stage_insert(head, stage);
3019 
3020 			IFNET_STAT_INC(ifp, obytes, len);
3021 			if (mcast)
3022 				IFNET_STAT_INC(ifp, omcasts, 1);
3023 			crit_exit_quick(td);
3024 			return error;
3025 		}
3026 
3027 		/*
3028 		 * Hold the subqueue interlock of ifnet.if_start
3029 		 */
3030 		ifsq_set_started(ifsq);
3031 		start = 1;
3032 	}
3033 	ALTQ_SQ_UNLOCK(ifsq);
3034 
3035 	if (!error) {
3036 		IFNET_STAT_INC(ifp, obytes, len);
3037 		if (mcast)
3038 			IFNET_STAT_INC(ifp, omcasts, 1);
3039 	}
3040 
3041 	if (stage != NULL) {
3042 		if (!start && (stage->stg_flags & IFSQ_STAGE_FLAG_SCHED)) {
3043 			KKASSERT(stage->stg_flags & IFSQ_STAGE_FLAG_QUED);
3044 			if (!avoid_start) {
3045 				ifsq_stage_remove(head, stage);
3046 				ifsq_ifstart_schedule(ifsq, 1);
3047 			}
3048 			crit_exit_quick(td);
3049 			return error;
3050 		}
3051 
3052 		if (stage->stg_flags & IFSQ_STAGE_FLAG_QUED) {
3053 			ifsq_stage_remove(head, stage);
3054 		} else {
3055 			stage->stg_cnt = 0;
3056 			stage->stg_len = 0;
3057 		}
3058 	}
3059 
3060 	if (!start) {
3061 		crit_exit_quick(td);
3062 		return error;
3063 	}
3064 
3065 	ifsq_ifstart_try(ifsq, 0);
3066 
3067 	crit_exit_quick(td);
3068 	return error;
3069 }
3070 
3071 void *
3072 ifa_create(int size)
3073 {
3074 	struct ifaddr *ifa;
3075 	int i;
3076 
3077 	KASSERT(size >= sizeof(*ifa), ("ifaddr size too small"));
3078 
3079 	ifa = kmalloc(size, M_IFADDR, M_INTWAIT | M_ZERO);
3080 	ifa->ifa_containers =
3081 	    kmalloc_cachealign(ncpus * sizeof(struct ifaddr_container),
3082 	        M_IFADDR, M_INTWAIT | M_ZERO);
3083 
3084 	ifa->ifa_ncnt = ncpus;
3085 	for (i = 0; i < ncpus; ++i) {
3086 		struct ifaddr_container *ifac = &ifa->ifa_containers[i];
3087 
3088 		ifac->ifa_magic = IFA_CONTAINER_MAGIC;
3089 		ifac->ifa = ifa;
3090 		ifac->ifa_refcnt = 1;
3091 	}
3092 #ifdef IFADDR_DEBUG
3093 	kprintf("alloc ifa %p %d\n", ifa, size);
3094 #endif
3095 	return ifa;
3096 }
3097 
3098 void
3099 ifac_free(struct ifaddr_container *ifac, int cpu_id)
3100 {
3101 	struct ifaddr *ifa = ifac->ifa;
3102 
3103 	KKASSERT(ifac->ifa_magic == IFA_CONTAINER_MAGIC);
3104 	KKASSERT(ifac->ifa_refcnt == 0);
3105 	KASSERT(ifac->ifa_listmask == 0,
3106 		("ifa is still on %#x lists", ifac->ifa_listmask));
3107 
3108 	ifac->ifa_magic = IFA_CONTAINER_DEAD;
3109 
3110 #ifdef IFADDR_DEBUG_VERBOSE
3111 	kprintf("try free ifa %p cpu_id %d\n", ifac->ifa, cpu_id);
3112 #endif
3113 
3114 	KASSERT(ifa->ifa_ncnt > 0 && ifa->ifa_ncnt <= ncpus,
3115 		("invalid # of ifac, %d", ifa->ifa_ncnt));
3116 	if (atomic_fetchadd_int(&ifa->ifa_ncnt, -1) == 1) {
3117 #ifdef IFADDR_DEBUG
3118 		kprintf("free ifa %p\n", ifa);
3119 #endif
3120 		kfree(ifa->ifa_containers, M_IFADDR);
3121 		kfree(ifa, M_IFADDR);
3122 	}
3123 }
3124 
3125 static void
3126 ifa_iflink_dispatch(netmsg_t nmsg)
3127 {
3128 	struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
3129 	struct ifaddr *ifa = msg->ifa;
3130 	struct ifnet *ifp = msg->ifp;
3131 	int cpu = mycpuid;
3132 	struct ifaddr_container *ifac;
3133 
3134 	crit_enter();
3135 
3136 	ifac = &ifa->ifa_containers[cpu];
3137 	ASSERT_IFAC_VALID(ifac);
3138 	KASSERT((ifac->ifa_listmask & IFA_LIST_IFADDRHEAD) == 0,
3139 		("ifaddr is on if_addrheads"));
3140 
3141 	ifac->ifa_listmask |= IFA_LIST_IFADDRHEAD;
3142 	if (msg->tail)
3143 		TAILQ_INSERT_TAIL(&ifp->if_addrheads[cpu], ifac, ifa_link);
3144 	else
3145 		TAILQ_INSERT_HEAD(&ifp->if_addrheads[cpu], ifac, ifa_link);
3146 
3147 	crit_exit();
3148 
3149 	netisr_forwardmsg(&nmsg->base, cpu + 1);
3150 }
3151 
3152 void
3153 ifa_iflink(struct ifaddr *ifa, struct ifnet *ifp, int tail)
3154 {
3155 	struct netmsg_ifaddr msg;
3156 
3157 	netmsg_init(&msg.base, NULL, &curthread->td_msgport,
3158 		    0, ifa_iflink_dispatch);
3159 	msg.ifa = ifa;
3160 	msg.ifp = ifp;
3161 	msg.tail = tail;
3162 
3163 	netisr_domsg(&msg.base, 0);
3164 }
3165 
3166 static void
3167 ifa_ifunlink_dispatch(netmsg_t nmsg)
3168 {
3169 	struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
3170 	struct ifaddr *ifa = msg->ifa;
3171 	struct ifnet *ifp = msg->ifp;
3172 	int cpu = mycpuid;
3173 	struct ifaddr_container *ifac;
3174 
3175 	crit_enter();
3176 
3177 	ifac = &ifa->ifa_containers[cpu];
3178 	ASSERT_IFAC_VALID(ifac);
3179 	KASSERT(ifac->ifa_listmask & IFA_LIST_IFADDRHEAD,
3180 		("ifaddr is not on if_addrhead"));
3181 
3182 	TAILQ_REMOVE(&ifp->if_addrheads[cpu], ifac, ifa_link);
3183 	ifac->ifa_listmask &= ~IFA_LIST_IFADDRHEAD;
3184 
3185 	crit_exit();
3186 
3187 	netisr_forwardmsg(&nmsg->base, cpu + 1);
3188 }
3189 
3190 void
3191 ifa_ifunlink(struct ifaddr *ifa, struct ifnet *ifp)
3192 {
3193 	struct netmsg_ifaddr msg;
3194 
3195 	netmsg_init(&msg.base, NULL, &curthread->td_msgport,
3196 		    0, ifa_ifunlink_dispatch);
3197 	msg.ifa = ifa;
3198 	msg.ifp = ifp;
3199 
3200 	netisr_domsg(&msg.base, 0);
3201 }
3202 
3203 static void
3204 ifa_destroy_dispatch(netmsg_t nmsg)
3205 {
3206 	struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
3207 
3208 	IFAFREE(msg->ifa);
3209 	netisr_forwardmsg(&nmsg->base, mycpuid + 1);
3210 }
3211 
3212 void
3213 ifa_destroy(struct ifaddr *ifa)
3214 {
3215 	struct netmsg_ifaddr msg;
3216 
3217 	netmsg_init(&msg.base, NULL, &curthread->td_msgport,
3218 		    0, ifa_destroy_dispatch);
3219 	msg.ifa = ifa;
3220 
3221 	netisr_domsg(&msg.base, 0);
3222 }
3223 
3224 static void
3225 if_start_rollup(void)
3226 {
3227 	struct ifsubq_stage_head *head = &ifsubq_stage_heads[mycpuid];
3228 	struct ifsubq_stage *stage;
3229 
3230 	crit_enter();
3231 
3232 	while ((stage = TAILQ_FIRST(&head->stg_head)) != NULL) {
3233 		struct ifaltq_subque *ifsq = stage->stg_subq;
3234 		int is_sched = 0;
3235 
3236 		if (stage->stg_flags & IFSQ_STAGE_FLAG_SCHED)
3237 			is_sched = 1;
3238 		ifsq_stage_remove(head, stage);
3239 
3240 		if (is_sched) {
3241 			ifsq_ifstart_schedule(ifsq, 1);
3242 		} else {
3243 			int start = 0;
3244 
3245 			ALTQ_SQ_LOCK(ifsq);
3246 			if (!ifsq_is_started(ifsq)) {
3247 				/*
3248 				 * Hold the subqueue interlock of
3249 				 * ifnet.if_start
3250 				 */
3251 				ifsq_set_started(ifsq);
3252 				start = 1;
3253 			}
3254 			ALTQ_SQ_UNLOCK(ifsq);
3255 
3256 			if (start)
3257 				ifsq_ifstart_try(ifsq, 1);
3258 		}
3259 		KKASSERT((stage->stg_flags &
3260 		    (IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED)) == 0);
3261 	}
3262 
3263 	crit_exit();
3264 }
3265 
3266 static void
3267 ifnetinit(void *dummy __unused)
3268 {
3269 	int i;
3270 
3271 	for (i = 0; i < ncpus; ++i)
3272 		TAILQ_INIT(&ifsubq_stage_heads[i].stg_head);
3273 	netisr_register_rollup(if_start_rollup, NETISR_ROLLUP_PRIO_IFSTART);
3274 }
3275 
3276 void
3277 if_register_com_alloc(u_char type,
3278     if_com_alloc_t *a, if_com_free_t *f)
3279 {
3280 
3281         KASSERT(if_com_alloc[type] == NULL,
3282             ("if_register_com_alloc: %d already registered", type));
3283         KASSERT(if_com_free[type] == NULL,
3284             ("if_register_com_alloc: %d free already registered", type));
3285 
3286         if_com_alloc[type] = a;
3287         if_com_free[type] = f;
3288 }
3289 
3290 void
3291 if_deregister_com_alloc(u_char type)
3292 {
3293 
3294         KASSERT(if_com_alloc[type] != NULL,
3295             ("if_deregister_com_alloc: %d not registered", type));
3296         KASSERT(if_com_free[type] != NULL,
3297             ("if_deregister_com_alloc: %d free not registered", type));
3298         if_com_alloc[type] = NULL;
3299         if_com_free[type] = NULL;
3300 }
3301 
3302 int
3303 if_ring_count2(int cnt, int cnt_max)
3304 {
3305 	int shift = 0;
3306 
3307 	KASSERT(cnt_max >= 1 && powerof2(cnt_max),
3308 	    ("invalid ring count max %d", cnt_max));
3309 
3310 	if (cnt <= 0)
3311 		cnt = cnt_max;
3312 	if (cnt > ncpus2)
3313 		cnt = ncpus2;
3314 	if (cnt > cnt_max)
3315 		cnt = cnt_max;
3316 
3317 	while ((1 << (shift + 1)) <= cnt)
3318 		++shift;
3319 	cnt = 1 << shift;
3320 
3321 	KASSERT(cnt >= 1 && cnt <= ncpus2 && cnt <= cnt_max,
3322 	    ("calculate cnt %d, ncpus2 %d, cnt max %d",
3323 	     cnt, ncpus2, cnt_max));
3324 	return cnt;
3325 }
3326 
3327 void
3328 ifq_set_maxlen(struct ifaltq *ifq, int len)
3329 {
3330 	ifq->altq_maxlen = len + (ncpus * ifsq_stage_cntmax);
3331 }
3332 
3333 int
3334 ifq_mapsubq_default(struct ifaltq *ifq __unused, int cpuid __unused)
3335 {
3336 	return ALTQ_SUBQ_INDEX_DEFAULT;
3337 }
3338 
3339 int
3340 ifq_mapsubq_mask(struct ifaltq *ifq, int cpuid)
3341 {
3342 	return (cpuid & ifq->altq_subq_mask);
3343 }
3344 
3345 static void
3346 ifsq_watchdog(void *arg)
3347 {
3348 	struct ifsubq_watchdog *wd = arg;
3349 	struct ifnet *ifp;
3350 
3351 	if (__predict_true(wd->wd_timer == 0 || --wd->wd_timer))
3352 		goto done;
3353 
3354 	ifp = ifsq_get_ifp(wd->wd_subq);
3355 	if (ifnet_tryserialize_all(ifp)) {
3356 		wd->wd_watchdog(wd->wd_subq);
3357 		ifnet_deserialize_all(ifp);
3358 	} else {
3359 		/* try again next timeout */
3360 		wd->wd_timer = 1;
3361 	}
3362 done:
3363 	ifsq_watchdog_reset(wd);
3364 }
3365 
3366 static void
3367 ifsq_watchdog_reset(struct ifsubq_watchdog *wd)
3368 {
3369 	callout_reset_bycpu(&wd->wd_callout, hz, ifsq_watchdog, wd,
3370 	    ifsq_get_cpuid(wd->wd_subq));
3371 }
3372 
3373 void
3374 ifsq_watchdog_init(struct ifsubq_watchdog *wd, struct ifaltq_subque *ifsq,
3375     ifsq_watchdog_t watchdog)
3376 {
3377 	callout_init_mp(&wd->wd_callout);
3378 	wd->wd_timer = 0;
3379 	wd->wd_subq = ifsq;
3380 	wd->wd_watchdog = watchdog;
3381 }
3382 
3383 void
3384 ifsq_watchdog_start(struct ifsubq_watchdog *wd)
3385 {
3386 	wd->wd_timer = 0;
3387 	ifsq_watchdog_reset(wd);
3388 }
3389 
3390 void
3391 ifsq_watchdog_stop(struct ifsubq_watchdog *wd)
3392 {
3393 	wd->wd_timer = 0;
3394 	callout_stop(&wd->wd_callout);
3395 }
3396 
3397 void
3398 ifnet_lock(void)
3399 {
3400 	KASSERT(curthread->td_type != TD_TYPE_NETISR,
3401 	    ("try holding ifnet lock in netisr"));
3402 	mtx_lock(&ifnet_mtx);
3403 }
3404 
3405 void
3406 ifnet_unlock(void)
3407 {
3408 	KASSERT(curthread->td_type != TD_TYPE_NETISR,
3409 	    ("try holding ifnet lock in netisr"));
3410 	mtx_unlock(&ifnet_mtx);
3411 }
3412 
3413 static struct ifnet_array *
3414 ifnet_array_alloc(int count)
3415 {
3416 	struct ifnet_array *arr;
3417 
3418 	arr = kmalloc(__offsetof(struct ifnet_array, ifnet_arr[count]),
3419 	    M_IFNET, M_WAITOK);
3420 	arr->ifnet_count = count;
3421 
3422 	return arr;
3423 }
3424 
3425 static void
3426 ifnet_array_free(struct ifnet_array *arr)
3427 {
3428 	if (arr == &ifnet_array0)
3429 		return;
3430 	kfree(arr, M_IFNET);
3431 }
3432 
3433 static struct ifnet_array *
3434 ifnet_array_add(struct ifnet *ifp, const struct ifnet_array *old_arr)
3435 {
3436 	struct ifnet_array *arr;
3437 	int count, i;
3438 
3439 	KASSERT(old_arr->ifnet_count >= 0,
3440 	    ("invalid ifnet array count %d", old_arr->ifnet_count));
3441 	count = old_arr->ifnet_count + 1;
3442 	arr = ifnet_array_alloc(count);
3443 
3444 	/*
3445 	 * Save the old ifnet array and append this ifp to the end of
3446 	 * the new ifnet array.
3447 	 */
3448 	for (i = 0; i < old_arr->ifnet_count; ++i) {
3449 		KASSERT(old_arr->ifnet_arr[i] != ifp,
3450 		    ("%s is already in ifnet array", ifp->if_xname));
3451 		arr->ifnet_arr[i] = old_arr->ifnet_arr[i];
3452 	}
3453 	KASSERT(i == count - 1,
3454 	    ("add %s, ifnet array index mismatch, should be %d, but got %d",
3455 	     ifp->if_xname, count - 1, i));
3456 	arr->ifnet_arr[i] = ifp;
3457 
3458 	return arr;
3459 }
3460 
3461 static struct ifnet_array *
3462 ifnet_array_del(struct ifnet *ifp, const struct ifnet_array *old_arr)
3463 {
3464 	struct ifnet_array *arr;
3465 	int count, i, idx, found = 0;
3466 
3467 	KASSERT(old_arr->ifnet_count > 0,
3468 	    ("invalid ifnet array count %d", old_arr->ifnet_count));
3469 	count = old_arr->ifnet_count - 1;
3470 	arr = ifnet_array_alloc(count);
3471 
3472 	/*
3473 	 * Save the old ifnet array, but skip this ifp.
3474 	 */
3475 	idx = 0;
3476 	for (i = 0; i < old_arr->ifnet_count; ++i) {
3477 		if (old_arr->ifnet_arr[i] == ifp) {
3478 			KASSERT(!found,
3479 			    ("dup %s is in ifnet array", ifp->if_xname));
3480 			found = 1;
3481 			continue;
3482 		}
3483 		KASSERT(idx < count,
3484 		    ("invalid ifnet array index %d, count %d", idx, count));
3485 		arr->ifnet_arr[idx] = old_arr->ifnet_arr[i];
3486 		++idx;
3487 	}
3488 	KASSERT(found, ("%s is not in ifnet array", ifp->if_xname));
3489 	KASSERT(idx == count,
3490 	    ("del %s, ifnet array count mismatch, should be %d, but got %d ",
3491 	     ifp->if_xname, count, idx));
3492 
3493 	return arr;
3494 }
3495 
3496 const struct ifnet_array *
3497 ifnet_array_get(void)
3498 {
3499 	const struct ifnet_array *ret;
3500 
3501 	KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr"));
3502 	ret = ifnet_array;
3503 	/* Make sure 'ret' is really used. */
3504 	cpu_ccfence();
3505 	return (ret);
3506 }
3507 
3508 int
3509 ifnet_array_isempty(void)
3510 {
3511 	KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr"));
3512 	if (ifnet_array->ifnet_count == 0)
3513 		return 1;
3514 	else
3515 		return 0;
3516 }
3517 
3518 void
3519 ifa_marker_init(struct ifaddr_marker *mark, struct ifnet *ifp)
3520 {
3521 	struct ifaddr *ifa;
3522 
3523 	memset(mark, 0, sizeof(*mark));
3524 	ifa = &mark->ifa;
3525 
3526 	mark->ifac.ifa = ifa;
3527 
3528 	ifa->ifa_addr = &mark->addr;
3529 	ifa->ifa_dstaddr = &mark->dstaddr;
3530 	ifa->ifa_netmask = &mark->netmask;
3531 	ifa->ifa_ifp = ifp;
3532 }
3533