xref: /dflybsd-src/sys/net/if.c (revision 5aa42fef418118d7414e8b76fbb5ae50738ffea0)
1 /*
2  * Copyright (c) 1980, 1986, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. Neither the name of the University nor the names of its contributors
14  *    may be used to endorse or promote products derived from this software
15  *    without specific prior written permission.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  *
29  *	@(#)if.c	8.3 (Berkeley) 1/4/94
30  * $FreeBSD: src/sys/net/if.c,v 1.185 2004/03/13 02:35:03 brooks Exp $
31  */
32 
33 #include "opt_compat.h"
34 #include "opt_inet6.h"
35 #include "opt_inet.h"
36 #include "opt_ifpoll.h"
37 
38 #include <sys/param.h>
39 #include <sys/malloc.h>
40 #include <sys/mbuf.h>
41 #include <sys/systm.h>
42 #include <sys/proc.h>
43 #include <sys/priv.h>
44 #include <sys/protosw.h>
45 #include <sys/socket.h>
46 #include <sys/socketvar.h>
47 #include <sys/socketops.h>
48 #include <sys/kernel.h>
49 #include <sys/ktr.h>
50 #include <sys/mutex.h>
51 #include <sys/sockio.h>
52 #include <sys/syslog.h>
53 #include <sys/sysctl.h>
54 #include <sys/domain.h>
55 #include <sys/thread.h>
56 #include <sys/serialize.h>
57 #include <sys/bus.h>
58 
59 #include <sys/thread2.h>
60 #include <sys/msgport2.h>
61 #include <sys/mutex2.h>
62 
63 #include <net/if.h>
64 #include <net/if_arp.h>
65 #include <net/if_dl.h>
66 #include <net/if_types.h>
67 #include <net/if_var.h>
68 #include <net/ifq_var.h>
69 #include <net/radix.h>
70 #include <net/route.h>
71 #include <net/if_clone.h>
72 #include <net/netisr2.h>
73 #include <net/netmsg2.h>
74 
75 #include <machine/atomic.h>
76 #include <machine/stdarg.h>
77 #include <machine/smp.h>
78 
79 #if defined(INET) || defined(INET6)
80 /*XXX*/
81 #include <netinet/in.h>
82 #include <netinet/in_var.h>
83 #include <netinet/if_ether.h>
84 #ifdef INET6
85 #include <netinet6/in6_var.h>
86 #include <netinet6/in6_ifattach.h>
87 #endif
88 #endif
89 
90 #if defined(COMPAT_43)
91 #include <emulation/43bsd/43bsd_socket.h>
92 #endif /* COMPAT_43 */
93 
94 struct netmsg_ifaddr {
95 	struct netmsg_base base;
96 	struct ifaddr	*ifa;
97 	struct ifnet	*ifp;
98 	int		tail;
99 };
100 
101 struct ifsubq_stage_head {
102 	TAILQ_HEAD(, ifsubq_stage)	stg_head;
103 } __cachealign;
104 
105 /*
106  * System initialization
107  */
108 static void	if_attachdomain(void *);
109 static void	if_attachdomain1(struct ifnet *);
110 static int	ifconf(u_long, caddr_t, struct ucred *);
111 static void	ifinit(void *);
112 static void	ifnetinit(void *);
113 static void	if_slowtimo(void *);
114 static void	link_rtrequest(int, struct rtentry *);
115 static int	if_rtdel(struct radix_node *, void *);
116 static void	if_slowtimo_dispatch(netmsg_t);
117 
118 /* Helper functions */
119 static void	ifsq_watchdog_reset(struct ifsubq_watchdog *);
120 static int	if_delmulti_serialized(struct ifnet *, struct sockaddr *);
121 static struct ifnet_array *ifnet_array_alloc(int);
122 static void	ifnet_array_free(struct ifnet_array *);
123 static struct ifnet_array *ifnet_array_add(struct ifnet *,
124 		    const struct ifnet_array *);
125 static struct ifnet_array *ifnet_array_del(struct ifnet *,
126 		    const struct ifnet_array *);
127 
128 #ifdef INET6
129 /*
130  * XXX: declare here to avoid to include many inet6 related files..
131  * should be more generalized?
132  */
133 extern void	nd6_setmtu(struct ifnet *);
134 #endif
135 
136 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
137 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
138 
139 static int ifsq_stage_cntmax = 4;
140 TUNABLE_INT("net.link.stage_cntmax", &ifsq_stage_cntmax);
141 SYSCTL_INT(_net_link, OID_AUTO, stage_cntmax, CTLFLAG_RW,
142     &ifsq_stage_cntmax, 0, "ifq staging packet count max");
143 
144 static int if_stats_compat = 0;
145 SYSCTL_INT(_net_link, OID_AUTO, stats_compat, CTLFLAG_RW,
146     &if_stats_compat, 0, "Compat the old ifnet stats");
147 
148 SYSINIT(interfaces, SI_SUB_PROTO_IF, SI_ORDER_FIRST, ifinit, NULL)
149 /* Must be after netisr_init */
150 SYSINIT(ifnet, SI_SUB_PRE_DRIVERS, SI_ORDER_SECOND, ifnetinit, NULL)
151 
152 static  if_com_alloc_t *if_com_alloc[256];
153 static  if_com_free_t *if_com_free[256];
154 
155 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
156 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
157 MALLOC_DEFINE(M_IFNET, "ifnet", "interface structure");
158 
159 int			ifqmaxlen = IFQ_MAXLEN;
160 struct ifnethead	ifnet = TAILQ_HEAD_INITIALIZER(ifnet);
161 
162 static struct ifnet_array	ifnet_array0;
163 static struct ifnet_array	*ifnet_array = &ifnet_array0;
164 
165 static struct callout		if_slowtimo_timer;
166 static struct netmsg_base	if_slowtimo_netmsg;
167 
168 int			if_index = 0;
169 struct ifnet		**ifindex2ifnet = NULL;
170 static struct thread	ifnet_threads[MAXCPU];
171 static struct mtx	ifnet_mtx = MTX_INITIALIZER;
172 
173 static struct ifsubq_stage_head	ifsubq_stage_heads[MAXCPU];
174 
175 #ifdef notyet
176 #define IFQ_KTR_STRING		"ifq=%p"
177 #define IFQ_KTR_ARGS	struct ifaltq *ifq
178 #ifndef KTR_IFQ
179 #define KTR_IFQ			KTR_ALL
180 #endif
181 KTR_INFO_MASTER(ifq);
182 KTR_INFO(KTR_IFQ, ifq, enqueue, 0, IFQ_KTR_STRING, IFQ_KTR_ARGS);
183 KTR_INFO(KTR_IFQ, ifq, dequeue, 1, IFQ_KTR_STRING, IFQ_KTR_ARGS);
184 #define logifq(name, arg)	KTR_LOG(ifq_ ## name, arg)
185 
186 #define IF_START_KTR_STRING	"ifp=%p"
187 #define IF_START_KTR_ARGS	struct ifnet *ifp
188 #ifndef KTR_IF_START
189 #define KTR_IF_START		KTR_ALL
190 #endif
191 KTR_INFO_MASTER(if_start);
192 KTR_INFO(KTR_IF_START, if_start, run, 0,
193 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
194 KTR_INFO(KTR_IF_START, if_start, sched, 1,
195 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
196 KTR_INFO(KTR_IF_START, if_start, avoid, 2,
197 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
198 KTR_INFO(KTR_IF_START, if_start, contend_sched, 3,
199 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
200 KTR_INFO(KTR_IF_START, if_start, chase_sched, 4,
201 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
202 #define logifstart(name, arg)	KTR_LOG(if_start_ ## name, arg)
203 #endif
204 
205 TAILQ_HEAD(, ifg_group) ifg_head = TAILQ_HEAD_INITIALIZER(ifg_head);
206 
207 /*
208  * Network interface utility routines.
209  *
210  * Routines with ifa_ifwith* names take sockaddr *'s as
211  * parameters.
212  */
213 /* ARGSUSED*/
214 void
215 ifinit(void *dummy)
216 {
217 	struct ifnet *ifp;
218 
219 	callout_init_mp(&if_slowtimo_timer);
220 	netmsg_init(&if_slowtimo_netmsg, NULL, &netisr_adone_rport,
221 	    MSGF_PRIORITY, if_slowtimo_dispatch);
222 
223 	/* XXX is this necessary? */
224 	ifnet_lock();
225 	TAILQ_FOREACH(ifp, &ifnetlist, if_link) {
226 		if (ifp->if_snd.altq_maxlen == 0) {
227 			if_printf(ifp, "XXX: driver didn't set altq_maxlen\n");
228 			ifq_set_maxlen(&ifp->if_snd, ifqmaxlen);
229 		}
230 	}
231 	ifnet_unlock();
232 
233 	/* Start if_slowtimo */
234 	lwkt_sendmsg(netisr_cpuport(0), &if_slowtimo_netmsg.lmsg);
235 }
236 
237 static void
238 ifsq_ifstart_ipifunc(void *arg)
239 {
240 	struct ifaltq_subque *ifsq = arg;
241 	struct lwkt_msg *lmsg = ifsq_get_ifstart_lmsg(ifsq, mycpuid);
242 
243 	crit_enter();
244 	if (lmsg->ms_flags & MSGF_DONE)
245 		lwkt_sendmsg_oncpu(netisr_cpuport(mycpuid), lmsg);
246 	crit_exit();
247 }
248 
249 static __inline void
250 ifsq_stage_remove(struct ifsubq_stage_head *head, struct ifsubq_stage *stage)
251 {
252 	KKASSERT(stage->stg_flags & IFSQ_STAGE_FLAG_QUED);
253 	TAILQ_REMOVE(&head->stg_head, stage, stg_link);
254 	stage->stg_flags &= ~(IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED);
255 	stage->stg_cnt = 0;
256 	stage->stg_len = 0;
257 }
258 
259 static __inline void
260 ifsq_stage_insert(struct ifsubq_stage_head *head, struct ifsubq_stage *stage)
261 {
262 	KKASSERT((stage->stg_flags &
263 	    (IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED)) == 0);
264 	stage->stg_flags |= IFSQ_STAGE_FLAG_QUED;
265 	TAILQ_INSERT_TAIL(&head->stg_head, stage, stg_link);
266 }
267 
268 /*
269  * Schedule ifnet.if_start on the subqueue owner CPU
270  */
271 static void
272 ifsq_ifstart_schedule(struct ifaltq_subque *ifsq, int force)
273 {
274 	int cpu;
275 
276 	if (!force && curthread->td_type == TD_TYPE_NETISR &&
277 	    ifsq_stage_cntmax > 0) {
278 		struct ifsubq_stage *stage = ifsq_get_stage(ifsq, mycpuid);
279 
280 		stage->stg_cnt = 0;
281 		stage->stg_len = 0;
282 		if ((stage->stg_flags & IFSQ_STAGE_FLAG_QUED) == 0)
283 			ifsq_stage_insert(&ifsubq_stage_heads[mycpuid], stage);
284 		stage->stg_flags |= IFSQ_STAGE_FLAG_SCHED;
285 		return;
286 	}
287 
288 	cpu = ifsq_get_cpuid(ifsq);
289 	if (cpu != mycpuid)
290 		lwkt_send_ipiq(globaldata_find(cpu), ifsq_ifstart_ipifunc, ifsq);
291 	else
292 		ifsq_ifstart_ipifunc(ifsq);
293 }
294 
295 /*
296  * NOTE:
297  * This function will release ifnet.if_start subqueue interlock,
298  * if ifnet.if_start for the subqueue does not need to be scheduled
299  */
300 static __inline int
301 ifsq_ifstart_need_schedule(struct ifaltq_subque *ifsq, int running)
302 {
303 	if (!running || ifsq_is_empty(ifsq)
304 #ifdef ALTQ
305 	    || ifsq->ifsq_altq->altq_tbr != NULL
306 #endif
307 	) {
308 		ALTQ_SQ_LOCK(ifsq);
309 		/*
310 		 * ifnet.if_start subqueue interlock is released, if:
311 		 * 1) Hardware can not take any packets, due to
312 		 *    o  interface is marked down
313 		 *    o  hardware queue is full (ifsq_is_oactive)
314 		 *    Under the second situation, hardware interrupt
315 		 *    or polling(4) will call/schedule ifnet.if_start
316 		 *    on the subqueue when hardware queue is ready
317 		 * 2) There is no packet in the subqueue.
318 		 *    Further ifq_dispatch or ifq_handoff will call/
319 		 *    schedule ifnet.if_start on the subqueue.
320 		 * 3) TBR is used and it does not allow further
321 		 *    dequeueing.
322 		 *    TBR callout will call ifnet.if_start on the
323 		 *    subqueue.
324 		 */
325 		if (!running || !ifsq_data_ready(ifsq)) {
326 			ifsq_clr_started(ifsq);
327 			ALTQ_SQ_UNLOCK(ifsq);
328 			return 0;
329 		}
330 		ALTQ_SQ_UNLOCK(ifsq);
331 	}
332 	return 1;
333 }
334 
335 static void
336 ifsq_ifstart_dispatch(netmsg_t msg)
337 {
338 	struct lwkt_msg *lmsg = &msg->base.lmsg;
339 	struct ifaltq_subque *ifsq = lmsg->u.ms_resultp;
340 	struct ifnet *ifp = ifsq_get_ifp(ifsq);
341 	struct globaldata *gd = mycpu;
342 	int running = 0, need_sched;
343 
344 	crit_enter_gd(gd);
345 
346 	lwkt_replymsg(lmsg, 0);	/* reply ASAP */
347 
348 	if (gd->gd_cpuid != ifsq_get_cpuid(ifsq)) {
349 		/*
350 		 * We need to chase the subqueue owner CPU change.
351 		 */
352 		ifsq_ifstart_schedule(ifsq, 1);
353 		crit_exit_gd(gd);
354 		return;
355 	}
356 
357 	ifsq_serialize_hw(ifsq);
358 	if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq)) {
359 		ifp->if_start(ifp, ifsq);
360 		if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
361 			running = 1;
362 	}
363 	need_sched = ifsq_ifstart_need_schedule(ifsq, running);
364 	ifsq_deserialize_hw(ifsq);
365 
366 	if (need_sched) {
367 		/*
368 		 * More data need to be transmitted, ifnet.if_start is
369 		 * scheduled on the subqueue owner CPU, and we keep going.
370 		 * NOTE: ifnet.if_start subqueue interlock is not released.
371 		 */
372 		ifsq_ifstart_schedule(ifsq, 0);
373 	}
374 
375 	crit_exit_gd(gd);
376 }
377 
378 /* Device driver ifnet.if_start helper function */
379 void
380 ifsq_devstart(struct ifaltq_subque *ifsq)
381 {
382 	struct ifnet *ifp = ifsq_get_ifp(ifsq);
383 	int running = 0;
384 
385 	ASSERT_ALTQ_SQ_SERIALIZED_HW(ifsq);
386 
387 	ALTQ_SQ_LOCK(ifsq);
388 	if (ifsq_is_started(ifsq) || !ifsq_data_ready(ifsq)) {
389 		ALTQ_SQ_UNLOCK(ifsq);
390 		return;
391 	}
392 	ifsq_set_started(ifsq);
393 	ALTQ_SQ_UNLOCK(ifsq);
394 
395 	ifp->if_start(ifp, ifsq);
396 
397 	if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
398 		running = 1;
399 
400 	if (ifsq_ifstart_need_schedule(ifsq, running)) {
401 		/*
402 		 * More data need to be transmitted, ifnet.if_start is
403 		 * scheduled on ifnet's CPU, and we keep going.
404 		 * NOTE: ifnet.if_start interlock is not released.
405 		 */
406 		ifsq_ifstart_schedule(ifsq, 0);
407 	}
408 }
409 
410 void
411 if_devstart(struct ifnet *ifp)
412 {
413 	ifsq_devstart(ifq_get_subq_default(&ifp->if_snd));
414 }
415 
416 /* Device driver ifnet.if_start schedule helper function */
417 void
418 ifsq_devstart_sched(struct ifaltq_subque *ifsq)
419 {
420 	ifsq_ifstart_schedule(ifsq, 1);
421 }
422 
423 void
424 if_devstart_sched(struct ifnet *ifp)
425 {
426 	ifsq_devstart_sched(ifq_get_subq_default(&ifp->if_snd));
427 }
428 
429 static void
430 if_default_serialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
431 {
432 	lwkt_serialize_enter(ifp->if_serializer);
433 }
434 
435 static void
436 if_default_deserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
437 {
438 	lwkt_serialize_exit(ifp->if_serializer);
439 }
440 
441 static int
442 if_default_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
443 {
444 	return lwkt_serialize_try(ifp->if_serializer);
445 }
446 
447 #ifdef INVARIANTS
448 static void
449 if_default_serialize_assert(struct ifnet *ifp,
450 			    enum ifnet_serialize slz __unused,
451 			    boolean_t serialized)
452 {
453 	if (serialized)
454 		ASSERT_SERIALIZED(ifp->if_serializer);
455 	else
456 		ASSERT_NOT_SERIALIZED(ifp->if_serializer);
457 }
458 #endif
459 
460 /*
461  * Attach an interface to the list of "active" interfaces.
462  *
463  * The serializer is optional.
464  */
465 void
466 if_attach(struct ifnet *ifp, lwkt_serialize_t serializer)
467 {
468 	unsigned socksize, ifasize;
469 	int namelen, masklen;
470 	struct sockaddr_dl *sdl, *sdl_addr;
471 	struct ifaddr *ifa;
472 	struct ifaltq *ifq;
473 	struct ifnet **old_ifindex2ifnet = NULL;
474 	struct ifnet_array *old_ifnet_array;
475 	int i, q;
476 
477 	static int if_indexlim = 8;
478 
479 	if (ifp->if_serialize != NULL) {
480 		KASSERT(ifp->if_deserialize != NULL &&
481 			ifp->if_tryserialize != NULL &&
482 			ifp->if_serialize_assert != NULL,
483 			("serialize functions are partially setup"));
484 
485 		/*
486 		 * If the device supplies serialize functions,
487 		 * then clear if_serializer to catch any invalid
488 		 * usage of this field.
489 		 */
490 		KASSERT(serializer == NULL,
491 			("both serialize functions and default serializer "
492 			 "are supplied"));
493 		ifp->if_serializer = NULL;
494 	} else {
495 		KASSERT(ifp->if_deserialize == NULL &&
496 			ifp->if_tryserialize == NULL &&
497 			ifp->if_serialize_assert == NULL,
498 			("serialize functions are partially setup"));
499 		ifp->if_serialize = if_default_serialize;
500 		ifp->if_deserialize = if_default_deserialize;
501 		ifp->if_tryserialize = if_default_tryserialize;
502 #ifdef INVARIANTS
503 		ifp->if_serialize_assert = if_default_serialize_assert;
504 #endif
505 
506 		/*
507 		 * The serializer can be passed in from the device,
508 		 * allowing the same serializer to be used for both
509 		 * the interrupt interlock and the device queue.
510 		 * If not specified, the netif structure will use an
511 		 * embedded serializer.
512 		 */
513 		if (serializer == NULL) {
514 			serializer = &ifp->if_default_serializer;
515 			lwkt_serialize_init(serializer);
516 		}
517 		ifp->if_serializer = serializer;
518 	}
519 
520 	mtx_init(&ifp->if_ioctl_mtx);
521 
522 	/*
523 	 * XXX -
524 	 * The old code would work if the interface passed a pre-existing
525 	 * chain of ifaddrs to this code.  We don't trust our callers to
526 	 * properly initialize the tailq, however, so we no longer allow
527 	 * this unlikely case.
528 	 */
529 	ifp->if_addrheads = kmalloc(ncpus * sizeof(struct ifaddrhead),
530 				    M_IFADDR, M_WAITOK | M_ZERO);
531 	for (i = 0; i < ncpus; ++i)
532 		TAILQ_INIT(&ifp->if_addrheads[i]);
533 
534 	TAILQ_INIT(&ifp->if_multiaddrs);
535 	TAILQ_INIT(&ifp->if_groups);
536 	getmicrotime(&ifp->if_lastchange);
537 
538 	/*
539 	 * create a Link Level name for this device
540 	 */
541 	namelen = strlen(ifp->if_xname);
542 	masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + namelen;
543 	socksize = masklen + ifp->if_addrlen;
544 	if (socksize < sizeof(*sdl))
545 		socksize = sizeof(*sdl);
546 	socksize = RT_ROUNDUP(socksize);
547 	ifasize = sizeof(struct ifaddr) + 2 * socksize;
548 	ifa = ifa_create(ifasize, M_WAITOK);
549 	sdl = sdl_addr = (struct sockaddr_dl *)(ifa + 1);
550 	sdl->sdl_len = socksize;
551 	sdl->sdl_family = AF_LINK;
552 	bcopy(ifp->if_xname, sdl->sdl_data, namelen);
553 	sdl->sdl_nlen = namelen;
554 	sdl->sdl_type = ifp->if_type;
555 	ifp->if_lladdr = ifa;
556 	ifa->ifa_ifp = ifp;
557 	ifa->ifa_rtrequest = link_rtrequest;
558 	ifa->ifa_addr = (struct sockaddr *)sdl;
559 	sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
560 	ifa->ifa_netmask = (struct sockaddr *)sdl;
561 	sdl->sdl_len = masklen;
562 	while (namelen != 0)
563 		sdl->sdl_data[--namelen] = 0xff;
564 	ifa_iflink(ifa, ifp, 0 /* Insert head */);
565 
566 	ifp->if_data_pcpu = kmalloc_cachealign(
567 	    ncpus * sizeof(struct ifdata_pcpu), M_DEVBUF, M_WAITOK | M_ZERO);
568 
569 	if (ifp->if_mapsubq == NULL)
570 		ifp->if_mapsubq = ifq_mapsubq_default;
571 
572 	ifq = &ifp->if_snd;
573 	ifq->altq_type = 0;
574 	ifq->altq_disc = NULL;
575 	ifq->altq_flags &= ALTQF_CANTCHANGE;
576 	ifq->altq_tbr = NULL;
577 	ifq->altq_ifp = ifp;
578 
579 	if (ifq->altq_subq_cnt <= 0)
580 		ifq->altq_subq_cnt = 1;
581 	ifq->altq_subq = kmalloc_cachealign(
582 	    ifq->altq_subq_cnt * sizeof(struct ifaltq_subque),
583 	    M_DEVBUF, M_WAITOK | M_ZERO);
584 
585 	if (ifq->altq_maxlen == 0) {
586 		if_printf(ifp, "driver didn't set altq_maxlen\n");
587 		ifq_set_maxlen(ifq, ifqmaxlen);
588 	}
589 
590 	for (q = 0; q < ifq->altq_subq_cnt; ++q) {
591 		struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
592 
593 		ALTQ_SQ_LOCK_INIT(ifsq);
594 		ifsq->ifsq_index = q;
595 
596 		ifsq->ifsq_altq = ifq;
597 		ifsq->ifsq_ifp = ifp;
598 
599 		ifsq->ifsq_maxlen = ifq->altq_maxlen;
600 		ifsq->ifsq_maxbcnt = ifsq->ifsq_maxlen * MCLBYTES;
601 		ifsq->ifsq_prepended = NULL;
602 		ifsq->ifsq_started = 0;
603 		ifsq->ifsq_hw_oactive = 0;
604 		ifsq_set_cpuid(ifsq, 0);
605 		if (ifp->if_serializer != NULL)
606 			ifsq_set_hw_serialize(ifsq, ifp->if_serializer);
607 
608 		ifsq->ifsq_stage =
609 		    kmalloc_cachealign(ncpus * sizeof(struct ifsubq_stage),
610 		    M_DEVBUF, M_WAITOK | M_ZERO);
611 		for (i = 0; i < ncpus; ++i)
612 			ifsq->ifsq_stage[i].stg_subq = ifsq;
613 
614 		ifsq->ifsq_ifstart_nmsg =
615 		    kmalloc(ncpus * sizeof(struct netmsg_base),
616 		    M_LWKTMSG, M_WAITOK);
617 		for (i = 0; i < ncpus; ++i) {
618 			netmsg_init(&ifsq->ifsq_ifstart_nmsg[i], NULL,
619 			    &netisr_adone_rport, 0, ifsq_ifstart_dispatch);
620 			ifsq->ifsq_ifstart_nmsg[i].lmsg.u.ms_resultp = ifsq;
621 		}
622 	}
623 	ifq_set_classic(ifq);
624 
625 	/*
626 	 * Install this ifp into ifindex2inet, ifnet queue and ifnet
627 	 * array after it is setup.
628 	 *
629 	 * Protect ifindex2ifnet, ifnet queue and ifnet array changes
630 	 * by ifnet lock, so that non-netisr threads could get a
631 	 * consistent view.
632 	 */
633 	ifnet_lock();
634 
635 	/* Don't update if_index until ifindex2ifnet is setup */
636 	ifp->if_index = if_index + 1;
637 	sdl_addr->sdl_index = ifp->if_index;
638 
639 	/*
640 	 * Install this ifp into ifindex2ifnet
641 	 */
642 	if (ifindex2ifnet == NULL || ifp->if_index >= if_indexlim) {
643 		unsigned int n;
644 		struct ifnet **q;
645 
646 		/*
647 		 * Grow ifindex2ifnet
648 		 */
649 		if_indexlim <<= 1;
650 		n = if_indexlim * sizeof(*q);
651 		q = kmalloc(n, M_IFADDR, M_WAITOK | M_ZERO);
652 		if (ifindex2ifnet != NULL) {
653 			bcopy(ifindex2ifnet, q, n/2);
654 			/* Free old ifindex2ifnet after sync all netisrs */
655 			old_ifindex2ifnet = ifindex2ifnet;
656 		}
657 		ifindex2ifnet = q;
658 	}
659 	ifindex2ifnet[ifp->if_index] = ifp;
660 	/*
661 	 * Update if_index after this ifp is installed into ifindex2ifnet,
662 	 * so that netisrs could get a consistent view of ifindex2ifnet.
663 	 */
664 	cpu_sfence();
665 	if_index = ifp->if_index;
666 
667 	/*
668 	 * Install this ifp into ifnet array.
669 	 */
670 	/* Free old ifnet array after sync all netisrs */
671 	old_ifnet_array = ifnet_array;
672 	ifnet_array = ifnet_array_add(ifp, old_ifnet_array);
673 
674 	/*
675 	 * Install this ifp into ifnet queue.
676 	 */
677 	TAILQ_INSERT_TAIL(&ifnetlist, ifp, if_link);
678 
679 	ifnet_unlock();
680 
681 	/*
682 	 * Sync all netisrs so that the old ifindex2ifnet and ifnet array
683 	 * are no longer accessed and we can free them safely later on.
684 	 */
685 	netmsg_service_sync();
686 	if (old_ifindex2ifnet != NULL)
687 		kfree(old_ifindex2ifnet, M_IFADDR);
688 	ifnet_array_free(old_ifnet_array);
689 
690 	if (!SLIST_EMPTY(&domains))
691 		if_attachdomain1(ifp);
692 
693 	/* Announce the interface. */
694 	EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
695 	devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
696 	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
697 }
698 
699 static void
700 if_attachdomain(void *dummy)
701 {
702 	struct ifnet *ifp;
703 
704 	ifnet_lock();
705 	TAILQ_FOREACH(ifp, &ifnetlist, if_list)
706 		if_attachdomain1(ifp);
707 	ifnet_unlock();
708 }
709 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST,
710 	if_attachdomain, NULL);
711 
712 static void
713 if_attachdomain1(struct ifnet *ifp)
714 {
715 	struct domain *dp;
716 
717 	crit_enter();
718 
719 	/* address family dependent data region */
720 	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
721 	SLIST_FOREACH(dp, &domains, dom_next)
722 		if (dp->dom_ifattach)
723 			ifp->if_afdata[dp->dom_family] =
724 				(*dp->dom_ifattach)(ifp);
725 	crit_exit();
726 }
727 
728 /*
729  * Purge all addresses whose type is _not_ AF_LINK
730  */
731 static void
732 if_purgeaddrs_nolink_dispatch(netmsg_t nmsg)
733 {
734 	struct lwkt_msg *lmsg = &nmsg->lmsg;
735 	struct ifnet *ifp = lmsg->u.ms_resultp;
736 	struct ifaddr_container *ifac, *next;
737 
738 	KASSERT(&curthread->td_msgport == netisr_cpuport(0),
739 	    ("not in netisr0"));
740 
741 	/*
742 	 * The ifaddr processing in the following loop will block,
743 	 * however, this function is called in netisr0, in which
744 	 * ifaddr list changes happen, so we don't care about the
745 	 * blockness of the ifaddr processing here.
746 	 */
747 	TAILQ_FOREACH_MUTABLE(ifac, &ifp->if_addrheads[mycpuid],
748 			      ifa_link, next) {
749 		struct ifaddr *ifa = ifac->ifa;
750 
751 		/* Ignore marker */
752 		if (ifa->ifa_addr->sa_family == AF_UNSPEC)
753 			continue;
754 
755 		/* Leave link ifaddr as it is */
756 		if (ifa->ifa_addr->sa_family == AF_LINK)
757 			continue;
758 #ifdef INET
759 		/* XXX: Ugly!! ad hoc just for INET */
760 		if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET) {
761 			struct ifaliasreq ifr;
762 #ifdef IFADDR_DEBUG_VERBOSE
763 			int i;
764 
765 			kprintf("purge in4 addr %p: ", ifa);
766 			for (i = 0; i < ncpus; ++i)
767 				kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
768 			kprintf("\n");
769 #endif
770 
771 			bzero(&ifr, sizeof ifr);
772 			ifr.ifra_addr = *ifa->ifa_addr;
773 			if (ifa->ifa_dstaddr)
774 				ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
775 			if (in_control(SIOCDIFADDR, (caddr_t)&ifr, ifp,
776 				       NULL) == 0)
777 				continue;
778 		}
779 #endif /* INET */
780 #ifdef INET6
781 		if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET6) {
782 #ifdef IFADDR_DEBUG_VERBOSE
783 			int i;
784 
785 			kprintf("purge in6 addr %p: ", ifa);
786 			for (i = 0; i < ncpus; ++i)
787 				kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
788 			kprintf("\n");
789 #endif
790 
791 			in6_purgeaddr(ifa);
792 			/* ifp_addrhead is already updated */
793 			continue;
794 		}
795 #endif /* INET6 */
796 		ifa_ifunlink(ifa, ifp);
797 		ifa_destroy(ifa);
798 	}
799 
800 	lwkt_replymsg(lmsg, 0);
801 }
802 
803 void
804 if_purgeaddrs_nolink(struct ifnet *ifp)
805 {
806 	struct netmsg_base nmsg;
807 	struct lwkt_msg *lmsg = &nmsg.lmsg;
808 
809 	ASSERT_CANDOMSG_NETISR0(curthread);
810 
811 	netmsg_init(&nmsg, NULL, &curthread->td_msgport, 0,
812 	    if_purgeaddrs_nolink_dispatch);
813 	lmsg->u.ms_resultp = ifp;
814 	lwkt_domsg(netisr_cpuport(0), lmsg, 0);
815 }
816 
817 static void
818 ifq_stage_detach_handler(netmsg_t nmsg)
819 {
820 	struct ifaltq *ifq = nmsg->lmsg.u.ms_resultp;
821 	int q;
822 
823 	for (q = 0; q < ifq->altq_subq_cnt; ++q) {
824 		struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
825 		struct ifsubq_stage *stage = ifsq_get_stage(ifsq, mycpuid);
826 
827 		if (stage->stg_flags & IFSQ_STAGE_FLAG_QUED)
828 			ifsq_stage_remove(&ifsubq_stage_heads[mycpuid], stage);
829 	}
830 	lwkt_replymsg(&nmsg->lmsg, 0);
831 }
832 
833 static void
834 ifq_stage_detach(struct ifaltq *ifq)
835 {
836 	struct netmsg_base base;
837 	int cpu;
838 
839 	netmsg_init(&base, NULL, &curthread->td_msgport, 0,
840 	    ifq_stage_detach_handler);
841 	base.lmsg.u.ms_resultp = ifq;
842 
843 	for (cpu = 0; cpu < ncpus; ++cpu)
844 		lwkt_domsg(netisr_cpuport(cpu), &base.lmsg, 0);
845 }
846 
847 struct netmsg_if_rtdel {
848 	struct netmsg_base	base;
849 	struct ifnet		*ifp;
850 };
851 
852 static void
853 if_rtdel_dispatch(netmsg_t msg)
854 {
855 	struct netmsg_if_rtdel *rmsg = (void *)msg;
856 	int i, nextcpu, cpu;
857 
858 	cpu = mycpuid;
859 	for (i = 1; i <= AF_MAX; i++) {
860 		struct radix_node_head	*rnh;
861 
862 		if ((rnh = rt_tables[cpu][i]) == NULL)
863 			continue;
864 		rnh->rnh_walktree(rnh, if_rtdel, rmsg->ifp);
865 	}
866 
867 	nextcpu = cpu + 1;
868 	if (nextcpu < ncpus)
869 		lwkt_forwardmsg(netisr_cpuport(nextcpu), &rmsg->base.lmsg);
870 	else
871 		lwkt_replymsg(&rmsg->base.lmsg, 0);
872 }
873 
874 /*
875  * Detach an interface, removing it from the
876  * list of "active" interfaces.
877  */
878 void
879 if_detach(struct ifnet *ifp)
880 {
881 	struct ifnet_array *old_ifnet_array;
882 	struct netmsg_if_rtdel msg;
883 	struct domain *dp;
884 	int q;
885 
886 	/* Announce that the interface is gone. */
887 	EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
888 	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
889 	devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
890 
891 	/*
892 	 * Remove this ifp from ifindex2inet, ifnet queue and ifnet
893 	 * array before it is whacked.
894 	 *
895 	 * Protect ifindex2ifnet, ifnet queue and ifnet array changes
896 	 * by ifnet lock, so that non-netisr threads could get a
897 	 * consistent view.
898 	 */
899 	ifnet_lock();
900 
901 	/*
902 	 * Remove this ifp from ifindex2ifnet and maybe decrement if_index.
903 	 */
904 	ifindex2ifnet[ifp->if_index] = NULL;
905 	while (if_index > 0 && ifindex2ifnet[if_index] == NULL)
906 		if_index--;
907 
908 	/*
909 	 * Remove this ifp from ifnet queue.
910 	 */
911 	TAILQ_REMOVE(&ifnetlist, ifp, if_link);
912 
913 	/*
914 	 * Remove this ifp from ifnet array.
915 	 */
916 	/* Free old ifnet array after sync all netisrs */
917 	old_ifnet_array = ifnet_array;
918 	ifnet_array = ifnet_array_del(ifp, old_ifnet_array);
919 
920 	ifnet_unlock();
921 
922 	/*
923 	 * Sync all netisrs so that the old ifnet array is no longer
924 	 * accessed and we can free it safely later on.
925 	 */
926 	netmsg_service_sync();
927 	ifnet_array_free(old_ifnet_array);
928 
929 	/*
930 	 * Remove routes and flush queues.
931 	 */
932 	crit_enter();
933 #ifdef IFPOLL_ENABLE
934 	if (ifp->if_flags & IFF_NPOLLING)
935 		ifpoll_deregister(ifp);
936 #endif
937 	if_down(ifp);
938 
939 #ifdef ALTQ
940 	if (ifq_is_enabled(&ifp->if_snd))
941 		altq_disable(&ifp->if_snd);
942 	if (ifq_is_attached(&ifp->if_snd))
943 		altq_detach(&ifp->if_snd);
944 #endif
945 
946 	/*
947 	 * Clean up all addresses.
948 	 */
949 	ifp->if_lladdr = NULL;
950 
951 	if_purgeaddrs_nolink(ifp);
952 	if (!TAILQ_EMPTY(&ifp->if_addrheads[mycpuid])) {
953 		struct ifaddr *ifa;
954 
955 		ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
956 		KASSERT(ifa->ifa_addr->sa_family == AF_LINK,
957 			("non-link ifaddr is left on if_addrheads"));
958 
959 		ifa_ifunlink(ifa, ifp);
960 		ifa_destroy(ifa);
961 		KASSERT(TAILQ_EMPTY(&ifp->if_addrheads[mycpuid]),
962 			("there are still ifaddrs left on if_addrheads"));
963 	}
964 
965 #ifdef INET
966 	/*
967 	 * Remove all IPv4 kernel structures related to ifp.
968 	 */
969 	in_ifdetach(ifp);
970 #endif
971 
972 #ifdef INET6
973 	/*
974 	 * Remove all IPv6 kernel structs related to ifp.  This should be done
975 	 * before removing routing entries below, since IPv6 interface direct
976 	 * routes are expected to be removed by the IPv6-specific kernel API.
977 	 * Otherwise, the kernel will detect some inconsistency and bark it.
978 	 */
979 	in6_ifdetach(ifp);
980 #endif
981 
982 	/*
983 	 * Delete all remaining routes using this interface
984 	 */
985 	netmsg_init(&msg.base, NULL, &curthread->td_msgport, MSGF_PRIORITY,
986 	    if_rtdel_dispatch);
987 	msg.ifp = ifp;
988 	rt_domsg_global(&msg.base);
989 
990 	SLIST_FOREACH(dp, &domains, dom_next)
991 		if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
992 			(*dp->dom_ifdetach)(ifp,
993 				ifp->if_afdata[dp->dom_family]);
994 
995 	kfree(ifp->if_addrheads, M_IFADDR);
996 
997 	lwkt_synchronize_ipiqs("if_detach");
998 	ifq_stage_detach(&ifp->if_snd);
999 
1000 	for (q = 0; q < ifp->if_snd.altq_subq_cnt; ++q) {
1001 		struct ifaltq_subque *ifsq = &ifp->if_snd.altq_subq[q];
1002 
1003 		kfree(ifsq->ifsq_ifstart_nmsg, M_LWKTMSG);
1004 		kfree(ifsq->ifsq_stage, M_DEVBUF);
1005 	}
1006 	kfree(ifp->if_snd.altq_subq, M_DEVBUF);
1007 
1008 	kfree(ifp->if_data_pcpu, M_DEVBUF);
1009 
1010 	crit_exit();
1011 }
1012 
1013 /*
1014  * Create interface group without members
1015  */
1016 struct ifg_group *
1017 if_creategroup(const char *groupname)
1018 {
1019         struct ifg_group        *ifg = NULL;
1020 
1021         if ((ifg = (struct ifg_group *)kmalloc(sizeof(struct ifg_group),
1022             M_TEMP, M_NOWAIT)) == NULL)
1023                 return (NULL);
1024 
1025         strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
1026         ifg->ifg_refcnt = 0;
1027         ifg->ifg_carp_demoted = 0;
1028         TAILQ_INIT(&ifg->ifg_members);
1029 #if NPF > 0
1030         pfi_attach_ifgroup(ifg);
1031 #endif
1032         TAILQ_INSERT_TAIL(&ifg_head, ifg, ifg_next);
1033 
1034         return (ifg);
1035 }
1036 
1037 /*
1038  * Add a group to an interface
1039  */
1040 int
1041 if_addgroup(struct ifnet *ifp, const char *groupname)
1042 {
1043 	struct ifg_list		*ifgl;
1044 	struct ifg_group	*ifg = NULL;
1045 	struct ifg_member	*ifgm;
1046 
1047 	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
1048 	    groupname[strlen(groupname) - 1] <= '9')
1049 		return (EINVAL);
1050 
1051 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
1052 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
1053 			return (EEXIST);
1054 
1055 	if ((ifgl = kmalloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL)
1056 		return (ENOMEM);
1057 
1058 	if ((ifgm = kmalloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) {
1059 		kfree(ifgl, M_TEMP);
1060 		return (ENOMEM);
1061 	}
1062 
1063 	TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
1064 		if (!strcmp(ifg->ifg_group, groupname))
1065 			break;
1066 
1067 	if (ifg == NULL && (ifg = if_creategroup(groupname)) == NULL) {
1068 		kfree(ifgl, M_TEMP);
1069 		kfree(ifgm, M_TEMP);
1070 		return (ENOMEM);
1071 	}
1072 
1073 	ifg->ifg_refcnt++;
1074 	ifgl->ifgl_group = ifg;
1075 	ifgm->ifgm_ifp = ifp;
1076 
1077 	TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
1078 	TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
1079 
1080 #if NPF > 0
1081 	pfi_group_change(groupname);
1082 #endif
1083 
1084 	return (0);
1085 }
1086 
1087 /*
1088  * Remove a group from an interface
1089  */
1090 int
1091 if_delgroup(struct ifnet *ifp, const char *groupname)
1092 {
1093 	struct ifg_list		*ifgl;
1094 	struct ifg_member	*ifgm;
1095 
1096 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
1097 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
1098 			break;
1099 	if (ifgl == NULL)
1100 		return (ENOENT);
1101 
1102 	TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
1103 
1104 	TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
1105 		if (ifgm->ifgm_ifp == ifp)
1106 			break;
1107 
1108 	if (ifgm != NULL) {
1109 		TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
1110 		kfree(ifgm, M_TEMP);
1111 	}
1112 
1113 	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
1114 		TAILQ_REMOVE(&ifg_head, ifgl->ifgl_group, ifg_next);
1115 #if NPF > 0
1116 		pfi_detach_ifgroup(ifgl->ifgl_group);
1117 #endif
1118 		kfree(ifgl->ifgl_group, M_TEMP);
1119 	}
1120 
1121 	kfree(ifgl, M_TEMP);
1122 
1123 #if NPF > 0
1124 	pfi_group_change(groupname);
1125 #endif
1126 
1127 	return (0);
1128 }
1129 
1130 /*
1131  * Stores all groups from an interface in memory pointed
1132  * to by data
1133  */
1134 int
1135 if_getgroup(caddr_t data, struct ifnet *ifp)
1136 {
1137 	int			 len, error;
1138 	struct ifg_list		*ifgl;
1139 	struct ifg_req		 ifgrq, *ifgp;
1140 	struct ifgroupreq	*ifgr = (struct ifgroupreq *)data;
1141 
1142 	if (ifgr->ifgr_len == 0) {
1143 		TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
1144 			ifgr->ifgr_len += sizeof(struct ifg_req);
1145 		return (0);
1146 	}
1147 
1148 	len = ifgr->ifgr_len;
1149 	ifgp = ifgr->ifgr_groups;
1150 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
1151 		if (len < sizeof(ifgrq))
1152 			return (EINVAL);
1153 		bzero(&ifgrq, sizeof ifgrq);
1154 		strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
1155 		    sizeof(ifgrq.ifgrq_group));
1156 		if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
1157 		    sizeof(struct ifg_req))))
1158 			return (error);
1159 		len -= sizeof(ifgrq);
1160 		ifgp++;
1161 	}
1162 
1163 	return (0);
1164 }
1165 
1166 /*
1167  * Stores all members of a group in memory pointed to by data
1168  */
1169 int
1170 if_getgroupmembers(caddr_t data)
1171 {
1172 	struct ifgroupreq	*ifgr = (struct ifgroupreq *)data;
1173 	struct ifg_group	*ifg;
1174 	struct ifg_member	*ifgm;
1175 	struct ifg_req		 ifgrq, *ifgp;
1176 	int			 len, error;
1177 
1178 	TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
1179 		if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
1180 			break;
1181 	if (ifg == NULL)
1182 		return (ENOENT);
1183 
1184 	if (ifgr->ifgr_len == 0) {
1185 		TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
1186 			ifgr->ifgr_len += sizeof(ifgrq);
1187 		return (0);
1188 	}
1189 
1190 	len = ifgr->ifgr_len;
1191 	ifgp = ifgr->ifgr_groups;
1192 	TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
1193 		if (len < sizeof(ifgrq))
1194 			return (EINVAL);
1195 		bzero(&ifgrq, sizeof ifgrq);
1196 		strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
1197 		    sizeof(ifgrq.ifgrq_member));
1198 		if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
1199 		    sizeof(struct ifg_req))))
1200 			return (error);
1201 		len -= sizeof(ifgrq);
1202 		ifgp++;
1203 	}
1204 
1205 	return (0);
1206 }
1207 
1208 /*
1209  * Delete Routes for a Network Interface
1210  *
1211  * Called for each routing entry via the rnh->rnh_walktree() call above
1212  * to delete all route entries referencing a detaching network interface.
1213  *
1214  * Arguments:
1215  *	rn	pointer to node in the routing table
1216  *	arg	argument passed to rnh->rnh_walktree() - detaching interface
1217  *
1218  * Returns:
1219  *	0	successful
1220  *	errno	failed - reason indicated
1221  *
1222  */
1223 static int
1224 if_rtdel(struct radix_node *rn, void *arg)
1225 {
1226 	struct rtentry	*rt = (struct rtentry *)rn;
1227 	struct ifnet	*ifp = arg;
1228 	int		err;
1229 
1230 	if (rt->rt_ifp == ifp) {
1231 
1232 		/*
1233 		 * Protect (sorta) against walktree recursion problems
1234 		 * with cloned routes
1235 		 */
1236 		if (!(rt->rt_flags & RTF_UP))
1237 			return (0);
1238 
1239 		err = rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
1240 				rt_mask(rt), rt->rt_flags,
1241 				NULL);
1242 		if (err) {
1243 			log(LOG_WARNING, "if_rtdel: error %d\n", err);
1244 		}
1245 	}
1246 
1247 	return (0);
1248 }
1249 
1250 /*
1251  * Locate an interface based on a complete address.
1252  */
1253 struct ifaddr *
1254 ifa_ifwithaddr(struct sockaddr *addr)
1255 {
1256 	const struct ifnet_array *arr;
1257 	int i;
1258 
1259 	arr = ifnet_array_get();
1260 	for (i = 0; i < arr->ifnet_count; ++i) {
1261 		struct ifnet *ifp = arr->ifnet_arr[i];
1262 		struct ifaddr_container *ifac;
1263 
1264 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1265 			struct ifaddr *ifa = ifac->ifa;
1266 
1267 			if (ifa->ifa_addr->sa_family != addr->sa_family)
1268 				continue;
1269 			if (sa_equal(addr, ifa->ifa_addr))
1270 				return (ifa);
1271 			if ((ifp->if_flags & IFF_BROADCAST) &&
1272 			    ifa->ifa_broadaddr &&
1273 			    /* IPv6 doesn't have broadcast */
1274 			    ifa->ifa_broadaddr->sa_len != 0 &&
1275 			    sa_equal(ifa->ifa_broadaddr, addr))
1276 				return (ifa);
1277 		}
1278 	}
1279 	return (NULL);
1280 }
1281 /*
1282  * Locate the point to point interface with a given destination address.
1283  */
1284 struct ifaddr *
1285 ifa_ifwithdstaddr(struct sockaddr *addr)
1286 {
1287 	const struct ifnet_array *arr;
1288 	int i;
1289 
1290 	arr = ifnet_array_get();
1291 	for (i = 0; i < arr->ifnet_count; ++i) {
1292 		struct ifnet *ifp = arr->ifnet_arr[i];
1293 		struct ifaddr_container *ifac;
1294 
1295 		if (!(ifp->if_flags & IFF_POINTOPOINT))
1296 			continue;
1297 
1298 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1299 			struct ifaddr *ifa = ifac->ifa;
1300 
1301 			if (ifa->ifa_addr->sa_family != addr->sa_family)
1302 				continue;
1303 			if (ifa->ifa_dstaddr &&
1304 			    sa_equal(addr, ifa->ifa_dstaddr))
1305 				return (ifa);
1306 		}
1307 	}
1308 	return (NULL);
1309 }
1310 
1311 /*
1312  * Find an interface on a specific network.  If many, choice
1313  * is most specific found.
1314  */
1315 struct ifaddr *
1316 ifa_ifwithnet(struct sockaddr *addr)
1317 {
1318 	struct ifaddr *ifa_maybe = NULL;
1319 	u_int af = addr->sa_family;
1320 	char *addr_data = addr->sa_data, *cplim;
1321 	const struct ifnet_array *arr;
1322 	int i;
1323 
1324 	/*
1325 	 * AF_LINK addresses can be looked up directly by their index number,
1326 	 * so do that if we can.
1327 	 */
1328 	if (af == AF_LINK) {
1329 		struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
1330 
1331 		if (sdl->sdl_index && sdl->sdl_index <= if_index)
1332 			return (ifindex2ifnet[sdl->sdl_index]->if_lladdr);
1333 	}
1334 
1335 	/*
1336 	 * Scan though each interface, looking for ones that have
1337 	 * addresses in this address family.
1338 	 */
1339 	arr = ifnet_array_get();
1340 	for (i = 0; i < arr->ifnet_count; ++i) {
1341 		struct ifnet *ifp = arr->ifnet_arr[i];
1342 		struct ifaddr_container *ifac;
1343 
1344 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1345 			struct ifaddr *ifa = ifac->ifa;
1346 			char *cp, *cp2, *cp3;
1347 
1348 			if (ifa->ifa_addr->sa_family != af)
1349 next:				continue;
1350 			if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT) {
1351 				/*
1352 				 * This is a bit broken as it doesn't
1353 				 * take into account that the remote end may
1354 				 * be a single node in the network we are
1355 				 * looking for.
1356 				 * The trouble is that we don't know the
1357 				 * netmask for the remote end.
1358 				 */
1359 				if (ifa->ifa_dstaddr != NULL &&
1360 				    sa_equal(addr, ifa->ifa_dstaddr))
1361 					return (ifa);
1362 			} else {
1363 				/*
1364 				 * if we have a special address handler,
1365 				 * then use it instead of the generic one.
1366 				 */
1367 				if (ifa->ifa_claim_addr) {
1368 					if ((*ifa->ifa_claim_addr)(ifa, addr)) {
1369 						return (ifa);
1370 					} else {
1371 						continue;
1372 					}
1373 				}
1374 
1375 				/*
1376 				 * Scan all the bits in the ifa's address.
1377 				 * If a bit dissagrees with what we are
1378 				 * looking for, mask it with the netmask
1379 				 * to see if it really matters.
1380 				 * (A byte at a time)
1381 				 */
1382 				if (ifa->ifa_netmask == 0)
1383 					continue;
1384 				cp = addr_data;
1385 				cp2 = ifa->ifa_addr->sa_data;
1386 				cp3 = ifa->ifa_netmask->sa_data;
1387 				cplim = ifa->ifa_netmask->sa_len +
1388 					(char *)ifa->ifa_netmask;
1389 				while (cp3 < cplim)
1390 					if ((*cp++ ^ *cp2++) & *cp3++)
1391 						goto next; /* next address! */
1392 				/*
1393 				 * If the netmask of what we just found
1394 				 * is more specific than what we had before
1395 				 * (if we had one) then remember the new one
1396 				 * before continuing to search
1397 				 * for an even better one.
1398 				 */
1399 				if (ifa_maybe == NULL ||
1400 				    rn_refines((char *)ifa->ifa_netmask,
1401 					       (char *)ifa_maybe->ifa_netmask))
1402 					ifa_maybe = ifa;
1403 			}
1404 		}
1405 	}
1406 	return (ifa_maybe);
1407 }
1408 
1409 /*
1410  * Find an interface address specific to an interface best matching
1411  * a given address.
1412  */
1413 struct ifaddr *
1414 ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp)
1415 {
1416 	struct ifaddr_container *ifac;
1417 	char *cp, *cp2, *cp3;
1418 	char *cplim;
1419 	struct ifaddr *ifa_maybe = NULL;
1420 	u_int af = addr->sa_family;
1421 
1422 	if (af >= AF_MAX)
1423 		return (0);
1424 	TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1425 		struct ifaddr *ifa = ifac->ifa;
1426 
1427 		if (ifa->ifa_addr->sa_family != af)
1428 			continue;
1429 		if (ifa_maybe == NULL)
1430 			ifa_maybe = ifa;
1431 		if (ifa->ifa_netmask == NULL) {
1432 			if (sa_equal(addr, ifa->ifa_addr) ||
1433 			    (ifa->ifa_dstaddr != NULL &&
1434 			     sa_equal(addr, ifa->ifa_dstaddr)))
1435 				return (ifa);
1436 			continue;
1437 		}
1438 		if (ifp->if_flags & IFF_POINTOPOINT) {
1439 			if (sa_equal(addr, ifa->ifa_dstaddr))
1440 				return (ifa);
1441 		} else {
1442 			cp = addr->sa_data;
1443 			cp2 = ifa->ifa_addr->sa_data;
1444 			cp3 = ifa->ifa_netmask->sa_data;
1445 			cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
1446 			for (; cp3 < cplim; cp3++)
1447 				if ((*cp++ ^ *cp2++) & *cp3)
1448 					break;
1449 			if (cp3 == cplim)
1450 				return (ifa);
1451 		}
1452 	}
1453 	return (ifa_maybe);
1454 }
1455 
1456 /*
1457  * Default action when installing a route with a Link Level gateway.
1458  * Lookup an appropriate real ifa to point to.
1459  * This should be moved to /sys/net/link.c eventually.
1460  */
1461 static void
1462 link_rtrequest(int cmd, struct rtentry *rt)
1463 {
1464 	struct ifaddr *ifa;
1465 	struct sockaddr *dst;
1466 	struct ifnet *ifp;
1467 
1468 	if (cmd != RTM_ADD || (ifa = rt->rt_ifa) == NULL ||
1469 	    (ifp = ifa->ifa_ifp) == NULL || (dst = rt_key(rt)) == NULL)
1470 		return;
1471 	ifa = ifaof_ifpforaddr(dst, ifp);
1472 	if (ifa != NULL) {
1473 		IFAFREE(rt->rt_ifa);
1474 		IFAREF(ifa);
1475 		rt->rt_ifa = ifa;
1476 		if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
1477 			ifa->ifa_rtrequest(cmd, rt);
1478 	}
1479 }
1480 
1481 struct netmsg_ifroute {
1482 	struct netmsg_base	base;
1483 	struct ifnet		*ifp;
1484 	int			flag;
1485 	int			fam;
1486 };
1487 
1488 /*
1489  * Mark an interface down and notify protocols of the transition.
1490  */
1491 static void
1492 if_unroute_dispatch(netmsg_t nmsg)
1493 {
1494 	struct netmsg_ifroute *msg = (struct netmsg_ifroute *)nmsg;
1495 	struct ifnet *ifp = msg->ifp;
1496 	int flag = msg->flag, fam = msg->fam;
1497 	struct ifaddr_container *ifac;
1498 
1499 	ifp->if_flags &= ~flag;
1500 	getmicrotime(&ifp->if_lastchange);
1501 	/*
1502 	 * The ifaddr processing in the following loop will block,
1503 	 * however, this function is called in netisr0, in which
1504 	 * ifaddr list changes happen, so we don't care about the
1505 	 * blockness of the ifaddr processing here.
1506 	 */
1507 	TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1508 		struct ifaddr *ifa = ifac->ifa;
1509 
1510 		/* Ignore marker */
1511 		if (ifa->ifa_addr->sa_family == AF_UNSPEC)
1512 			continue;
1513 
1514 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1515 			kpfctlinput(PRC_IFDOWN, ifa->ifa_addr);
1516 	}
1517 	ifq_purge_all(&ifp->if_snd);
1518 	rt_ifmsg(ifp);
1519 
1520 	lwkt_replymsg(&nmsg->lmsg, 0);
1521 }
1522 
1523 void
1524 if_unroute(struct ifnet *ifp, int flag, int fam)
1525 {
1526 	struct netmsg_ifroute msg;
1527 
1528 	ASSERT_CANDOMSG_NETISR0(curthread);
1529 
1530 	netmsg_init(&msg.base, NULL, &curthread->td_msgport, 0,
1531 	    if_unroute_dispatch);
1532 	msg.ifp = ifp;
1533 	msg.flag = flag;
1534 	msg.fam = fam;
1535 	lwkt_domsg(netisr_cpuport(0), &msg.base.lmsg, 0);
1536 }
1537 
1538 /*
1539  * Mark an interface up and notify protocols of the transition.
1540  */
1541 static void
1542 if_route_dispatch(netmsg_t nmsg)
1543 {
1544 	struct netmsg_ifroute *msg = (struct netmsg_ifroute *)nmsg;
1545 	struct ifnet *ifp = msg->ifp;
1546 	int flag = msg->flag, fam = msg->fam;
1547 	struct ifaddr_container *ifac;
1548 
1549 	ifq_purge_all(&ifp->if_snd);
1550 	ifp->if_flags |= flag;
1551 	getmicrotime(&ifp->if_lastchange);
1552 	/*
1553 	 * The ifaddr processing in the following loop will block,
1554 	 * however, this function is called in netisr0, in which
1555 	 * ifaddr list changes happen, so we don't care about the
1556 	 * blockness of the ifaddr processing here.
1557 	 */
1558 	TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1559 		struct ifaddr *ifa = ifac->ifa;
1560 
1561 		/* Ignore marker */
1562 		if (ifa->ifa_addr->sa_family == AF_UNSPEC)
1563 			continue;
1564 
1565 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1566 			kpfctlinput(PRC_IFUP, ifa->ifa_addr);
1567 	}
1568 	rt_ifmsg(ifp);
1569 #ifdef INET6
1570 	in6_if_up(ifp);
1571 #endif
1572 
1573 	lwkt_replymsg(&nmsg->lmsg, 0);
1574 }
1575 
1576 void
1577 if_route(struct ifnet *ifp, int flag, int fam)
1578 {
1579 	struct netmsg_ifroute msg;
1580 
1581 	ASSERT_CANDOMSG_NETISR0(curthread);
1582 
1583 	netmsg_init(&msg.base, NULL, &curthread->td_msgport, 0,
1584 	    if_route_dispatch);
1585 	msg.ifp = ifp;
1586 	msg.flag = flag;
1587 	msg.fam = fam;
1588 	lwkt_domsg(netisr_cpuport(0), &msg.base.lmsg, 0);
1589 }
1590 
1591 /*
1592  * Mark an interface down and notify protocols of the transition.  An
1593  * interface going down is also considered to be a synchronizing event.
1594  * We must ensure that all packet processing related to the interface
1595  * has completed before we return so e.g. the caller can free the ifnet
1596  * structure that the mbufs may be referencing.
1597  *
1598  * NOTE: must be called at splnet or eqivalent.
1599  */
1600 void
1601 if_down(struct ifnet *ifp)
1602 {
1603 	if_unroute(ifp, IFF_UP, AF_UNSPEC);
1604 	netmsg_service_sync();
1605 }
1606 
1607 /*
1608  * Mark an interface up and notify protocols of
1609  * the transition.
1610  * NOTE: must be called at splnet or eqivalent.
1611  */
1612 void
1613 if_up(struct ifnet *ifp)
1614 {
1615 	if_route(ifp, IFF_UP, AF_UNSPEC);
1616 }
1617 
1618 /*
1619  * Process a link state change.
1620  * NOTE: must be called at splsoftnet or equivalent.
1621  */
1622 void
1623 if_link_state_change(struct ifnet *ifp)
1624 {
1625 	int link_state = ifp->if_link_state;
1626 
1627 	rt_ifmsg(ifp);
1628 	devctl_notify("IFNET", ifp->if_xname,
1629 	    (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL);
1630 }
1631 
1632 /*
1633  * Handle interface watchdog timer routines.  Called
1634  * from softclock, we decrement timers (if set) and
1635  * call the appropriate interface routine on expiration.
1636  */
1637 static void
1638 if_slowtimo_dispatch(netmsg_t nmsg)
1639 {
1640 	struct globaldata *gd = mycpu;
1641 	const struct ifnet_array *arr;
1642 	int i;
1643 
1644 	KASSERT(&curthread->td_msgport == netisr_cpuport(0),
1645 	    ("not in netisr0"));
1646 
1647 	crit_enter_gd(gd);
1648 	lwkt_replymsg(&nmsg->lmsg, 0);  /* reply ASAP */
1649 	crit_exit_gd(gd);
1650 
1651 	arr = ifnet_array_get();
1652 	for (i = 0; i < arr->ifnet_count; ++i) {
1653 		struct ifnet *ifp = arr->ifnet_arr[i];
1654 
1655 		crit_enter_gd(gd);
1656 
1657 		if (if_stats_compat) {
1658 			IFNET_STAT_GET(ifp, ipackets, ifp->if_ipackets);
1659 			IFNET_STAT_GET(ifp, ierrors, ifp->if_ierrors);
1660 			IFNET_STAT_GET(ifp, opackets, ifp->if_opackets);
1661 			IFNET_STAT_GET(ifp, oerrors, ifp->if_oerrors);
1662 			IFNET_STAT_GET(ifp, collisions, ifp->if_collisions);
1663 			IFNET_STAT_GET(ifp, ibytes, ifp->if_ibytes);
1664 			IFNET_STAT_GET(ifp, obytes, ifp->if_obytes);
1665 			IFNET_STAT_GET(ifp, imcasts, ifp->if_imcasts);
1666 			IFNET_STAT_GET(ifp, omcasts, ifp->if_omcasts);
1667 			IFNET_STAT_GET(ifp, iqdrops, ifp->if_iqdrops);
1668 			IFNET_STAT_GET(ifp, noproto, ifp->if_noproto);
1669 		}
1670 
1671 		if (ifp->if_timer == 0 || --ifp->if_timer) {
1672 			crit_exit_gd(gd);
1673 			continue;
1674 		}
1675 		if (ifp->if_watchdog) {
1676 			if (ifnet_tryserialize_all(ifp)) {
1677 				(*ifp->if_watchdog)(ifp);
1678 				ifnet_deserialize_all(ifp);
1679 			} else {
1680 				/* try again next timeout */
1681 				++ifp->if_timer;
1682 			}
1683 		}
1684 
1685 		crit_exit_gd(gd);
1686 	}
1687 
1688 	callout_reset(&if_slowtimo_timer, hz / IFNET_SLOWHZ, if_slowtimo, NULL);
1689 }
1690 
1691 static void
1692 if_slowtimo(void *arg __unused)
1693 {
1694 	struct lwkt_msg *lmsg = &if_slowtimo_netmsg.lmsg;
1695 
1696 	KASSERT(mycpuid == 0, ("not on cpu0"));
1697 	crit_enter();
1698 	if (lmsg->ms_flags & MSGF_DONE)
1699 		lwkt_sendmsg_oncpu(netisr_cpuport(0), lmsg);
1700 	crit_exit();
1701 }
1702 
1703 /*
1704  * Map interface name to
1705  * interface structure pointer.
1706  */
1707 struct ifnet *
1708 ifunit(const char *name)
1709 {
1710 	struct ifnet *ifp;
1711 
1712 	/*
1713 	 * Search all the interfaces for this name/number
1714 	 */
1715 	KASSERT(mtx_owned(&ifnet_mtx), ("ifnet is not locked"));
1716 
1717 	TAILQ_FOREACH(ifp, &ifnetlist, if_link) {
1718 		if (strncmp(ifp->if_xname, name, IFNAMSIZ) == 0)
1719 			break;
1720 	}
1721 	return (ifp);
1722 }
1723 
1724 struct ifnet *
1725 ifunit_netisr(const char *name)
1726 {
1727 	const struct ifnet_array *arr;
1728 	int i;
1729 
1730 	/*
1731 	 * Search all the interfaces for this name/number
1732 	 */
1733 
1734 	arr = ifnet_array_get();
1735 	for (i = 0; i < arr->ifnet_count; ++i) {
1736 		struct ifnet *ifp = arr->ifnet_arr[i];
1737 
1738 		if (strncmp(ifp->if_xname, name, IFNAMSIZ) == 0)
1739 			return ifp;
1740 	}
1741 	return NULL;
1742 }
1743 
1744 /*
1745  * Interface ioctls.
1746  */
1747 int
1748 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct ucred *cred)
1749 {
1750 	struct ifnet *ifp;
1751 	struct ifreq *ifr;
1752 	struct ifstat *ifs;
1753 	int error;
1754 	short oif_flags;
1755 	int new_flags;
1756 #ifdef COMPAT_43
1757 	int ocmd;
1758 #endif
1759 	size_t namelen, onamelen;
1760 	char new_name[IFNAMSIZ];
1761 	struct ifaddr *ifa;
1762 	struct sockaddr_dl *sdl;
1763 
1764 	switch (cmd) {
1765 	case SIOCGIFCONF:
1766 	case OSIOCGIFCONF:
1767 		return (ifconf(cmd, data, cred));
1768 	default:
1769 		break;
1770 	}
1771 
1772 	ifr = (struct ifreq *)data;
1773 
1774 	switch (cmd) {
1775 	case SIOCIFCREATE:
1776 	case SIOCIFCREATE2:
1777 		if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1778 			return (error);
1779 		return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name),
1780 		    	cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL));
1781 	case SIOCIFDESTROY:
1782 		if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1783 			return (error);
1784 		return (if_clone_destroy(ifr->ifr_name));
1785 	case SIOCIFGCLONERS:
1786 		return (if_clone_list((struct if_clonereq *)data));
1787 	default:
1788 		break;
1789 	}
1790 
1791 	/*
1792 	 * Nominal ioctl through interface, lookup the ifp and obtain a
1793 	 * lock to serialize the ifconfig ioctl operation.
1794 	 */
1795 	ifnet_lock();
1796 
1797 	ifp = ifunit(ifr->ifr_name);
1798 	if (ifp == NULL) {
1799 		ifnet_unlock();
1800 		return (ENXIO);
1801 	}
1802 	error = 0;
1803 
1804 	switch (cmd) {
1805 	case SIOCGIFINDEX:
1806 		ifr->ifr_index = ifp->if_index;
1807 		break;
1808 
1809 	case SIOCGIFFLAGS:
1810 		ifr->ifr_flags = ifp->if_flags;
1811 		ifr->ifr_flagshigh = ifp->if_flags >> 16;
1812 		break;
1813 
1814 	case SIOCGIFCAP:
1815 		ifr->ifr_reqcap = ifp->if_capabilities;
1816 		ifr->ifr_curcap = ifp->if_capenable;
1817 		break;
1818 
1819 	case SIOCGIFMETRIC:
1820 		ifr->ifr_metric = ifp->if_metric;
1821 		break;
1822 
1823 	case SIOCGIFMTU:
1824 		ifr->ifr_mtu = ifp->if_mtu;
1825 		break;
1826 
1827 	case SIOCGIFTSOLEN:
1828 		ifr->ifr_tsolen = ifp->if_tsolen;
1829 		break;
1830 
1831 	case SIOCGIFDATA:
1832 		error = copyout((caddr_t)&ifp->if_data, ifr->ifr_data,
1833 				sizeof(ifp->if_data));
1834 		break;
1835 
1836 	case SIOCGIFPHYS:
1837 		ifr->ifr_phys = ifp->if_physical;
1838 		break;
1839 
1840 	case SIOCGIFPOLLCPU:
1841 		ifr->ifr_pollcpu = -1;
1842 		break;
1843 
1844 	case SIOCSIFPOLLCPU:
1845 		break;
1846 
1847 	case SIOCSIFFLAGS:
1848 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1849 		if (error)
1850 			break;
1851 		new_flags = (ifr->ifr_flags & 0xffff) |
1852 		    (ifr->ifr_flagshigh << 16);
1853 		if (ifp->if_flags & IFF_SMART) {
1854 			/* Smart drivers twiddle their own routes */
1855 		} else if (ifp->if_flags & IFF_UP &&
1856 		    (new_flags & IFF_UP) == 0) {
1857 			crit_enter();
1858 			if_down(ifp);
1859 			crit_exit();
1860 		} else if (new_flags & IFF_UP &&
1861 		    (ifp->if_flags & IFF_UP) == 0) {
1862 			crit_enter();
1863 			if_up(ifp);
1864 			crit_exit();
1865 		}
1866 
1867 #ifdef IFPOLL_ENABLE
1868 		if ((new_flags ^ ifp->if_flags) & IFF_NPOLLING) {
1869 			if (new_flags & IFF_NPOLLING)
1870 				ifpoll_register(ifp);
1871 			else
1872 				ifpoll_deregister(ifp);
1873 		}
1874 #endif
1875 
1876 		ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
1877 			(new_flags &~ IFF_CANTCHANGE);
1878 		if (new_flags & IFF_PPROMISC) {
1879 			/* Permanently promiscuous mode requested */
1880 			ifp->if_flags |= IFF_PROMISC;
1881 		} else if (ifp->if_pcount == 0) {
1882 			ifp->if_flags &= ~IFF_PROMISC;
1883 		}
1884 		if (ifp->if_ioctl) {
1885 			ifnet_serialize_all(ifp);
1886 			ifp->if_ioctl(ifp, cmd, data, cred);
1887 			ifnet_deserialize_all(ifp);
1888 		}
1889 		getmicrotime(&ifp->if_lastchange);
1890 		break;
1891 
1892 	case SIOCSIFCAP:
1893 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1894 		if (error)
1895 			break;
1896 		if (ifr->ifr_reqcap & ~ifp->if_capabilities) {
1897 			error = EINVAL;
1898 			break;
1899 		}
1900 		ifnet_serialize_all(ifp);
1901 		ifp->if_ioctl(ifp, cmd, data, cred);
1902 		ifnet_deserialize_all(ifp);
1903 		break;
1904 
1905 	case SIOCSIFNAME:
1906 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1907 		if (error)
1908 			break;
1909 		error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
1910 		if (error)
1911 			break;
1912 		if (new_name[0] == '\0') {
1913 			error = EINVAL;
1914 			break;
1915 		}
1916 		if (ifunit(new_name) != NULL) {
1917 			error = EEXIST;
1918 			break;
1919 		}
1920 
1921 		EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
1922 
1923 		/* Announce the departure of the interface. */
1924 		rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
1925 
1926 		strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
1927 		ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
1928 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
1929 		namelen = strlen(new_name);
1930 		onamelen = sdl->sdl_nlen;
1931 		/*
1932 		 * Move the address if needed.  This is safe because we
1933 		 * allocate space for a name of length IFNAMSIZ when we
1934 		 * create this in if_attach().
1935 		 */
1936 		if (namelen != onamelen) {
1937 			bcopy(sdl->sdl_data + onamelen,
1938 			    sdl->sdl_data + namelen, sdl->sdl_alen);
1939 		}
1940 		bcopy(new_name, sdl->sdl_data, namelen);
1941 		sdl->sdl_nlen = namelen;
1942 		sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
1943 		bzero(sdl->sdl_data, onamelen);
1944 		while (namelen != 0)
1945 			sdl->sdl_data[--namelen] = 0xff;
1946 
1947 		EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
1948 
1949 		/* Announce the return of the interface. */
1950 		rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
1951 		break;
1952 
1953 	case SIOCSIFMETRIC:
1954 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1955 		if (error)
1956 			break;
1957 		ifp->if_metric = ifr->ifr_metric;
1958 		getmicrotime(&ifp->if_lastchange);
1959 		break;
1960 
1961 	case SIOCSIFPHYS:
1962 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1963 		if (error)
1964 			break;
1965 		if (ifp->if_ioctl == NULL) {
1966 		        error = EOPNOTSUPP;
1967 			break;
1968 		}
1969 		ifnet_serialize_all(ifp);
1970 		error = ifp->if_ioctl(ifp, cmd, data, cred);
1971 		ifnet_deserialize_all(ifp);
1972 		if (error == 0)
1973 			getmicrotime(&ifp->if_lastchange);
1974 		break;
1975 
1976 	case SIOCSIFMTU:
1977 	{
1978 		u_long oldmtu = ifp->if_mtu;
1979 
1980 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1981 		if (error)
1982 			break;
1983 		if (ifp->if_ioctl == NULL) {
1984 			error = EOPNOTSUPP;
1985 			break;
1986 		}
1987 		if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) {
1988 			error = EINVAL;
1989 			break;
1990 		}
1991 		ifnet_serialize_all(ifp);
1992 		error = ifp->if_ioctl(ifp, cmd, data, cred);
1993 		ifnet_deserialize_all(ifp);
1994 		if (error == 0) {
1995 			getmicrotime(&ifp->if_lastchange);
1996 			rt_ifmsg(ifp);
1997 		}
1998 		/*
1999 		 * If the link MTU changed, do network layer specific procedure.
2000 		 */
2001 		if (ifp->if_mtu != oldmtu) {
2002 #ifdef INET6
2003 			nd6_setmtu(ifp);
2004 #endif
2005 		}
2006 		break;
2007 	}
2008 
2009 	case SIOCSIFTSOLEN:
2010 		error = priv_check_cred(cred, PRIV_ROOT, 0);
2011 		if (error)
2012 			break;
2013 
2014 		/* XXX need driver supplied upper limit */
2015 		if (ifr->ifr_tsolen <= 0) {
2016 			error = EINVAL;
2017 			break;
2018 		}
2019 		ifp->if_tsolen = ifr->ifr_tsolen;
2020 		break;
2021 
2022 	case SIOCADDMULTI:
2023 	case SIOCDELMULTI:
2024 		error = priv_check_cred(cred, PRIV_ROOT, 0);
2025 		if (error)
2026 			break;
2027 
2028 		/* Don't allow group membership on non-multicast interfaces. */
2029 		if ((ifp->if_flags & IFF_MULTICAST) == 0) {
2030 			error = EOPNOTSUPP;
2031 			break;
2032 		}
2033 
2034 		/* Don't let users screw up protocols' entries. */
2035 		if (ifr->ifr_addr.sa_family != AF_LINK) {
2036 			error = EINVAL;
2037 			break;
2038 		}
2039 
2040 		if (cmd == SIOCADDMULTI) {
2041 			struct ifmultiaddr *ifma;
2042 			error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
2043 		} else {
2044 			error = if_delmulti(ifp, &ifr->ifr_addr);
2045 		}
2046 		if (error == 0)
2047 			getmicrotime(&ifp->if_lastchange);
2048 		break;
2049 
2050 	case SIOCSIFPHYADDR:
2051 	case SIOCDIFPHYADDR:
2052 #ifdef INET6
2053 	case SIOCSIFPHYADDR_IN6:
2054 #endif
2055 	case SIOCSLIFPHYADDR:
2056         case SIOCSIFMEDIA:
2057 	case SIOCSIFGENERIC:
2058 		error = priv_check_cred(cred, PRIV_ROOT, 0);
2059 		if (error)
2060 			break;
2061 		if (ifp->if_ioctl == 0) {
2062 			error = EOPNOTSUPP;
2063 			break;
2064 		}
2065 		ifnet_serialize_all(ifp);
2066 		error = ifp->if_ioctl(ifp, cmd, data, cred);
2067 		ifnet_deserialize_all(ifp);
2068 		if (error == 0)
2069 			getmicrotime(&ifp->if_lastchange);
2070 		break;
2071 
2072 	case SIOCGIFSTATUS:
2073 		ifs = (struct ifstat *)data;
2074 		ifs->ascii[0] = '\0';
2075 		/* fall through */
2076 	case SIOCGIFPSRCADDR:
2077 	case SIOCGIFPDSTADDR:
2078 	case SIOCGLIFPHYADDR:
2079 	case SIOCGIFMEDIA:
2080 	case SIOCGIFGENERIC:
2081 		if (ifp->if_ioctl == NULL) {
2082 			error = EOPNOTSUPP;
2083 			break;
2084 		}
2085 		ifnet_serialize_all(ifp);
2086 		error = ifp->if_ioctl(ifp, cmd, data, cred);
2087 		ifnet_deserialize_all(ifp);
2088 		break;
2089 
2090 	case SIOCSIFLLADDR:
2091 		error = priv_check_cred(cred, PRIV_ROOT, 0);
2092 		if (error)
2093 			break;
2094 		error = if_setlladdr(ifp, ifr->ifr_addr.sa_data,
2095 				     ifr->ifr_addr.sa_len);
2096 		EVENTHANDLER_INVOKE(iflladdr_event, ifp);
2097 		break;
2098 
2099 	default:
2100 		oif_flags = ifp->if_flags;
2101 		if (so->so_proto == 0) {
2102 			error = EOPNOTSUPP;
2103 			break;
2104 		}
2105 #ifndef COMPAT_43
2106 		error = so_pru_control_direct(so, cmd, data, ifp);
2107 #else
2108 		ocmd = cmd;
2109 
2110 		switch (cmd) {
2111 		case SIOCSIFDSTADDR:
2112 		case SIOCSIFADDR:
2113 		case SIOCSIFBRDADDR:
2114 		case SIOCSIFNETMASK:
2115 #if BYTE_ORDER != BIG_ENDIAN
2116 			if (ifr->ifr_addr.sa_family == 0 &&
2117 			    ifr->ifr_addr.sa_len < 16) {
2118 				ifr->ifr_addr.sa_family = ifr->ifr_addr.sa_len;
2119 				ifr->ifr_addr.sa_len = 16;
2120 			}
2121 #else
2122 			if (ifr->ifr_addr.sa_len == 0)
2123 				ifr->ifr_addr.sa_len = 16;
2124 #endif
2125 			break;
2126 		case OSIOCGIFADDR:
2127 			cmd = SIOCGIFADDR;
2128 			break;
2129 		case OSIOCGIFDSTADDR:
2130 			cmd = SIOCGIFDSTADDR;
2131 			break;
2132 		case OSIOCGIFBRDADDR:
2133 			cmd = SIOCGIFBRDADDR;
2134 			break;
2135 		case OSIOCGIFNETMASK:
2136 			cmd = SIOCGIFNETMASK;
2137 			break;
2138 		default:
2139 			break;
2140 		}
2141 
2142 		error = so_pru_control_direct(so, cmd, data, ifp);
2143 
2144 		switch (ocmd) {
2145 		case OSIOCGIFADDR:
2146 		case OSIOCGIFDSTADDR:
2147 		case OSIOCGIFBRDADDR:
2148 		case OSIOCGIFNETMASK:
2149 			*(u_short *)&ifr->ifr_addr = ifr->ifr_addr.sa_family;
2150 			break;
2151 		}
2152 #endif /* COMPAT_43 */
2153 
2154 		if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
2155 #ifdef INET6
2156 			DELAY(100);/* XXX: temporary workaround for fxp issue*/
2157 			if (ifp->if_flags & IFF_UP) {
2158 				crit_enter();
2159 				in6_if_up(ifp);
2160 				crit_exit();
2161 			}
2162 #endif
2163 		}
2164 		break;
2165 	}
2166 
2167 	ifnet_unlock();
2168 	return (error);
2169 }
2170 
2171 /*
2172  * Set/clear promiscuous mode on interface ifp based on the truth value
2173  * of pswitch.  The calls are reference counted so that only the first
2174  * "on" request actually has an effect, as does the final "off" request.
2175  * Results are undefined if the "off" and "on" requests are not matched.
2176  */
2177 int
2178 ifpromisc(struct ifnet *ifp, int pswitch)
2179 {
2180 	struct ifreq ifr;
2181 	int error;
2182 	int oldflags;
2183 
2184 	oldflags = ifp->if_flags;
2185 	if (ifp->if_flags & IFF_PPROMISC) {
2186 		/* Do nothing if device is in permanently promiscuous mode */
2187 		ifp->if_pcount += pswitch ? 1 : -1;
2188 		return (0);
2189 	}
2190 	if (pswitch) {
2191 		/*
2192 		 * If the device is not configured up, we cannot put it in
2193 		 * promiscuous mode.
2194 		 */
2195 		if ((ifp->if_flags & IFF_UP) == 0)
2196 			return (ENETDOWN);
2197 		if (ifp->if_pcount++ != 0)
2198 			return (0);
2199 		ifp->if_flags |= IFF_PROMISC;
2200 		log(LOG_INFO, "%s: promiscuous mode enabled\n",
2201 		    ifp->if_xname);
2202 	} else {
2203 		if (--ifp->if_pcount > 0)
2204 			return (0);
2205 		ifp->if_flags &= ~IFF_PROMISC;
2206 		log(LOG_INFO, "%s: promiscuous mode disabled\n",
2207 		    ifp->if_xname);
2208 	}
2209 	ifr.ifr_flags = ifp->if_flags;
2210 	ifr.ifr_flagshigh = ifp->if_flags >> 16;
2211 	ifnet_serialize_all(ifp);
2212 	error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr, NULL);
2213 	ifnet_deserialize_all(ifp);
2214 	if (error == 0)
2215 		rt_ifmsg(ifp);
2216 	else
2217 		ifp->if_flags = oldflags;
2218 	return error;
2219 }
2220 
2221 /*
2222  * Return interface configuration
2223  * of system.  List may be used
2224  * in later ioctl's (above) to get
2225  * other information.
2226  */
2227 static int
2228 ifconf(u_long cmd, caddr_t data, struct ucred *cred)
2229 {
2230 	struct ifconf *ifc = (struct ifconf *)data;
2231 	struct ifnet *ifp;
2232 	struct sockaddr *sa;
2233 	struct ifreq ifr, *ifrp;
2234 	int space = ifc->ifc_len, error = 0;
2235 
2236 	ifrp = ifc->ifc_req;
2237 
2238 	ifnet_lock();
2239 	TAILQ_FOREACH(ifp, &ifnetlist, if_link) {
2240 		struct ifaddr_container *ifac, *ifac_mark;
2241 		struct ifaddr_marker mark;
2242 		struct ifaddrhead *head;
2243 		int addrs;
2244 
2245 		if (space <= sizeof ifr)
2246 			break;
2247 
2248 		/*
2249 		 * Zero the stack declared structure first to prevent
2250 		 * memory disclosure.
2251 		 */
2252 		bzero(&ifr, sizeof(ifr));
2253 		if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
2254 		    >= sizeof(ifr.ifr_name)) {
2255 			error = ENAMETOOLONG;
2256 			break;
2257 		}
2258 
2259 		/*
2260 		 * Add a marker, since copyout() could block and during that
2261 		 * period the list could be changed.  Inserting the marker to
2262 		 * the header of the list will not cause trouble for the code
2263 		 * assuming that the first element of the list is AF_LINK; the
2264 		 * marker will be moved to the next position w/o blocking.
2265 		 */
2266 		ifa_marker_init(&mark, ifp);
2267 		ifac_mark = &mark.ifac;
2268 		head = &ifp->if_addrheads[mycpuid];
2269 
2270 		addrs = 0;
2271 		TAILQ_INSERT_HEAD(head, ifac_mark, ifa_link);
2272 		while ((ifac = TAILQ_NEXT(ifac_mark, ifa_link)) != NULL) {
2273 			struct ifaddr *ifa = ifac->ifa;
2274 
2275 			TAILQ_REMOVE(head, ifac_mark, ifa_link);
2276 			TAILQ_INSERT_AFTER(head, ifac, ifac_mark, ifa_link);
2277 
2278 			/* Ignore marker */
2279 			if (ifa->ifa_addr->sa_family == AF_UNSPEC)
2280 				continue;
2281 
2282 			if (space <= sizeof ifr)
2283 				break;
2284 			sa = ifa->ifa_addr;
2285 			if (cred->cr_prison &&
2286 			    prison_if(cred, sa))
2287 				continue;
2288 			addrs++;
2289 			/*
2290 			 * Keep a reference on this ifaddr, so that it will
2291 			 * not be destroyed when its address is copied to
2292 			 * the userland, which could block.
2293 			 */
2294 			IFAREF(ifa);
2295 #ifdef COMPAT_43
2296 			if (cmd == OSIOCGIFCONF) {
2297 				struct osockaddr *osa =
2298 					 (struct osockaddr *)&ifr.ifr_addr;
2299 				ifr.ifr_addr = *sa;
2300 				osa->sa_family = sa->sa_family;
2301 				error = copyout(&ifr, ifrp, sizeof ifr);
2302 				ifrp++;
2303 			} else
2304 #endif
2305 			if (sa->sa_len <= sizeof(*sa)) {
2306 				ifr.ifr_addr = *sa;
2307 				error = copyout(&ifr, ifrp, sizeof ifr);
2308 				ifrp++;
2309 			} else {
2310 				if (space < (sizeof ifr) + sa->sa_len -
2311 					    sizeof(*sa)) {
2312 					IFAFREE(ifa);
2313 					break;
2314 				}
2315 				space -= sa->sa_len - sizeof(*sa);
2316 				error = copyout(&ifr, ifrp,
2317 						sizeof ifr.ifr_name);
2318 				if (error == 0)
2319 					error = copyout(sa, &ifrp->ifr_addr,
2320 							sa->sa_len);
2321 				ifrp = (struct ifreq *)
2322 					(sa->sa_len + (caddr_t)&ifrp->ifr_addr);
2323 			}
2324 			IFAFREE(ifa);
2325 			if (error)
2326 				break;
2327 			space -= sizeof ifr;
2328 		}
2329 		TAILQ_REMOVE(head, ifac_mark, ifa_link);
2330 		if (error)
2331 			break;
2332 		if (!addrs) {
2333 			bzero(&ifr.ifr_addr, sizeof ifr.ifr_addr);
2334 			error = copyout(&ifr, ifrp, sizeof ifr);
2335 			if (error)
2336 				break;
2337 			space -= sizeof ifr;
2338 			ifrp++;
2339 		}
2340 	}
2341 	ifnet_unlock();
2342 
2343 	ifc->ifc_len -= space;
2344 	return (error);
2345 }
2346 
2347 /*
2348  * Just like if_promisc(), but for all-multicast-reception mode.
2349  */
2350 int
2351 if_allmulti(struct ifnet *ifp, int onswitch)
2352 {
2353 	int error = 0;
2354 	struct ifreq ifr;
2355 
2356 	crit_enter();
2357 
2358 	if (onswitch) {
2359 		if (ifp->if_amcount++ == 0) {
2360 			ifp->if_flags |= IFF_ALLMULTI;
2361 			ifr.ifr_flags = ifp->if_flags;
2362 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
2363 			ifnet_serialize_all(ifp);
2364 			error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2365 					      NULL);
2366 			ifnet_deserialize_all(ifp);
2367 		}
2368 	} else {
2369 		if (ifp->if_amcount > 1) {
2370 			ifp->if_amcount--;
2371 		} else {
2372 			ifp->if_amcount = 0;
2373 			ifp->if_flags &= ~IFF_ALLMULTI;
2374 			ifr.ifr_flags = ifp->if_flags;
2375 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
2376 			ifnet_serialize_all(ifp);
2377 			error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2378 					      NULL);
2379 			ifnet_deserialize_all(ifp);
2380 		}
2381 	}
2382 
2383 	crit_exit();
2384 
2385 	if (error == 0)
2386 		rt_ifmsg(ifp);
2387 	return error;
2388 }
2389 
2390 /*
2391  * Add a multicast listenership to the interface in question.
2392  * The link layer provides a routine which converts
2393  */
2394 int
2395 if_addmulti_serialized(struct ifnet *ifp, struct sockaddr *sa,
2396     struct ifmultiaddr **retifma)
2397 {
2398 	struct sockaddr *llsa, *dupsa;
2399 	int error;
2400 	struct ifmultiaddr *ifma;
2401 
2402 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
2403 
2404 	/*
2405 	 * If the matching multicast address already exists
2406 	 * then don't add a new one, just add a reference
2407 	 */
2408 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2409 		if (sa_equal(sa, ifma->ifma_addr)) {
2410 			ifma->ifma_refcount++;
2411 			if (retifma)
2412 				*retifma = ifma;
2413 			return 0;
2414 		}
2415 	}
2416 
2417 	/*
2418 	 * Give the link layer a chance to accept/reject it, and also
2419 	 * find out which AF_LINK address this maps to, if it isn't one
2420 	 * already.
2421 	 */
2422 	if (ifp->if_resolvemulti) {
2423 		error = ifp->if_resolvemulti(ifp, &llsa, sa);
2424 		if (error)
2425 			return error;
2426 	} else {
2427 		llsa = NULL;
2428 	}
2429 
2430 	ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2431 	dupsa = kmalloc(sa->sa_len, M_IFMADDR, M_WAITOK);
2432 	bcopy(sa, dupsa, sa->sa_len);
2433 
2434 	ifma->ifma_addr = dupsa;
2435 	ifma->ifma_lladdr = llsa;
2436 	ifma->ifma_ifp = ifp;
2437 	ifma->ifma_refcount = 1;
2438 	ifma->ifma_protospec = NULL;
2439 	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
2440 
2441 	TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2442 	if (retifma)
2443 		*retifma = ifma;
2444 
2445 	if (llsa != NULL) {
2446 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2447 			if (sa_equal(ifma->ifma_addr, llsa))
2448 				break;
2449 		}
2450 		if (ifma) {
2451 			ifma->ifma_refcount++;
2452 		} else {
2453 			ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2454 			dupsa = kmalloc(llsa->sa_len, M_IFMADDR, M_WAITOK);
2455 			bcopy(llsa, dupsa, llsa->sa_len);
2456 			ifma->ifma_addr = dupsa;
2457 			ifma->ifma_ifp = ifp;
2458 			ifma->ifma_refcount = 1;
2459 			TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2460 		}
2461 	}
2462 	/*
2463 	 * We are certain we have added something, so call down to the
2464 	 * interface to let them know about it.
2465 	 */
2466 	if (ifp->if_ioctl)
2467 		ifp->if_ioctl(ifp, SIOCADDMULTI, 0, NULL);
2468 
2469 	return 0;
2470 }
2471 
2472 int
2473 if_addmulti(struct ifnet *ifp, struct sockaddr *sa,
2474     struct ifmultiaddr **retifma)
2475 {
2476 	int error;
2477 
2478 	ifnet_serialize_all(ifp);
2479 	error = if_addmulti_serialized(ifp, sa, retifma);
2480 	ifnet_deserialize_all(ifp);
2481 
2482 	return error;
2483 }
2484 
2485 /*
2486  * Remove a reference to a multicast address on this interface.  Yell
2487  * if the request does not match an existing membership.
2488  */
2489 static int
2490 if_delmulti_serialized(struct ifnet *ifp, struct sockaddr *sa)
2491 {
2492 	struct ifmultiaddr *ifma;
2493 
2494 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
2495 
2496 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2497 		if (sa_equal(sa, ifma->ifma_addr))
2498 			break;
2499 	if (ifma == NULL)
2500 		return ENOENT;
2501 
2502 	if (ifma->ifma_refcount > 1) {
2503 		ifma->ifma_refcount--;
2504 		return 0;
2505 	}
2506 
2507 	rt_newmaddrmsg(RTM_DELMADDR, ifma);
2508 	sa = ifma->ifma_lladdr;
2509 	TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2510 	/*
2511 	 * Make sure the interface driver is notified
2512 	 * in the case of a link layer mcast group being left.
2513 	 */
2514 	if (ifma->ifma_addr->sa_family == AF_LINK && sa == NULL)
2515 		ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2516 	kfree(ifma->ifma_addr, M_IFMADDR);
2517 	kfree(ifma, M_IFMADDR);
2518 	if (sa == NULL)
2519 		return 0;
2520 
2521 	/*
2522 	 * Now look for the link-layer address which corresponds to
2523 	 * this network address.  It had been squirreled away in
2524 	 * ifma->ifma_lladdr for this purpose (so we don't have
2525 	 * to call ifp->if_resolvemulti() again), and we saved that
2526 	 * value in sa above.  If some nasty deleted the
2527 	 * link-layer address out from underneath us, we can deal because
2528 	 * the address we stored was is not the same as the one which was
2529 	 * in the record for the link-layer address.  (So we don't complain
2530 	 * in that case.)
2531 	 */
2532 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2533 		if (sa_equal(sa, ifma->ifma_addr))
2534 			break;
2535 	if (ifma == NULL)
2536 		return 0;
2537 
2538 	if (ifma->ifma_refcount > 1) {
2539 		ifma->ifma_refcount--;
2540 		return 0;
2541 	}
2542 
2543 	TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2544 	ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2545 	kfree(ifma->ifma_addr, M_IFMADDR);
2546 	kfree(sa, M_IFMADDR);
2547 	kfree(ifma, M_IFMADDR);
2548 
2549 	return 0;
2550 }
2551 
2552 int
2553 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
2554 {
2555 	int error;
2556 
2557 	ifnet_serialize_all(ifp);
2558 	error = if_delmulti_serialized(ifp, sa);
2559 	ifnet_deserialize_all(ifp);
2560 
2561 	return error;
2562 }
2563 
2564 /*
2565  * Delete all multicast group membership for an interface.
2566  * Should be used to quickly flush all multicast filters.
2567  */
2568 void
2569 if_delallmulti_serialized(struct ifnet *ifp)
2570 {
2571 	struct ifmultiaddr *ifma, mark;
2572 	struct sockaddr sa;
2573 
2574 	ASSERT_IFNET_SERIALIZED_ALL(ifp);
2575 
2576 	bzero(&sa, sizeof(sa));
2577 	sa.sa_family = AF_UNSPEC;
2578 	sa.sa_len = sizeof(sa);
2579 
2580 	bzero(&mark, sizeof(mark));
2581 	mark.ifma_addr = &sa;
2582 
2583 	TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, &mark, ifma_link);
2584 	while ((ifma = TAILQ_NEXT(&mark, ifma_link)) != NULL) {
2585 		TAILQ_REMOVE(&ifp->if_multiaddrs, &mark, ifma_link);
2586 		TAILQ_INSERT_AFTER(&ifp->if_multiaddrs, ifma, &mark,
2587 		    ifma_link);
2588 
2589 		if (ifma->ifma_addr->sa_family == AF_UNSPEC)
2590 			continue;
2591 
2592 		if_delmulti_serialized(ifp, ifma->ifma_addr);
2593 	}
2594 	TAILQ_REMOVE(&ifp->if_multiaddrs, &mark, ifma_link);
2595 }
2596 
2597 
2598 /*
2599  * Set the link layer address on an interface.
2600  *
2601  * At this time we only support certain types of interfaces,
2602  * and we don't allow the length of the address to change.
2603  */
2604 int
2605 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
2606 {
2607 	struct sockaddr_dl *sdl;
2608 	struct ifreq ifr;
2609 
2610 	sdl = IF_LLSOCKADDR(ifp);
2611 	if (sdl == NULL)
2612 		return (EINVAL);
2613 	if (len != sdl->sdl_alen)	/* don't allow length to change */
2614 		return (EINVAL);
2615 	switch (ifp->if_type) {
2616 	case IFT_ETHER:			/* these types use struct arpcom */
2617 	case IFT_XETHER:
2618 	case IFT_L2VLAN:
2619 	case IFT_IEEE8023ADLAG:
2620 		bcopy(lladdr, ((struct arpcom *)ifp->if_softc)->ac_enaddr, len);
2621 		bcopy(lladdr, LLADDR(sdl), len);
2622 		break;
2623 	default:
2624 		return (ENODEV);
2625 	}
2626 	/*
2627 	 * If the interface is already up, we need
2628 	 * to re-init it in order to reprogram its
2629 	 * address filter.
2630 	 */
2631 	ifnet_serialize_all(ifp);
2632 	if ((ifp->if_flags & IFF_UP) != 0) {
2633 #ifdef INET
2634 		struct ifaddr_container *ifac;
2635 #endif
2636 
2637 		ifp->if_flags &= ~IFF_UP;
2638 		ifr.ifr_flags = ifp->if_flags;
2639 		ifr.ifr_flagshigh = ifp->if_flags >> 16;
2640 		ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2641 			      NULL);
2642 		ifp->if_flags |= IFF_UP;
2643 		ifr.ifr_flags = ifp->if_flags;
2644 		ifr.ifr_flagshigh = ifp->if_flags >> 16;
2645 		ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2646 				 NULL);
2647 #ifdef INET
2648 		/*
2649 		 * Also send gratuitous ARPs to notify other nodes about
2650 		 * the address change.
2651 		 */
2652 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
2653 			struct ifaddr *ifa = ifac->ifa;
2654 
2655 			if (ifa->ifa_addr != NULL &&
2656 			    ifa->ifa_addr->sa_family == AF_INET)
2657 				arp_gratuitous(ifp, ifa);
2658 		}
2659 #endif
2660 	}
2661 	ifnet_deserialize_all(ifp);
2662 	return (0);
2663 }
2664 
2665 struct ifmultiaddr *
2666 ifmaof_ifpforaddr(struct sockaddr *sa, struct ifnet *ifp)
2667 {
2668 	struct ifmultiaddr *ifma;
2669 
2670 	/* TODO: need ifnet_serialize_main */
2671 	ifnet_serialize_all(ifp);
2672 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2673 		if (sa_equal(ifma->ifma_addr, sa))
2674 			break;
2675 	ifnet_deserialize_all(ifp);
2676 
2677 	return ifma;
2678 }
2679 
2680 /*
2681  * This function locates the first real ethernet MAC from a network
2682  * card and loads it into node, returning 0 on success or ENOENT if
2683  * no suitable interfaces were found.  It is used by the uuid code to
2684  * generate a unique 6-byte number.
2685  */
2686 int
2687 if_getanyethermac(uint16_t *node, int minlen)
2688 {
2689 	struct ifnet *ifp;
2690 	struct sockaddr_dl *sdl;
2691 
2692 	ifnet_lock();
2693 	TAILQ_FOREACH(ifp, &ifnetlist, if_link) {
2694 		if (ifp->if_type != IFT_ETHER)
2695 			continue;
2696 		sdl = IF_LLSOCKADDR(ifp);
2697 		if (sdl->sdl_alen < minlen)
2698 			continue;
2699 		bcopy(((struct arpcom *)ifp->if_softc)->ac_enaddr, node,
2700 		      minlen);
2701 		ifnet_unlock();
2702 		return(0);
2703 	}
2704 	ifnet_unlock();
2705 	return (ENOENT);
2706 }
2707 
2708 /*
2709  * The name argument must be a pointer to storage which will last as
2710  * long as the interface does.  For physical devices, the result of
2711  * device_get_name(dev) is a good choice and for pseudo-devices a
2712  * static string works well.
2713  */
2714 void
2715 if_initname(struct ifnet *ifp, const char *name, int unit)
2716 {
2717 	ifp->if_dname = name;
2718 	ifp->if_dunit = unit;
2719 	if (unit != IF_DUNIT_NONE)
2720 		ksnprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
2721 	else
2722 		strlcpy(ifp->if_xname, name, IFNAMSIZ);
2723 }
2724 
2725 int
2726 if_printf(struct ifnet *ifp, const char *fmt, ...)
2727 {
2728 	__va_list ap;
2729 	int retval;
2730 
2731 	retval = kprintf("%s: ", ifp->if_xname);
2732 	__va_start(ap, fmt);
2733 	retval += kvprintf(fmt, ap);
2734 	__va_end(ap);
2735 	return (retval);
2736 }
2737 
2738 struct ifnet *
2739 if_alloc(uint8_t type)
2740 {
2741         struct ifnet *ifp;
2742 	size_t size;
2743 
2744 	/*
2745 	 * XXX temporary hack until arpcom is setup in if_l2com
2746 	 */
2747 	if (type == IFT_ETHER)
2748 		size = sizeof(struct arpcom);
2749 	else
2750 		size = sizeof(struct ifnet);
2751 
2752 	ifp = kmalloc(size, M_IFNET, M_WAITOK|M_ZERO);
2753 
2754 	ifp->if_type = type;
2755 
2756 	if (if_com_alloc[type] != NULL) {
2757 		ifp->if_l2com = if_com_alloc[type](type, ifp);
2758 		if (ifp->if_l2com == NULL) {
2759 			kfree(ifp, M_IFNET);
2760 			return (NULL);
2761 		}
2762 	}
2763 	return (ifp);
2764 }
2765 
2766 void
2767 if_free(struct ifnet *ifp)
2768 {
2769 	kfree(ifp, M_IFNET);
2770 }
2771 
2772 void
2773 ifq_set_classic(struct ifaltq *ifq)
2774 {
2775 	ifq_set_methods(ifq, ifq->altq_ifp->if_mapsubq,
2776 	    ifsq_classic_enqueue, ifsq_classic_dequeue, ifsq_classic_request);
2777 }
2778 
2779 void
2780 ifq_set_methods(struct ifaltq *ifq, altq_mapsubq_t mapsubq,
2781     ifsq_enqueue_t enqueue, ifsq_dequeue_t dequeue, ifsq_request_t request)
2782 {
2783 	int q;
2784 
2785 	KASSERT(mapsubq != NULL, ("mapsubq is not specified"));
2786 	KASSERT(enqueue != NULL, ("enqueue is not specified"));
2787 	KASSERT(dequeue != NULL, ("dequeue is not specified"));
2788 	KASSERT(request != NULL, ("request is not specified"));
2789 
2790 	ifq->altq_mapsubq = mapsubq;
2791 	for (q = 0; q < ifq->altq_subq_cnt; ++q) {
2792 		struct ifaltq_subque *ifsq = &ifq->altq_subq[q];
2793 
2794 		ifsq->ifsq_enqueue = enqueue;
2795 		ifsq->ifsq_dequeue = dequeue;
2796 		ifsq->ifsq_request = request;
2797 	}
2798 }
2799 
2800 static void
2801 ifsq_norm_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m)
2802 {
2803 	m->m_nextpkt = NULL;
2804 	if (ifsq->ifsq_norm_tail == NULL)
2805 		ifsq->ifsq_norm_head = m;
2806 	else
2807 		ifsq->ifsq_norm_tail->m_nextpkt = m;
2808 	ifsq->ifsq_norm_tail = m;
2809 	ALTQ_SQ_CNTR_INC(ifsq, m->m_pkthdr.len);
2810 }
2811 
2812 static void
2813 ifsq_prio_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m)
2814 {
2815 	m->m_nextpkt = NULL;
2816 	if (ifsq->ifsq_prio_tail == NULL)
2817 		ifsq->ifsq_prio_head = m;
2818 	else
2819 		ifsq->ifsq_prio_tail->m_nextpkt = m;
2820 	ifsq->ifsq_prio_tail = m;
2821 	ALTQ_SQ_CNTR_INC(ifsq, m->m_pkthdr.len);
2822 	ALTQ_SQ_PRIO_CNTR_INC(ifsq, m->m_pkthdr.len);
2823 }
2824 
2825 static struct mbuf *
2826 ifsq_norm_dequeue(struct ifaltq_subque *ifsq)
2827 {
2828 	struct mbuf *m;
2829 
2830 	m = ifsq->ifsq_norm_head;
2831 	if (m != NULL) {
2832 		if ((ifsq->ifsq_norm_head = m->m_nextpkt) == NULL)
2833 			ifsq->ifsq_norm_tail = NULL;
2834 		m->m_nextpkt = NULL;
2835 		ALTQ_SQ_CNTR_DEC(ifsq, m->m_pkthdr.len);
2836 	}
2837 	return m;
2838 }
2839 
2840 static struct mbuf *
2841 ifsq_prio_dequeue(struct ifaltq_subque *ifsq)
2842 {
2843 	struct mbuf *m;
2844 
2845 	m = ifsq->ifsq_prio_head;
2846 	if (m != NULL) {
2847 		if ((ifsq->ifsq_prio_head = m->m_nextpkt) == NULL)
2848 			ifsq->ifsq_prio_tail = NULL;
2849 		m->m_nextpkt = NULL;
2850 		ALTQ_SQ_CNTR_DEC(ifsq, m->m_pkthdr.len);
2851 		ALTQ_SQ_PRIO_CNTR_DEC(ifsq, m->m_pkthdr.len);
2852 	}
2853 	return m;
2854 }
2855 
2856 int
2857 ifsq_classic_enqueue(struct ifaltq_subque *ifsq, struct mbuf *m,
2858     struct altq_pktattr *pa __unused)
2859 {
2860 	M_ASSERTPKTHDR(m);
2861 	if (ifsq->ifsq_len >= ifsq->ifsq_maxlen ||
2862 	    ifsq->ifsq_bcnt >= ifsq->ifsq_maxbcnt) {
2863 		if ((m->m_flags & M_PRIO) &&
2864 		    ifsq->ifsq_prio_len < (ifsq->ifsq_maxlen / 2) &&
2865 		    ifsq->ifsq_prio_bcnt < (ifsq->ifsq_maxbcnt / 2)) {
2866 			struct mbuf *m_drop;
2867 
2868 			/*
2869 			 * Perform drop-head on normal queue
2870 			 */
2871 			m_drop = ifsq_norm_dequeue(ifsq);
2872 			if (m_drop != NULL) {
2873 				m_freem(m_drop);
2874 				ifsq_prio_enqueue(ifsq, m);
2875 				return 0;
2876 			}
2877 			/* XXX nothing could be dropped? */
2878 		}
2879 		m_freem(m);
2880 		return ENOBUFS;
2881 	} else {
2882 		if (m->m_flags & M_PRIO)
2883 			ifsq_prio_enqueue(ifsq, m);
2884 		else
2885 			ifsq_norm_enqueue(ifsq, m);
2886 		return 0;
2887 	}
2888 }
2889 
2890 struct mbuf *
2891 ifsq_classic_dequeue(struct ifaltq_subque *ifsq, int op)
2892 {
2893 	struct mbuf *m;
2894 
2895 	switch (op) {
2896 	case ALTDQ_POLL:
2897 		m = ifsq->ifsq_prio_head;
2898 		if (m == NULL)
2899 			m = ifsq->ifsq_norm_head;
2900 		break;
2901 
2902 	case ALTDQ_REMOVE:
2903 		m = ifsq_prio_dequeue(ifsq);
2904 		if (m == NULL)
2905 			m = ifsq_norm_dequeue(ifsq);
2906 		break;
2907 
2908 	default:
2909 		panic("unsupported ALTQ dequeue op: %d", op);
2910 	}
2911 	return m;
2912 }
2913 
2914 int
2915 ifsq_classic_request(struct ifaltq_subque *ifsq, int req, void *arg)
2916 {
2917 	switch (req) {
2918 	case ALTRQ_PURGE:
2919 		for (;;) {
2920 			struct mbuf *m;
2921 
2922 			m = ifsq_classic_dequeue(ifsq, ALTDQ_REMOVE);
2923 			if (m == NULL)
2924 				break;
2925 			m_freem(m);
2926 		}
2927 		break;
2928 
2929 	default:
2930 		panic("unsupported ALTQ request: %d", req);
2931 	}
2932 	return 0;
2933 }
2934 
2935 static void
2936 ifsq_ifstart_try(struct ifaltq_subque *ifsq, int force_sched)
2937 {
2938 	struct ifnet *ifp = ifsq_get_ifp(ifsq);
2939 	int running = 0, need_sched;
2940 
2941 	/*
2942 	 * Try to do direct ifnet.if_start on the subqueue first, if there is
2943 	 * contention on the subqueue hardware serializer, ifnet.if_start on
2944 	 * the subqueue will be scheduled on the subqueue owner CPU.
2945 	 */
2946 	if (!ifsq_tryserialize_hw(ifsq)) {
2947 		/*
2948 		 * Subqueue hardware serializer contention happened,
2949 		 * ifnet.if_start on the subqueue is scheduled on
2950 		 * the subqueue owner CPU, and we keep going.
2951 		 */
2952 		ifsq_ifstart_schedule(ifsq, 1);
2953 		return;
2954 	}
2955 
2956 	if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq)) {
2957 		ifp->if_start(ifp, ifsq);
2958 		if ((ifp->if_flags & IFF_RUNNING) && !ifsq_is_oactive(ifsq))
2959 			running = 1;
2960 	}
2961 	need_sched = ifsq_ifstart_need_schedule(ifsq, running);
2962 
2963 	ifsq_deserialize_hw(ifsq);
2964 
2965 	if (need_sched) {
2966 		/*
2967 		 * More data need to be transmitted, ifnet.if_start on the
2968 		 * subqueue is scheduled on the subqueue owner CPU, and we
2969 		 * keep going.
2970 		 * NOTE: ifnet.if_start subqueue interlock is not released.
2971 		 */
2972 		ifsq_ifstart_schedule(ifsq, force_sched);
2973 	}
2974 }
2975 
2976 /*
2977  * Subqeue packets staging mechanism:
2978  *
2979  * The packets enqueued into the subqueue are staged to a certain amount
2980  * before the ifnet.if_start on the subqueue is called.  In this way, the
2981  * driver could avoid writing to hardware registers upon every packet,
2982  * instead, hardware registers could be written when certain amount of
2983  * packets are put onto hardware TX ring.  The measurement on several modern
2984  * NICs (emx(4), igb(4), bnx(4), bge(4), jme(4)) shows that the hardware
2985  * registers writing aggregation could save ~20% CPU time when 18bytes UDP
2986  * datagrams are transmitted at 1.48Mpps.  The performance improvement by
2987  * hardware registers writing aggeregation is also mentioned by Luigi Rizzo's
2988  * netmap paper (http://info.iet.unipi.it/~luigi/netmap/).
2989  *
2990  * Subqueue packets staging is performed for two entry points into drivers'
2991  * transmission function:
2992  * - Direct ifnet.if_start calling on the subqueue, i.e. ifsq_ifstart_try()
2993  * - ifnet.if_start scheduling on the subqueue, i.e. ifsq_ifstart_schedule()
2994  *
2995  * Subqueue packets staging will be stopped upon any of the following
2996  * conditions:
2997  * - If the count of packets enqueued on the current CPU is great than or
2998  *   equal to ifsq_stage_cntmax. (XXX this should be per-interface)
2999  * - If the total length of packets enqueued on the current CPU is great
3000  *   than or equal to the hardware's MTU - max_protohdr.  max_protohdr is
3001  *   cut from the hardware's MTU mainly bacause a full TCP segment's size
3002  *   is usually less than hardware's MTU.
3003  * - ifsq_ifstart_schedule() is not pending on the current CPU and
3004  *   ifnet.if_start subqueue interlock (ifaltq_subq.ifsq_started) is not
3005  *   released.
3006  * - The if_start_rollup(), which is registered as low priority netisr
3007  *   rollup function, is called; probably because no more work is pending
3008  *   for netisr.
3009  *
3010  * NOTE:
3011  * Currently subqueue packet staging is only performed in netisr threads.
3012  */
3013 int
3014 ifq_dispatch(struct ifnet *ifp, struct mbuf *m, struct altq_pktattr *pa)
3015 {
3016 	struct ifaltq *ifq = &ifp->if_snd;
3017 	struct ifaltq_subque *ifsq;
3018 	int error, start = 0, len, mcast = 0, avoid_start = 0;
3019 	struct ifsubq_stage_head *head = NULL;
3020 	struct ifsubq_stage *stage = NULL;
3021 	struct globaldata *gd = mycpu;
3022 	struct thread *td = gd->gd_curthread;
3023 
3024 	crit_enter_quick(td);
3025 
3026 	ifsq = ifq_map_subq(ifq, gd->gd_cpuid);
3027 	ASSERT_ALTQ_SQ_NOT_SERIALIZED_HW(ifsq);
3028 
3029 	len = m->m_pkthdr.len;
3030 	if (m->m_flags & M_MCAST)
3031 		mcast = 1;
3032 
3033 	if (td->td_type == TD_TYPE_NETISR) {
3034 		head = &ifsubq_stage_heads[mycpuid];
3035 		stage = ifsq_get_stage(ifsq, mycpuid);
3036 
3037 		stage->stg_cnt++;
3038 		stage->stg_len += len;
3039 		if (stage->stg_cnt < ifsq_stage_cntmax &&
3040 		    stage->stg_len < (ifp->if_mtu - max_protohdr))
3041 			avoid_start = 1;
3042 	}
3043 
3044 	ALTQ_SQ_LOCK(ifsq);
3045 	error = ifsq_enqueue_locked(ifsq, m, pa);
3046 	if (error) {
3047 		if (!ifsq_data_ready(ifsq)) {
3048 			ALTQ_SQ_UNLOCK(ifsq);
3049 			crit_exit_quick(td);
3050 			return error;
3051 		}
3052 		avoid_start = 0;
3053 	}
3054 	if (!ifsq_is_started(ifsq)) {
3055 		if (avoid_start) {
3056 			ALTQ_SQ_UNLOCK(ifsq);
3057 
3058 			KKASSERT(!error);
3059 			if ((stage->stg_flags & IFSQ_STAGE_FLAG_QUED) == 0)
3060 				ifsq_stage_insert(head, stage);
3061 
3062 			IFNET_STAT_INC(ifp, obytes, len);
3063 			if (mcast)
3064 				IFNET_STAT_INC(ifp, omcasts, 1);
3065 			crit_exit_quick(td);
3066 			return error;
3067 		}
3068 
3069 		/*
3070 		 * Hold the subqueue interlock of ifnet.if_start
3071 		 */
3072 		ifsq_set_started(ifsq);
3073 		start = 1;
3074 	}
3075 	ALTQ_SQ_UNLOCK(ifsq);
3076 
3077 	if (!error) {
3078 		IFNET_STAT_INC(ifp, obytes, len);
3079 		if (mcast)
3080 			IFNET_STAT_INC(ifp, omcasts, 1);
3081 	}
3082 
3083 	if (stage != NULL) {
3084 		if (!start && (stage->stg_flags & IFSQ_STAGE_FLAG_SCHED)) {
3085 			KKASSERT(stage->stg_flags & IFSQ_STAGE_FLAG_QUED);
3086 			if (!avoid_start) {
3087 				ifsq_stage_remove(head, stage);
3088 				ifsq_ifstart_schedule(ifsq, 1);
3089 			}
3090 			crit_exit_quick(td);
3091 			return error;
3092 		}
3093 
3094 		if (stage->stg_flags & IFSQ_STAGE_FLAG_QUED) {
3095 			ifsq_stage_remove(head, stage);
3096 		} else {
3097 			stage->stg_cnt = 0;
3098 			stage->stg_len = 0;
3099 		}
3100 	}
3101 
3102 	if (!start) {
3103 		crit_exit_quick(td);
3104 		return error;
3105 	}
3106 
3107 	ifsq_ifstart_try(ifsq, 0);
3108 
3109 	crit_exit_quick(td);
3110 	return error;
3111 }
3112 
3113 void *
3114 ifa_create(int size, int flags)
3115 {
3116 	struct ifaddr *ifa;
3117 	int i;
3118 
3119 	KASSERT(size >= sizeof(*ifa), ("ifaddr size too small"));
3120 
3121 	ifa = kmalloc(size, M_IFADDR, flags | M_ZERO);
3122 	if (ifa == NULL)
3123 		return NULL;
3124 
3125 	ifa->ifa_containers =
3126 	    kmalloc_cachealign(ncpus * sizeof(struct ifaddr_container),
3127 	        M_IFADDR, M_WAITOK | M_ZERO);
3128 	ifa->ifa_ncnt = ncpus;
3129 	for (i = 0; i < ncpus; ++i) {
3130 		struct ifaddr_container *ifac = &ifa->ifa_containers[i];
3131 
3132 		ifac->ifa_magic = IFA_CONTAINER_MAGIC;
3133 		ifac->ifa = ifa;
3134 		ifac->ifa_refcnt = 1;
3135 	}
3136 #ifdef IFADDR_DEBUG
3137 	kprintf("alloc ifa %p %d\n", ifa, size);
3138 #endif
3139 	return ifa;
3140 }
3141 
3142 void
3143 ifac_free(struct ifaddr_container *ifac, int cpu_id)
3144 {
3145 	struct ifaddr *ifa = ifac->ifa;
3146 
3147 	KKASSERT(ifac->ifa_magic == IFA_CONTAINER_MAGIC);
3148 	KKASSERT(ifac->ifa_refcnt == 0);
3149 	KASSERT(ifac->ifa_listmask == 0,
3150 		("ifa is still on %#x lists", ifac->ifa_listmask));
3151 
3152 	ifac->ifa_magic = IFA_CONTAINER_DEAD;
3153 
3154 #ifdef IFADDR_DEBUG_VERBOSE
3155 	kprintf("try free ifa %p cpu_id %d\n", ifac->ifa, cpu_id);
3156 #endif
3157 
3158 	KASSERT(ifa->ifa_ncnt > 0 && ifa->ifa_ncnt <= ncpus,
3159 		("invalid # of ifac, %d", ifa->ifa_ncnt));
3160 	if (atomic_fetchadd_int(&ifa->ifa_ncnt, -1) == 1) {
3161 #ifdef IFADDR_DEBUG
3162 		kprintf("free ifa %p\n", ifa);
3163 #endif
3164 		kfree(ifa->ifa_containers, M_IFADDR);
3165 		kfree(ifa, M_IFADDR);
3166 	}
3167 }
3168 
3169 static void
3170 ifa_iflink_dispatch(netmsg_t nmsg)
3171 {
3172 	struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
3173 	struct ifaddr *ifa = msg->ifa;
3174 	struct ifnet *ifp = msg->ifp;
3175 	int cpu = mycpuid;
3176 	struct ifaddr_container *ifac;
3177 
3178 	crit_enter();
3179 
3180 	ifac = &ifa->ifa_containers[cpu];
3181 	ASSERT_IFAC_VALID(ifac);
3182 	KASSERT((ifac->ifa_listmask & IFA_LIST_IFADDRHEAD) == 0,
3183 		("ifaddr is on if_addrheads"));
3184 
3185 	ifac->ifa_listmask |= IFA_LIST_IFADDRHEAD;
3186 	if (msg->tail)
3187 		TAILQ_INSERT_TAIL(&ifp->if_addrheads[cpu], ifac, ifa_link);
3188 	else
3189 		TAILQ_INSERT_HEAD(&ifp->if_addrheads[cpu], ifac, ifa_link);
3190 
3191 	crit_exit();
3192 
3193 	ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
3194 }
3195 
3196 void
3197 ifa_iflink(struct ifaddr *ifa, struct ifnet *ifp, int tail)
3198 {
3199 	struct netmsg_ifaddr msg;
3200 
3201 	netmsg_init(&msg.base, NULL, &curthread->td_msgport,
3202 		    0, ifa_iflink_dispatch);
3203 	msg.ifa = ifa;
3204 	msg.ifp = ifp;
3205 	msg.tail = tail;
3206 
3207 	ifa_domsg(&msg.base.lmsg, 0);
3208 }
3209 
3210 static void
3211 ifa_ifunlink_dispatch(netmsg_t nmsg)
3212 {
3213 	struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
3214 	struct ifaddr *ifa = msg->ifa;
3215 	struct ifnet *ifp = msg->ifp;
3216 	int cpu = mycpuid;
3217 	struct ifaddr_container *ifac;
3218 
3219 	crit_enter();
3220 
3221 	ifac = &ifa->ifa_containers[cpu];
3222 	ASSERT_IFAC_VALID(ifac);
3223 	KASSERT(ifac->ifa_listmask & IFA_LIST_IFADDRHEAD,
3224 		("ifaddr is not on if_addrhead"));
3225 
3226 	TAILQ_REMOVE(&ifp->if_addrheads[cpu], ifac, ifa_link);
3227 	ifac->ifa_listmask &= ~IFA_LIST_IFADDRHEAD;
3228 
3229 	crit_exit();
3230 
3231 	ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
3232 }
3233 
3234 void
3235 ifa_ifunlink(struct ifaddr *ifa, struct ifnet *ifp)
3236 {
3237 	struct netmsg_ifaddr msg;
3238 
3239 	netmsg_init(&msg.base, NULL, &curthread->td_msgport,
3240 		    0, ifa_ifunlink_dispatch);
3241 	msg.ifa = ifa;
3242 	msg.ifp = ifp;
3243 
3244 	ifa_domsg(&msg.base.lmsg, 0);
3245 }
3246 
3247 static void
3248 ifa_destroy_dispatch(netmsg_t nmsg)
3249 {
3250 	struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
3251 
3252 	IFAFREE(msg->ifa);
3253 	ifa_forwardmsg(&nmsg->lmsg, mycpuid + 1);
3254 }
3255 
3256 void
3257 ifa_destroy(struct ifaddr *ifa)
3258 {
3259 	struct netmsg_ifaddr msg;
3260 
3261 	netmsg_init(&msg.base, NULL, &curthread->td_msgport,
3262 		    0, ifa_destroy_dispatch);
3263 	msg.ifa = ifa;
3264 
3265 	ifa_domsg(&msg.base.lmsg, 0);
3266 }
3267 
3268 struct lwkt_port *
3269 ifnet_portfn(int cpu)
3270 {
3271 	return &ifnet_threads[cpu].td_msgport;
3272 }
3273 
3274 void
3275 ifnet_forwardmsg(struct lwkt_msg *lmsg, int next_cpu)
3276 {
3277 	KKASSERT(next_cpu > mycpuid && next_cpu <= ncpus);
3278 
3279 	if (next_cpu < ncpus)
3280 		lwkt_forwardmsg(ifnet_portfn(next_cpu), lmsg);
3281 	else
3282 		lwkt_replymsg(lmsg, 0);
3283 }
3284 
3285 int
3286 ifnet_domsg(struct lwkt_msg *lmsg, int cpu)
3287 {
3288 	KKASSERT(cpu < ncpus);
3289 	return lwkt_domsg(ifnet_portfn(cpu), lmsg, 0);
3290 }
3291 
3292 void
3293 ifnet_sendmsg(struct lwkt_msg *lmsg, int cpu)
3294 {
3295 	KKASSERT(cpu < ncpus);
3296 	lwkt_sendmsg(ifnet_portfn(cpu), lmsg);
3297 }
3298 
3299 /*
3300  * Generic netmsg service loop.  Some protocols may roll their own but all
3301  * must do the basic command dispatch function call done here.
3302  */
3303 static void
3304 ifnet_service_loop(void *arg __unused)
3305 {
3306 	netmsg_t msg;
3307 
3308 	while ((msg = lwkt_waitport(&curthread->td_msgport, 0))) {
3309 		KASSERT(msg->base.nm_dispatch, ("ifnet_service: badmsg"));
3310 		msg->base.nm_dispatch(msg);
3311 	}
3312 }
3313 
3314 static void
3315 if_start_rollup(void)
3316 {
3317 	struct ifsubq_stage_head *head = &ifsubq_stage_heads[mycpuid];
3318 	struct ifsubq_stage *stage;
3319 
3320 	crit_enter();
3321 
3322 	while ((stage = TAILQ_FIRST(&head->stg_head)) != NULL) {
3323 		struct ifaltq_subque *ifsq = stage->stg_subq;
3324 		int is_sched = 0;
3325 
3326 		if (stage->stg_flags & IFSQ_STAGE_FLAG_SCHED)
3327 			is_sched = 1;
3328 		ifsq_stage_remove(head, stage);
3329 
3330 		if (is_sched) {
3331 			ifsq_ifstart_schedule(ifsq, 1);
3332 		} else {
3333 			int start = 0;
3334 
3335 			ALTQ_SQ_LOCK(ifsq);
3336 			if (!ifsq_is_started(ifsq)) {
3337 				/*
3338 				 * Hold the subqueue interlock of
3339 				 * ifnet.if_start
3340 				 */
3341 				ifsq_set_started(ifsq);
3342 				start = 1;
3343 			}
3344 			ALTQ_SQ_UNLOCK(ifsq);
3345 
3346 			if (start)
3347 				ifsq_ifstart_try(ifsq, 1);
3348 		}
3349 		KKASSERT((stage->stg_flags &
3350 		    (IFSQ_STAGE_FLAG_QUED | IFSQ_STAGE_FLAG_SCHED)) == 0);
3351 	}
3352 
3353 	crit_exit();
3354 }
3355 
3356 static void
3357 ifnetinit(void *dummy __unused)
3358 {
3359 	int i;
3360 
3361 	for (i = 0; i < ncpus; ++i) {
3362 		struct thread *thr = &ifnet_threads[i];
3363 
3364 		lwkt_create(ifnet_service_loop, NULL, NULL,
3365 			    thr, TDF_NOSTART|TDF_FORCE_SPINPORT|TDF_FIXEDCPU,
3366 			    i, "ifnet %d", i);
3367 		netmsg_service_port_init(&thr->td_msgport);
3368 		lwkt_schedule(thr);
3369 	}
3370 
3371 	for (i = 0; i < ncpus; ++i)
3372 		TAILQ_INIT(&ifsubq_stage_heads[i].stg_head);
3373 	netisr_register_rollup(if_start_rollup, NETISR_ROLLUP_PRIO_IFSTART);
3374 }
3375 
3376 void
3377 if_register_com_alloc(u_char type,
3378     if_com_alloc_t *a, if_com_free_t *f)
3379 {
3380 
3381         KASSERT(if_com_alloc[type] == NULL,
3382             ("if_register_com_alloc: %d already registered", type));
3383         KASSERT(if_com_free[type] == NULL,
3384             ("if_register_com_alloc: %d free already registered", type));
3385 
3386         if_com_alloc[type] = a;
3387         if_com_free[type] = f;
3388 }
3389 
3390 void
3391 if_deregister_com_alloc(u_char type)
3392 {
3393 
3394         KASSERT(if_com_alloc[type] != NULL,
3395             ("if_deregister_com_alloc: %d not registered", type));
3396         KASSERT(if_com_free[type] != NULL,
3397             ("if_deregister_com_alloc: %d free not registered", type));
3398         if_com_alloc[type] = NULL;
3399         if_com_free[type] = NULL;
3400 }
3401 
3402 int
3403 if_ring_count2(int cnt, int cnt_max)
3404 {
3405 	int shift = 0;
3406 
3407 	KASSERT(cnt_max >= 1 && powerof2(cnt_max),
3408 	    ("invalid ring count max %d", cnt_max));
3409 
3410 	if (cnt <= 0)
3411 		cnt = cnt_max;
3412 	if (cnt > ncpus2)
3413 		cnt = ncpus2;
3414 	if (cnt > cnt_max)
3415 		cnt = cnt_max;
3416 
3417 	while ((1 << (shift + 1)) <= cnt)
3418 		++shift;
3419 	cnt = 1 << shift;
3420 
3421 	KASSERT(cnt >= 1 && cnt <= ncpus2 && cnt <= cnt_max,
3422 	    ("calculate cnt %d, ncpus2 %d, cnt max %d",
3423 	     cnt, ncpus2, cnt_max));
3424 	return cnt;
3425 }
3426 
3427 void
3428 ifq_set_maxlen(struct ifaltq *ifq, int len)
3429 {
3430 	ifq->altq_maxlen = len + (ncpus * ifsq_stage_cntmax);
3431 }
3432 
3433 int
3434 ifq_mapsubq_default(struct ifaltq *ifq __unused, int cpuid __unused)
3435 {
3436 	return ALTQ_SUBQ_INDEX_DEFAULT;
3437 }
3438 
3439 int
3440 ifq_mapsubq_mask(struct ifaltq *ifq, int cpuid)
3441 {
3442 	return (cpuid & ifq->altq_subq_mask);
3443 }
3444 
3445 static void
3446 ifsq_watchdog(void *arg)
3447 {
3448 	struct ifsubq_watchdog *wd = arg;
3449 	struct ifnet *ifp;
3450 
3451 	if (__predict_true(wd->wd_timer == 0 || --wd->wd_timer))
3452 		goto done;
3453 
3454 	ifp = ifsq_get_ifp(wd->wd_subq);
3455 	if (ifnet_tryserialize_all(ifp)) {
3456 		wd->wd_watchdog(wd->wd_subq);
3457 		ifnet_deserialize_all(ifp);
3458 	} else {
3459 		/* try again next timeout */
3460 		wd->wd_timer = 1;
3461 	}
3462 done:
3463 	ifsq_watchdog_reset(wd);
3464 }
3465 
3466 static void
3467 ifsq_watchdog_reset(struct ifsubq_watchdog *wd)
3468 {
3469 	callout_reset_bycpu(&wd->wd_callout, hz, ifsq_watchdog, wd,
3470 	    ifsq_get_cpuid(wd->wd_subq));
3471 }
3472 
3473 void
3474 ifsq_watchdog_init(struct ifsubq_watchdog *wd, struct ifaltq_subque *ifsq,
3475     ifsq_watchdog_t watchdog)
3476 {
3477 	callout_init_mp(&wd->wd_callout);
3478 	wd->wd_timer = 0;
3479 	wd->wd_subq = ifsq;
3480 	wd->wd_watchdog = watchdog;
3481 }
3482 
3483 void
3484 ifsq_watchdog_start(struct ifsubq_watchdog *wd)
3485 {
3486 	wd->wd_timer = 0;
3487 	ifsq_watchdog_reset(wd);
3488 }
3489 
3490 void
3491 ifsq_watchdog_stop(struct ifsubq_watchdog *wd)
3492 {
3493 	wd->wd_timer = 0;
3494 	callout_stop(&wd->wd_callout);
3495 }
3496 
3497 void
3498 ifnet_lock(void)
3499 {
3500 	KASSERT(curthread->td_type != TD_TYPE_NETISR,
3501 	    ("try holding ifnet lock in netisr"));
3502 	mtx_lock(&ifnet_mtx);
3503 }
3504 
3505 void
3506 ifnet_unlock(void)
3507 {
3508 	KASSERT(curthread->td_type != TD_TYPE_NETISR,
3509 	    ("try holding ifnet lock in netisr"));
3510 	mtx_unlock(&ifnet_mtx);
3511 }
3512 
3513 static struct ifnet_array *
3514 ifnet_array_alloc(int count)
3515 {
3516 	struct ifnet_array *arr;
3517 
3518 	arr = kmalloc(__offsetof(struct ifnet_array, ifnet_arr[count]),
3519 	    M_IFNET, M_WAITOK);
3520 	arr->ifnet_count = count;
3521 
3522 	return arr;
3523 }
3524 
3525 static void
3526 ifnet_array_free(struct ifnet_array *arr)
3527 {
3528 	if (arr == &ifnet_array0)
3529 		return;
3530 	kfree(arr, M_IFNET);
3531 }
3532 
3533 static struct ifnet_array *
3534 ifnet_array_add(struct ifnet *ifp, const struct ifnet_array *old_arr)
3535 {
3536 	struct ifnet_array *arr;
3537 	int count, i;
3538 
3539 	KASSERT(old_arr->ifnet_count >= 0,
3540 	    ("invalid ifnet array count %d", old_arr->ifnet_count));
3541 	count = old_arr->ifnet_count + 1;
3542 	arr = ifnet_array_alloc(count);
3543 
3544 	/*
3545 	 * Save the old ifnet array and append this ifp to the end of
3546 	 * the new ifnet array.
3547 	 */
3548 	for (i = 0; i < old_arr->ifnet_count; ++i) {
3549 		KASSERT(old_arr->ifnet_arr[i] != ifp,
3550 		    ("%s is already in ifnet array", ifp->if_xname));
3551 		arr->ifnet_arr[i] = old_arr->ifnet_arr[i];
3552 	}
3553 	KASSERT(i == count - 1,
3554 	    ("add %s, ifnet array index mismatch, should be %d, but got %d",
3555 	     ifp->if_xname, count - 1, i));
3556 	arr->ifnet_arr[i] = ifp;
3557 
3558 	return arr;
3559 }
3560 
3561 static struct ifnet_array *
3562 ifnet_array_del(struct ifnet *ifp, const struct ifnet_array *old_arr)
3563 {
3564 	struct ifnet_array *arr;
3565 	int count, i, idx, found = 0;
3566 
3567 	KASSERT(old_arr->ifnet_count > 0,
3568 	    ("invalid ifnet array count %d", old_arr->ifnet_count));
3569 	count = old_arr->ifnet_count - 1;
3570 	arr = ifnet_array_alloc(count);
3571 
3572 	/*
3573 	 * Save the old ifnet array, but skip this ifp.
3574 	 */
3575 	idx = 0;
3576 	for (i = 0; i < old_arr->ifnet_count; ++i) {
3577 		if (old_arr->ifnet_arr[i] == ifp) {
3578 			KASSERT(!found,
3579 			    ("dup %s is in ifnet array", ifp->if_xname));
3580 			found = 1;
3581 			continue;
3582 		}
3583 		KASSERT(idx < count,
3584 		    ("invalid ifnet array index %d, count %d", idx, count));
3585 		arr->ifnet_arr[idx] = old_arr->ifnet_arr[i];
3586 		++idx;
3587 	}
3588 	KASSERT(found, ("%s is not in ifnet array", ifp->if_xname));
3589 	KASSERT(idx == count,
3590 	    ("del %s, ifnet array count mismatch, should be %d, but got %d ",
3591 	     ifp->if_xname, count, idx));
3592 
3593 	return arr;
3594 }
3595 
3596 const struct ifnet_array *
3597 ifnet_array_get(void)
3598 {
3599 	KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr"));
3600 	return ifnet_array;
3601 }
3602 
3603 int
3604 ifnet_array_isempty(void)
3605 {
3606 	KASSERT(curthread->td_type == TD_TYPE_NETISR, ("not in netisr"));
3607 	if (ifnet_array->ifnet_count == 0)
3608 		return 1;
3609 	else
3610 		return 0;
3611 }
3612 
3613 void
3614 ifa_marker_init(struct ifaddr_marker *mark, struct ifnet *ifp)
3615 {
3616 	struct ifaddr *ifa;
3617 
3618 	memset(mark, 0, sizeof(*mark));
3619 	ifa = &mark->ifa;
3620 
3621 	mark->ifac.ifa = ifa;
3622 
3623 	ifa->ifa_addr = &mark->addr;
3624 	ifa->ifa_dstaddr = &mark->dstaddr;
3625 	ifa->ifa_netmask = &mark->netmask;
3626 	ifa->ifa_ifp = ifp;
3627 }
3628