xref: /dflybsd-src/sys/net/if.c (revision 211d4362597aee676ecea315377d5cb13da26bb5)
1 /*
2  * Copyright (c) 1980, 1986, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)if.c	8.3 (Berkeley) 1/4/94
34  * $FreeBSD: src/sys/net/if.c,v 1.185 2004/03/13 02:35:03 brooks Exp $
35  */
36 
37 #include "opt_compat.h"
38 #include "opt_inet6.h"
39 #include "opt_inet.h"
40 #include "opt_ifpoll.h"
41 
42 #include <sys/param.h>
43 #include <sys/malloc.h>
44 #include <sys/mbuf.h>
45 #include <sys/systm.h>
46 #include <sys/proc.h>
47 #include <sys/priv.h>
48 #include <sys/protosw.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/socketops.h>
52 #include <sys/protosw.h>
53 #include <sys/kernel.h>
54 #include <sys/ktr.h>
55 #include <sys/mutex.h>
56 #include <sys/sockio.h>
57 #include <sys/syslog.h>
58 #include <sys/sysctl.h>
59 #include <sys/domain.h>
60 #include <sys/thread.h>
61 #include <sys/serialize.h>
62 #include <sys/bus.h>
63 
64 #include <sys/thread2.h>
65 #include <sys/msgport2.h>
66 #include <sys/mutex2.h>
67 
68 #include <net/if.h>
69 #include <net/if_arp.h>
70 #include <net/if_dl.h>
71 #include <net/if_types.h>
72 #include <net/if_var.h>
73 #include <net/ifq_var.h>
74 #include <net/radix.h>
75 #include <net/route.h>
76 #include <net/if_clone.h>
77 #include <net/netisr.h>
78 #include <net/netmsg2.h>
79 
80 #include <machine/atomic.h>
81 #include <machine/stdarg.h>
82 #include <machine/smp.h>
83 
84 #if defined(INET) || defined(INET6)
85 /*XXX*/
86 #include <netinet/in.h>
87 #include <netinet/in_var.h>
88 #include <netinet/if_ether.h>
89 #ifdef INET6
90 #include <netinet6/in6_var.h>
91 #include <netinet6/in6_ifattach.h>
92 #endif
93 #endif
94 
95 #if defined(COMPAT_43)
96 #include <emulation/43bsd/43bsd_socket.h>
97 #endif /* COMPAT_43 */
98 
99 struct netmsg_ifaddr {
100 	struct netmsg_base base;
101 	struct ifaddr	*ifa;
102 	struct ifnet	*ifp;
103 	int		tail;
104 };
105 
106 struct ifaltq_stage_head {
107 	TAILQ_HEAD(, ifaltq_stage)	ifqs_head;
108 } __cachealign;
109 
110 /*
111  * System initialization
112  */
113 static void	if_attachdomain(void *);
114 static void	if_attachdomain1(struct ifnet *);
115 static int	ifconf(u_long, caddr_t, struct ucred *);
116 static void	ifinit(void *);
117 static void	ifnetinit(void *);
118 static void	if_slowtimo(void *);
119 static void	link_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
120 static int	if_rtdel(struct radix_node *, void *);
121 
122 #ifdef INET6
123 /*
124  * XXX: declare here to avoid to include many inet6 related files..
125  * should be more generalized?
126  */
127 extern void	nd6_setmtu(struct ifnet *);
128 #endif
129 
130 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
131 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
132 
133 static int ifq_stage_cntmax = 4;
134 TUNABLE_INT("net.link.stage_cntmax", &ifq_stage_cntmax);
135 SYSCTL_INT(_net_link, OID_AUTO, stage_cntmax, CTLFLAG_RW,
136     &ifq_stage_cntmax, 0, "ifq staging packet count max");
137 
138 SYSINIT(interfaces, SI_SUB_PROTO_IF, SI_ORDER_FIRST, ifinit, NULL)
139 /* Must be after netisr_init */
140 SYSINIT(ifnet, SI_SUB_PRE_DRIVERS, SI_ORDER_SECOND, ifnetinit, NULL)
141 
142 static  if_com_alloc_t *if_com_alloc[256];
143 static  if_com_free_t *if_com_free[256];
144 
145 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
146 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
147 MALLOC_DEFINE(M_IFNET, "ifnet", "interface structure");
148 
149 int			ifqmaxlen = IFQ_MAXLEN;
150 struct ifnethead	ifnet = TAILQ_HEAD_INITIALIZER(ifnet);
151 
152 struct callout		if_slowtimo_timer;
153 
154 int			if_index = 0;
155 struct ifnet		**ifindex2ifnet = NULL;
156 static struct thread	ifnet_threads[MAXCPU];
157 
158 static struct ifaltq_stage_head	ifq_stage_heads[MAXCPU];
159 
160 #define IFQ_KTR_STRING		"ifq=%p"
161 #define IFQ_KTR_ARGS	struct ifaltq *ifq
162 #ifndef KTR_IFQ
163 #define KTR_IFQ			KTR_ALL
164 #endif
165 KTR_INFO_MASTER(ifq);
166 KTR_INFO(KTR_IFQ, ifq, enqueue, 0, IFQ_KTR_STRING, IFQ_KTR_ARGS);
167 KTR_INFO(KTR_IFQ, ifq, dequeue, 1, IFQ_KTR_STRING, IFQ_KTR_ARGS);
168 #define logifq(name, arg)	KTR_LOG(ifq_ ## name, arg)
169 
170 #define IF_START_KTR_STRING	"ifp=%p"
171 #define IF_START_KTR_ARGS	struct ifnet *ifp
172 #ifndef KTR_IF_START
173 #define KTR_IF_START		KTR_ALL
174 #endif
175 KTR_INFO_MASTER(if_start);
176 KTR_INFO(KTR_IF_START, if_start, run, 0,
177 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
178 KTR_INFO(KTR_IF_START, if_start, sched, 1,
179 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
180 KTR_INFO(KTR_IF_START, if_start, avoid, 2,
181 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
182 KTR_INFO(KTR_IF_START, if_start, contend_sched, 3,
183 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
184 KTR_INFO(KTR_IF_START, if_start, chase_sched, 4,
185 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
186 #define logifstart(name, arg)	KTR_LOG(if_start_ ## name, arg)
187 
188 TAILQ_HEAD(, ifg_group) ifg_head = TAILQ_HEAD_INITIALIZER(ifg_head);
189 
190 /*
191  * Network interface utility routines.
192  *
193  * Routines with ifa_ifwith* names take sockaddr *'s as
194  * parameters.
195  */
196 /* ARGSUSED*/
197 void
198 ifinit(void *dummy)
199 {
200 	struct ifnet *ifp;
201 
202 	callout_init(&if_slowtimo_timer);
203 
204 	crit_enter();
205 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
206 		if (ifp->if_snd.ifq_maxlen == 0) {
207 			if_printf(ifp, "XXX: driver didn't set ifq_maxlen\n");
208 			ifq_set_maxlen(&ifp->if_snd, ifqmaxlen);
209 		}
210 	}
211 	crit_exit();
212 
213 	if_slowtimo(0);
214 }
215 
216 static int
217 if_start_cpuid(struct ifnet *ifp)
218 {
219 	return ifp->if_cpuid;
220 }
221 
222 #ifdef IFPOLL_ENABLE
223 static int
224 if_start_cpuid_npoll(struct ifnet *ifp)
225 {
226 	int poll_cpuid = ifp->if_npoll_cpuid;
227 
228 	if (poll_cpuid >= 0)
229 		return poll_cpuid;
230 	else
231 		return ifp->if_cpuid;
232 }
233 #endif
234 
235 static void
236 ifq_ifstart_ipifunc(void *arg)
237 {
238 	struct ifnet *ifp = arg;
239 	struct lwkt_msg *lmsg = &ifp->if_start_nmsg[mycpuid].lmsg;
240 
241 	crit_enter();
242 	if (lmsg->ms_flags & MSGF_DONE)
243 		lwkt_sendmsg(netisr_portfn(mycpuid), lmsg);
244 	crit_exit();
245 }
246 
247 static __inline void
248 ifq_stage_remove(struct ifaltq_stage_head *head, struct ifaltq_stage *stage)
249 {
250 	KKASSERT(stage->ifqs_flags & IFQ_STAGE_FLAG_QUED);
251 	TAILQ_REMOVE(&head->ifqs_head, stage, ifqs_link);
252 	stage->ifqs_flags &= ~(IFQ_STAGE_FLAG_QUED | IFQ_STAGE_FLAG_SCHED);
253 	stage->ifqs_cnt = 0;
254 	stage->ifqs_len = 0;
255 }
256 
257 static __inline void
258 ifq_stage_insert(struct ifaltq_stage_head *head, struct ifaltq_stage *stage)
259 {
260 	KKASSERT((stage->ifqs_flags &
261 	    (IFQ_STAGE_FLAG_QUED | IFQ_STAGE_FLAG_SCHED)) == 0);
262 	stage->ifqs_flags |= IFQ_STAGE_FLAG_QUED;
263 	TAILQ_INSERT_TAIL(&head->ifqs_head, stage, ifqs_link);
264 }
265 
266 /*
267  * Schedule ifnet.if_start on ifnet's CPU
268  */
269 static void
270 ifq_ifstart_schedule(struct ifaltq *ifq, int force)
271 {
272 	struct ifnet *ifp = ifq->altq_ifp;
273 	int cpu;
274 
275 	if (!force && curthread->td_type == TD_TYPE_NETISR &&
276 	    ifq_stage_cntmax > 0) {
277 		struct ifaltq_stage *stage = ifq_get_stage(ifq, mycpuid);
278 
279 		stage->ifqs_cnt = 0;
280 		stage->ifqs_len = 0;
281 		if ((stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) == 0)
282 			ifq_stage_insert(&ifq_stage_heads[mycpuid], stage);
283 		stage->ifqs_flags |= IFQ_STAGE_FLAG_SCHED;
284 		return;
285 	}
286 
287 	cpu = ifp->if_start_cpuid(ifp);
288 	if (cpu != mycpuid)
289 		lwkt_send_ipiq(globaldata_find(cpu), ifq_ifstart_ipifunc, ifp);
290 	else
291 		ifq_ifstart_ipifunc(ifp);
292 }
293 
294 /*
295  * NOTE:
296  * This function will release ifnet.if_start interlock,
297  * if ifnet.if_start does not need to be scheduled
298  */
299 static __inline int
300 ifq_ifstart_need_schedule(struct ifaltq *ifq, int running)
301 {
302 	if (!running || ifq_is_empty(ifq)
303 #ifdef ALTQ
304 	    || ifq->altq_tbr != NULL
305 #endif
306 	) {
307 		ALTQ_LOCK(ifq);
308 		/*
309 		 * ifnet.if_start interlock is released, if:
310 		 * 1) Hardware can not take any packets, due to
311 		 *    o  interface is marked down
312 		 *    o  hardware queue is full (ifq_is_oactive)
313 		 *    Under the second situation, hardware interrupt
314 		 *    or polling(4) will call/schedule ifnet.if_start
315 		 *    when hardware queue is ready
316 		 * 2) There is not packet in the ifnet.if_snd.
317 		 *    Further ifq_dispatch or ifq_handoff will call/
318 		 *    schedule ifnet.if_start
319 		 * 3) TBR is used and it does not allow further
320 		 *    dequeueing.
321 		 *    TBR callout will call ifnet.if_start
322 		 */
323 		if (!running || !ifq_data_ready(ifq)) {
324 			ifq_clr_started(ifq);
325 			ALTQ_UNLOCK(ifq);
326 			return 0;
327 		}
328 		ALTQ_UNLOCK(ifq);
329 	}
330 	return 1;
331 }
332 
333 static void
334 if_start_dispatch(netmsg_t msg)
335 {
336 	struct lwkt_msg *lmsg = &msg->base.lmsg;
337 	struct ifnet *ifp = lmsg->u.ms_resultp;
338 	struct ifaltq *ifq = &ifp->if_snd;
339 	int running = 0, need_sched;
340 
341 	crit_enter();
342 	lwkt_replymsg(lmsg, 0);	/* reply ASAP */
343 	crit_exit();
344 
345 	if (mycpuid != ifp->if_start_cpuid(ifp)) {
346 		/*
347 		 * We need to chase the ifnet CPU change.
348 		 */
349 		logifstart(chase_sched, ifp);
350 		ifq_ifstart_schedule(ifq, 1);
351 		return;
352 	}
353 
354 	ifnet_serialize_tx(ifp);
355 	if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq)) {
356 		logifstart(run, ifp);
357 		ifp->if_start(ifp);
358 		if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq))
359 			running = 1;
360 	}
361 	need_sched = ifq_ifstart_need_schedule(ifq, running);
362 	ifnet_deserialize_tx(ifp);
363 
364 	if (need_sched) {
365 		/*
366 		 * More data need to be transmitted, ifnet.if_start is
367 		 * scheduled on ifnet's CPU, and we keep going.
368 		 * NOTE: ifnet.if_start interlock is not released.
369 		 */
370 		logifstart(sched, ifp);
371 		ifq_ifstart_schedule(ifq, 0);
372 	}
373 }
374 
375 /* Device driver ifnet.if_start helper function */
376 void
377 if_devstart(struct ifnet *ifp)
378 {
379 	struct ifaltq *ifq = &ifp->if_snd;
380 	int running = 0;
381 
382 	ASSERT_IFNET_SERIALIZED_TX(ifp);
383 
384 	ALTQ_LOCK(ifq);
385 	if (ifq_is_started(ifq) || !ifq_data_ready(ifq)) {
386 		logifstart(avoid, ifp);
387 		ALTQ_UNLOCK(ifq);
388 		return;
389 	}
390 	ifq_set_started(ifq);
391 	ALTQ_UNLOCK(ifq);
392 
393 	logifstart(run, ifp);
394 	ifp->if_start(ifp);
395 
396 	if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq))
397 		running = 1;
398 
399 	if (ifq_ifstart_need_schedule(ifq, running)) {
400 		/*
401 		 * More data need to be transmitted, ifnet.if_start is
402 		 * scheduled on ifnet's CPU, and we keep going.
403 		 * NOTE: ifnet.if_start interlock is not released.
404 		 */
405 		logifstart(sched, ifp);
406 		ifq_ifstart_schedule(ifq, 0);
407 	}
408 }
409 
410 /* Device driver ifnet.if_start schedule helper function */
411 void
412 if_devstart_sched(struct ifnet *ifp)
413 {
414 	ifq_ifstart_schedule(&ifp->if_snd, 1);
415 }
416 
417 static void
418 if_default_serialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
419 {
420 	lwkt_serialize_enter(ifp->if_serializer);
421 }
422 
423 static void
424 if_default_deserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
425 {
426 	lwkt_serialize_exit(ifp->if_serializer);
427 }
428 
429 static int
430 if_default_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
431 {
432 	return lwkt_serialize_try(ifp->if_serializer);
433 }
434 
435 #ifdef INVARIANTS
436 static void
437 if_default_serialize_assert(struct ifnet *ifp,
438 			    enum ifnet_serialize slz __unused,
439 			    boolean_t serialized)
440 {
441 	if (serialized)
442 		ASSERT_SERIALIZED(ifp->if_serializer);
443 	else
444 		ASSERT_NOT_SERIALIZED(ifp->if_serializer);
445 }
446 #endif
447 
448 /*
449  * Attach an interface to the list of "active" interfaces.
450  *
451  * The serializer is optional.  If non-NULL access to the interface
452  * may be MPSAFE.
453  */
454 void
455 if_attach(struct ifnet *ifp, lwkt_serialize_t serializer)
456 {
457 	unsigned socksize, ifasize;
458 	int namelen, masklen;
459 	struct sockaddr_dl *sdl;
460 	struct ifaddr *ifa;
461 	struct ifaltq *ifq;
462 	int i;
463 
464 	static int if_indexlim = 8;
465 
466 	if (ifp->if_serialize != NULL) {
467 		KASSERT(ifp->if_deserialize != NULL &&
468 			ifp->if_tryserialize != NULL &&
469 			ifp->if_serialize_assert != NULL,
470 			("serialize functions are partially setup"));
471 
472 		/*
473 		 * If the device supplies serialize functions,
474 		 * then clear if_serializer to catch any invalid
475 		 * usage of this field.
476 		 */
477 		KASSERT(serializer == NULL,
478 			("both serialize functions and default serializer "
479 			 "are supplied"));
480 		ifp->if_serializer = NULL;
481 	} else {
482 		KASSERT(ifp->if_deserialize == NULL &&
483 			ifp->if_tryserialize == NULL &&
484 			ifp->if_serialize_assert == NULL,
485 			("serialize functions are partially setup"));
486 		ifp->if_serialize = if_default_serialize;
487 		ifp->if_deserialize = if_default_deserialize;
488 		ifp->if_tryserialize = if_default_tryserialize;
489 #ifdef INVARIANTS
490 		ifp->if_serialize_assert = if_default_serialize_assert;
491 #endif
492 
493 		/*
494 		 * The serializer can be passed in from the device,
495 		 * allowing the same serializer to be used for both
496 		 * the interrupt interlock and the device queue.
497 		 * If not specified, the netif structure will use an
498 		 * embedded serializer.
499 		 */
500 		if (serializer == NULL) {
501 			serializer = &ifp->if_default_serializer;
502 			lwkt_serialize_init(serializer);
503 		}
504 		ifp->if_serializer = serializer;
505 	}
506 
507 	ifp->if_start_cpuid = if_start_cpuid;
508 	ifp->if_cpuid = 0;
509 
510 #ifdef IFPOLL_ENABLE
511 	/* Device is not in polling mode by default */
512 	ifp->if_npoll_cpuid = -1;
513 	if (ifp->if_npoll != NULL)
514 		ifp->if_start_cpuid = if_start_cpuid_npoll;
515 #endif
516 
517 	ifp->if_start_nmsg = kmalloc(ncpus * sizeof(*ifp->if_start_nmsg),
518 				     M_LWKTMSG, M_WAITOK);
519 	for (i = 0; i < ncpus; ++i) {
520 		netmsg_init(&ifp->if_start_nmsg[i], NULL, &netisr_adone_rport,
521 			    0, if_start_dispatch);
522 		ifp->if_start_nmsg[i].lmsg.u.ms_resultp = ifp;
523 	}
524 
525 	mtx_init(&ifp->if_ioctl_mtx);
526 	mtx_lock(&ifp->if_ioctl_mtx);
527 
528 	TAILQ_INSERT_TAIL(&ifnet, ifp, if_link);
529 	ifp->if_index = ++if_index;
530 
531 	/*
532 	 * XXX -
533 	 * The old code would work if the interface passed a pre-existing
534 	 * chain of ifaddrs to this code.  We don't trust our callers to
535 	 * properly initialize the tailq, however, so we no longer allow
536 	 * this unlikely case.
537 	 */
538 	ifp->if_addrheads = kmalloc(ncpus * sizeof(struct ifaddrhead),
539 				    M_IFADDR, M_WAITOK | M_ZERO);
540 	for (i = 0; i < ncpus; ++i)
541 		TAILQ_INIT(&ifp->if_addrheads[i]);
542 
543 	TAILQ_INIT(&ifp->if_prefixhead);
544 	TAILQ_INIT(&ifp->if_multiaddrs);
545 	TAILQ_INIT(&ifp->if_groups);
546 	getmicrotime(&ifp->if_lastchange);
547 	if (ifindex2ifnet == NULL || if_index >= if_indexlim) {
548 		unsigned int n;
549 		struct ifnet **q;
550 
551 		if_indexlim <<= 1;
552 
553 		/* grow ifindex2ifnet */
554 		n = if_indexlim * sizeof(*q);
555 		q = kmalloc(n, M_IFADDR, M_WAITOK | M_ZERO);
556 		if (ifindex2ifnet) {
557 			bcopy(ifindex2ifnet, q, n/2);
558 			kfree(ifindex2ifnet, M_IFADDR);
559 		}
560 		ifindex2ifnet = q;
561 	}
562 
563 	ifindex2ifnet[if_index] = ifp;
564 
565 	/*
566 	 * create a Link Level name for this device
567 	 */
568 	namelen = strlen(ifp->if_xname);
569 	masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + namelen;
570 	socksize = masklen + ifp->if_addrlen;
571 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof(long) - 1)))
572 	if (socksize < sizeof(*sdl))
573 		socksize = sizeof(*sdl);
574 	socksize = ROUNDUP(socksize);
575 #undef ROUNDUP
576 	ifasize = sizeof(struct ifaddr) + 2 * socksize;
577 	ifa = ifa_create(ifasize, M_WAITOK);
578 	sdl = (struct sockaddr_dl *)(ifa + 1);
579 	sdl->sdl_len = socksize;
580 	sdl->sdl_family = AF_LINK;
581 	bcopy(ifp->if_xname, sdl->sdl_data, namelen);
582 	sdl->sdl_nlen = namelen;
583 	sdl->sdl_index = ifp->if_index;
584 	sdl->sdl_type = ifp->if_type;
585 	ifp->if_lladdr = ifa;
586 	ifa->ifa_ifp = ifp;
587 	ifa->ifa_rtrequest = link_rtrequest;
588 	ifa->ifa_addr = (struct sockaddr *)sdl;
589 	sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
590 	ifa->ifa_netmask = (struct sockaddr *)sdl;
591 	sdl->sdl_len = masklen;
592 	while (namelen != 0)
593 		sdl->sdl_data[--namelen] = 0xff;
594 	ifa_iflink(ifa, ifp, 0 /* Insert head */);
595 
596 	EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
597 	devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
598 
599 	ifq = &ifp->if_snd;
600 	ifq->altq_type = 0;
601 	ifq->altq_disc = NULL;
602 	ifq->altq_flags &= ALTQF_CANTCHANGE;
603 	ifq->altq_tbr = NULL;
604 	ifq->altq_ifp = ifp;
605 	ifq->altq_started = 0;
606 	ifq->altq_prepended = NULL;
607 	ALTQ_LOCK_INIT(ifq);
608 	ifq_set_classic(ifq);
609 
610 	ifq->altq_stage =
611 	    kmalloc_cachealign(ncpus * sizeof(struct ifaltq_stage),
612 	    M_DEVBUF, M_WAITOK | M_ZERO);
613 	for (i = 0; i < ncpus; ++i)
614 		ifq->altq_stage[i].ifqs_altq = ifq;
615 
616 	if (!SLIST_EMPTY(&domains))
617 		if_attachdomain1(ifp);
618 
619 	/* Announce the interface. */
620 	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
621 
622 	mtx_unlock(&ifp->if_ioctl_mtx);
623 }
624 
625 static void
626 if_attachdomain(void *dummy)
627 {
628 	struct ifnet *ifp;
629 
630 	crit_enter();
631 	TAILQ_FOREACH(ifp, &ifnet, if_list)
632 		if_attachdomain1(ifp);
633 	crit_exit();
634 }
635 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST,
636 	if_attachdomain, NULL);
637 
638 static void
639 if_attachdomain1(struct ifnet *ifp)
640 {
641 	struct domain *dp;
642 
643 	crit_enter();
644 
645 	/* address family dependent data region */
646 	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
647 	SLIST_FOREACH(dp, &domains, dom_next)
648 		if (dp->dom_ifattach)
649 			ifp->if_afdata[dp->dom_family] =
650 				(*dp->dom_ifattach)(ifp);
651 	crit_exit();
652 }
653 
654 /*
655  * Purge all addresses whose type is _not_ AF_LINK
656  */
657 void
658 if_purgeaddrs_nolink(struct ifnet *ifp)
659 {
660 	struct ifaddr_container *ifac, *next;
661 
662 	TAILQ_FOREACH_MUTABLE(ifac, &ifp->if_addrheads[mycpuid],
663 			      ifa_link, next) {
664 		struct ifaddr *ifa = ifac->ifa;
665 
666 		/* Leave link ifaddr as it is */
667 		if (ifa->ifa_addr->sa_family == AF_LINK)
668 			continue;
669 #ifdef INET
670 		/* XXX: Ugly!! ad hoc just for INET */
671 		if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET) {
672 			struct ifaliasreq ifr;
673 #ifdef IFADDR_DEBUG_VERBOSE
674 			int i;
675 
676 			kprintf("purge in4 addr %p: ", ifa);
677 			for (i = 0; i < ncpus; ++i)
678 				kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
679 			kprintf("\n");
680 #endif
681 
682 			bzero(&ifr, sizeof ifr);
683 			ifr.ifra_addr = *ifa->ifa_addr;
684 			if (ifa->ifa_dstaddr)
685 				ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
686 			if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
687 				       NULL) == 0)
688 				continue;
689 		}
690 #endif /* INET */
691 #ifdef INET6
692 		if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET6) {
693 #ifdef IFADDR_DEBUG_VERBOSE
694 			int i;
695 
696 			kprintf("purge in6 addr %p: ", ifa);
697 			for (i = 0; i < ncpus; ++i)
698 				kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
699 			kprintf("\n");
700 #endif
701 
702 			in6_purgeaddr(ifa);
703 			/* ifp_addrhead is already updated */
704 			continue;
705 		}
706 #endif /* INET6 */
707 		ifa_ifunlink(ifa, ifp);
708 		ifa_destroy(ifa);
709 	}
710 }
711 
712 static void
713 ifq_stage_detach_handler(netmsg_t nmsg)
714 {
715 	struct ifaltq *ifq = nmsg->lmsg.u.ms_resultp;
716 	struct ifaltq_stage *stage = ifq_get_stage(ifq, mycpuid);
717 
718 	if (stage->ifqs_flags & IFQ_STAGE_FLAG_QUED)
719 		ifq_stage_remove(&ifq_stage_heads[mycpuid], stage);
720 	lwkt_replymsg(&nmsg->lmsg, 0);
721 }
722 
723 static void
724 ifq_stage_detach(struct ifaltq *ifq)
725 {
726 	struct netmsg_base base;
727 	int cpu;
728 
729 	netmsg_init(&base, NULL, &curthread->td_msgport, 0,
730 	    ifq_stage_detach_handler);
731 	base.lmsg.u.ms_resultp = ifq;
732 
733 	for (cpu = 0; cpu < ncpus; ++cpu)
734 		lwkt_domsg(netisr_portfn(cpu), &base.lmsg, 0);
735 }
736 
737 /*
738  * Detach an interface, removing it from the
739  * list of "active" interfaces.
740  */
741 void
742 if_detach(struct ifnet *ifp)
743 {
744 	struct radix_node_head	*rnh;
745 	int i;
746 	int cpu, origcpu;
747 	struct domain *dp;
748 
749 	EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
750 
751 	/*
752 	 * Remove routes and flush queues.
753 	 */
754 	crit_enter();
755 #ifdef IFPOLL_ENABLE
756 	if (ifp->if_flags & IFF_NPOLLING)
757 		ifpoll_deregister(ifp);
758 #endif
759 	if_down(ifp);
760 
761 #ifdef ALTQ
762 	if (ifq_is_enabled(&ifp->if_snd))
763 		altq_disable(&ifp->if_snd);
764 	if (ifq_is_attached(&ifp->if_snd))
765 		altq_detach(&ifp->if_snd);
766 #endif
767 
768 	/*
769 	 * Clean up all addresses.
770 	 */
771 	ifp->if_lladdr = NULL;
772 
773 	if_purgeaddrs_nolink(ifp);
774 	if (!TAILQ_EMPTY(&ifp->if_addrheads[mycpuid])) {
775 		struct ifaddr *ifa;
776 
777 		ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
778 		KASSERT(ifa->ifa_addr->sa_family == AF_LINK,
779 			("non-link ifaddr is left on if_addrheads"));
780 
781 		ifa_ifunlink(ifa, ifp);
782 		ifa_destroy(ifa);
783 		KASSERT(TAILQ_EMPTY(&ifp->if_addrheads[mycpuid]),
784 			("there are still ifaddrs left on if_addrheads"));
785 	}
786 
787 #ifdef INET
788 	/*
789 	 * Remove all IPv4 kernel structures related to ifp.
790 	 */
791 	in_ifdetach(ifp);
792 #endif
793 
794 #ifdef INET6
795 	/*
796 	 * Remove all IPv6 kernel structs related to ifp.  This should be done
797 	 * before removing routing entries below, since IPv6 interface direct
798 	 * routes are expected to be removed by the IPv6-specific kernel API.
799 	 * Otherwise, the kernel will detect some inconsistency and bark it.
800 	 */
801 	in6_ifdetach(ifp);
802 #endif
803 
804 	/*
805 	 * Delete all remaining routes using this interface
806 	 * Unfortuneatly the only way to do this is to slog through
807 	 * the entire routing table looking for routes which point
808 	 * to this interface...oh well...
809 	 */
810 	origcpu = mycpuid;
811 	for (cpu = 0; cpu < ncpus; cpu++) {
812 		lwkt_migratecpu(cpu);
813 		for (i = 1; i <= AF_MAX; i++) {
814 			if ((rnh = rt_tables[cpu][i]) == NULL)
815 				continue;
816 			rnh->rnh_walktree(rnh, if_rtdel, ifp);
817 		}
818 	}
819 	lwkt_migratecpu(origcpu);
820 
821 	/* Announce that the interface is gone. */
822 	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
823 	devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
824 
825 	SLIST_FOREACH(dp, &domains, dom_next)
826 		if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
827 			(*dp->dom_ifdetach)(ifp,
828 				ifp->if_afdata[dp->dom_family]);
829 
830 	/*
831 	 * Remove interface from ifindex2ifp[] and maybe decrement if_index.
832 	 */
833 	ifindex2ifnet[ifp->if_index] = NULL;
834 	while (if_index > 0 && ifindex2ifnet[if_index] == NULL)
835 		if_index--;
836 
837 	TAILQ_REMOVE(&ifnet, ifp, if_link);
838 	kfree(ifp->if_addrheads, M_IFADDR);
839 
840 	lwkt_synchronize_ipiqs("if_detach");
841 	ifq_stage_detach(&ifp->if_snd);
842 
843 	kfree(ifp->if_start_nmsg, M_LWKTMSG);
844 	kfree(ifp->if_snd.altq_stage, M_DEVBUF);
845 	crit_exit();
846 }
847 
848 /*
849  * Create interface group without members
850  */
851 struct ifg_group *
852 if_creategroup(const char *groupname)
853 {
854         struct ifg_group        *ifg = NULL;
855 
856         if ((ifg = (struct ifg_group *)kmalloc(sizeof(struct ifg_group),
857             M_TEMP, M_NOWAIT)) == NULL)
858                 return (NULL);
859 
860         strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
861         ifg->ifg_refcnt = 0;
862         ifg->ifg_carp_demoted = 0;
863         TAILQ_INIT(&ifg->ifg_members);
864 #if NPF > 0
865         pfi_attach_ifgroup(ifg);
866 #endif
867         TAILQ_INSERT_TAIL(&ifg_head, ifg, ifg_next);
868 
869         return (ifg);
870 }
871 
872 /*
873  * Add a group to an interface
874  */
875 int
876 if_addgroup(struct ifnet *ifp, const char *groupname)
877 {
878 	struct ifg_list		*ifgl;
879 	struct ifg_group	*ifg = NULL;
880 	struct ifg_member	*ifgm;
881 
882 	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
883 	    groupname[strlen(groupname) - 1] <= '9')
884 		return (EINVAL);
885 
886 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
887 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
888 			return (EEXIST);
889 
890 	if ((ifgl = kmalloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL)
891 		return (ENOMEM);
892 
893 	if ((ifgm = kmalloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) {
894 		kfree(ifgl, M_TEMP);
895 		return (ENOMEM);
896 	}
897 
898 	TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
899 		if (!strcmp(ifg->ifg_group, groupname))
900 			break;
901 
902 	if (ifg == NULL && (ifg = if_creategroup(groupname)) == NULL) {
903 		kfree(ifgl, M_TEMP);
904 		kfree(ifgm, M_TEMP);
905 		return (ENOMEM);
906 	}
907 
908 	ifg->ifg_refcnt++;
909 	ifgl->ifgl_group = ifg;
910 	ifgm->ifgm_ifp = ifp;
911 
912 	TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
913 	TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
914 
915 #if NPF > 0
916 	pfi_group_change(groupname);
917 #endif
918 
919 	return (0);
920 }
921 
922 /*
923  * Remove a group from an interface
924  */
925 int
926 if_delgroup(struct ifnet *ifp, const char *groupname)
927 {
928 	struct ifg_list		*ifgl;
929 	struct ifg_member	*ifgm;
930 
931 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
932 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
933 			break;
934 	if (ifgl == NULL)
935 		return (ENOENT);
936 
937 	TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
938 
939 	TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
940 		if (ifgm->ifgm_ifp == ifp)
941 			break;
942 
943 	if (ifgm != NULL) {
944 		TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
945 		kfree(ifgm, M_TEMP);
946 	}
947 
948 	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
949 		TAILQ_REMOVE(&ifg_head, ifgl->ifgl_group, ifg_next);
950 #if NPF > 0
951 		pfi_detach_ifgroup(ifgl->ifgl_group);
952 #endif
953 		kfree(ifgl->ifgl_group, M_TEMP);
954 	}
955 
956 	kfree(ifgl, M_TEMP);
957 
958 #if NPF > 0
959 	pfi_group_change(groupname);
960 #endif
961 
962 	return (0);
963 }
964 
965 /*
966  * Stores all groups from an interface in memory pointed
967  * to by data
968  */
969 int
970 if_getgroup(caddr_t data, struct ifnet *ifp)
971 {
972 	int			 len, error;
973 	struct ifg_list		*ifgl;
974 	struct ifg_req		 ifgrq, *ifgp;
975 	struct ifgroupreq	*ifgr = (struct ifgroupreq *)data;
976 
977 	if (ifgr->ifgr_len == 0) {
978 		TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
979 			ifgr->ifgr_len += sizeof(struct ifg_req);
980 		return (0);
981 	}
982 
983 	len = ifgr->ifgr_len;
984 	ifgp = ifgr->ifgr_groups;
985 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
986 		if (len < sizeof(ifgrq))
987 			return (EINVAL);
988 		bzero(&ifgrq, sizeof ifgrq);
989 		strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
990 		    sizeof(ifgrq.ifgrq_group));
991 		if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
992 		    sizeof(struct ifg_req))))
993 			return (error);
994 		len -= sizeof(ifgrq);
995 		ifgp++;
996 	}
997 
998 	return (0);
999 }
1000 
1001 /*
1002  * Stores all members of a group in memory pointed to by data
1003  */
1004 int
1005 if_getgroupmembers(caddr_t data)
1006 {
1007 	struct ifgroupreq	*ifgr = (struct ifgroupreq *)data;
1008 	struct ifg_group	*ifg;
1009 	struct ifg_member	*ifgm;
1010 	struct ifg_req		 ifgrq, *ifgp;
1011 	int			 len, error;
1012 
1013 	TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
1014 		if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
1015 			break;
1016 	if (ifg == NULL)
1017 		return (ENOENT);
1018 
1019 	if (ifgr->ifgr_len == 0) {
1020 		TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
1021 			ifgr->ifgr_len += sizeof(ifgrq);
1022 		return (0);
1023 	}
1024 
1025 	len = ifgr->ifgr_len;
1026 	ifgp = ifgr->ifgr_groups;
1027 	TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
1028 		if (len < sizeof(ifgrq))
1029 			return (EINVAL);
1030 		bzero(&ifgrq, sizeof ifgrq);
1031 		strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
1032 		    sizeof(ifgrq.ifgrq_member));
1033 		if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
1034 		    sizeof(struct ifg_req))))
1035 			return (error);
1036 		len -= sizeof(ifgrq);
1037 		ifgp++;
1038 	}
1039 
1040 	return (0);
1041 }
1042 
1043 /*
1044  * Delete Routes for a Network Interface
1045  *
1046  * Called for each routing entry via the rnh->rnh_walktree() call above
1047  * to delete all route entries referencing a detaching network interface.
1048  *
1049  * Arguments:
1050  *	rn	pointer to node in the routing table
1051  *	arg	argument passed to rnh->rnh_walktree() - detaching interface
1052  *
1053  * Returns:
1054  *	0	successful
1055  *	errno	failed - reason indicated
1056  *
1057  */
1058 static int
1059 if_rtdel(struct radix_node *rn, void *arg)
1060 {
1061 	struct rtentry	*rt = (struct rtentry *)rn;
1062 	struct ifnet	*ifp = arg;
1063 	int		err;
1064 
1065 	if (rt->rt_ifp == ifp) {
1066 
1067 		/*
1068 		 * Protect (sorta) against walktree recursion problems
1069 		 * with cloned routes
1070 		 */
1071 		if (!(rt->rt_flags & RTF_UP))
1072 			return (0);
1073 
1074 		err = rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
1075 				rt_mask(rt), rt->rt_flags,
1076 				NULL);
1077 		if (err) {
1078 			log(LOG_WARNING, "if_rtdel: error %d\n", err);
1079 		}
1080 	}
1081 
1082 	return (0);
1083 }
1084 
1085 /*
1086  * Locate an interface based on a complete address.
1087  */
1088 struct ifaddr *
1089 ifa_ifwithaddr(struct sockaddr *addr)
1090 {
1091 	struct ifnet *ifp;
1092 
1093 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
1094 		struct ifaddr_container *ifac;
1095 
1096 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1097 			struct ifaddr *ifa = ifac->ifa;
1098 
1099 			if (ifa->ifa_addr->sa_family != addr->sa_family)
1100 				continue;
1101 			if (sa_equal(addr, ifa->ifa_addr))
1102 				return (ifa);
1103 			if ((ifp->if_flags & IFF_BROADCAST) &&
1104 			    ifa->ifa_broadaddr &&
1105 			    /* IPv6 doesn't have broadcast */
1106 			    ifa->ifa_broadaddr->sa_len != 0 &&
1107 			    sa_equal(ifa->ifa_broadaddr, addr))
1108 				return (ifa);
1109 		}
1110 	}
1111 	return (NULL);
1112 }
1113 /*
1114  * Locate the point to point interface with a given destination address.
1115  */
1116 struct ifaddr *
1117 ifa_ifwithdstaddr(struct sockaddr *addr)
1118 {
1119 	struct ifnet *ifp;
1120 
1121 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
1122 		struct ifaddr_container *ifac;
1123 
1124 		if (!(ifp->if_flags & IFF_POINTOPOINT))
1125 			continue;
1126 
1127 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1128 			struct ifaddr *ifa = ifac->ifa;
1129 
1130 			if (ifa->ifa_addr->sa_family != addr->sa_family)
1131 				continue;
1132 			if (ifa->ifa_dstaddr &&
1133 			    sa_equal(addr, ifa->ifa_dstaddr))
1134 				return (ifa);
1135 		}
1136 	}
1137 	return (NULL);
1138 }
1139 
1140 /*
1141  * Find an interface on a specific network.  If many, choice
1142  * is most specific found.
1143  */
1144 struct ifaddr *
1145 ifa_ifwithnet(struct sockaddr *addr)
1146 {
1147 	struct ifnet *ifp;
1148 	struct ifaddr *ifa_maybe = NULL;
1149 	u_int af = addr->sa_family;
1150 	char *addr_data = addr->sa_data, *cplim;
1151 
1152 	/*
1153 	 * AF_LINK addresses can be looked up directly by their index number,
1154 	 * so do that if we can.
1155 	 */
1156 	if (af == AF_LINK) {
1157 		struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
1158 
1159 		if (sdl->sdl_index && sdl->sdl_index <= if_index)
1160 			return (ifindex2ifnet[sdl->sdl_index]->if_lladdr);
1161 	}
1162 
1163 	/*
1164 	 * Scan though each interface, looking for ones that have
1165 	 * addresses in this address family.
1166 	 */
1167 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
1168 		struct ifaddr_container *ifac;
1169 
1170 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1171 			struct ifaddr *ifa = ifac->ifa;
1172 			char *cp, *cp2, *cp3;
1173 
1174 			if (ifa->ifa_addr->sa_family != af)
1175 next:				continue;
1176 			if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT) {
1177 				/*
1178 				 * This is a bit broken as it doesn't
1179 				 * take into account that the remote end may
1180 				 * be a single node in the network we are
1181 				 * looking for.
1182 				 * The trouble is that we don't know the
1183 				 * netmask for the remote end.
1184 				 */
1185 				if (ifa->ifa_dstaddr != NULL &&
1186 				    sa_equal(addr, ifa->ifa_dstaddr))
1187 					return (ifa);
1188 			} else {
1189 				/*
1190 				 * if we have a special address handler,
1191 				 * then use it instead of the generic one.
1192 				 */
1193 				if (ifa->ifa_claim_addr) {
1194 					if ((*ifa->ifa_claim_addr)(ifa, addr)) {
1195 						return (ifa);
1196 					} else {
1197 						continue;
1198 					}
1199 				}
1200 
1201 				/*
1202 				 * Scan all the bits in the ifa's address.
1203 				 * If a bit dissagrees with what we are
1204 				 * looking for, mask it with the netmask
1205 				 * to see if it really matters.
1206 				 * (A byte at a time)
1207 				 */
1208 				if (ifa->ifa_netmask == 0)
1209 					continue;
1210 				cp = addr_data;
1211 				cp2 = ifa->ifa_addr->sa_data;
1212 				cp3 = ifa->ifa_netmask->sa_data;
1213 				cplim = ifa->ifa_netmask->sa_len +
1214 					(char *)ifa->ifa_netmask;
1215 				while (cp3 < cplim)
1216 					if ((*cp++ ^ *cp2++) & *cp3++)
1217 						goto next; /* next address! */
1218 				/*
1219 				 * If the netmask of what we just found
1220 				 * is more specific than what we had before
1221 				 * (if we had one) then remember the new one
1222 				 * before continuing to search
1223 				 * for an even better one.
1224 				 */
1225 				if (ifa_maybe == NULL ||
1226 				    rn_refines((char *)ifa->ifa_netmask,
1227 					       (char *)ifa_maybe->ifa_netmask))
1228 					ifa_maybe = ifa;
1229 			}
1230 		}
1231 	}
1232 	return (ifa_maybe);
1233 }
1234 
1235 /*
1236  * Find an interface address specific to an interface best matching
1237  * a given address.
1238  */
1239 struct ifaddr *
1240 ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp)
1241 {
1242 	struct ifaddr_container *ifac;
1243 	char *cp, *cp2, *cp3;
1244 	char *cplim;
1245 	struct ifaddr *ifa_maybe = NULL;
1246 	u_int af = addr->sa_family;
1247 
1248 	if (af >= AF_MAX)
1249 		return (0);
1250 	TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1251 		struct ifaddr *ifa = ifac->ifa;
1252 
1253 		if (ifa->ifa_addr->sa_family != af)
1254 			continue;
1255 		if (ifa_maybe == NULL)
1256 			ifa_maybe = ifa;
1257 		if (ifa->ifa_netmask == NULL) {
1258 			if (sa_equal(addr, ifa->ifa_addr) ||
1259 			    (ifa->ifa_dstaddr != NULL &&
1260 			     sa_equal(addr, ifa->ifa_dstaddr)))
1261 				return (ifa);
1262 			continue;
1263 		}
1264 		if (ifp->if_flags & IFF_POINTOPOINT) {
1265 			if (sa_equal(addr, ifa->ifa_dstaddr))
1266 				return (ifa);
1267 		} else {
1268 			cp = addr->sa_data;
1269 			cp2 = ifa->ifa_addr->sa_data;
1270 			cp3 = ifa->ifa_netmask->sa_data;
1271 			cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
1272 			for (; cp3 < cplim; cp3++)
1273 				if ((*cp++ ^ *cp2++) & *cp3)
1274 					break;
1275 			if (cp3 == cplim)
1276 				return (ifa);
1277 		}
1278 	}
1279 	return (ifa_maybe);
1280 }
1281 
1282 /*
1283  * Default action when installing a route with a Link Level gateway.
1284  * Lookup an appropriate real ifa to point to.
1285  * This should be moved to /sys/net/link.c eventually.
1286  */
1287 static void
1288 link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info)
1289 {
1290 	struct ifaddr *ifa;
1291 	struct sockaddr *dst;
1292 	struct ifnet *ifp;
1293 
1294 	if (cmd != RTM_ADD || (ifa = rt->rt_ifa) == NULL ||
1295 	    (ifp = ifa->ifa_ifp) == NULL || (dst = rt_key(rt)) == NULL)
1296 		return;
1297 	ifa = ifaof_ifpforaddr(dst, ifp);
1298 	if (ifa != NULL) {
1299 		IFAFREE(rt->rt_ifa);
1300 		IFAREF(ifa);
1301 		rt->rt_ifa = ifa;
1302 		if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
1303 			ifa->ifa_rtrequest(cmd, rt, info);
1304 	}
1305 }
1306 
1307 /*
1308  * Mark an interface down and notify protocols of
1309  * the transition.
1310  * NOTE: must be called at splnet or eqivalent.
1311  */
1312 void
1313 if_unroute(struct ifnet *ifp, int flag, int fam)
1314 {
1315 	struct ifaddr_container *ifac;
1316 
1317 	ifp->if_flags &= ~flag;
1318 	getmicrotime(&ifp->if_lastchange);
1319 	TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1320 		struct ifaddr *ifa = ifac->ifa;
1321 
1322 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1323 			kpfctlinput(PRC_IFDOWN, ifa->ifa_addr);
1324 	}
1325 	ifq_purge_all(&ifp->if_snd);
1326 	rt_ifmsg(ifp);
1327 }
1328 
1329 /*
1330  * Mark an interface up and notify protocols of
1331  * the transition.
1332  * NOTE: must be called at splnet or eqivalent.
1333  */
1334 void
1335 if_route(struct ifnet *ifp, int flag, int fam)
1336 {
1337 	struct ifaddr_container *ifac;
1338 
1339 	ifq_purge_all(&ifp->if_snd);
1340 	ifp->if_flags |= flag;
1341 	getmicrotime(&ifp->if_lastchange);
1342 	TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1343 		struct ifaddr *ifa = ifac->ifa;
1344 
1345 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1346 			kpfctlinput(PRC_IFUP, ifa->ifa_addr);
1347 	}
1348 	rt_ifmsg(ifp);
1349 #ifdef INET6
1350 	in6_if_up(ifp);
1351 #endif
1352 }
1353 
1354 /*
1355  * Mark an interface down and notify protocols of the transition.  An
1356  * interface going down is also considered to be a synchronizing event.
1357  * We must ensure that all packet processing related to the interface
1358  * has completed before we return so e.g. the caller can free the ifnet
1359  * structure that the mbufs may be referencing.
1360  *
1361  * NOTE: must be called at splnet or eqivalent.
1362  */
1363 void
1364 if_down(struct ifnet *ifp)
1365 {
1366 	if_unroute(ifp, IFF_UP, AF_UNSPEC);
1367 	netmsg_service_sync();
1368 }
1369 
1370 /*
1371  * Mark an interface up and notify protocols of
1372  * the transition.
1373  * NOTE: must be called at splnet or eqivalent.
1374  */
1375 void
1376 if_up(struct ifnet *ifp)
1377 {
1378 	if_route(ifp, IFF_UP, AF_UNSPEC);
1379 }
1380 
1381 /*
1382  * Process a link state change.
1383  * NOTE: must be called at splsoftnet or equivalent.
1384  */
1385 void
1386 if_link_state_change(struct ifnet *ifp)
1387 {
1388 	int link_state = ifp->if_link_state;
1389 
1390 	rt_ifmsg(ifp);
1391 	devctl_notify("IFNET", ifp->if_xname,
1392 	    (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL);
1393 }
1394 
1395 /*
1396  * Handle interface watchdog timer routines.  Called
1397  * from softclock, we decrement timers (if set) and
1398  * call the appropriate interface routine on expiration.
1399  */
1400 static void
1401 if_slowtimo(void *arg)
1402 {
1403 	struct ifnet *ifp;
1404 
1405 	crit_enter();
1406 
1407 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
1408 		if (ifp->if_timer == 0 || --ifp->if_timer)
1409 			continue;
1410 		if (ifp->if_watchdog) {
1411 			if (ifnet_tryserialize_all(ifp)) {
1412 				(*ifp->if_watchdog)(ifp);
1413 				ifnet_deserialize_all(ifp);
1414 			} else {
1415 				/* try again next timeout */
1416 				++ifp->if_timer;
1417 			}
1418 		}
1419 	}
1420 
1421 	crit_exit();
1422 
1423 	callout_reset(&if_slowtimo_timer, hz / IFNET_SLOWHZ, if_slowtimo, NULL);
1424 }
1425 
1426 /*
1427  * Map interface name to
1428  * interface structure pointer.
1429  */
1430 struct ifnet *
1431 ifunit(const char *name)
1432 {
1433 	struct ifnet *ifp;
1434 
1435 	/*
1436 	 * Search all the interfaces for this name/number
1437 	 */
1438 
1439 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
1440 		if (strncmp(ifp->if_xname, name, IFNAMSIZ) == 0)
1441 			break;
1442 	}
1443 	return (ifp);
1444 }
1445 
1446 
1447 /*
1448  * Map interface name in a sockaddr_dl to
1449  * interface structure pointer.
1450  */
1451 struct ifnet *
1452 if_withname(struct sockaddr *sa)
1453 {
1454 	char ifname[IFNAMSIZ+1];
1455 	struct sockaddr_dl *sdl = (struct sockaddr_dl *)sa;
1456 
1457 	if ( (sa->sa_family != AF_LINK) || (sdl->sdl_nlen == 0) ||
1458 	     (sdl->sdl_nlen > IFNAMSIZ) )
1459 		return NULL;
1460 
1461 	/*
1462 	 * ifunit wants a null-terminated name.  It may not be null-terminated
1463 	 * in the sockaddr.  We don't want to change the caller's sockaddr,
1464 	 * and there might not be room to put the trailing null anyway, so we
1465 	 * make a local copy that we know we can null terminate safely.
1466 	 */
1467 
1468 	bcopy(sdl->sdl_data, ifname, sdl->sdl_nlen);
1469 	ifname[sdl->sdl_nlen] = '\0';
1470 	return ifunit(ifname);
1471 }
1472 
1473 
1474 /*
1475  * Interface ioctls.
1476  */
1477 int
1478 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct ucred *cred)
1479 {
1480 	struct ifnet *ifp;
1481 	struct ifreq *ifr;
1482 	struct ifstat *ifs;
1483 	int error;
1484 	short oif_flags;
1485 	int new_flags;
1486 #ifdef COMPAT_43
1487 	int ocmd;
1488 #endif
1489 	size_t namelen, onamelen;
1490 	char new_name[IFNAMSIZ];
1491 	struct ifaddr *ifa;
1492 	struct sockaddr_dl *sdl;
1493 
1494 	switch (cmd) {
1495 	case SIOCGIFCONF:
1496 	case OSIOCGIFCONF:
1497 		return (ifconf(cmd, data, cred));
1498 	default:
1499 		break;
1500 	}
1501 
1502 	ifr = (struct ifreq *)data;
1503 
1504 	switch (cmd) {
1505 	case SIOCIFCREATE:
1506 	case SIOCIFCREATE2:
1507 		if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1508 			return (error);
1509 		return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name),
1510 		    	cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL));
1511 	case SIOCIFDESTROY:
1512 		if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1513 			return (error);
1514 		return (if_clone_destroy(ifr->ifr_name));
1515 	case SIOCIFGCLONERS:
1516 		return (if_clone_list((struct if_clonereq *)data));
1517 	default:
1518 		break;
1519 	}
1520 
1521 	/*
1522 	 * Nominal ioctl through interface, lookup the ifp and obtain a
1523 	 * lock to serialize the ifconfig ioctl operation.
1524 	 */
1525 	ifp = ifunit(ifr->ifr_name);
1526 	if (ifp == NULL)
1527 		return (ENXIO);
1528 	error = 0;
1529 	mtx_lock(&ifp->if_ioctl_mtx);
1530 
1531 	switch (cmd) {
1532 	case SIOCGIFINDEX:
1533 		ifr->ifr_index = ifp->if_index;
1534 		break;
1535 
1536 	case SIOCGIFFLAGS:
1537 		ifr->ifr_flags = ifp->if_flags;
1538 		ifr->ifr_flagshigh = ifp->if_flags >> 16;
1539 		break;
1540 
1541 	case SIOCGIFCAP:
1542 		ifr->ifr_reqcap = ifp->if_capabilities;
1543 		ifr->ifr_curcap = ifp->if_capenable;
1544 		break;
1545 
1546 	case SIOCGIFMETRIC:
1547 		ifr->ifr_metric = ifp->if_metric;
1548 		break;
1549 
1550 	case SIOCGIFMTU:
1551 		ifr->ifr_mtu = ifp->if_mtu;
1552 		break;
1553 
1554 	case SIOCGIFDATA:
1555 		error = copyout((caddr_t)&ifp->if_data, ifr->ifr_data,
1556 				sizeof(ifp->if_data));
1557 		break;
1558 
1559 	case SIOCGIFPHYS:
1560 		ifr->ifr_phys = ifp->if_physical;
1561 		break;
1562 
1563 	case SIOCGIFPOLLCPU:
1564 		ifr->ifr_pollcpu = -1;
1565 		break;
1566 
1567 	case SIOCSIFPOLLCPU:
1568 		break;
1569 
1570 	case SIOCSIFFLAGS:
1571 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1572 		if (error)
1573 			break;
1574 		new_flags = (ifr->ifr_flags & 0xffff) |
1575 		    (ifr->ifr_flagshigh << 16);
1576 		if (ifp->if_flags & IFF_SMART) {
1577 			/* Smart drivers twiddle their own routes */
1578 		} else if (ifp->if_flags & IFF_UP &&
1579 		    (new_flags & IFF_UP) == 0) {
1580 			crit_enter();
1581 			if_down(ifp);
1582 			crit_exit();
1583 		} else if (new_flags & IFF_UP &&
1584 		    (ifp->if_flags & IFF_UP) == 0) {
1585 			crit_enter();
1586 			if_up(ifp);
1587 			crit_exit();
1588 		}
1589 
1590 #ifdef IFPOLL_ENABLE
1591 		if ((new_flags ^ ifp->if_flags) & IFF_NPOLLING) {
1592 			if (new_flags & IFF_NPOLLING)
1593 				ifpoll_register(ifp);
1594 			else
1595 				ifpoll_deregister(ifp);
1596 		}
1597 #endif
1598 
1599 		ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
1600 			(new_flags &~ IFF_CANTCHANGE);
1601 		if (new_flags & IFF_PPROMISC) {
1602 			/* Permanently promiscuous mode requested */
1603 			ifp->if_flags |= IFF_PROMISC;
1604 		} else if (ifp->if_pcount == 0) {
1605 			ifp->if_flags &= ~IFF_PROMISC;
1606 		}
1607 		if (ifp->if_ioctl) {
1608 			ifnet_serialize_all(ifp);
1609 			ifp->if_ioctl(ifp, cmd, data, cred);
1610 			ifnet_deserialize_all(ifp);
1611 		}
1612 		getmicrotime(&ifp->if_lastchange);
1613 		break;
1614 
1615 	case SIOCSIFCAP:
1616 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1617 		if (error)
1618 			break;
1619 		if (ifr->ifr_reqcap & ~ifp->if_capabilities) {
1620 			error = EINVAL;
1621 			break;
1622 		}
1623 		ifnet_serialize_all(ifp);
1624 		ifp->if_ioctl(ifp, cmd, data, cred);
1625 		ifnet_deserialize_all(ifp);
1626 		break;
1627 
1628 	case SIOCSIFNAME:
1629 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1630 		if (error)
1631 			break;
1632 		error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
1633 		if (error)
1634 			break;
1635 		if (new_name[0] == '\0') {
1636 			error = EINVAL;
1637 			break;
1638 		}
1639 		if (ifunit(new_name) != NULL) {
1640 			error = EEXIST;
1641 			break;
1642 		}
1643 
1644 		EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
1645 
1646 		/* Announce the departure of the interface. */
1647 		rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
1648 
1649 		strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
1650 		ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
1651 		/* XXX IFA_LOCK(ifa); */
1652 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
1653 		namelen = strlen(new_name);
1654 		onamelen = sdl->sdl_nlen;
1655 		/*
1656 		 * Move the address if needed.  This is safe because we
1657 		 * allocate space for a name of length IFNAMSIZ when we
1658 		 * create this in if_attach().
1659 		 */
1660 		if (namelen != onamelen) {
1661 			bcopy(sdl->sdl_data + onamelen,
1662 			    sdl->sdl_data + namelen, sdl->sdl_alen);
1663 		}
1664 		bcopy(new_name, sdl->sdl_data, namelen);
1665 		sdl->sdl_nlen = namelen;
1666 		sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
1667 		bzero(sdl->sdl_data, onamelen);
1668 		while (namelen != 0)
1669 			sdl->sdl_data[--namelen] = 0xff;
1670 		/* XXX IFA_UNLOCK(ifa) */
1671 
1672 		EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
1673 
1674 		/* Announce the return of the interface. */
1675 		rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
1676 		break;
1677 
1678 	case SIOCSIFMETRIC:
1679 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1680 		if (error)
1681 			break;
1682 		ifp->if_metric = ifr->ifr_metric;
1683 		getmicrotime(&ifp->if_lastchange);
1684 		break;
1685 
1686 	case SIOCSIFPHYS:
1687 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1688 		if (error)
1689 			break;
1690 		if (ifp->if_ioctl == NULL) {
1691 		        error = EOPNOTSUPP;
1692 			break;
1693 		}
1694 		ifnet_serialize_all(ifp);
1695 		error = ifp->if_ioctl(ifp, cmd, data, cred);
1696 		ifnet_deserialize_all(ifp);
1697 		if (error == 0)
1698 			getmicrotime(&ifp->if_lastchange);
1699 		break;
1700 
1701 	case SIOCSIFMTU:
1702 	{
1703 		u_long oldmtu = ifp->if_mtu;
1704 
1705 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1706 		if (error)
1707 			break;
1708 		if (ifp->if_ioctl == NULL) {
1709 			error = EOPNOTSUPP;
1710 			break;
1711 		}
1712 		if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) {
1713 			error = EINVAL;
1714 			break;
1715 		}
1716 		ifnet_serialize_all(ifp);
1717 		error = ifp->if_ioctl(ifp, cmd, data, cred);
1718 		ifnet_deserialize_all(ifp);
1719 		if (error == 0) {
1720 			getmicrotime(&ifp->if_lastchange);
1721 			rt_ifmsg(ifp);
1722 		}
1723 		/*
1724 		 * If the link MTU changed, do network layer specific procedure.
1725 		 */
1726 		if (ifp->if_mtu != oldmtu) {
1727 #ifdef INET6
1728 			nd6_setmtu(ifp);
1729 #endif
1730 		}
1731 		break;
1732 	}
1733 
1734 	case SIOCADDMULTI:
1735 	case SIOCDELMULTI:
1736 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1737 		if (error)
1738 			break;
1739 
1740 		/* Don't allow group membership on non-multicast interfaces. */
1741 		if ((ifp->if_flags & IFF_MULTICAST) == 0) {
1742 			error = EOPNOTSUPP;
1743 			break;
1744 		}
1745 
1746 		/* Don't let users screw up protocols' entries. */
1747 		if (ifr->ifr_addr.sa_family != AF_LINK) {
1748 			error = EINVAL;
1749 			break;
1750 		}
1751 
1752 		if (cmd == SIOCADDMULTI) {
1753 			struct ifmultiaddr *ifma;
1754 			error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
1755 		} else {
1756 			error = if_delmulti(ifp, &ifr->ifr_addr);
1757 		}
1758 		if (error == 0)
1759 			getmicrotime(&ifp->if_lastchange);
1760 		break;
1761 
1762 	case SIOCSIFPHYADDR:
1763 	case SIOCDIFPHYADDR:
1764 #ifdef INET6
1765 	case SIOCSIFPHYADDR_IN6:
1766 #endif
1767 	case SIOCSLIFPHYADDR:
1768         case SIOCSIFMEDIA:
1769 	case SIOCSIFGENERIC:
1770 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1771 		if (error)
1772 			break;
1773 		if (ifp->if_ioctl == 0) {
1774 			error = EOPNOTSUPP;
1775 			break;
1776 		}
1777 		ifnet_serialize_all(ifp);
1778 		error = ifp->if_ioctl(ifp, cmd, data, cred);
1779 		ifnet_deserialize_all(ifp);
1780 		if (error == 0)
1781 			getmicrotime(&ifp->if_lastchange);
1782 		break;
1783 
1784 	case SIOCGIFSTATUS:
1785 		ifs = (struct ifstat *)data;
1786 		ifs->ascii[0] = '\0';
1787 		/* fall through */
1788 	case SIOCGIFPSRCADDR:
1789 	case SIOCGIFPDSTADDR:
1790 	case SIOCGLIFPHYADDR:
1791 	case SIOCGIFMEDIA:
1792 	case SIOCGIFGENERIC:
1793 		if (ifp->if_ioctl == NULL) {
1794 			error = EOPNOTSUPP;
1795 			break;
1796 		}
1797 		ifnet_serialize_all(ifp);
1798 		error = ifp->if_ioctl(ifp, cmd, data, cred);
1799 		ifnet_deserialize_all(ifp);
1800 		break;
1801 
1802 	case SIOCSIFLLADDR:
1803 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1804 		if (error)
1805 			break;
1806 		error = if_setlladdr(ifp, ifr->ifr_addr.sa_data,
1807 				     ifr->ifr_addr.sa_len);
1808 		EVENTHANDLER_INVOKE(iflladdr_event, ifp);
1809 		break;
1810 
1811 	default:
1812 		oif_flags = ifp->if_flags;
1813 		if (so->so_proto == 0) {
1814 			error = EOPNOTSUPP;
1815 			break;
1816 		}
1817 #ifndef COMPAT_43
1818 		error = so_pru_control_direct(so, cmd, data, ifp);
1819 #else
1820 		ocmd = cmd;
1821 
1822 		switch (cmd) {
1823 		case SIOCSIFDSTADDR:
1824 		case SIOCSIFADDR:
1825 		case SIOCSIFBRDADDR:
1826 		case SIOCSIFNETMASK:
1827 #if BYTE_ORDER != BIG_ENDIAN
1828 			if (ifr->ifr_addr.sa_family == 0 &&
1829 			    ifr->ifr_addr.sa_len < 16) {
1830 				ifr->ifr_addr.sa_family = ifr->ifr_addr.sa_len;
1831 				ifr->ifr_addr.sa_len = 16;
1832 			}
1833 #else
1834 			if (ifr->ifr_addr.sa_len == 0)
1835 				ifr->ifr_addr.sa_len = 16;
1836 #endif
1837 			break;
1838 		case OSIOCGIFADDR:
1839 			cmd = SIOCGIFADDR;
1840 			break;
1841 		case OSIOCGIFDSTADDR:
1842 			cmd = SIOCGIFDSTADDR;
1843 			break;
1844 		case OSIOCGIFBRDADDR:
1845 			cmd = SIOCGIFBRDADDR;
1846 			break;
1847 		case OSIOCGIFNETMASK:
1848 			cmd = SIOCGIFNETMASK;
1849 			break;
1850 		default:
1851 			break;
1852 		}
1853 
1854 		error = so_pru_control_direct(so, cmd, data, ifp);
1855 
1856 		switch (ocmd) {
1857 		case OSIOCGIFADDR:
1858 		case OSIOCGIFDSTADDR:
1859 		case OSIOCGIFBRDADDR:
1860 		case OSIOCGIFNETMASK:
1861 			*(u_short *)&ifr->ifr_addr = ifr->ifr_addr.sa_family;
1862 			break;
1863 		}
1864 #endif /* COMPAT_43 */
1865 
1866 		if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
1867 #ifdef INET6
1868 			DELAY(100);/* XXX: temporary workaround for fxp issue*/
1869 			if (ifp->if_flags & IFF_UP) {
1870 				crit_enter();
1871 				in6_if_up(ifp);
1872 				crit_exit();
1873 			}
1874 #endif
1875 		}
1876 		break;
1877 	}
1878 
1879 	mtx_unlock(&ifp->if_ioctl_mtx);
1880 	return (error);
1881 }
1882 
1883 /*
1884  * Set/clear promiscuous mode on interface ifp based on the truth value
1885  * of pswitch.  The calls are reference counted so that only the first
1886  * "on" request actually has an effect, as does the final "off" request.
1887  * Results are undefined if the "off" and "on" requests are not matched.
1888  */
1889 int
1890 ifpromisc(struct ifnet *ifp, int pswitch)
1891 {
1892 	struct ifreq ifr;
1893 	int error;
1894 	int oldflags;
1895 
1896 	oldflags = ifp->if_flags;
1897 	if (ifp->if_flags & IFF_PPROMISC) {
1898 		/* Do nothing if device is in permanently promiscuous mode */
1899 		ifp->if_pcount += pswitch ? 1 : -1;
1900 		return (0);
1901 	}
1902 	if (pswitch) {
1903 		/*
1904 		 * If the device is not configured up, we cannot put it in
1905 		 * promiscuous mode.
1906 		 */
1907 		if ((ifp->if_flags & IFF_UP) == 0)
1908 			return (ENETDOWN);
1909 		if (ifp->if_pcount++ != 0)
1910 			return (0);
1911 		ifp->if_flags |= IFF_PROMISC;
1912 		log(LOG_INFO, "%s: promiscuous mode enabled\n",
1913 		    ifp->if_xname);
1914 	} else {
1915 		if (--ifp->if_pcount > 0)
1916 			return (0);
1917 		ifp->if_flags &= ~IFF_PROMISC;
1918 		log(LOG_INFO, "%s: promiscuous mode disabled\n",
1919 		    ifp->if_xname);
1920 	}
1921 	ifr.ifr_flags = ifp->if_flags;
1922 	ifr.ifr_flagshigh = ifp->if_flags >> 16;
1923 	ifnet_serialize_all(ifp);
1924 	error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr, NULL);
1925 	ifnet_deserialize_all(ifp);
1926 	if (error == 0)
1927 		rt_ifmsg(ifp);
1928 	else
1929 		ifp->if_flags = oldflags;
1930 	return error;
1931 }
1932 
1933 /*
1934  * Return interface configuration
1935  * of system.  List may be used
1936  * in later ioctl's (above) to get
1937  * other information.
1938  */
1939 static int
1940 ifconf(u_long cmd, caddr_t data, struct ucred *cred)
1941 {
1942 	struct ifconf *ifc = (struct ifconf *)data;
1943 	struct ifnet *ifp;
1944 	struct sockaddr *sa;
1945 	struct ifreq ifr, *ifrp;
1946 	int space = ifc->ifc_len, error = 0;
1947 
1948 	ifrp = ifc->ifc_req;
1949 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
1950 		struct ifaddr_container *ifac;
1951 		int addrs;
1952 
1953 		if (space <= sizeof ifr)
1954 			break;
1955 
1956 		/*
1957 		 * Zero the stack declared structure first to prevent
1958 		 * memory disclosure.
1959 		 */
1960 		bzero(&ifr, sizeof(ifr));
1961 		if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
1962 		    >= sizeof(ifr.ifr_name)) {
1963 			error = ENAMETOOLONG;
1964 			break;
1965 		}
1966 
1967 		addrs = 0;
1968 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1969 			struct ifaddr *ifa = ifac->ifa;
1970 
1971 			if (space <= sizeof ifr)
1972 				break;
1973 			sa = ifa->ifa_addr;
1974 			if (cred->cr_prison &&
1975 			    prison_if(cred, sa))
1976 				continue;
1977 			addrs++;
1978 #ifdef COMPAT_43
1979 			if (cmd == OSIOCGIFCONF) {
1980 				struct osockaddr *osa =
1981 					 (struct osockaddr *)&ifr.ifr_addr;
1982 				ifr.ifr_addr = *sa;
1983 				osa->sa_family = sa->sa_family;
1984 				error = copyout(&ifr, ifrp, sizeof ifr);
1985 				ifrp++;
1986 			} else
1987 #endif
1988 			if (sa->sa_len <= sizeof(*sa)) {
1989 				ifr.ifr_addr = *sa;
1990 				error = copyout(&ifr, ifrp, sizeof ifr);
1991 				ifrp++;
1992 			} else {
1993 				if (space < (sizeof ifr) + sa->sa_len -
1994 					    sizeof(*sa))
1995 					break;
1996 				space -= sa->sa_len - sizeof(*sa);
1997 				error = copyout(&ifr, ifrp,
1998 						sizeof ifr.ifr_name);
1999 				if (error == 0)
2000 					error = copyout(sa, &ifrp->ifr_addr,
2001 							sa->sa_len);
2002 				ifrp = (struct ifreq *)
2003 					(sa->sa_len + (caddr_t)&ifrp->ifr_addr);
2004 			}
2005 			if (error)
2006 				break;
2007 			space -= sizeof ifr;
2008 		}
2009 		if (error)
2010 			break;
2011 		if (!addrs) {
2012 			bzero(&ifr.ifr_addr, sizeof ifr.ifr_addr);
2013 			error = copyout(&ifr, ifrp, sizeof ifr);
2014 			if (error)
2015 				break;
2016 			space -= sizeof ifr;
2017 			ifrp++;
2018 		}
2019 	}
2020 	ifc->ifc_len -= space;
2021 	return (error);
2022 }
2023 
2024 /*
2025  * Just like if_promisc(), but for all-multicast-reception mode.
2026  */
2027 int
2028 if_allmulti(struct ifnet *ifp, int onswitch)
2029 {
2030 	int error = 0;
2031 	struct ifreq ifr;
2032 
2033 	crit_enter();
2034 
2035 	if (onswitch) {
2036 		if (ifp->if_amcount++ == 0) {
2037 			ifp->if_flags |= IFF_ALLMULTI;
2038 			ifr.ifr_flags = ifp->if_flags;
2039 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
2040 			ifnet_serialize_all(ifp);
2041 			error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2042 					      NULL);
2043 			ifnet_deserialize_all(ifp);
2044 		}
2045 	} else {
2046 		if (ifp->if_amcount > 1) {
2047 			ifp->if_amcount--;
2048 		} else {
2049 			ifp->if_amcount = 0;
2050 			ifp->if_flags &= ~IFF_ALLMULTI;
2051 			ifr.ifr_flags = ifp->if_flags;
2052 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
2053 			ifnet_serialize_all(ifp);
2054 			error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2055 					      NULL);
2056 			ifnet_deserialize_all(ifp);
2057 		}
2058 	}
2059 
2060 	crit_exit();
2061 
2062 	if (error == 0)
2063 		rt_ifmsg(ifp);
2064 	return error;
2065 }
2066 
2067 /*
2068  * Add a multicast listenership to the interface in question.
2069  * The link layer provides a routine which converts
2070  */
2071 int
2072 if_addmulti(
2073 	struct ifnet *ifp,	/* interface to manipulate */
2074 	struct sockaddr *sa,	/* address to add */
2075 	struct ifmultiaddr **retifma)
2076 {
2077 	struct sockaddr *llsa, *dupsa;
2078 	int error;
2079 	struct ifmultiaddr *ifma;
2080 
2081 	/*
2082 	 * If the matching multicast address already exists
2083 	 * then don't add a new one, just add a reference
2084 	 */
2085 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2086 		if (sa_equal(sa, ifma->ifma_addr)) {
2087 			ifma->ifma_refcount++;
2088 			if (retifma)
2089 				*retifma = ifma;
2090 			return 0;
2091 		}
2092 	}
2093 
2094 	/*
2095 	 * Give the link layer a chance to accept/reject it, and also
2096 	 * find out which AF_LINK address this maps to, if it isn't one
2097 	 * already.
2098 	 */
2099 	if (ifp->if_resolvemulti) {
2100 		ifnet_serialize_all(ifp);
2101 		error = ifp->if_resolvemulti(ifp, &llsa, sa);
2102 		ifnet_deserialize_all(ifp);
2103 		if (error)
2104 			return error;
2105 	} else {
2106 		llsa = NULL;
2107 	}
2108 
2109 	ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2110 	dupsa = kmalloc(sa->sa_len, M_IFMADDR, M_WAITOK);
2111 	bcopy(sa, dupsa, sa->sa_len);
2112 
2113 	ifma->ifma_addr = dupsa;
2114 	ifma->ifma_lladdr = llsa;
2115 	ifma->ifma_ifp = ifp;
2116 	ifma->ifma_refcount = 1;
2117 	ifma->ifma_protospec = 0;
2118 	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
2119 
2120 	/*
2121 	 * Some network interfaces can scan the address list at
2122 	 * interrupt time; lock them out.
2123 	 */
2124 	crit_enter();
2125 	TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2126 	crit_exit();
2127 	if (retifma)
2128 		*retifma = ifma;
2129 
2130 	if (llsa != NULL) {
2131 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2132 			if (sa_equal(ifma->ifma_addr, llsa))
2133 				break;
2134 		}
2135 		if (ifma) {
2136 			ifma->ifma_refcount++;
2137 		} else {
2138 			ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2139 			dupsa = kmalloc(llsa->sa_len, M_IFMADDR, M_WAITOK);
2140 			bcopy(llsa, dupsa, llsa->sa_len);
2141 			ifma->ifma_addr = dupsa;
2142 			ifma->ifma_ifp = ifp;
2143 			ifma->ifma_refcount = 1;
2144 			crit_enter();
2145 			TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2146 			crit_exit();
2147 		}
2148 	}
2149 	/*
2150 	 * We are certain we have added something, so call down to the
2151 	 * interface to let them know about it.
2152 	 */
2153 	crit_enter();
2154 	ifnet_serialize_all(ifp);
2155 	if (ifp->if_ioctl)
2156 		ifp->if_ioctl(ifp, SIOCADDMULTI, 0, NULL);
2157 	ifnet_deserialize_all(ifp);
2158 	crit_exit();
2159 
2160 	return 0;
2161 }
2162 
2163 /*
2164  * Remove a reference to a multicast address on this interface.  Yell
2165  * if the request does not match an existing membership.
2166  */
2167 int
2168 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
2169 {
2170 	struct ifmultiaddr *ifma;
2171 
2172 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2173 		if (sa_equal(sa, ifma->ifma_addr))
2174 			break;
2175 	if (ifma == NULL)
2176 		return ENOENT;
2177 
2178 	if (ifma->ifma_refcount > 1) {
2179 		ifma->ifma_refcount--;
2180 		return 0;
2181 	}
2182 
2183 	rt_newmaddrmsg(RTM_DELMADDR, ifma);
2184 	sa = ifma->ifma_lladdr;
2185 	crit_enter();
2186 	TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2187 	/*
2188 	 * Make sure the interface driver is notified
2189 	 * in the case of a link layer mcast group being left.
2190 	 */
2191 	if (ifma->ifma_addr->sa_family == AF_LINK && sa == NULL) {
2192 		ifnet_serialize_all(ifp);
2193 		ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2194 		ifnet_deserialize_all(ifp);
2195 	}
2196 	crit_exit();
2197 	kfree(ifma->ifma_addr, M_IFMADDR);
2198 	kfree(ifma, M_IFMADDR);
2199 	if (sa == NULL)
2200 		return 0;
2201 
2202 	/*
2203 	 * Now look for the link-layer address which corresponds to
2204 	 * this network address.  It had been squirreled away in
2205 	 * ifma->ifma_lladdr for this purpose (so we don't have
2206 	 * to call ifp->if_resolvemulti() again), and we saved that
2207 	 * value in sa above.  If some nasty deleted the
2208 	 * link-layer address out from underneath us, we can deal because
2209 	 * the address we stored was is not the same as the one which was
2210 	 * in the record for the link-layer address.  (So we don't complain
2211 	 * in that case.)
2212 	 */
2213 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2214 		if (sa_equal(sa, ifma->ifma_addr))
2215 			break;
2216 	if (ifma == NULL)
2217 		return 0;
2218 
2219 	if (ifma->ifma_refcount > 1) {
2220 		ifma->ifma_refcount--;
2221 		return 0;
2222 	}
2223 
2224 	crit_enter();
2225 	ifnet_serialize_all(ifp);
2226 	TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2227 	ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2228 	ifnet_deserialize_all(ifp);
2229 	crit_exit();
2230 	kfree(ifma->ifma_addr, M_IFMADDR);
2231 	kfree(sa, M_IFMADDR);
2232 	kfree(ifma, M_IFMADDR);
2233 
2234 	return 0;
2235 }
2236 
2237 /*
2238  * Delete all multicast group membership for an interface.
2239  * Should be used to quickly flush all multicast filters.
2240  */
2241 void
2242 if_delallmulti(struct ifnet *ifp)
2243 {
2244 	struct ifmultiaddr *ifma;
2245 	struct ifmultiaddr *next;
2246 
2247 	TAILQ_FOREACH_MUTABLE(ifma, &ifp->if_multiaddrs, ifma_link, next)
2248 		if_delmulti(ifp, ifma->ifma_addr);
2249 }
2250 
2251 
2252 /*
2253  * Set the link layer address on an interface.
2254  *
2255  * At this time we only support certain types of interfaces,
2256  * and we don't allow the length of the address to change.
2257  */
2258 int
2259 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
2260 {
2261 	struct sockaddr_dl *sdl;
2262 	struct ifreq ifr;
2263 
2264 	sdl = IF_LLSOCKADDR(ifp);
2265 	if (sdl == NULL)
2266 		return (EINVAL);
2267 	if (len != sdl->sdl_alen)	/* don't allow length to change */
2268 		return (EINVAL);
2269 	switch (ifp->if_type) {
2270 	case IFT_ETHER:			/* these types use struct arpcom */
2271 	case IFT_XETHER:
2272 	case IFT_L2VLAN:
2273 		bcopy(lladdr, ((struct arpcom *)ifp->if_softc)->ac_enaddr, len);
2274 		bcopy(lladdr, LLADDR(sdl), len);
2275 		break;
2276 	default:
2277 		return (ENODEV);
2278 	}
2279 	/*
2280 	 * If the interface is already up, we need
2281 	 * to re-init it in order to reprogram its
2282 	 * address filter.
2283 	 */
2284 	ifnet_serialize_all(ifp);
2285 	if ((ifp->if_flags & IFF_UP) != 0) {
2286 #ifdef INET
2287 		struct ifaddr_container *ifac;
2288 #endif
2289 
2290 		ifp->if_flags &= ~IFF_UP;
2291 		ifr.ifr_flags = ifp->if_flags;
2292 		ifr.ifr_flagshigh = ifp->if_flags >> 16;
2293 		ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2294 			      NULL);
2295 		ifp->if_flags |= IFF_UP;
2296 		ifr.ifr_flags = ifp->if_flags;
2297 		ifr.ifr_flagshigh = ifp->if_flags >> 16;
2298 		ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2299 				 NULL);
2300 #ifdef INET
2301 		/*
2302 		 * Also send gratuitous ARPs to notify other nodes about
2303 		 * the address change.
2304 		 */
2305 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
2306 			struct ifaddr *ifa = ifac->ifa;
2307 
2308 			if (ifa->ifa_addr != NULL &&
2309 			    ifa->ifa_addr->sa_family == AF_INET)
2310 				arp_gratuitous(ifp, ifa);
2311 		}
2312 #endif
2313 	}
2314 	ifnet_deserialize_all(ifp);
2315 	return (0);
2316 }
2317 
2318 struct ifmultiaddr *
2319 ifmaof_ifpforaddr(struct sockaddr *sa, struct ifnet *ifp)
2320 {
2321 	struct ifmultiaddr *ifma;
2322 
2323 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2324 		if (sa_equal(ifma->ifma_addr, sa))
2325 			break;
2326 
2327 	return ifma;
2328 }
2329 
2330 /*
2331  * This function locates the first real ethernet MAC from a network
2332  * card and loads it into node, returning 0 on success or ENOENT if
2333  * no suitable interfaces were found.  It is used by the uuid code to
2334  * generate a unique 6-byte number.
2335  */
2336 int
2337 if_getanyethermac(uint16_t *node, int minlen)
2338 {
2339 	struct ifnet *ifp;
2340 	struct sockaddr_dl *sdl;
2341 
2342 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
2343 		if (ifp->if_type != IFT_ETHER)
2344 			continue;
2345 		sdl = IF_LLSOCKADDR(ifp);
2346 		if (sdl->sdl_alen < minlen)
2347 			continue;
2348 		bcopy(((struct arpcom *)ifp->if_softc)->ac_enaddr, node,
2349 		      minlen);
2350 		return(0);
2351 	}
2352 	return (ENOENT);
2353 }
2354 
2355 /*
2356  * The name argument must be a pointer to storage which will last as
2357  * long as the interface does.  For physical devices, the result of
2358  * device_get_name(dev) is a good choice and for pseudo-devices a
2359  * static string works well.
2360  */
2361 void
2362 if_initname(struct ifnet *ifp, const char *name, int unit)
2363 {
2364 	ifp->if_dname = name;
2365 	ifp->if_dunit = unit;
2366 	if (unit != IF_DUNIT_NONE)
2367 		ksnprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
2368 	else
2369 		strlcpy(ifp->if_xname, name, IFNAMSIZ);
2370 }
2371 
2372 int
2373 if_printf(struct ifnet *ifp, const char *fmt, ...)
2374 {
2375 	__va_list ap;
2376 	int retval;
2377 
2378 	retval = kprintf("%s: ", ifp->if_xname);
2379 	__va_start(ap, fmt);
2380 	retval += kvprintf(fmt, ap);
2381 	__va_end(ap);
2382 	return (retval);
2383 }
2384 
2385 struct ifnet *
2386 if_alloc(uint8_t type)
2387 {
2388         struct ifnet *ifp;
2389 	size_t size;
2390 
2391 	/*
2392 	 * XXX temporary hack until arpcom is setup in if_l2com
2393 	 */
2394 	if (type == IFT_ETHER)
2395 		size = sizeof(struct arpcom);
2396 	else
2397 		size = sizeof(struct ifnet);
2398 
2399 	ifp = kmalloc(size, M_IFNET, M_WAITOK|M_ZERO);
2400 
2401 	ifp->if_type = type;
2402 
2403 	if (if_com_alloc[type] != NULL) {
2404 		ifp->if_l2com = if_com_alloc[type](type, ifp);
2405 		if (ifp->if_l2com == NULL) {
2406 			kfree(ifp, M_IFNET);
2407 			return (NULL);
2408 		}
2409 	}
2410 	return (ifp);
2411 }
2412 
2413 void
2414 if_free(struct ifnet *ifp)
2415 {
2416 	kfree(ifp, M_IFNET);
2417 }
2418 
2419 void
2420 ifq_set_classic(struct ifaltq *ifq)
2421 {
2422 	ifq->altq_enqueue = ifq_classic_enqueue;
2423 	ifq->altq_dequeue = ifq_classic_dequeue;
2424 	ifq->altq_request = ifq_classic_request;
2425 }
2426 
2427 int
2428 ifq_classic_enqueue(struct ifaltq *ifq, struct mbuf *m,
2429 		    struct altq_pktattr *pa __unused)
2430 {
2431 	logifq(enqueue, ifq);
2432 	if (IF_QFULL(ifq)) {
2433 		m_freem(m);
2434 		return(ENOBUFS);
2435 	} else {
2436 		IF_ENQUEUE(ifq, m);
2437 		return(0);
2438 	}
2439 }
2440 
2441 struct mbuf *
2442 ifq_classic_dequeue(struct ifaltq *ifq, struct mbuf *mpolled, int op)
2443 {
2444 	struct mbuf *m;
2445 
2446 	switch (op) {
2447 	case ALTDQ_POLL:
2448 		IF_POLL(ifq, m);
2449 		break;
2450 	case ALTDQ_REMOVE:
2451 		logifq(dequeue, ifq);
2452 		IF_DEQUEUE(ifq, m);
2453 		break;
2454 	default:
2455 		panic("unsupported ALTQ dequeue op: %d", op);
2456 	}
2457 	KKASSERT(mpolled == NULL || mpolled == m);
2458 	return(m);
2459 }
2460 
2461 int
2462 ifq_classic_request(struct ifaltq *ifq, int req, void *arg)
2463 {
2464 	switch (req) {
2465 	case ALTRQ_PURGE:
2466 		IF_DRAIN(ifq);
2467 		break;
2468 	default:
2469 		panic("unsupported ALTQ request: %d", req);
2470 	}
2471 	return(0);
2472 }
2473 
2474 static void
2475 ifq_try_ifstart(struct ifaltq *ifq, int force_sched)
2476 {
2477 	struct ifnet *ifp = ifq->altq_ifp;
2478 	int running = 0, need_sched;
2479 
2480 	/*
2481 	 * Try to do direct ifnet.if_start first, if there is
2482 	 * contention on ifnet's serializer, ifnet.if_start will
2483 	 * be scheduled on ifnet's CPU.
2484 	 */
2485 	if (!ifnet_tryserialize_tx(ifp)) {
2486 		/*
2487 		 * ifnet serializer contention happened,
2488 		 * ifnet.if_start is scheduled on ifnet's
2489 		 * CPU, and we keep going.
2490 		 */
2491 		logifstart(contend_sched, ifp);
2492 		ifq_ifstart_schedule(ifq, 1);
2493 		return;
2494 	}
2495 
2496 	if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq)) {
2497 		logifstart(run, ifp);
2498 		ifp->if_start(ifp);
2499 		if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq))
2500 			running = 1;
2501 	}
2502 	need_sched = ifq_ifstart_need_schedule(ifq, running);
2503 
2504 	ifnet_deserialize_tx(ifp);
2505 
2506 	if (need_sched) {
2507 		/*
2508 		 * More data need to be transmitted, ifnet.if_start is
2509 		 * scheduled on ifnet's CPU, and we keep going.
2510 		 * NOTE: ifnet.if_start interlock is not released.
2511 		 */
2512 		logifstart(sched, ifp);
2513 		ifq_ifstart_schedule(ifq, force_sched);
2514 	}
2515 }
2516 
2517 /*
2518  * IFQ packets staging mechanism:
2519  *
2520  * The packets enqueued into IFQ are staged to a certain amount before the
2521  * ifnet's if_start is called.  In this way, the driver could avoid writing
2522  * to hardware registers upon every packet, instead, hardware registers
2523  * could be written when certain amount of packets are put onto hardware
2524  * TX ring.  The measurement on several modern NICs (emx(4), igb(4), bnx(4),
2525  * bge(4), jme(4)) shows that the hardware registers writing aggregation
2526  * could save ~20% CPU time when 18bytes UDP datagrams are transmitted at
2527  * 1.48Mpps.  The performance improvement by hardware registers writing
2528  * aggeregation is also mentioned by Luigi Rizzo's netmap paper
2529  * (http://info.iet.unipi.it/~luigi/netmap/).
2530  *
2531  * IFQ packets staging is performed for two entry points into drivers's
2532  * transmission function:
2533  * - Direct ifnet's if_start calling, i.e. ifq_try_ifstart()
2534  * - ifnet's if_start scheduling, i.e. ifq_ifstart_schedule()
2535  *
2536  * IFQ packets staging will be stopped upon any of the following conditions:
2537  * - If the count of packets enqueued on the current CPU is great than or
2538  *   equal to ifq_stage_cntmax. (XXX this should be per-interface)
2539  * - If the total length of packets enqueued on the current CPU is great
2540  *   than or equal to the hardware's MTU - max_protohdr.  max_protohdr is
2541  *   cut from the hardware's MTU mainly bacause a full TCP segment's size
2542  *   is usually less than hardware's MTU.
2543  * - ifq_ifstart_schedule() is not pending on the current CPU and if_start
2544  *   interlock (if_snd.altq_started) is not released.
2545  * - The if_start_rollup(), which is registered as low priority netisr
2546  *   rollup function, is called; probably because no more work is pending
2547  *   for netisr.
2548  *
2549  * NOTE:
2550  * Currently IFQ packet staging is only performed in netisr threads.
2551  */
2552 int
2553 ifq_dispatch(struct ifnet *ifp, struct mbuf *m, struct altq_pktattr *pa)
2554 {
2555 	struct ifaltq *ifq = &ifp->if_snd;
2556 	int error, start = 0, len, mcast = 0, avoid_start = 0;
2557 	struct ifaltq_stage_head *head = NULL;
2558 	struct ifaltq_stage *stage = NULL;
2559 
2560 	ASSERT_IFNET_NOT_SERIALIZED_TX(ifp);
2561 
2562 	len = m->m_pkthdr.len;
2563 	if (m->m_flags & M_MCAST)
2564 		mcast = 1;
2565 
2566 	if (curthread->td_type == TD_TYPE_NETISR) {
2567 		head = &ifq_stage_heads[mycpuid];
2568 		stage = ifq_get_stage(ifq, mycpuid);
2569 
2570 		stage->ifqs_cnt++;
2571 		stage->ifqs_len += len;
2572 		if (stage->ifqs_cnt < ifq_stage_cntmax &&
2573 		    stage->ifqs_len < (ifp->if_mtu - max_protohdr))
2574 			avoid_start = 1;
2575 	}
2576 
2577 	ALTQ_LOCK(ifq);
2578 	error = ifq_enqueue_locked(ifq, m, pa);
2579 	if (error) {
2580 		if (!ifq_data_ready(ifq)) {
2581 			ALTQ_UNLOCK(ifq);
2582 			return error;
2583 		}
2584 		avoid_start = 0;
2585 	}
2586 	if (!ifq_is_started(ifq)) {
2587 		if (avoid_start) {
2588 			ALTQ_UNLOCK(ifq);
2589 
2590 			KKASSERT(!error);
2591 			if ((stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) == 0)
2592 				ifq_stage_insert(head, stage);
2593 
2594 			ifp->if_obytes += len;
2595 			if (mcast)
2596 				ifp->if_omcasts++;
2597 			return error;
2598 		}
2599 
2600 		/*
2601 		 * Hold the interlock of ifnet.if_start
2602 		 */
2603 		ifq_set_started(ifq);
2604 		start = 1;
2605 	}
2606 	ALTQ_UNLOCK(ifq);
2607 
2608 	if (!error) {
2609 		ifp->if_obytes += len;
2610 		if (mcast)
2611 			ifp->if_omcasts++;
2612 	}
2613 
2614 	if (stage != NULL) {
2615 		if (!start && (stage->ifqs_flags & IFQ_STAGE_FLAG_SCHED)) {
2616 			KKASSERT(stage->ifqs_flags & IFQ_STAGE_FLAG_QUED);
2617 			if (!avoid_start) {
2618 				ifq_stage_remove(head, stage);
2619 				ifq_ifstart_schedule(ifq, 1);
2620 			}
2621 			return error;
2622 		}
2623 
2624 		if (stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) {
2625 			ifq_stage_remove(head, stage);
2626 		} else {
2627 			stage->ifqs_cnt = 0;
2628 			stage->ifqs_len = 0;
2629 		}
2630 	}
2631 
2632 	if (!start) {
2633 		logifstart(avoid, ifp);
2634 		return error;
2635 	}
2636 
2637 	ifq_try_ifstart(ifq, 0);
2638 	return error;
2639 }
2640 
2641 void *
2642 ifa_create(int size, int flags)
2643 {
2644 	struct ifaddr *ifa;
2645 	int i;
2646 
2647 	KASSERT(size >= sizeof(*ifa), ("ifaddr size too small"));
2648 
2649 	ifa = kmalloc(size, M_IFADDR, flags | M_ZERO);
2650 	if (ifa == NULL)
2651 		return NULL;
2652 
2653 	ifa->ifa_containers = kmalloc(ncpus * sizeof(struct ifaddr_container),
2654 				      M_IFADDR, M_WAITOK | M_ZERO);
2655 	ifa->ifa_ncnt = ncpus;
2656 	for (i = 0; i < ncpus; ++i) {
2657 		struct ifaddr_container *ifac = &ifa->ifa_containers[i];
2658 
2659 		ifac->ifa_magic = IFA_CONTAINER_MAGIC;
2660 		ifac->ifa = ifa;
2661 		ifac->ifa_refcnt = 1;
2662 	}
2663 #ifdef IFADDR_DEBUG
2664 	kprintf("alloc ifa %p %d\n", ifa, size);
2665 #endif
2666 	return ifa;
2667 }
2668 
2669 void
2670 ifac_free(struct ifaddr_container *ifac, int cpu_id)
2671 {
2672 	struct ifaddr *ifa = ifac->ifa;
2673 
2674 	KKASSERT(ifac->ifa_magic == IFA_CONTAINER_MAGIC);
2675 	KKASSERT(ifac->ifa_refcnt == 0);
2676 	KASSERT(ifac->ifa_listmask == 0,
2677 		("ifa is still on %#x lists", ifac->ifa_listmask));
2678 
2679 	ifac->ifa_magic = IFA_CONTAINER_DEAD;
2680 
2681 #ifdef IFADDR_DEBUG_VERBOSE
2682 	kprintf("try free ifa %p cpu_id %d\n", ifac->ifa, cpu_id);
2683 #endif
2684 
2685 	KASSERT(ifa->ifa_ncnt > 0 && ifa->ifa_ncnt <= ncpus,
2686 		("invalid # of ifac, %d", ifa->ifa_ncnt));
2687 	if (atomic_fetchadd_int(&ifa->ifa_ncnt, -1) == 1) {
2688 #ifdef IFADDR_DEBUG
2689 		kprintf("free ifa %p\n", ifa);
2690 #endif
2691 		kfree(ifa->ifa_containers, M_IFADDR);
2692 		kfree(ifa, M_IFADDR);
2693 	}
2694 }
2695 
2696 static void
2697 ifa_iflink_dispatch(netmsg_t nmsg)
2698 {
2699 	struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2700 	struct ifaddr *ifa = msg->ifa;
2701 	struct ifnet *ifp = msg->ifp;
2702 	int cpu = mycpuid;
2703 	struct ifaddr_container *ifac;
2704 
2705 	crit_enter();
2706 
2707 	ifac = &ifa->ifa_containers[cpu];
2708 	ASSERT_IFAC_VALID(ifac);
2709 	KASSERT((ifac->ifa_listmask & IFA_LIST_IFADDRHEAD) == 0,
2710 		("ifaddr is on if_addrheads"));
2711 
2712 	ifac->ifa_listmask |= IFA_LIST_IFADDRHEAD;
2713 	if (msg->tail)
2714 		TAILQ_INSERT_TAIL(&ifp->if_addrheads[cpu], ifac, ifa_link);
2715 	else
2716 		TAILQ_INSERT_HEAD(&ifp->if_addrheads[cpu], ifac, ifa_link);
2717 
2718 	crit_exit();
2719 
2720 	ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
2721 }
2722 
2723 void
2724 ifa_iflink(struct ifaddr *ifa, struct ifnet *ifp, int tail)
2725 {
2726 	struct netmsg_ifaddr msg;
2727 
2728 	netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2729 		    0, ifa_iflink_dispatch);
2730 	msg.ifa = ifa;
2731 	msg.ifp = ifp;
2732 	msg.tail = tail;
2733 
2734 	ifa_domsg(&msg.base.lmsg, 0);
2735 }
2736 
2737 static void
2738 ifa_ifunlink_dispatch(netmsg_t nmsg)
2739 {
2740 	struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2741 	struct ifaddr *ifa = msg->ifa;
2742 	struct ifnet *ifp = msg->ifp;
2743 	int cpu = mycpuid;
2744 	struct ifaddr_container *ifac;
2745 
2746 	crit_enter();
2747 
2748 	ifac = &ifa->ifa_containers[cpu];
2749 	ASSERT_IFAC_VALID(ifac);
2750 	KASSERT(ifac->ifa_listmask & IFA_LIST_IFADDRHEAD,
2751 		("ifaddr is not on if_addrhead"));
2752 
2753 	TAILQ_REMOVE(&ifp->if_addrheads[cpu], ifac, ifa_link);
2754 	ifac->ifa_listmask &= ~IFA_LIST_IFADDRHEAD;
2755 
2756 	crit_exit();
2757 
2758 	ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
2759 }
2760 
2761 void
2762 ifa_ifunlink(struct ifaddr *ifa, struct ifnet *ifp)
2763 {
2764 	struct netmsg_ifaddr msg;
2765 
2766 	netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2767 		    0, ifa_ifunlink_dispatch);
2768 	msg.ifa = ifa;
2769 	msg.ifp = ifp;
2770 
2771 	ifa_domsg(&msg.base.lmsg, 0);
2772 }
2773 
2774 static void
2775 ifa_destroy_dispatch(netmsg_t nmsg)
2776 {
2777 	struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2778 
2779 	IFAFREE(msg->ifa);
2780 	ifa_forwardmsg(&nmsg->lmsg, mycpuid + 1);
2781 }
2782 
2783 void
2784 ifa_destroy(struct ifaddr *ifa)
2785 {
2786 	struct netmsg_ifaddr msg;
2787 
2788 	netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2789 		    0, ifa_destroy_dispatch);
2790 	msg.ifa = ifa;
2791 
2792 	ifa_domsg(&msg.base.lmsg, 0);
2793 }
2794 
2795 struct lwkt_port *
2796 ifnet_portfn(int cpu)
2797 {
2798 	return &ifnet_threads[cpu].td_msgport;
2799 }
2800 
2801 void
2802 ifnet_forwardmsg(struct lwkt_msg *lmsg, int next_cpu)
2803 {
2804 	KKASSERT(next_cpu > mycpuid && next_cpu <= ncpus);
2805 
2806 	if (next_cpu < ncpus)
2807 		lwkt_forwardmsg(ifnet_portfn(next_cpu), lmsg);
2808 	else
2809 		lwkt_replymsg(lmsg, 0);
2810 }
2811 
2812 int
2813 ifnet_domsg(struct lwkt_msg *lmsg, int cpu)
2814 {
2815 	KKASSERT(cpu < ncpus);
2816 	return lwkt_domsg(ifnet_portfn(cpu), lmsg, 0);
2817 }
2818 
2819 void
2820 ifnet_sendmsg(struct lwkt_msg *lmsg, int cpu)
2821 {
2822 	KKASSERT(cpu < ncpus);
2823 	lwkt_sendmsg(ifnet_portfn(cpu), lmsg);
2824 }
2825 
2826 /*
2827  * Generic netmsg service loop.  Some protocols may roll their own but all
2828  * must do the basic command dispatch function call done here.
2829  */
2830 static void
2831 ifnet_service_loop(void *arg __unused)
2832 {
2833 	netmsg_t msg;
2834 
2835 	while ((msg = lwkt_waitport(&curthread->td_msgport, 0))) {
2836 		KASSERT(msg->base.nm_dispatch, ("ifnet_service: badmsg"));
2837 		msg->base.nm_dispatch(msg);
2838 	}
2839 }
2840 
2841 static void
2842 if_start_rollup(void)
2843 {
2844 	struct ifaltq_stage_head *head = &ifq_stage_heads[mycpuid];
2845 	struct ifaltq_stage *stage;
2846 
2847 	while ((stage = TAILQ_FIRST(&head->ifqs_head)) != NULL) {
2848 		struct ifaltq *ifq = stage->ifqs_altq;
2849 		int is_sched = 0;
2850 
2851 		if (stage->ifqs_flags & IFQ_STAGE_FLAG_SCHED)
2852 			is_sched = 1;
2853 		ifq_stage_remove(head, stage);
2854 
2855 		if (is_sched) {
2856 			ifq_ifstart_schedule(ifq, 1);
2857 		} else {
2858 			int start = 0;
2859 
2860 			ALTQ_LOCK(ifq);
2861 			if (!ifq_is_started(ifq)) {
2862 				/*
2863 				 * Hold the interlock of ifnet.if_start
2864 				 */
2865 				ifq_set_started(ifq);
2866 				start = 1;
2867 			}
2868 			ALTQ_UNLOCK(ifq);
2869 
2870 			if (start)
2871 				ifq_try_ifstart(ifq, 1);
2872 		}
2873 		KKASSERT((stage->ifqs_flags &
2874 		    (IFQ_STAGE_FLAG_QUED | IFQ_STAGE_FLAG_SCHED)) == 0);
2875 	}
2876 }
2877 
2878 static void
2879 ifnetinit(void *dummy __unused)
2880 {
2881 	int i;
2882 
2883 	for (i = 0; i < ncpus; ++i) {
2884 		struct thread *thr = &ifnet_threads[i];
2885 
2886 		lwkt_create(ifnet_service_loop, NULL, NULL,
2887 			    thr, TDF_NOSTART|TDF_FORCE_SPINPORT,
2888 			    i, "ifnet %d", i);
2889 		netmsg_service_port_init(&thr->td_msgport);
2890 		lwkt_schedule(thr);
2891 	}
2892 
2893 	for (i = 0; i < ncpus; ++i)
2894 		TAILQ_INIT(&ifq_stage_heads[i].ifqs_head);
2895 	netisr_register_rollup(if_start_rollup, NETISR_ROLLUP_PRIO_IFSTART);
2896 }
2897 
2898 struct ifnet *
2899 ifnet_byindex(unsigned short idx)
2900 {
2901 	if (idx > if_index)
2902 		return NULL;
2903 	return ifindex2ifnet[idx];
2904 }
2905 
2906 struct ifaddr *
2907 ifaddr_byindex(unsigned short idx)
2908 {
2909 	struct ifnet *ifp;
2910 
2911 	ifp = ifnet_byindex(idx);
2912 	if (!ifp)
2913 		return NULL;
2914 	return TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
2915 }
2916 
2917 void
2918 if_register_com_alloc(u_char type,
2919     if_com_alloc_t *a, if_com_free_t *f)
2920 {
2921 
2922         KASSERT(if_com_alloc[type] == NULL,
2923             ("if_register_com_alloc: %d already registered", type));
2924         KASSERT(if_com_free[type] == NULL,
2925             ("if_register_com_alloc: %d free already registered", type));
2926 
2927         if_com_alloc[type] = a;
2928         if_com_free[type] = f;
2929 }
2930 
2931 void
2932 if_deregister_com_alloc(u_char type)
2933 {
2934 
2935         KASSERT(if_com_alloc[type] != NULL,
2936             ("if_deregister_com_alloc: %d not registered", type));
2937         KASSERT(if_com_free[type] != NULL,
2938             ("if_deregister_com_alloc: %d free not registered", type));
2939         if_com_alloc[type] = NULL;
2940         if_com_free[type] = NULL;
2941 }
2942 
2943 int
2944 if_ring_count2(int cnt, int cnt_max)
2945 {
2946 	int shift = 0;
2947 
2948 	KASSERT(cnt_max >= 1 && powerof2(cnt_max),
2949 	    ("invalid ring count max %d", cnt_max));
2950 
2951 	if (cnt <= 0)
2952 		cnt = cnt_max;
2953 	if (cnt > ncpus2)
2954 		cnt = ncpus2;
2955 	if (cnt > cnt_max)
2956 		cnt = cnt_max;
2957 
2958 	while ((1 << (shift + 1)) <= cnt)
2959 		++shift;
2960 	cnt = 1 << shift;
2961 
2962 	KASSERT(cnt >= 1 && cnt <= ncpus2 && cnt <= cnt_max,
2963 	    ("calculate cnt %d, ncpus2 %d, cnt max %d",
2964 	     cnt, ncpus2, cnt_max));
2965 	return cnt;
2966 }
2967 
2968 void
2969 ifq_set_maxlen(struct ifaltq *ifq, int len)
2970 {
2971 	ifq->ifq_maxlen = len + (ncpus * ifq_stage_cntmax);
2972 }
2973