xref: /dflybsd-src/sys/net/if.c (revision 9ed293e071aa5626e1e68861be45f0002c7b0d8c)
1 /*
2  * Copyright (c) 1980, 1986, 1993
3  *	The Regents of the University of California.  All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *    notice, this list of conditions and the following disclaimer.
10  * 2. Redistributions in binary form must reproduce the above copyright
11  *    notice, this list of conditions and the following disclaimer in the
12  *    documentation and/or other materials provided with the distribution.
13  * 3. All advertising materials mentioning features or use of this software
14  *    must display the following acknowledgement:
15  *	This product includes software developed by the University of
16  *	California, Berkeley and its contributors.
17  * 4. Neither the name of the University nor the names of its contributors
18  *    may be used to endorse or promote products derived from this software
19  *    without specific prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
22  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
25  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
26  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
27  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
28  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
29  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
30  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
31  * SUCH DAMAGE.
32  *
33  *	@(#)if.c	8.3 (Berkeley) 1/4/94
34  * $FreeBSD: src/sys/net/if.c,v 1.185 2004/03/13 02:35:03 brooks Exp $
35  */
36 
37 #include "opt_compat.h"
38 #include "opt_inet6.h"
39 #include "opt_inet.h"
40 #include "opt_ifpoll.h"
41 
42 #include <sys/param.h>
43 #include <sys/malloc.h>
44 #include <sys/mbuf.h>
45 #include <sys/systm.h>
46 #include <sys/proc.h>
47 #include <sys/priv.h>
48 #include <sys/protosw.h>
49 #include <sys/socket.h>
50 #include <sys/socketvar.h>
51 #include <sys/socketops.h>
52 #include <sys/protosw.h>
53 #include <sys/kernel.h>
54 #include <sys/ktr.h>
55 #include <sys/mutex.h>
56 #include <sys/sockio.h>
57 #include <sys/syslog.h>
58 #include <sys/sysctl.h>
59 #include <sys/domain.h>
60 #include <sys/thread.h>
61 #include <sys/serialize.h>
62 #include <sys/bus.h>
63 
64 #include <sys/thread2.h>
65 #include <sys/msgport2.h>
66 #include <sys/mutex2.h>
67 
68 #include <net/if.h>
69 #include <net/if_arp.h>
70 #include <net/if_dl.h>
71 #include <net/if_types.h>
72 #include <net/if_var.h>
73 #include <net/ifq_var.h>
74 #include <net/radix.h>
75 #include <net/route.h>
76 #include <net/if_clone.h>
77 #include <net/netisr.h>
78 #include <net/netmsg2.h>
79 
80 #include <machine/atomic.h>
81 #include <machine/stdarg.h>
82 #include <machine/smp.h>
83 
84 #if defined(INET) || defined(INET6)
85 /*XXX*/
86 #include <netinet/in.h>
87 #include <netinet/in_var.h>
88 #include <netinet/if_ether.h>
89 #ifdef INET6
90 #include <netinet6/in6_var.h>
91 #include <netinet6/in6_ifattach.h>
92 #endif
93 #endif
94 
95 #if defined(COMPAT_43)
96 #include <emulation/43bsd/43bsd_socket.h>
97 #endif /* COMPAT_43 */
98 
99 struct netmsg_ifaddr {
100 	struct netmsg_base base;
101 	struct ifaddr	*ifa;
102 	struct ifnet	*ifp;
103 	int		tail;
104 };
105 
106 struct ifaltq_stage_head {
107 	TAILQ_HEAD(, ifaltq_stage)	ifqs_head;
108 } __cachealign;
109 
110 /*
111  * System initialization
112  */
113 static void	if_attachdomain(void *);
114 static void	if_attachdomain1(struct ifnet *);
115 static int	ifconf(u_long, caddr_t, struct ucred *);
116 static void	ifinit(void *);
117 static void	ifnetinit(void *);
118 static void	if_slowtimo(void *);
119 static void	link_rtrequest(int, struct rtentry *, struct rt_addrinfo *);
120 static int	if_rtdel(struct radix_node *, void *);
121 
122 #ifdef INET6
123 /*
124  * XXX: declare here to avoid to include many inet6 related files..
125  * should be more generalized?
126  */
127 extern void	nd6_setmtu(struct ifnet *);
128 #endif
129 
130 SYSCTL_NODE(_net, PF_LINK, link, CTLFLAG_RW, 0, "Link layers");
131 SYSCTL_NODE(_net_link, 0, generic, CTLFLAG_RW, 0, "Generic link-management");
132 
133 static int ifq_stage_cntmax = 4;
134 TUNABLE_INT("net.link.stage_cntmax", &ifq_stage_cntmax);
135 SYSCTL_INT(_net_link, OID_AUTO, stage_cntmax, CTLFLAG_RW,
136     &ifq_stage_cntmax, 0, "ifq staging packet count max");
137 
138 SYSINIT(interfaces, SI_SUB_PROTO_IF, SI_ORDER_FIRST, ifinit, NULL)
139 /* Must be after netisr_init */
140 SYSINIT(ifnet, SI_SUB_PRE_DRIVERS, SI_ORDER_SECOND, ifnetinit, NULL)
141 
142 static  if_com_alloc_t *if_com_alloc[256];
143 static  if_com_free_t *if_com_free[256];
144 
145 MALLOC_DEFINE(M_IFADDR, "ifaddr", "interface address");
146 MALLOC_DEFINE(M_IFMADDR, "ether_multi", "link-level multicast address");
147 MALLOC_DEFINE(M_IFNET, "ifnet", "interface structure");
148 
149 int			ifqmaxlen = IFQ_MAXLEN;
150 struct ifnethead	ifnet = TAILQ_HEAD_INITIALIZER(ifnet);
151 
152 struct callout		if_slowtimo_timer;
153 
154 int			if_index = 0;
155 struct ifnet		**ifindex2ifnet = NULL;
156 static struct thread	ifnet_threads[MAXCPU];
157 
158 static struct ifaltq_stage_head	ifq_stage_heads[MAXCPU];
159 
160 #define IFQ_KTR_STRING		"ifq=%p"
161 #define IFQ_KTR_ARGS	struct ifaltq *ifq
162 #ifndef KTR_IFQ
163 #define KTR_IFQ			KTR_ALL
164 #endif
165 KTR_INFO_MASTER(ifq);
166 KTR_INFO(KTR_IFQ, ifq, enqueue, 0, IFQ_KTR_STRING, IFQ_KTR_ARGS);
167 KTR_INFO(KTR_IFQ, ifq, dequeue, 1, IFQ_KTR_STRING, IFQ_KTR_ARGS);
168 #define logifq(name, arg)	KTR_LOG(ifq_ ## name, arg)
169 
170 #define IF_START_KTR_STRING	"ifp=%p"
171 #define IF_START_KTR_ARGS	struct ifnet *ifp
172 #ifndef KTR_IF_START
173 #define KTR_IF_START		KTR_ALL
174 #endif
175 KTR_INFO_MASTER(if_start);
176 KTR_INFO(KTR_IF_START, if_start, run, 0,
177 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
178 KTR_INFO(KTR_IF_START, if_start, sched, 1,
179 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
180 KTR_INFO(KTR_IF_START, if_start, avoid, 2,
181 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
182 KTR_INFO(KTR_IF_START, if_start, contend_sched, 3,
183 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
184 KTR_INFO(KTR_IF_START, if_start, chase_sched, 4,
185 	 IF_START_KTR_STRING, IF_START_KTR_ARGS);
186 #define logifstart(name, arg)	KTR_LOG(if_start_ ## name, arg)
187 
188 TAILQ_HEAD(, ifg_group) ifg_head = TAILQ_HEAD_INITIALIZER(ifg_head);
189 
190 /*
191  * Network interface utility routines.
192  *
193  * Routines with ifa_ifwith* names take sockaddr *'s as
194  * parameters.
195  */
196 /* ARGSUSED*/
197 void
198 ifinit(void *dummy)
199 {
200 	struct ifnet *ifp;
201 
202 	callout_init(&if_slowtimo_timer);
203 
204 	crit_enter();
205 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
206 		if (ifp->if_snd.ifq_maxlen == 0) {
207 			if_printf(ifp, "XXX: driver didn't set ifq_maxlen\n");
208 			ifq_set_maxlen(&ifp->if_snd, ifqmaxlen);
209 		}
210 	}
211 	crit_exit();
212 
213 	if_slowtimo(0);
214 }
215 
216 static int
217 if_start_cpuid(struct ifnet *ifp)
218 {
219 	return ifp->if_cpuid;
220 }
221 
222 #ifdef IFPOLL_ENABLE
223 static int
224 if_start_cpuid_npoll(struct ifnet *ifp)
225 {
226 	int poll_cpuid = ifp->if_npoll_cpuid;
227 
228 	if (poll_cpuid >= 0)
229 		return poll_cpuid;
230 	else
231 		return ifp->if_cpuid;
232 }
233 #endif
234 
235 static void
236 if_start_ipifunc(void *arg)
237 {
238 	struct ifnet *ifp = arg;
239 	struct lwkt_msg *lmsg = &ifp->if_start_nmsg[mycpuid].lmsg;
240 
241 	crit_enter();
242 	if (lmsg->ms_flags & MSGF_DONE)
243 		lwkt_sendmsg(netisr_portfn(mycpuid), lmsg);
244 	crit_exit();
245 }
246 
247 static __inline void
248 ifq_stage_remove(struct ifaltq_stage_head *head, struct ifaltq_stage *stage)
249 {
250 	KKASSERT(stage->ifqs_flags & IFQ_STAGE_FLAG_QUED);
251 	TAILQ_REMOVE(&head->ifqs_head, stage, ifqs_link);
252 	stage->ifqs_flags &= ~(IFQ_STAGE_FLAG_QUED | IFQ_STAGE_FLAG_SCHED);
253 	stage->ifqs_cnt = 0;
254 	stage->ifqs_len = 0;
255 }
256 
257 static __inline void
258 ifq_stage_insert(struct ifaltq_stage_head *head, struct ifaltq_stage *stage)
259 {
260 	KKASSERT((stage->ifqs_flags &
261 	    (IFQ_STAGE_FLAG_QUED | IFQ_STAGE_FLAG_SCHED)) == 0);
262 	stage->ifqs_flags |= IFQ_STAGE_FLAG_QUED;
263 	TAILQ_INSERT_TAIL(&head->ifqs_head, stage, ifqs_link);
264 }
265 
266 /*
267  * Schedule ifnet.if_start on ifnet's CPU
268  */
269 static void
270 if_start_schedule(struct ifnet *ifp, int force)
271 {
272 	int cpu;
273 
274 	if (!force && curthread->td_type == TD_TYPE_NETISR &&
275 	    ifq_stage_cntmax > 0) {
276 		struct ifaltq_stage *stage = &ifp->if_snd.altq_stage[mycpuid];
277 
278 		stage->ifqs_cnt = 0;
279 		stage->ifqs_len = 0;
280 		if ((stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) == 0)
281 			ifq_stage_insert(&ifq_stage_heads[mycpuid], stage);
282 		stage->ifqs_flags |= IFQ_STAGE_FLAG_SCHED;
283 		return;
284 	}
285 
286 	cpu = ifp->if_start_cpuid(ifp);
287 	if (cpu != mycpuid)
288 		lwkt_send_ipiq(globaldata_find(cpu), if_start_ipifunc, ifp);
289 	else
290 		if_start_ipifunc(ifp);
291 }
292 
293 /*
294  * NOTE:
295  * This function will release ifnet.if_start interlock,
296  * if ifnet.if_start does not need to be scheduled
297  */
298 static __inline int
299 if_start_need_schedule(struct ifaltq *ifq, int running)
300 {
301 	if (!running || ifq_is_empty(ifq)
302 #ifdef ALTQ
303 	    || ifq->altq_tbr != NULL
304 #endif
305 	) {
306 		ALTQ_LOCK(ifq);
307 		/*
308 		 * ifnet.if_start interlock is released, if:
309 		 * 1) Hardware can not take any packets, due to
310 		 *    o  interface is marked down
311 		 *    o  hardware queue is full (ifq_is_oactive)
312 		 *    Under the second situation, hardware interrupt
313 		 *    or polling(4) will call/schedule ifnet.if_start
314 		 *    when hardware queue is ready
315 		 * 2) There is not packet in the ifnet.if_snd.
316 		 *    Further ifq_dispatch or ifq_handoff will call/
317 		 *    schedule ifnet.if_start
318 		 * 3) TBR is used and it does not allow further
319 		 *    dequeueing.
320 		 *    TBR callout will call ifnet.if_start
321 		 */
322 		if (!running || !ifq_data_ready(ifq)) {
323 			ifq->altq_started = 0;
324 			ALTQ_UNLOCK(ifq);
325 			return 0;
326 		}
327 		ALTQ_UNLOCK(ifq);
328 	}
329 	return 1;
330 }
331 
332 static void
333 if_start_dispatch(netmsg_t msg)
334 {
335 	struct lwkt_msg *lmsg = &msg->base.lmsg;
336 	struct ifnet *ifp = lmsg->u.ms_resultp;
337 	struct ifaltq *ifq = &ifp->if_snd;
338 	int running = 0, need_sched;
339 
340 	crit_enter();
341 	lwkt_replymsg(lmsg, 0);	/* reply ASAP */
342 	crit_exit();
343 
344 	if (mycpuid != ifp->if_start_cpuid(ifp)) {
345 		/*
346 		 * We need to chase the ifnet CPU change.
347 		 */
348 		logifstart(chase_sched, ifp);
349 		if_start_schedule(ifp, 1);
350 		return;
351 	}
352 
353 	ifnet_serialize_tx(ifp);
354 	if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq)) {
355 		logifstart(run, ifp);
356 		ifp->if_start(ifp);
357 		if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq))
358 			running = 1;
359 	}
360 	need_sched = if_start_need_schedule(ifq, running);
361 	ifnet_deserialize_tx(ifp);
362 
363 	if (need_sched) {
364 		/*
365 		 * More data need to be transmitted, ifnet.if_start is
366 		 * scheduled on ifnet's CPU, and we keep going.
367 		 * NOTE: ifnet.if_start interlock is not released.
368 		 */
369 		logifstart(sched, ifp);
370 		if_start_schedule(ifp, 0);
371 	}
372 }
373 
374 /* Device driver ifnet.if_start helper function */
375 void
376 if_devstart(struct ifnet *ifp)
377 {
378 	struct ifaltq *ifq = &ifp->if_snd;
379 	int running = 0;
380 
381 	ASSERT_IFNET_SERIALIZED_TX(ifp);
382 
383 	ALTQ_LOCK(ifq);
384 	if (ifq->altq_started || !ifq_data_ready(ifq)) {
385 		logifstart(avoid, ifp);
386 		ALTQ_UNLOCK(ifq);
387 		return;
388 	}
389 	ifq->altq_started = 1;
390 	ALTQ_UNLOCK(ifq);
391 
392 	logifstart(run, ifp);
393 	ifp->if_start(ifp);
394 
395 	if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq))
396 		running = 1;
397 
398 	if (if_start_need_schedule(ifq, running)) {
399 		/*
400 		 * More data need to be transmitted, ifnet.if_start is
401 		 * scheduled on ifnet's CPU, and we keep going.
402 		 * NOTE: ifnet.if_start interlock is not released.
403 		 */
404 		logifstart(sched, ifp);
405 		if_start_schedule(ifp, 0);
406 	}
407 }
408 
409 static void
410 if_default_serialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
411 {
412 	lwkt_serialize_enter(ifp->if_serializer);
413 }
414 
415 static void
416 if_default_deserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
417 {
418 	lwkt_serialize_exit(ifp->if_serializer);
419 }
420 
421 static int
422 if_default_tryserialize(struct ifnet *ifp, enum ifnet_serialize slz __unused)
423 {
424 	return lwkt_serialize_try(ifp->if_serializer);
425 }
426 
427 #ifdef INVARIANTS
428 static void
429 if_default_serialize_assert(struct ifnet *ifp,
430 			    enum ifnet_serialize slz __unused,
431 			    boolean_t serialized)
432 {
433 	if (serialized)
434 		ASSERT_SERIALIZED(ifp->if_serializer);
435 	else
436 		ASSERT_NOT_SERIALIZED(ifp->if_serializer);
437 }
438 #endif
439 
440 /*
441  * Attach an interface to the list of "active" interfaces.
442  *
443  * The serializer is optional.  If non-NULL access to the interface
444  * may be MPSAFE.
445  */
446 void
447 if_attach(struct ifnet *ifp, lwkt_serialize_t serializer)
448 {
449 	unsigned socksize, ifasize;
450 	int namelen, masklen;
451 	struct sockaddr_dl *sdl;
452 	struct ifaddr *ifa;
453 	struct ifaltq *ifq;
454 	int i;
455 
456 	static int if_indexlim = 8;
457 
458 	if (ifp->if_serialize != NULL) {
459 		KASSERT(ifp->if_deserialize != NULL &&
460 			ifp->if_tryserialize != NULL &&
461 			ifp->if_serialize_assert != NULL,
462 			("serialize functions are partially setup"));
463 
464 		/*
465 		 * If the device supplies serialize functions,
466 		 * then clear if_serializer to catch any invalid
467 		 * usage of this field.
468 		 */
469 		KASSERT(serializer == NULL,
470 			("both serialize functions and default serializer "
471 			 "are supplied"));
472 		ifp->if_serializer = NULL;
473 	} else {
474 		KASSERT(ifp->if_deserialize == NULL &&
475 			ifp->if_tryserialize == NULL &&
476 			ifp->if_serialize_assert == NULL,
477 			("serialize functions are partially setup"));
478 		ifp->if_serialize = if_default_serialize;
479 		ifp->if_deserialize = if_default_deserialize;
480 		ifp->if_tryserialize = if_default_tryserialize;
481 #ifdef INVARIANTS
482 		ifp->if_serialize_assert = if_default_serialize_assert;
483 #endif
484 
485 		/*
486 		 * The serializer can be passed in from the device,
487 		 * allowing the same serializer to be used for both
488 		 * the interrupt interlock and the device queue.
489 		 * If not specified, the netif structure will use an
490 		 * embedded serializer.
491 		 */
492 		if (serializer == NULL) {
493 			serializer = &ifp->if_default_serializer;
494 			lwkt_serialize_init(serializer);
495 		}
496 		ifp->if_serializer = serializer;
497 	}
498 
499 	ifp->if_start_cpuid = if_start_cpuid;
500 	ifp->if_cpuid = 0;
501 
502 #ifdef IFPOLL_ENABLE
503 	/* Device is not in polling mode by default */
504 	ifp->if_npoll_cpuid = -1;
505 	if (ifp->if_npoll != NULL)
506 		ifp->if_start_cpuid = if_start_cpuid_npoll;
507 #endif
508 
509 	ifp->if_start_nmsg = kmalloc(ncpus * sizeof(*ifp->if_start_nmsg),
510 				     M_LWKTMSG, M_WAITOK);
511 	for (i = 0; i < ncpus; ++i) {
512 		netmsg_init(&ifp->if_start_nmsg[i], NULL, &netisr_adone_rport,
513 			    0, if_start_dispatch);
514 		ifp->if_start_nmsg[i].lmsg.u.ms_resultp = ifp;
515 	}
516 
517 	mtx_init(&ifp->if_ioctl_mtx);
518 	mtx_lock(&ifp->if_ioctl_mtx);
519 
520 	TAILQ_INSERT_TAIL(&ifnet, ifp, if_link);
521 	ifp->if_index = ++if_index;
522 
523 	/*
524 	 * XXX -
525 	 * The old code would work if the interface passed a pre-existing
526 	 * chain of ifaddrs to this code.  We don't trust our callers to
527 	 * properly initialize the tailq, however, so we no longer allow
528 	 * this unlikely case.
529 	 */
530 	ifp->if_addrheads = kmalloc(ncpus * sizeof(struct ifaddrhead),
531 				    M_IFADDR, M_WAITOK | M_ZERO);
532 	for (i = 0; i < ncpus; ++i)
533 		TAILQ_INIT(&ifp->if_addrheads[i]);
534 
535 	TAILQ_INIT(&ifp->if_prefixhead);
536 	TAILQ_INIT(&ifp->if_multiaddrs);
537 	TAILQ_INIT(&ifp->if_groups);
538 	getmicrotime(&ifp->if_lastchange);
539 	if (ifindex2ifnet == NULL || if_index >= if_indexlim) {
540 		unsigned int n;
541 		struct ifnet **q;
542 
543 		if_indexlim <<= 1;
544 
545 		/* grow ifindex2ifnet */
546 		n = if_indexlim * sizeof(*q);
547 		q = kmalloc(n, M_IFADDR, M_WAITOK | M_ZERO);
548 		if (ifindex2ifnet) {
549 			bcopy(ifindex2ifnet, q, n/2);
550 			kfree(ifindex2ifnet, M_IFADDR);
551 		}
552 		ifindex2ifnet = q;
553 	}
554 
555 	ifindex2ifnet[if_index] = ifp;
556 
557 	/*
558 	 * create a Link Level name for this device
559 	 */
560 	namelen = strlen(ifp->if_xname);
561 	masklen = offsetof(struct sockaddr_dl, sdl_data[0]) + namelen;
562 	socksize = masklen + ifp->if_addrlen;
563 #define ROUNDUP(a) (1 + (((a) - 1) | (sizeof(long) - 1)))
564 	if (socksize < sizeof(*sdl))
565 		socksize = sizeof(*sdl);
566 	socksize = ROUNDUP(socksize);
567 #undef ROUNDUP
568 	ifasize = sizeof(struct ifaddr) + 2 * socksize;
569 	ifa = ifa_create(ifasize, M_WAITOK);
570 	sdl = (struct sockaddr_dl *)(ifa + 1);
571 	sdl->sdl_len = socksize;
572 	sdl->sdl_family = AF_LINK;
573 	bcopy(ifp->if_xname, sdl->sdl_data, namelen);
574 	sdl->sdl_nlen = namelen;
575 	sdl->sdl_index = ifp->if_index;
576 	sdl->sdl_type = ifp->if_type;
577 	ifp->if_lladdr = ifa;
578 	ifa->ifa_ifp = ifp;
579 	ifa->ifa_rtrequest = link_rtrequest;
580 	ifa->ifa_addr = (struct sockaddr *)sdl;
581 	sdl = (struct sockaddr_dl *)(socksize + (caddr_t)sdl);
582 	ifa->ifa_netmask = (struct sockaddr *)sdl;
583 	sdl->sdl_len = masklen;
584 	while (namelen != 0)
585 		sdl->sdl_data[--namelen] = 0xff;
586 	ifa_iflink(ifa, ifp, 0 /* Insert head */);
587 
588 	EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
589 	devctl_notify("IFNET", ifp->if_xname, "ATTACH", NULL);
590 
591 	ifq = &ifp->if_snd;
592 	ifq->altq_type = 0;
593 	ifq->altq_disc = NULL;
594 	ifq->altq_flags &= ALTQF_CANTCHANGE;
595 	ifq->altq_tbr = NULL;
596 	ifq->altq_ifp = ifp;
597 	ifq->altq_started = 0;
598 	ifq->altq_prepended = NULL;
599 	ALTQ_LOCK_INIT(ifq);
600 	ifq_set_classic(ifq);
601 
602 	ifq->altq_stage =
603 	    kmalloc_cachealign(ncpus * sizeof(struct ifaltq_stage),
604 	    M_DEVBUF, M_WAITOK | M_ZERO);
605 	for (i = 0; i < ncpus; ++i)
606 		ifq->altq_stage[i].ifqs_altq = ifq;
607 
608 	if (!SLIST_EMPTY(&domains))
609 		if_attachdomain1(ifp);
610 
611 	/* Announce the interface. */
612 	rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
613 
614 	mtx_unlock(&ifp->if_ioctl_mtx);
615 }
616 
617 static void
618 if_attachdomain(void *dummy)
619 {
620 	struct ifnet *ifp;
621 
622 	crit_enter();
623 	TAILQ_FOREACH(ifp, &ifnet, if_list)
624 		if_attachdomain1(ifp);
625 	crit_exit();
626 }
627 SYSINIT(domainifattach, SI_SUB_PROTO_IFATTACHDOMAIN, SI_ORDER_FIRST,
628 	if_attachdomain, NULL);
629 
630 static void
631 if_attachdomain1(struct ifnet *ifp)
632 {
633 	struct domain *dp;
634 
635 	crit_enter();
636 
637 	/* address family dependent data region */
638 	bzero(ifp->if_afdata, sizeof(ifp->if_afdata));
639 	SLIST_FOREACH(dp, &domains, dom_next)
640 		if (dp->dom_ifattach)
641 			ifp->if_afdata[dp->dom_family] =
642 				(*dp->dom_ifattach)(ifp);
643 	crit_exit();
644 }
645 
646 /*
647  * Purge all addresses whose type is _not_ AF_LINK
648  */
649 void
650 if_purgeaddrs_nolink(struct ifnet *ifp)
651 {
652 	struct ifaddr_container *ifac, *next;
653 
654 	TAILQ_FOREACH_MUTABLE(ifac, &ifp->if_addrheads[mycpuid],
655 			      ifa_link, next) {
656 		struct ifaddr *ifa = ifac->ifa;
657 
658 		/* Leave link ifaddr as it is */
659 		if (ifa->ifa_addr->sa_family == AF_LINK)
660 			continue;
661 #ifdef INET
662 		/* XXX: Ugly!! ad hoc just for INET */
663 		if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET) {
664 			struct ifaliasreq ifr;
665 #ifdef IFADDR_DEBUG_VERBOSE
666 			int i;
667 
668 			kprintf("purge in4 addr %p: ", ifa);
669 			for (i = 0; i < ncpus; ++i)
670 				kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
671 			kprintf("\n");
672 #endif
673 
674 			bzero(&ifr, sizeof ifr);
675 			ifr.ifra_addr = *ifa->ifa_addr;
676 			if (ifa->ifa_dstaddr)
677 				ifr.ifra_broadaddr = *ifa->ifa_dstaddr;
678 			if (in_control(NULL, SIOCDIFADDR, (caddr_t)&ifr, ifp,
679 				       NULL) == 0)
680 				continue;
681 		}
682 #endif /* INET */
683 #ifdef INET6
684 		if (ifa->ifa_addr && ifa->ifa_addr->sa_family == AF_INET6) {
685 #ifdef IFADDR_DEBUG_VERBOSE
686 			int i;
687 
688 			kprintf("purge in6 addr %p: ", ifa);
689 			for (i = 0; i < ncpus; ++i)
690 				kprintf("%d ", ifa->ifa_containers[i].ifa_refcnt);
691 			kprintf("\n");
692 #endif
693 
694 			in6_purgeaddr(ifa);
695 			/* ifp_addrhead is already updated */
696 			continue;
697 		}
698 #endif /* INET6 */
699 		ifa_ifunlink(ifa, ifp);
700 		ifa_destroy(ifa);
701 	}
702 }
703 
704 static void
705 ifq_stage_detach_handler(netmsg_t nmsg)
706 {
707 	struct ifaltq *ifq = nmsg->lmsg.u.ms_resultp;
708 	struct ifaltq_stage *stage = &ifq->altq_stage[mycpuid];
709 
710 	if (stage->ifqs_flags & IFQ_STAGE_FLAG_QUED)
711 		ifq_stage_remove(&ifq_stage_heads[mycpuid], stage);
712 	lwkt_replymsg(&nmsg->lmsg, 0);
713 }
714 
715 static void
716 ifq_stage_detach(struct ifaltq *ifq)
717 {
718 	struct netmsg_base base;
719 	int cpu;
720 
721 	netmsg_init(&base, NULL, &curthread->td_msgport, 0,
722 	    ifq_stage_detach_handler);
723 	base.lmsg.u.ms_resultp = ifq;
724 
725 	for (cpu = 0; cpu < ncpus; ++cpu)
726 		lwkt_domsg(netisr_portfn(cpu), &base.lmsg, 0);
727 }
728 
729 /*
730  * Detach an interface, removing it from the
731  * list of "active" interfaces.
732  */
733 void
734 if_detach(struct ifnet *ifp)
735 {
736 	struct radix_node_head	*rnh;
737 	int i;
738 	int cpu, origcpu;
739 	struct domain *dp;
740 
741 	EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
742 
743 	/*
744 	 * Remove routes and flush queues.
745 	 */
746 	crit_enter();
747 #ifdef IFPOLL_ENABLE
748 	if (ifp->if_flags & IFF_NPOLLING)
749 		ifpoll_deregister(ifp);
750 #endif
751 	if_down(ifp);
752 
753 #ifdef ALTQ
754 	if (ifq_is_enabled(&ifp->if_snd))
755 		altq_disable(&ifp->if_snd);
756 	if (ifq_is_attached(&ifp->if_snd))
757 		altq_detach(&ifp->if_snd);
758 #endif
759 
760 	/*
761 	 * Clean up all addresses.
762 	 */
763 	ifp->if_lladdr = NULL;
764 
765 	if_purgeaddrs_nolink(ifp);
766 	if (!TAILQ_EMPTY(&ifp->if_addrheads[mycpuid])) {
767 		struct ifaddr *ifa;
768 
769 		ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
770 		KASSERT(ifa->ifa_addr->sa_family == AF_LINK,
771 			("non-link ifaddr is left on if_addrheads"));
772 
773 		ifa_ifunlink(ifa, ifp);
774 		ifa_destroy(ifa);
775 		KASSERT(TAILQ_EMPTY(&ifp->if_addrheads[mycpuid]),
776 			("there are still ifaddrs left on if_addrheads"));
777 	}
778 
779 #ifdef INET
780 	/*
781 	 * Remove all IPv4 kernel structures related to ifp.
782 	 */
783 	in_ifdetach(ifp);
784 #endif
785 
786 #ifdef INET6
787 	/*
788 	 * Remove all IPv6 kernel structs related to ifp.  This should be done
789 	 * before removing routing entries below, since IPv6 interface direct
790 	 * routes are expected to be removed by the IPv6-specific kernel API.
791 	 * Otherwise, the kernel will detect some inconsistency and bark it.
792 	 */
793 	in6_ifdetach(ifp);
794 #endif
795 
796 	/*
797 	 * Delete all remaining routes using this interface
798 	 * Unfortuneatly the only way to do this is to slog through
799 	 * the entire routing table looking for routes which point
800 	 * to this interface...oh well...
801 	 */
802 	origcpu = mycpuid;
803 	for (cpu = 0; cpu < ncpus; cpu++) {
804 		lwkt_migratecpu(cpu);
805 		for (i = 1; i <= AF_MAX; i++) {
806 			if ((rnh = rt_tables[cpu][i]) == NULL)
807 				continue;
808 			rnh->rnh_walktree(rnh, if_rtdel, ifp);
809 		}
810 	}
811 	lwkt_migratecpu(origcpu);
812 
813 	/* Announce that the interface is gone. */
814 	rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
815 	devctl_notify("IFNET", ifp->if_xname, "DETACH", NULL);
816 
817 	SLIST_FOREACH(dp, &domains, dom_next)
818 		if (dp->dom_ifdetach && ifp->if_afdata[dp->dom_family])
819 			(*dp->dom_ifdetach)(ifp,
820 				ifp->if_afdata[dp->dom_family]);
821 
822 	/*
823 	 * Remove interface from ifindex2ifp[] and maybe decrement if_index.
824 	 */
825 	ifindex2ifnet[ifp->if_index] = NULL;
826 	while (if_index > 0 && ifindex2ifnet[if_index] == NULL)
827 		if_index--;
828 
829 	TAILQ_REMOVE(&ifnet, ifp, if_link);
830 	kfree(ifp->if_addrheads, M_IFADDR);
831 
832 	lwkt_synchronize_ipiqs("if_detach");
833 	ifq_stage_detach(&ifp->if_snd);
834 
835 	kfree(ifp->if_start_nmsg, M_LWKTMSG);
836 	kfree(ifp->if_snd.altq_stage, M_DEVBUF);
837 	crit_exit();
838 }
839 
840 /*
841  * Create interface group without members
842  */
843 struct ifg_group *
844 if_creategroup(const char *groupname)
845 {
846         struct ifg_group        *ifg = NULL;
847 
848         if ((ifg = (struct ifg_group *)kmalloc(sizeof(struct ifg_group),
849             M_TEMP, M_NOWAIT)) == NULL)
850                 return (NULL);
851 
852         strlcpy(ifg->ifg_group, groupname, sizeof(ifg->ifg_group));
853         ifg->ifg_refcnt = 0;
854         ifg->ifg_carp_demoted = 0;
855         TAILQ_INIT(&ifg->ifg_members);
856 #if NPF > 0
857         pfi_attach_ifgroup(ifg);
858 #endif
859         TAILQ_INSERT_TAIL(&ifg_head, ifg, ifg_next);
860 
861         return (ifg);
862 }
863 
864 /*
865  * Add a group to an interface
866  */
867 int
868 if_addgroup(struct ifnet *ifp, const char *groupname)
869 {
870 	struct ifg_list		*ifgl;
871 	struct ifg_group	*ifg = NULL;
872 	struct ifg_member	*ifgm;
873 
874 	if (groupname[0] && groupname[strlen(groupname) - 1] >= '0' &&
875 	    groupname[strlen(groupname) - 1] <= '9')
876 		return (EINVAL);
877 
878 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
879 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
880 			return (EEXIST);
881 
882 	if ((ifgl = kmalloc(sizeof(*ifgl), M_TEMP, M_NOWAIT)) == NULL)
883 		return (ENOMEM);
884 
885 	if ((ifgm = kmalloc(sizeof(*ifgm), M_TEMP, M_NOWAIT)) == NULL) {
886 		kfree(ifgl, M_TEMP);
887 		return (ENOMEM);
888 	}
889 
890 	TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
891 		if (!strcmp(ifg->ifg_group, groupname))
892 			break;
893 
894 	if (ifg == NULL && (ifg = if_creategroup(groupname)) == NULL) {
895 		kfree(ifgl, M_TEMP);
896 		kfree(ifgm, M_TEMP);
897 		return (ENOMEM);
898 	}
899 
900 	ifg->ifg_refcnt++;
901 	ifgl->ifgl_group = ifg;
902 	ifgm->ifgm_ifp = ifp;
903 
904 	TAILQ_INSERT_TAIL(&ifg->ifg_members, ifgm, ifgm_next);
905 	TAILQ_INSERT_TAIL(&ifp->if_groups, ifgl, ifgl_next);
906 
907 #if NPF > 0
908 	pfi_group_change(groupname);
909 #endif
910 
911 	return (0);
912 }
913 
914 /*
915  * Remove a group from an interface
916  */
917 int
918 if_delgroup(struct ifnet *ifp, const char *groupname)
919 {
920 	struct ifg_list		*ifgl;
921 	struct ifg_member	*ifgm;
922 
923 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
924 		if (!strcmp(ifgl->ifgl_group->ifg_group, groupname))
925 			break;
926 	if (ifgl == NULL)
927 		return (ENOENT);
928 
929 	TAILQ_REMOVE(&ifp->if_groups, ifgl, ifgl_next);
930 
931 	TAILQ_FOREACH(ifgm, &ifgl->ifgl_group->ifg_members, ifgm_next)
932 		if (ifgm->ifgm_ifp == ifp)
933 			break;
934 
935 	if (ifgm != NULL) {
936 		TAILQ_REMOVE(&ifgl->ifgl_group->ifg_members, ifgm, ifgm_next);
937 		kfree(ifgm, M_TEMP);
938 	}
939 
940 	if (--ifgl->ifgl_group->ifg_refcnt == 0) {
941 		TAILQ_REMOVE(&ifg_head, ifgl->ifgl_group, ifg_next);
942 #if NPF > 0
943 		pfi_detach_ifgroup(ifgl->ifgl_group);
944 #endif
945 		kfree(ifgl->ifgl_group, M_TEMP);
946 	}
947 
948 	kfree(ifgl, M_TEMP);
949 
950 #if NPF > 0
951 	pfi_group_change(groupname);
952 #endif
953 
954 	return (0);
955 }
956 
957 /*
958  * Stores all groups from an interface in memory pointed
959  * to by data
960  */
961 int
962 if_getgroup(caddr_t data, struct ifnet *ifp)
963 {
964 	int			 len, error;
965 	struct ifg_list		*ifgl;
966 	struct ifg_req		 ifgrq, *ifgp;
967 	struct ifgroupreq	*ifgr = (struct ifgroupreq *)data;
968 
969 	if (ifgr->ifgr_len == 0) {
970 		TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next)
971 			ifgr->ifgr_len += sizeof(struct ifg_req);
972 		return (0);
973 	}
974 
975 	len = ifgr->ifgr_len;
976 	ifgp = ifgr->ifgr_groups;
977 	TAILQ_FOREACH(ifgl, &ifp->if_groups, ifgl_next) {
978 		if (len < sizeof(ifgrq))
979 			return (EINVAL);
980 		bzero(&ifgrq, sizeof ifgrq);
981 		strlcpy(ifgrq.ifgrq_group, ifgl->ifgl_group->ifg_group,
982 		    sizeof(ifgrq.ifgrq_group));
983 		if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
984 		    sizeof(struct ifg_req))))
985 			return (error);
986 		len -= sizeof(ifgrq);
987 		ifgp++;
988 	}
989 
990 	return (0);
991 }
992 
993 /*
994  * Stores all members of a group in memory pointed to by data
995  */
996 int
997 if_getgroupmembers(caddr_t data)
998 {
999 	struct ifgroupreq	*ifgr = (struct ifgroupreq *)data;
1000 	struct ifg_group	*ifg;
1001 	struct ifg_member	*ifgm;
1002 	struct ifg_req		 ifgrq, *ifgp;
1003 	int			 len, error;
1004 
1005 	TAILQ_FOREACH(ifg, &ifg_head, ifg_next)
1006 		if (!strcmp(ifg->ifg_group, ifgr->ifgr_name))
1007 			break;
1008 	if (ifg == NULL)
1009 		return (ENOENT);
1010 
1011 	if (ifgr->ifgr_len == 0) {
1012 		TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next)
1013 			ifgr->ifgr_len += sizeof(ifgrq);
1014 		return (0);
1015 	}
1016 
1017 	len = ifgr->ifgr_len;
1018 	ifgp = ifgr->ifgr_groups;
1019 	TAILQ_FOREACH(ifgm, &ifg->ifg_members, ifgm_next) {
1020 		if (len < sizeof(ifgrq))
1021 			return (EINVAL);
1022 		bzero(&ifgrq, sizeof ifgrq);
1023 		strlcpy(ifgrq.ifgrq_member, ifgm->ifgm_ifp->if_xname,
1024 		    sizeof(ifgrq.ifgrq_member));
1025 		if ((error = copyout((caddr_t)&ifgrq, (caddr_t)ifgp,
1026 		    sizeof(struct ifg_req))))
1027 			return (error);
1028 		len -= sizeof(ifgrq);
1029 		ifgp++;
1030 	}
1031 
1032 	return (0);
1033 }
1034 
1035 /*
1036  * Delete Routes for a Network Interface
1037  *
1038  * Called for each routing entry via the rnh->rnh_walktree() call above
1039  * to delete all route entries referencing a detaching network interface.
1040  *
1041  * Arguments:
1042  *	rn	pointer to node in the routing table
1043  *	arg	argument passed to rnh->rnh_walktree() - detaching interface
1044  *
1045  * Returns:
1046  *	0	successful
1047  *	errno	failed - reason indicated
1048  *
1049  */
1050 static int
1051 if_rtdel(struct radix_node *rn, void *arg)
1052 {
1053 	struct rtentry	*rt = (struct rtentry *)rn;
1054 	struct ifnet	*ifp = arg;
1055 	int		err;
1056 
1057 	if (rt->rt_ifp == ifp) {
1058 
1059 		/*
1060 		 * Protect (sorta) against walktree recursion problems
1061 		 * with cloned routes
1062 		 */
1063 		if (!(rt->rt_flags & RTF_UP))
1064 			return (0);
1065 
1066 		err = rtrequest(RTM_DELETE, rt_key(rt), rt->rt_gateway,
1067 				rt_mask(rt), rt->rt_flags,
1068 				NULL);
1069 		if (err) {
1070 			log(LOG_WARNING, "if_rtdel: error %d\n", err);
1071 		}
1072 	}
1073 
1074 	return (0);
1075 }
1076 
1077 /*
1078  * Locate an interface based on a complete address.
1079  */
1080 struct ifaddr *
1081 ifa_ifwithaddr(struct sockaddr *addr)
1082 {
1083 	struct ifnet *ifp;
1084 
1085 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
1086 		struct ifaddr_container *ifac;
1087 
1088 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1089 			struct ifaddr *ifa = ifac->ifa;
1090 
1091 			if (ifa->ifa_addr->sa_family != addr->sa_family)
1092 				continue;
1093 			if (sa_equal(addr, ifa->ifa_addr))
1094 				return (ifa);
1095 			if ((ifp->if_flags & IFF_BROADCAST) &&
1096 			    ifa->ifa_broadaddr &&
1097 			    /* IPv6 doesn't have broadcast */
1098 			    ifa->ifa_broadaddr->sa_len != 0 &&
1099 			    sa_equal(ifa->ifa_broadaddr, addr))
1100 				return (ifa);
1101 		}
1102 	}
1103 	return (NULL);
1104 }
1105 /*
1106  * Locate the point to point interface with a given destination address.
1107  */
1108 struct ifaddr *
1109 ifa_ifwithdstaddr(struct sockaddr *addr)
1110 {
1111 	struct ifnet *ifp;
1112 
1113 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
1114 		struct ifaddr_container *ifac;
1115 
1116 		if (!(ifp->if_flags & IFF_POINTOPOINT))
1117 			continue;
1118 
1119 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1120 			struct ifaddr *ifa = ifac->ifa;
1121 
1122 			if (ifa->ifa_addr->sa_family != addr->sa_family)
1123 				continue;
1124 			if (ifa->ifa_dstaddr &&
1125 			    sa_equal(addr, ifa->ifa_dstaddr))
1126 				return (ifa);
1127 		}
1128 	}
1129 	return (NULL);
1130 }
1131 
1132 /*
1133  * Find an interface on a specific network.  If many, choice
1134  * is most specific found.
1135  */
1136 struct ifaddr *
1137 ifa_ifwithnet(struct sockaddr *addr)
1138 {
1139 	struct ifnet *ifp;
1140 	struct ifaddr *ifa_maybe = NULL;
1141 	u_int af = addr->sa_family;
1142 	char *addr_data = addr->sa_data, *cplim;
1143 
1144 	/*
1145 	 * AF_LINK addresses can be looked up directly by their index number,
1146 	 * so do that if we can.
1147 	 */
1148 	if (af == AF_LINK) {
1149 		struct sockaddr_dl *sdl = (struct sockaddr_dl *)addr;
1150 
1151 		if (sdl->sdl_index && sdl->sdl_index <= if_index)
1152 			return (ifindex2ifnet[sdl->sdl_index]->if_lladdr);
1153 	}
1154 
1155 	/*
1156 	 * Scan though each interface, looking for ones that have
1157 	 * addresses in this address family.
1158 	 */
1159 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
1160 		struct ifaddr_container *ifac;
1161 
1162 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1163 			struct ifaddr *ifa = ifac->ifa;
1164 			char *cp, *cp2, *cp3;
1165 
1166 			if (ifa->ifa_addr->sa_family != af)
1167 next:				continue;
1168 			if (af == AF_INET && ifp->if_flags & IFF_POINTOPOINT) {
1169 				/*
1170 				 * This is a bit broken as it doesn't
1171 				 * take into account that the remote end may
1172 				 * be a single node in the network we are
1173 				 * looking for.
1174 				 * The trouble is that we don't know the
1175 				 * netmask for the remote end.
1176 				 */
1177 				if (ifa->ifa_dstaddr != NULL &&
1178 				    sa_equal(addr, ifa->ifa_dstaddr))
1179 					return (ifa);
1180 			} else {
1181 				/*
1182 				 * if we have a special address handler,
1183 				 * then use it instead of the generic one.
1184 				 */
1185 				if (ifa->ifa_claim_addr) {
1186 					if ((*ifa->ifa_claim_addr)(ifa, addr)) {
1187 						return (ifa);
1188 					} else {
1189 						continue;
1190 					}
1191 				}
1192 
1193 				/*
1194 				 * Scan all the bits in the ifa's address.
1195 				 * If a bit dissagrees with what we are
1196 				 * looking for, mask it with the netmask
1197 				 * to see if it really matters.
1198 				 * (A byte at a time)
1199 				 */
1200 				if (ifa->ifa_netmask == 0)
1201 					continue;
1202 				cp = addr_data;
1203 				cp2 = ifa->ifa_addr->sa_data;
1204 				cp3 = ifa->ifa_netmask->sa_data;
1205 				cplim = ifa->ifa_netmask->sa_len +
1206 					(char *)ifa->ifa_netmask;
1207 				while (cp3 < cplim)
1208 					if ((*cp++ ^ *cp2++) & *cp3++)
1209 						goto next; /* next address! */
1210 				/*
1211 				 * If the netmask of what we just found
1212 				 * is more specific than what we had before
1213 				 * (if we had one) then remember the new one
1214 				 * before continuing to search
1215 				 * for an even better one.
1216 				 */
1217 				if (ifa_maybe == NULL ||
1218 				    rn_refines((char *)ifa->ifa_netmask,
1219 					       (char *)ifa_maybe->ifa_netmask))
1220 					ifa_maybe = ifa;
1221 			}
1222 		}
1223 	}
1224 	return (ifa_maybe);
1225 }
1226 
1227 /*
1228  * Find an interface address specific to an interface best matching
1229  * a given address.
1230  */
1231 struct ifaddr *
1232 ifaof_ifpforaddr(struct sockaddr *addr, struct ifnet *ifp)
1233 {
1234 	struct ifaddr_container *ifac;
1235 	char *cp, *cp2, *cp3;
1236 	char *cplim;
1237 	struct ifaddr *ifa_maybe = NULL;
1238 	u_int af = addr->sa_family;
1239 
1240 	if (af >= AF_MAX)
1241 		return (0);
1242 	TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1243 		struct ifaddr *ifa = ifac->ifa;
1244 
1245 		if (ifa->ifa_addr->sa_family != af)
1246 			continue;
1247 		if (ifa_maybe == NULL)
1248 			ifa_maybe = ifa;
1249 		if (ifa->ifa_netmask == NULL) {
1250 			if (sa_equal(addr, ifa->ifa_addr) ||
1251 			    (ifa->ifa_dstaddr != NULL &&
1252 			     sa_equal(addr, ifa->ifa_dstaddr)))
1253 				return (ifa);
1254 			continue;
1255 		}
1256 		if (ifp->if_flags & IFF_POINTOPOINT) {
1257 			if (sa_equal(addr, ifa->ifa_dstaddr))
1258 				return (ifa);
1259 		} else {
1260 			cp = addr->sa_data;
1261 			cp2 = ifa->ifa_addr->sa_data;
1262 			cp3 = ifa->ifa_netmask->sa_data;
1263 			cplim = ifa->ifa_netmask->sa_len + (char *)ifa->ifa_netmask;
1264 			for (; cp3 < cplim; cp3++)
1265 				if ((*cp++ ^ *cp2++) & *cp3)
1266 					break;
1267 			if (cp3 == cplim)
1268 				return (ifa);
1269 		}
1270 	}
1271 	return (ifa_maybe);
1272 }
1273 
1274 /*
1275  * Default action when installing a route with a Link Level gateway.
1276  * Lookup an appropriate real ifa to point to.
1277  * This should be moved to /sys/net/link.c eventually.
1278  */
1279 static void
1280 link_rtrequest(int cmd, struct rtentry *rt, struct rt_addrinfo *info)
1281 {
1282 	struct ifaddr *ifa;
1283 	struct sockaddr *dst;
1284 	struct ifnet *ifp;
1285 
1286 	if (cmd != RTM_ADD || (ifa = rt->rt_ifa) == NULL ||
1287 	    (ifp = ifa->ifa_ifp) == NULL || (dst = rt_key(rt)) == NULL)
1288 		return;
1289 	ifa = ifaof_ifpforaddr(dst, ifp);
1290 	if (ifa != NULL) {
1291 		IFAFREE(rt->rt_ifa);
1292 		IFAREF(ifa);
1293 		rt->rt_ifa = ifa;
1294 		if (ifa->ifa_rtrequest && ifa->ifa_rtrequest != link_rtrequest)
1295 			ifa->ifa_rtrequest(cmd, rt, info);
1296 	}
1297 }
1298 
1299 /*
1300  * Mark an interface down and notify protocols of
1301  * the transition.
1302  * NOTE: must be called at splnet or eqivalent.
1303  */
1304 void
1305 if_unroute(struct ifnet *ifp, int flag, int fam)
1306 {
1307 	struct ifaddr_container *ifac;
1308 
1309 	ifp->if_flags &= ~flag;
1310 	getmicrotime(&ifp->if_lastchange);
1311 	TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1312 		struct ifaddr *ifa = ifac->ifa;
1313 
1314 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1315 			kpfctlinput(PRC_IFDOWN, ifa->ifa_addr);
1316 	}
1317 	ifq_purge_all(&ifp->if_snd);
1318 	rt_ifmsg(ifp);
1319 }
1320 
1321 /*
1322  * Mark an interface up and notify protocols of
1323  * the transition.
1324  * NOTE: must be called at splnet or eqivalent.
1325  */
1326 void
1327 if_route(struct ifnet *ifp, int flag, int fam)
1328 {
1329 	struct ifaddr_container *ifac;
1330 
1331 	ifq_purge_all(&ifp->if_snd);
1332 	ifp->if_flags |= flag;
1333 	getmicrotime(&ifp->if_lastchange);
1334 	TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1335 		struct ifaddr *ifa = ifac->ifa;
1336 
1337 		if (fam == PF_UNSPEC || (fam == ifa->ifa_addr->sa_family))
1338 			kpfctlinput(PRC_IFUP, ifa->ifa_addr);
1339 	}
1340 	rt_ifmsg(ifp);
1341 #ifdef INET6
1342 	in6_if_up(ifp);
1343 #endif
1344 }
1345 
1346 /*
1347  * Mark an interface down and notify protocols of the transition.  An
1348  * interface going down is also considered to be a synchronizing event.
1349  * We must ensure that all packet processing related to the interface
1350  * has completed before we return so e.g. the caller can free the ifnet
1351  * structure that the mbufs may be referencing.
1352  *
1353  * NOTE: must be called at splnet or eqivalent.
1354  */
1355 void
1356 if_down(struct ifnet *ifp)
1357 {
1358 	if_unroute(ifp, IFF_UP, AF_UNSPEC);
1359 	netmsg_service_sync();
1360 }
1361 
1362 /*
1363  * Mark an interface up and notify protocols of
1364  * the transition.
1365  * NOTE: must be called at splnet or eqivalent.
1366  */
1367 void
1368 if_up(struct ifnet *ifp)
1369 {
1370 	if_route(ifp, IFF_UP, AF_UNSPEC);
1371 }
1372 
1373 /*
1374  * Process a link state change.
1375  * NOTE: must be called at splsoftnet or equivalent.
1376  */
1377 void
1378 if_link_state_change(struct ifnet *ifp)
1379 {
1380 	int link_state = ifp->if_link_state;
1381 
1382 	rt_ifmsg(ifp);
1383 	devctl_notify("IFNET", ifp->if_xname,
1384 	    (link_state == LINK_STATE_UP) ? "LINK_UP" : "LINK_DOWN", NULL);
1385 }
1386 
1387 /*
1388  * Handle interface watchdog timer routines.  Called
1389  * from softclock, we decrement timers (if set) and
1390  * call the appropriate interface routine on expiration.
1391  */
1392 static void
1393 if_slowtimo(void *arg)
1394 {
1395 	struct ifnet *ifp;
1396 
1397 	crit_enter();
1398 
1399 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
1400 		if (ifp->if_timer == 0 || --ifp->if_timer)
1401 			continue;
1402 		if (ifp->if_watchdog) {
1403 			if (ifnet_tryserialize_all(ifp)) {
1404 				(*ifp->if_watchdog)(ifp);
1405 				ifnet_deserialize_all(ifp);
1406 			} else {
1407 				/* try again next timeout */
1408 				++ifp->if_timer;
1409 			}
1410 		}
1411 	}
1412 
1413 	crit_exit();
1414 
1415 	callout_reset(&if_slowtimo_timer, hz / IFNET_SLOWHZ, if_slowtimo, NULL);
1416 }
1417 
1418 /*
1419  * Map interface name to
1420  * interface structure pointer.
1421  */
1422 struct ifnet *
1423 ifunit(const char *name)
1424 {
1425 	struct ifnet *ifp;
1426 
1427 	/*
1428 	 * Search all the interfaces for this name/number
1429 	 */
1430 
1431 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
1432 		if (strncmp(ifp->if_xname, name, IFNAMSIZ) == 0)
1433 			break;
1434 	}
1435 	return (ifp);
1436 }
1437 
1438 
1439 /*
1440  * Map interface name in a sockaddr_dl to
1441  * interface structure pointer.
1442  */
1443 struct ifnet *
1444 if_withname(struct sockaddr *sa)
1445 {
1446 	char ifname[IFNAMSIZ+1];
1447 	struct sockaddr_dl *sdl = (struct sockaddr_dl *)sa;
1448 
1449 	if ( (sa->sa_family != AF_LINK) || (sdl->sdl_nlen == 0) ||
1450 	     (sdl->sdl_nlen > IFNAMSIZ) )
1451 		return NULL;
1452 
1453 	/*
1454 	 * ifunit wants a null-terminated name.  It may not be null-terminated
1455 	 * in the sockaddr.  We don't want to change the caller's sockaddr,
1456 	 * and there might not be room to put the trailing null anyway, so we
1457 	 * make a local copy that we know we can null terminate safely.
1458 	 */
1459 
1460 	bcopy(sdl->sdl_data, ifname, sdl->sdl_nlen);
1461 	ifname[sdl->sdl_nlen] = '\0';
1462 	return ifunit(ifname);
1463 }
1464 
1465 
1466 /*
1467  * Interface ioctls.
1468  */
1469 int
1470 ifioctl(struct socket *so, u_long cmd, caddr_t data, struct ucred *cred)
1471 {
1472 	struct ifnet *ifp;
1473 	struct ifreq *ifr;
1474 	struct ifstat *ifs;
1475 	int error;
1476 	short oif_flags;
1477 	int new_flags;
1478 #ifdef COMPAT_43
1479 	int ocmd;
1480 #endif
1481 	size_t namelen, onamelen;
1482 	char new_name[IFNAMSIZ];
1483 	struct ifaddr *ifa;
1484 	struct sockaddr_dl *sdl;
1485 
1486 	switch (cmd) {
1487 	case SIOCGIFCONF:
1488 	case OSIOCGIFCONF:
1489 		return (ifconf(cmd, data, cred));
1490 	default:
1491 		break;
1492 	}
1493 
1494 	ifr = (struct ifreq *)data;
1495 
1496 	switch (cmd) {
1497 	case SIOCIFCREATE:
1498 	case SIOCIFCREATE2:
1499 		if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1500 			return (error);
1501 		return (if_clone_create(ifr->ifr_name, sizeof(ifr->ifr_name),
1502 		    	cmd == SIOCIFCREATE2 ? ifr->ifr_data : NULL));
1503 	case SIOCIFDESTROY:
1504 		if ((error = priv_check_cred(cred, PRIV_ROOT, 0)) != 0)
1505 			return (error);
1506 		return (if_clone_destroy(ifr->ifr_name));
1507 	case SIOCIFGCLONERS:
1508 		return (if_clone_list((struct if_clonereq *)data));
1509 	default:
1510 		break;
1511 	}
1512 
1513 	/*
1514 	 * Nominal ioctl through interface, lookup the ifp and obtain a
1515 	 * lock to serialize the ifconfig ioctl operation.
1516 	 */
1517 	ifp = ifunit(ifr->ifr_name);
1518 	if (ifp == NULL)
1519 		return (ENXIO);
1520 	error = 0;
1521 	mtx_lock(&ifp->if_ioctl_mtx);
1522 
1523 	switch (cmd) {
1524 	case SIOCGIFINDEX:
1525 		ifr->ifr_index = ifp->if_index;
1526 		break;
1527 
1528 	case SIOCGIFFLAGS:
1529 		ifr->ifr_flags = ifp->if_flags;
1530 		ifr->ifr_flagshigh = ifp->if_flags >> 16;
1531 		break;
1532 
1533 	case SIOCGIFCAP:
1534 		ifr->ifr_reqcap = ifp->if_capabilities;
1535 		ifr->ifr_curcap = ifp->if_capenable;
1536 		break;
1537 
1538 	case SIOCGIFMETRIC:
1539 		ifr->ifr_metric = ifp->if_metric;
1540 		break;
1541 
1542 	case SIOCGIFMTU:
1543 		ifr->ifr_mtu = ifp->if_mtu;
1544 		break;
1545 
1546 	case SIOCGIFDATA:
1547 		error = copyout((caddr_t)&ifp->if_data, ifr->ifr_data,
1548 				sizeof(ifp->if_data));
1549 		break;
1550 
1551 	case SIOCGIFPHYS:
1552 		ifr->ifr_phys = ifp->if_physical;
1553 		break;
1554 
1555 	case SIOCGIFPOLLCPU:
1556 		ifr->ifr_pollcpu = -1;
1557 		break;
1558 
1559 	case SIOCSIFPOLLCPU:
1560 		break;
1561 
1562 	case SIOCSIFFLAGS:
1563 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1564 		if (error)
1565 			break;
1566 		new_flags = (ifr->ifr_flags & 0xffff) |
1567 		    (ifr->ifr_flagshigh << 16);
1568 		if (ifp->if_flags & IFF_SMART) {
1569 			/* Smart drivers twiddle their own routes */
1570 		} else if (ifp->if_flags & IFF_UP &&
1571 		    (new_flags & IFF_UP) == 0) {
1572 			crit_enter();
1573 			if_down(ifp);
1574 			crit_exit();
1575 		} else if (new_flags & IFF_UP &&
1576 		    (ifp->if_flags & IFF_UP) == 0) {
1577 			crit_enter();
1578 			if_up(ifp);
1579 			crit_exit();
1580 		}
1581 
1582 #ifdef IFPOLL_ENABLE
1583 		if ((new_flags ^ ifp->if_flags) & IFF_NPOLLING) {
1584 			if (new_flags & IFF_NPOLLING)
1585 				ifpoll_register(ifp);
1586 			else
1587 				ifpoll_deregister(ifp);
1588 		}
1589 #endif
1590 
1591 		ifp->if_flags = (ifp->if_flags & IFF_CANTCHANGE) |
1592 			(new_flags &~ IFF_CANTCHANGE);
1593 		if (new_flags & IFF_PPROMISC) {
1594 			/* Permanently promiscuous mode requested */
1595 			ifp->if_flags |= IFF_PROMISC;
1596 		} else if (ifp->if_pcount == 0) {
1597 			ifp->if_flags &= ~IFF_PROMISC;
1598 		}
1599 		if (ifp->if_ioctl) {
1600 			ifnet_serialize_all(ifp);
1601 			ifp->if_ioctl(ifp, cmd, data, cred);
1602 			ifnet_deserialize_all(ifp);
1603 		}
1604 		getmicrotime(&ifp->if_lastchange);
1605 		break;
1606 
1607 	case SIOCSIFCAP:
1608 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1609 		if (error)
1610 			break;
1611 		if (ifr->ifr_reqcap & ~ifp->if_capabilities) {
1612 			error = EINVAL;
1613 			break;
1614 		}
1615 		ifnet_serialize_all(ifp);
1616 		ifp->if_ioctl(ifp, cmd, data, cred);
1617 		ifnet_deserialize_all(ifp);
1618 		break;
1619 
1620 	case SIOCSIFNAME:
1621 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1622 		if (error)
1623 			break;
1624 		error = copyinstr(ifr->ifr_data, new_name, IFNAMSIZ, NULL);
1625 		if (error)
1626 			break;
1627 		if (new_name[0] == '\0') {
1628 			error = EINVAL;
1629 			break;
1630 		}
1631 		if (ifunit(new_name) != NULL) {
1632 			error = EEXIST;
1633 			break;
1634 		}
1635 
1636 		EVENTHANDLER_INVOKE(ifnet_detach_event, ifp);
1637 
1638 		/* Announce the departure of the interface. */
1639 		rt_ifannouncemsg(ifp, IFAN_DEPARTURE);
1640 
1641 		strlcpy(ifp->if_xname, new_name, sizeof(ifp->if_xname));
1642 		ifa = TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
1643 		/* XXX IFA_LOCK(ifa); */
1644 		sdl = (struct sockaddr_dl *)ifa->ifa_addr;
1645 		namelen = strlen(new_name);
1646 		onamelen = sdl->sdl_nlen;
1647 		/*
1648 		 * Move the address if needed.  This is safe because we
1649 		 * allocate space for a name of length IFNAMSIZ when we
1650 		 * create this in if_attach().
1651 		 */
1652 		if (namelen != onamelen) {
1653 			bcopy(sdl->sdl_data + onamelen,
1654 			    sdl->sdl_data + namelen, sdl->sdl_alen);
1655 		}
1656 		bcopy(new_name, sdl->sdl_data, namelen);
1657 		sdl->sdl_nlen = namelen;
1658 		sdl = (struct sockaddr_dl *)ifa->ifa_netmask;
1659 		bzero(sdl->sdl_data, onamelen);
1660 		while (namelen != 0)
1661 			sdl->sdl_data[--namelen] = 0xff;
1662 		/* XXX IFA_UNLOCK(ifa) */
1663 
1664 		EVENTHANDLER_INVOKE(ifnet_attach_event, ifp);
1665 
1666 		/* Announce the return of the interface. */
1667 		rt_ifannouncemsg(ifp, IFAN_ARRIVAL);
1668 		break;
1669 
1670 	case SIOCSIFMETRIC:
1671 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1672 		if (error)
1673 			break;
1674 		ifp->if_metric = ifr->ifr_metric;
1675 		getmicrotime(&ifp->if_lastchange);
1676 		break;
1677 
1678 	case SIOCSIFPHYS:
1679 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1680 		if (error)
1681 			break;
1682 		if (ifp->if_ioctl == NULL) {
1683 		        error = EOPNOTSUPP;
1684 			break;
1685 		}
1686 		ifnet_serialize_all(ifp);
1687 		error = ifp->if_ioctl(ifp, cmd, data, cred);
1688 		ifnet_deserialize_all(ifp);
1689 		if (error == 0)
1690 			getmicrotime(&ifp->if_lastchange);
1691 		break;
1692 
1693 	case SIOCSIFMTU:
1694 	{
1695 		u_long oldmtu = ifp->if_mtu;
1696 
1697 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1698 		if (error)
1699 			break;
1700 		if (ifp->if_ioctl == NULL) {
1701 			error = EOPNOTSUPP;
1702 			break;
1703 		}
1704 		if (ifr->ifr_mtu < IF_MINMTU || ifr->ifr_mtu > IF_MAXMTU) {
1705 			error = EINVAL;
1706 			break;
1707 		}
1708 		ifnet_serialize_all(ifp);
1709 		error = ifp->if_ioctl(ifp, cmd, data, cred);
1710 		ifnet_deserialize_all(ifp);
1711 		if (error == 0) {
1712 			getmicrotime(&ifp->if_lastchange);
1713 			rt_ifmsg(ifp);
1714 		}
1715 		/*
1716 		 * If the link MTU changed, do network layer specific procedure.
1717 		 */
1718 		if (ifp->if_mtu != oldmtu) {
1719 #ifdef INET6
1720 			nd6_setmtu(ifp);
1721 #endif
1722 		}
1723 		break;
1724 	}
1725 
1726 	case SIOCADDMULTI:
1727 	case SIOCDELMULTI:
1728 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1729 		if (error)
1730 			break;
1731 
1732 		/* Don't allow group membership on non-multicast interfaces. */
1733 		if ((ifp->if_flags & IFF_MULTICAST) == 0) {
1734 			error = EOPNOTSUPP;
1735 			break;
1736 		}
1737 
1738 		/* Don't let users screw up protocols' entries. */
1739 		if (ifr->ifr_addr.sa_family != AF_LINK) {
1740 			error = EINVAL;
1741 			break;
1742 		}
1743 
1744 		if (cmd == SIOCADDMULTI) {
1745 			struct ifmultiaddr *ifma;
1746 			error = if_addmulti(ifp, &ifr->ifr_addr, &ifma);
1747 		} else {
1748 			error = if_delmulti(ifp, &ifr->ifr_addr);
1749 		}
1750 		if (error == 0)
1751 			getmicrotime(&ifp->if_lastchange);
1752 		break;
1753 
1754 	case SIOCSIFPHYADDR:
1755 	case SIOCDIFPHYADDR:
1756 #ifdef INET6
1757 	case SIOCSIFPHYADDR_IN6:
1758 #endif
1759 	case SIOCSLIFPHYADDR:
1760         case SIOCSIFMEDIA:
1761 	case SIOCSIFGENERIC:
1762 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1763 		if (error)
1764 			break;
1765 		if (ifp->if_ioctl == 0) {
1766 			error = EOPNOTSUPP;
1767 			break;
1768 		}
1769 		ifnet_serialize_all(ifp);
1770 		error = ifp->if_ioctl(ifp, cmd, data, cred);
1771 		ifnet_deserialize_all(ifp);
1772 		if (error == 0)
1773 			getmicrotime(&ifp->if_lastchange);
1774 		break;
1775 
1776 	case SIOCGIFSTATUS:
1777 		ifs = (struct ifstat *)data;
1778 		ifs->ascii[0] = '\0';
1779 		/* fall through */
1780 	case SIOCGIFPSRCADDR:
1781 	case SIOCGIFPDSTADDR:
1782 	case SIOCGLIFPHYADDR:
1783 	case SIOCGIFMEDIA:
1784 	case SIOCGIFGENERIC:
1785 		if (ifp->if_ioctl == NULL) {
1786 			error = EOPNOTSUPP;
1787 			break;
1788 		}
1789 		ifnet_serialize_all(ifp);
1790 		error = ifp->if_ioctl(ifp, cmd, data, cred);
1791 		ifnet_deserialize_all(ifp);
1792 		break;
1793 
1794 	case SIOCSIFLLADDR:
1795 		error = priv_check_cred(cred, PRIV_ROOT, 0);
1796 		if (error)
1797 			break;
1798 		error = if_setlladdr(ifp, ifr->ifr_addr.sa_data,
1799 				     ifr->ifr_addr.sa_len);
1800 		EVENTHANDLER_INVOKE(iflladdr_event, ifp);
1801 		break;
1802 
1803 	default:
1804 		oif_flags = ifp->if_flags;
1805 		if (so->so_proto == 0) {
1806 			error = EOPNOTSUPP;
1807 			break;
1808 		}
1809 #ifndef COMPAT_43
1810 		error = so_pru_control_direct(so, cmd, data, ifp);
1811 #else
1812 		ocmd = cmd;
1813 
1814 		switch (cmd) {
1815 		case SIOCSIFDSTADDR:
1816 		case SIOCSIFADDR:
1817 		case SIOCSIFBRDADDR:
1818 		case SIOCSIFNETMASK:
1819 #if BYTE_ORDER != BIG_ENDIAN
1820 			if (ifr->ifr_addr.sa_family == 0 &&
1821 			    ifr->ifr_addr.sa_len < 16) {
1822 				ifr->ifr_addr.sa_family = ifr->ifr_addr.sa_len;
1823 				ifr->ifr_addr.sa_len = 16;
1824 			}
1825 #else
1826 			if (ifr->ifr_addr.sa_len == 0)
1827 				ifr->ifr_addr.sa_len = 16;
1828 #endif
1829 			break;
1830 		case OSIOCGIFADDR:
1831 			cmd = SIOCGIFADDR;
1832 			break;
1833 		case OSIOCGIFDSTADDR:
1834 			cmd = SIOCGIFDSTADDR;
1835 			break;
1836 		case OSIOCGIFBRDADDR:
1837 			cmd = SIOCGIFBRDADDR;
1838 			break;
1839 		case OSIOCGIFNETMASK:
1840 			cmd = SIOCGIFNETMASK;
1841 			break;
1842 		default:
1843 			break;
1844 		}
1845 
1846 		error = so_pru_control_direct(so, cmd, data, ifp);
1847 
1848 		switch (ocmd) {
1849 		case OSIOCGIFADDR:
1850 		case OSIOCGIFDSTADDR:
1851 		case OSIOCGIFBRDADDR:
1852 		case OSIOCGIFNETMASK:
1853 			*(u_short *)&ifr->ifr_addr = ifr->ifr_addr.sa_family;
1854 			break;
1855 		}
1856 #endif /* COMPAT_43 */
1857 
1858 		if ((oif_flags ^ ifp->if_flags) & IFF_UP) {
1859 #ifdef INET6
1860 			DELAY(100);/* XXX: temporary workaround for fxp issue*/
1861 			if (ifp->if_flags & IFF_UP) {
1862 				crit_enter();
1863 				in6_if_up(ifp);
1864 				crit_exit();
1865 			}
1866 #endif
1867 		}
1868 		break;
1869 	}
1870 
1871 	mtx_unlock(&ifp->if_ioctl_mtx);
1872 	return (error);
1873 }
1874 
1875 /*
1876  * Set/clear promiscuous mode on interface ifp based on the truth value
1877  * of pswitch.  The calls are reference counted so that only the first
1878  * "on" request actually has an effect, as does the final "off" request.
1879  * Results are undefined if the "off" and "on" requests are not matched.
1880  */
1881 int
1882 ifpromisc(struct ifnet *ifp, int pswitch)
1883 {
1884 	struct ifreq ifr;
1885 	int error;
1886 	int oldflags;
1887 
1888 	oldflags = ifp->if_flags;
1889 	if (ifp->if_flags & IFF_PPROMISC) {
1890 		/* Do nothing if device is in permanently promiscuous mode */
1891 		ifp->if_pcount += pswitch ? 1 : -1;
1892 		return (0);
1893 	}
1894 	if (pswitch) {
1895 		/*
1896 		 * If the device is not configured up, we cannot put it in
1897 		 * promiscuous mode.
1898 		 */
1899 		if ((ifp->if_flags & IFF_UP) == 0)
1900 			return (ENETDOWN);
1901 		if (ifp->if_pcount++ != 0)
1902 			return (0);
1903 		ifp->if_flags |= IFF_PROMISC;
1904 		log(LOG_INFO, "%s: promiscuous mode enabled\n",
1905 		    ifp->if_xname);
1906 	} else {
1907 		if (--ifp->if_pcount > 0)
1908 			return (0);
1909 		ifp->if_flags &= ~IFF_PROMISC;
1910 		log(LOG_INFO, "%s: promiscuous mode disabled\n",
1911 		    ifp->if_xname);
1912 	}
1913 	ifr.ifr_flags = ifp->if_flags;
1914 	ifr.ifr_flagshigh = ifp->if_flags >> 16;
1915 	ifnet_serialize_all(ifp);
1916 	error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr, NULL);
1917 	ifnet_deserialize_all(ifp);
1918 	if (error == 0)
1919 		rt_ifmsg(ifp);
1920 	else
1921 		ifp->if_flags = oldflags;
1922 	return error;
1923 }
1924 
1925 /*
1926  * Return interface configuration
1927  * of system.  List may be used
1928  * in later ioctl's (above) to get
1929  * other information.
1930  */
1931 static int
1932 ifconf(u_long cmd, caddr_t data, struct ucred *cred)
1933 {
1934 	struct ifconf *ifc = (struct ifconf *)data;
1935 	struct ifnet *ifp;
1936 	struct sockaddr *sa;
1937 	struct ifreq ifr, *ifrp;
1938 	int space = ifc->ifc_len, error = 0;
1939 
1940 	ifrp = ifc->ifc_req;
1941 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
1942 		struct ifaddr_container *ifac;
1943 		int addrs;
1944 
1945 		if (space <= sizeof ifr)
1946 			break;
1947 
1948 		/*
1949 		 * Zero the stack declared structure first to prevent
1950 		 * memory disclosure.
1951 		 */
1952 		bzero(&ifr, sizeof(ifr));
1953 		if (strlcpy(ifr.ifr_name, ifp->if_xname, sizeof(ifr.ifr_name))
1954 		    >= sizeof(ifr.ifr_name)) {
1955 			error = ENAMETOOLONG;
1956 			break;
1957 		}
1958 
1959 		addrs = 0;
1960 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
1961 			struct ifaddr *ifa = ifac->ifa;
1962 
1963 			if (space <= sizeof ifr)
1964 				break;
1965 			sa = ifa->ifa_addr;
1966 			if (cred->cr_prison &&
1967 			    prison_if(cred, sa))
1968 				continue;
1969 			addrs++;
1970 #ifdef COMPAT_43
1971 			if (cmd == OSIOCGIFCONF) {
1972 				struct osockaddr *osa =
1973 					 (struct osockaddr *)&ifr.ifr_addr;
1974 				ifr.ifr_addr = *sa;
1975 				osa->sa_family = sa->sa_family;
1976 				error = copyout(&ifr, ifrp, sizeof ifr);
1977 				ifrp++;
1978 			} else
1979 #endif
1980 			if (sa->sa_len <= sizeof(*sa)) {
1981 				ifr.ifr_addr = *sa;
1982 				error = copyout(&ifr, ifrp, sizeof ifr);
1983 				ifrp++;
1984 			} else {
1985 				if (space < (sizeof ifr) + sa->sa_len -
1986 					    sizeof(*sa))
1987 					break;
1988 				space -= sa->sa_len - sizeof(*sa);
1989 				error = copyout(&ifr, ifrp,
1990 						sizeof ifr.ifr_name);
1991 				if (error == 0)
1992 					error = copyout(sa, &ifrp->ifr_addr,
1993 							sa->sa_len);
1994 				ifrp = (struct ifreq *)
1995 					(sa->sa_len + (caddr_t)&ifrp->ifr_addr);
1996 			}
1997 			if (error)
1998 				break;
1999 			space -= sizeof ifr;
2000 		}
2001 		if (error)
2002 			break;
2003 		if (!addrs) {
2004 			bzero(&ifr.ifr_addr, sizeof ifr.ifr_addr);
2005 			error = copyout(&ifr, ifrp, sizeof ifr);
2006 			if (error)
2007 				break;
2008 			space -= sizeof ifr;
2009 			ifrp++;
2010 		}
2011 	}
2012 	ifc->ifc_len -= space;
2013 	return (error);
2014 }
2015 
2016 /*
2017  * Just like if_promisc(), but for all-multicast-reception mode.
2018  */
2019 int
2020 if_allmulti(struct ifnet *ifp, int onswitch)
2021 {
2022 	int error = 0;
2023 	struct ifreq ifr;
2024 
2025 	crit_enter();
2026 
2027 	if (onswitch) {
2028 		if (ifp->if_amcount++ == 0) {
2029 			ifp->if_flags |= IFF_ALLMULTI;
2030 			ifr.ifr_flags = ifp->if_flags;
2031 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
2032 			ifnet_serialize_all(ifp);
2033 			error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2034 					      NULL);
2035 			ifnet_deserialize_all(ifp);
2036 		}
2037 	} else {
2038 		if (ifp->if_amcount > 1) {
2039 			ifp->if_amcount--;
2040 		} else {
2041 			ifp->if_amcount = 0;
2042 			ifp->if_flags &= ~IFF_ALLMULTI;
2043 			ifr.ifr_flags = ifp->if_flags;
2044 			ifr.ifr_flagshigh = ifp->if_flags >> 16;
2045 			ifnet_serialize_all(ifp);
2046 			error = ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2047 					      NULL);
2048 			ifnet_deserialize_all(ifp);
2049 		}
2050 	}
2051 
2052 	crit_exit();
2053 
2054 	if (error == 0)
2055 		rt_ifmsg(ifp);
2056 	return error;
2057 }
2058 
2059 /*
2060  * Add a multicast listenership to the interface in question.
2061  * The link layer provides a routine which converts
2062  */
2063 int
2064 if_addmulti(
2065 	struct ifnet *ifp,	/* interface to manipulate */
2066 	struct sockaddr *sa,	/* address to add */
2067 	struct ifmultiaddr **retifma)
2068 {
2069 	struct sockaddr *llsa, *dupsa;
2070 	int error;
2071 	struct ifmultiaddr *ifma;
2072 
2073 	/*
2074 	 * If the matching multicast address already exists
2075 	 * then don't add a new one, just add a reference
2076 	 */
2077 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2078 		if (sa_equal(sa, ifma->ifma_addr)) {
2079 			ifma->ifma_refcount++;
2080 			if (retifma)
2081 				*retifma = ifma;
2082 			return 0;
2083 		}
2084 	}
2085 
2086 	/*
2087 	 * Give the link layer a chance to accept/reject it, and also
2088 	 * find out which AF_LINK address this maps to, if it isn't one
2089 	 * already.
2090 	 */
2091 	if (ifp->if_resolvemulti) {
2092 		ifnet_serialize_all(ifp);
2093 		error = ifp->if_resolvemulti(ifp, &llsa, sa);
2094 		ifnet_deserialize_all(ifp);
2095 		if (error)
2096 			return error;
2097 	} else {
2098 		llsa = NULL;
2099 	}
2100 
2101 	ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2102 	dupsa = kmalloc(sa->sa_len, M_IFMADDR, M_WAITOK);
2103 	bcopy(sa, dupsa, sa->sa_len);
2104 
2105 	ifma->ifma_addr = dupsa;
2106 	ifma->ifma_lladdr = llsa;
2107 	ifma->ifma_ifp = ifp;
2108 	ifma->ifma_refcount = 1;
2109 	ifma->ifma_protospec = 0;
2110 	rt_newmaddrmsg(RTM_NEWMADDR, ifma);
2111 
2112 	/*
2113 	 * Some network interfaces can scan the address list at
2114 	 * interrupt time; lock them out.
2115 	 */
2116 	crit_enter();
2117 	TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2118 	crit_exit();
2119 	if (retifma)
2120 		*retifma = ifma;
2121 
2122 	if (llsa != NULL) {
2123 		TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link) {
2124 			if (sa_equal(ifma->ifma_addr, llsa))
2125 				break;
2126 		}
2127 		if (ifma) {
2128 			ifma->ifma_refcount++;
2129 		} else {
2130 			ifma = kmalloc(sizeof *ifma, M_IFMADDR, M_WAITOK);
2131 			dupsa = kmalloc(llsa->sa_len, M_IFMADDR, M_WAITOK);
2132 			bcopy(llsa, dupsa, llsa->sa_len);
2133 			ifma->ifma_addr = dupsa;
2134 			ifma->ifma_ifp = ifp;
2135 			ifma->ifma_refcount = 1;
2136 			crit_enter();
2137 			TAILQ_INSERT_HEAD(&ifp->if_multiaddrs, ifma, ifma_link);
2138 			crit_exit();
2139 		}
2140 	}
2141 	/*
2142 	 * We are certain we have added something, so call down to the
2143 	 * interface to let them know about it.
2144 	 */
2145 	crit_enter();
2146 	ifnet_serialize_all(ifp);
2147 	if (ifp->if_ioctl)
2148 		ifp->if_ioctl(ifp, SIOCADDMULTI, 0, NULL);
2149 	ifnet_deserialize_all(ifp);
2150 	crit_exit();
2151 
2152 	return 0;
2153 }
2154 
2155 /*
2156  * Remove a reference to a multicast address on this interface.  Yell
2157  * if the request does not match an existing membership.
2158  */
2159 int
2160 if_delmulti(struct ifnet *ifp, struct sockaddr *sa)
2161 {
2162 	struct ifmultiaddr *ifma;
2163 
2164 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2165 		if (sa_equal(sa, ifma->ifma_addr))
2166 			break;
2167 	if (ifma == NULL)
2168 		return ENOENT;
2169 
2170 	if (ifma->ifma_refcount > 1) {
2171 		ifma->ifma_refcount--;
2172 		return 0;
2173 	}
2174 
2175 	rt_newmaddrmsg(RTM_DELMADDR, ifma);
2176 	sa = ifma->ifma_lladdr;
2177 	crit_enter();
2178 	TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2179 	/*
2180 	 * Make sure the interface driver is notified
2181 	 * in the case of a link layer mcast group being left.
2182 	 */
2183 	if (ifma->ifma_addr->sa_family == AF_LINK && sa == NULL) {
2184 		ifnet_serialize_all(ifp);
2185 		ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2186 		ifnet_deserialize_all(ifp);
2187 	}
2188 	crit_exit();
2189 	kfree(ifma->ifma_addr, M_IFMADDR);
2190 	kfree(ifma, M_IFMADDR);
2191 	if (sa == NULL)
2192 		return 0;
2193 
2194 	/*
2195 	 * Now look for the link-layer address which corresponds to
2196 	 * this network address.  It had been squirreled away in
2197 	 * ifma->ifma_lladdr for this purpose (so we don't have
2198 	 * to call ifp->if_resolvemulti() again), and we saved that
2199 	 * value in sa above.  If some nasty deleted the
2200 	 * link-layer address out from underneath us, we can deal because
2201 	 * the address we stored was is not the same as the one which was
2202 	 * in the record for the link-layer address.  (So we don't complain
2203 	 * in that case.)
2204 	 */
2205 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2206 		if (sa_equal(sa, ifma->ifma_addr))
2207 			break;
2208 	if (ifma == NULL)
2209 		return 0;
2210 
2211 	if (ifma->ifma_refcount > 1) {
2212 		ifma->ifma_refcount--;
2213 		return 0;
2214 	}
2215 
2216 	crit_enter();
2217 	ifnet_serialize_all(ifp);
2218 	TAILQ_REMOVE(&ifp->if_multiaddrs, ifma, ifma_link);
2219 	ifp->if_ioctl(ifp, SIOCDELMULTI, 0, NULL);
2220 	ifnet_deserialize_all(ifp);
2221 	crit_exit();
2222 	kfree(ifma->ifma_addr, M_IFMADDR);
2223 	kfree(sa, M_IFMADDR);
2224 	kfree(ifma, M_IFMADDR);
2225 
2226 	return 0;
2227 }
2228 
2229 /*
2230  * Delete all multicast group membership for an interface.
2231  * Should be used to quickly flush all multicast filters.
2232  */
2233 void
2234 if_delallmulti(struct ifnet *ifp)
2235 {
2236 	struct ifmultiaddr *ifma;
2237 	struct ifmultiaddr *next;
2238 
2239 	TAILQ_FOREACH_MUTABLE(ifma, &ifp->if_multiaddrs, ifma_link, next)
2240 		if_delmulti(ifp, ifma->ifma_addr);
2241 }
2242 
2243 
2244 /*
2245  * Set the link layer address on an interface.
2246  *
2247  * At this time we only support certain types of interfaces,
2248  * and we don't allow the length of the address to change.
2249  */
2250 int
2251 if_setlladdr(struct ifnet *ifp, const u_char *lladdr, int len)
2252 {
2253 	struct sockaddr_dl *sdl;
2254 	struct ifreq ifr;
2255 
2256 	sdl = IF_LLSOCKADDR(ifp);
2257 	if (sdl == NULL)
2258 		return (EINVAL);
2259 	if (len != sdl->sdl_alen)	/* don't allow length to change */
2260 		return (EINVAL);
2261 	switch (ifp->if_type) {
2262 	case IFT_ETHER:			/* these types use struct arpcom */
2263 	case IFT_XETHER:
2264 	case IFT_L2VLAN:
2265 		bcopy(lladdr, ((struct arpcom *)ifp->if_softc)->ac_enaddr, len);
2266 		bcopy(lladdr, LLADDR(sdl), len);
2267 		break;
2268 	default:
2269 		return (ENODEV);
2270 	}
2271 	/*
2272 	 * If the interface is already up, we need
2273 	 * to re-init it in order to reprogram its
2274 	 * address filter.
2275 	 */
2276 	ifnet_serialize_all(ifp);
2277 	if ((ifp->if_flags & IFF_UP) != 0) {
2278 #ifdef INET
2279 		struct ifaddr_container *ifac;
2280 #endif
2281 
2282 		ifp->if_flags &= ~IFF_UP;
2283 		ifr.ifr_flags = ifp->if_flags;
2284 		ifr.ifr_flagshigh = ifp->if_flags >> 16;
2285 		ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2286 			      NULL);
2287 		ifp->if_flags |= IFF_UP;
2288 		ifr.ifr_flags = ifp->if_flags;
2289 		ifr.ifr_flagshigh = ifp->if_flags >> 16;
2290 		ifp->if_ioctl(ifp, SIOCSIFFLAGS, (caddr_t)&ifr,
2291 				 NULL);
2292 #ifdef INET
2293 		/*
2294 		 * Also send gratuitous ARPs to notify other nodes about
2295 		 * the address change.
2296 		 */
2297 		TAILQ_FOREACH(ifac, &ifp->if_addrheads[mycpuid], ifa_link) {
2298 			struct ifaddr *ifa = ifac->ifa;
2299 
2300 			if (ifa->ifa_addr != NULL &&
2301 			    ifa->ifa_addr->sa_family == AF_INET)
2302 				arp_gratuitous(ifp, ifa);
2303 		}
2304 #endif
2305 	}
2306 	ifnet_deserialize_all(ifp);
2307 	return (0);
2308 }
2309 
2310 struct ifmultiaddr *
2311 ifmaof_ifpforaddr(struct sockaddr *sa, struct ifnet *ifp)
2312 {
2313 	struct ifmultiaddr *ifma;
2314 
2315 	TAILQ_FOREACH(ifma, &ifp->if_multiaddrs, ifma_link)
2316 		if (sa_equal(ifma->ifma_addr, sa))
2317 			break;
2318 
2319 	return ifma;
2320 }
2321 
2322 /*
2323  * This function locates the first real ethernet MAC from a network
2324  * card and loads it into node, returning 0 on success or ENOENT if
2325  * no suitable interfaces were found.  It is used by the uuid code to
2326  * generate a unique 6-byte number.
2327  */
2328 int
2329 if_getanyethermac(uint16_t *node, int minlen)
2330 {
2331 	struct ifnet *ifp;
2332 	struct sockaddr_dl *sdl;
2333 
2334 	TAILQ_FOREACH(ifp, &ifnet, if_link) {
2335 		if (ifp->if_type != IFT_ETHER)
2336 			continue;
2337 		sdl = IF_LLSOCKADDR(ifp);
2338 		if (sdl->sdl_alen < minlen)
2339 			continue;
2340 		bcopy(((struct arpcom *)ifp->if_softc)->ac_enaddr, node,
2341 		      minlen);
2342 		return(0);
2343 	}
2344 	return (ENOENT);
2345 }
2346 
2347 /*
2348  * The name argument must be a pointer to storage which will last as
2349  * long as the interface does.  For physical devices, the result of
2350  * device_get_name(dev) is a good choice and for pseudo-devices a
2351  * static string works well.
2352  */
2353 void
2354 if_initname(struct ifnet *ifp, const char *name, int unit)
2355 {
2356 	ifp->if_dname = name;
2357 	ifp->if_dunit = unit;
2358 	if (unit != IF_DUNIT_NONE)
2359 		ksnprintf(ifp->if_xname, IFNAMSIZ, "%s%d", name, unit);
2360 	else
2361 		strlcpy(ifp->if_xname, name, IFNAMSIZ);
2362 }
2363 
2364 int
2365 if_printf(struct ifnet *ifp, const char *fmt, ...)
2366 {
2367 	__va_list ap;
2368 	int retval;
2369 
2370 	retval = kprintf("%s: ", ifp->if_xname);
2371 	__va_start(ap, fmt);
2372 	retval += kvprintf(fmt, ap);
2373 	__va_end(ap);
2374 	return (retval);
2375 }
2376 
2377 struct ifnet *
2378 if_alloc(uint8_t type)
2379 {
2380         struct ifnet *ifp;
2381 	size_t size;
2382 
2383 	/*
2384 	 * XXX temporary hack until arpcom is setup in if_l2com
2385 	 */
2386 	if (type == IFT_ETHER)
2387 		size = sizeof(struct arpcom);
2388 	else
2389 		size = sizeof(struct ifnet);
2390 
2391 	ifp = kmalloc(size, M_IFNET, M_WAITOK|M_ZERO);
2392 
2393 	ifp->if_type = type;
2394 
2395 	if (if_com_alloc[type] != NULL) {
2396 		ifp->if_l2com = if_com_alloc[type](type, ifp);
2397 		if (ifp->if_l2com == NULL) {
2398 			kfree(ifp, M_IFNET);
2399 			return (NULL);
2400 		}
2401 	}
2402 	return (ifp);
2403 }
2404 
2405 void
2406 if_free(struct ifnet *ifp)
2407 {
2408 	kfree(ifp, M_IFNET);
2409 }
2410 
2411 void
2412 ifq_set_classic(struct ifaltq *ifq)
2413 {
2414 	ifq->altq_enqueue = ifq_classic_enqueue;
2415 	ifq->altq_dequeue = ifq_classic_dequeue;
2416 	ifq->altq_request = ifq_classic_request;
2417 }
2418 
2419 int
2420 ifq_classic_enqueue(struct ifaltq *ifq, struct mbuf *m,
2421 		    struct altq_pktattr *pa __unused)
2422 {
2423 	logifq(enqueue, ifq);
2424 	if (IF_QFULL(ifq)) {
2425 		m_freem(m);
2426 		return(ENOBUFS);
2427 	} else {
2428 		IF_ENQUEUE(ifq, m);
2429 		return(0);
2430 	}
2431 }
2432 
2433 struct mbuf *
2434 ifq_classic_dequeue(struct ifaltq *ifq, struct mbuf *mpolled, int op)
2435 {
2436 	struct mbuf *m;
2437 
2438 	switch (op) {
2439 	case ALTDQ_POLL:
2440 		IF_POLL(ifq, m);
2441 		break;
2442 	case ALTDQ_REMOVE:
2443 		logifq(dequeue, ifq);
2444 		IF_DEQUEUE(ifq, m);
2445 		break;
2446 	default:
2447 		panic("unsupported ALTQ dequeue op: %d", op);
2448 	}
2449 	KKASSERT(mpolled == NULL || mpolled == m);
2450 	return(m);
2451 }
2452 
2453 int
2454 ifq_classic_request(struct ifaltq *ifq, int req, void *arg)
2455 {
2456 	switch (req) {
2457 	case ALTRQ_PURGE:
2458 		IF_DRAIN(ifq);
2459 		break;
2460 	default:
2461 		panic("unsupported ALTQ request: %d", req);
2462 	}
2463 	return(0);
2464 }
2465 
2466 static void
2467 ifq_try_ifstart(struct ifaltq *ifq, int force_sched)
2468 {
2469 	struct ifnet *ifp = ifq->altq_ifp;
2470 	int running = 0, need_sched;
2471 
2472 	/*
2473 	 * Try to do direct ifnet.if_start first, if there is
2474 	 * contention on ifnet's serializer, ifnet.if_start will
2475 	 * be scheduled on ifnet's CPU.
2476 	 */
2477 	if (!ifnet_tryserialize_tx(ifp)) {
2478 		/*
2479 		 * ifnet serializer contention happened,
2480 		 * ifnet.if_start is scheduled on ifnet's
2481 		 * CPU, and we keep going.
2482 		 */
2483 		logifstart(contend_sched, ifp);
2484 		if_start_schedule(ifp, 1);
2485 		return;
2486 	}
2487 
2488 	if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq)) {
2489 		logifstart(run, ifp);
2490 		ifp->if_start(ifp);
2491 		if ((ifp->if_flags & IFF_RUNNING) && !ifq_is_oactive(ifq))
2492 			running = 1;
2493 	}
2494 	need_sched = if_start_need_schedule(ifq, running);
2495 
2496 	ifnet_deserialize_tx(ifp);
2497 
2498 	if (need_sched) {
2499 		/*
2500 		 * More data need to be transmitted, ifnet.if_start is
2501 		 * scheduled on ifnet's CPU, and we keep going.
2502 		 * NOTE: ifnet.if_start interlock is not released.
2503 		 */
2504 		logifstart(sched, ifp);
2505 		if_start_schedule(ifp, force_sched);
2506 	}
2507 }
2508 
2509 /*
2510  * IFQ packets staging mechanism:
2511  *
2512  * The packets enqueued into IFQ are staged to a certain amount before the
2513  * ifnet's if_start is called.  In this way, the driver could avoid writing
2514  * to hardware registers upon every packet, instead, hardware registers
2515  * could be written when certain amount of packets are put onto hardware
2516  * TX ring.  The measurement on several modern NICs (emx(4), igb(4), bnx(4),
2517  * bge(4), jme(4)) shows that the hardware registers writing aggregation
2518  * could save ~20% CPU time when 18bytes UDP datagrams are transmitted at
2519  * 1.48Mpps.  The performance improvement by hardware registers writing
2520  * aggeregation is also mentioned by Luigi Rizzo's netmap paper
2521  * (http://info.iet.unipi.it/~luigi/netmap/).
2522  *
2523  * IFQ packets staging is performed for two entry points into drivers's
2524  * transmission function:
2525  * - Direct ifnet's if_start calling, i.e. ifq_try_ifstart()
2526  * - ifnet's if_start scheduling, i.e. if_start_schedule()
2527  *
2528  * IFQ packets staging will be stopped upon any of the following conditions:
2529  * - If the count of packets enqueued on the current CPU is great than or
2530  *   equal to ifq_stage_cntmax. (XXX this should be per-interface)
2531  * - If the total length of packets enqueued on the current CPU is great
2532  *   than or equal to the hardware's MTU - max_protohdr.  max_protohdr is
2533  *   cut from the hardware's MTU mainly bacause a full TCP segment's size
2534  *   is usually less than hardware's MTU.
2535  * - if_start_schedule() is not pending on the current CPU and if_start
2536  *   interlock (if_snd.altq_started) is not released.
2537  * - The if_start_rollup(), which is registered as low priority netisr
2538  *   rollup function, is called; probably because no more work is pending
2539  *   for netisr.
2540  *
2541  * NOTE:
2542  * Currently IFQ packet staging is only performed in netisr threads.
2543  */
2544 int
2545 ifq_dispatch(struct ifnet *ifp, struct mbuf *m, struct altq_pktattr *pa)
2546 {
2547 	struct ifaltq *ifq = &ifp->if_snd;
2548 	int error, start = 0, len, mcast = 0, avoid_start = 0;
2549 	struct ifaltq_stage_head *head = NULL;
2550 	struct ifaltq_stage *stage = NULL;
2551 
2552 	ASSERT_IFNET_NOT_SERIALIZED_TX(ifp);
2553 
2554 	len = m->m_pkthdr.len;
2555 	if (m->m_flags & M_MCAST)
2556 		mcast = 1;
2557 
2558 	if (curthread->td_type == TD_TYPE_NETISR) {
2559 		head = &ifq_stage_heads[mycpuid];
2560 		stage = &ifq->altq_stage[mycpuid];
2561 
2562 		stage->ifqs_cnt++;
2563 		stage->ifqs_len += len;
2564 		if (stage->ifqs_cnt < ifq_stage_cntmax &&
2565 		    stage->ifqs_len < (ifp->if_mtu - max_protohdr))
2566 			avoid_start = 1;
2567 	}
2568 
2569 	ALTQ_LOCK(ifq);
2570 	error = ifq_enqueue_locked(ifq, m, pa);
2571 	if (error) {
2572 		if (!ifq_data_ready(ifq)) {
2573 			ALTQ_UNLOCK(ifq);
2574 			return error;
2575 		}
2576 		avoid_start = 0;
2577 	}
2578 	if (!ifq->altq_started) {
2579 		if (avoid_start) {
2580 			ALTQ_UNLOCK(ifq);
2581 
2582 			KKASSERT(!error);
2583 			if ((stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) == 0)
2584 				ifq_stage_insert(head, stage);
2585 
2586 			ifp->if_obytes += len;
2587 			if (mcast)
2588 				ifp->if_omcasts++;
2589 			return error;
2590 		}
2591 
2592 		/*
2593 		 * Hold the interlock of ifnet.if_start
2594 		 */
2595 		ifq->altq_started = 1;
2596 		start = 1;
2597 	}
2598 	ALTQ_UNLOCK(ifq);
2599 
2600 	if (!error) {
2601 		ifp->if_obytes += len;
2602 		if (mcast)
2603 			ifp->if_omcasts++;
2604 	}
2605 
2606 	if (stage != NULL) {
2607 		if (!start && (stage->ifqs_flags & IFQ_STAGE_FLAG_SCHED)) {
2608 			KKASSERT(stage->ifqs_flags & IFQ_STAGE_FLAG_QUED);
2609 			if (!avoid_start) {
2610 				ifq_stage_remove(head, stage);
2611 				if_start_schedule(ifp, 1);
2612 			}
2613 			return error;
2614 		}
2615 
2616 		if (stage->ifqs_flags & IFQ_STAGE_FLAG_QUED) {
2617 			ifq_stage_remove(head, stage);
2618 		} else {
2619 			stage->ifqs_cnt = 0;
2620 			stage->ifqs_len = 0;
2621 		}
2622 	}
2623 
2624 	if (!start) {
2625 		logifstart(avoid, ifp);
2626 		return error;
2627 	}
2628 
2629 	ifq_try_ifstart(ifq, 0);
2630 	return error;
2631 }
2632 
2633 void *
2634 ifa_create(int size, int flags)
2635 {
2636 	struct ifaddr *ifa;
2637 	int i;
2638 
2639 	KASSERT(size >= sizeof(*ifa), ("ifaddr size too small"));
2640 
2641 	ifa = kmalloc(size, M_IFADDR, flags | M_ZERO);
2642 	if (ifa == NULL)
2643 		return NULL;
2644 
2645 	ifa->ifa_containers = kmalloc(ncpus * sizeof(struct ifaddr_container),
2646 				      M_IFADDR, M_WAITOK | M_ZERO);
2647 	ifa->ifa_ncnt = ncpus;
2648 	for (i = 0; i < ncpus; ++i) {
2649 		struct ifaddr_container *ifac = &ifa->ifa_containers[i];
2650 
2651 		ifac->ifa_magic = IFA_CONTAINER_MAGIC;
2652 		ifac->ifa = ifa;
2653 		ifac->ifa_refcnt = 1;
2654 	}
2655 #ifdef IFADDR_DEBUG
2656 	kprintf("alloc ifa %p %d\n", ifa, size);
2657 #endif
2658 	return ifa;
2659 }
2660 
2661 void
2662 ifac_free(struct ifaddr_container *ifac, int cpu_id)
2663 {
2664 	struct ifaddr *ifa = ifac->ifa;
2665 
2666 	KKASSERT(ifac->ifa_magic == IFA_CONTAINER_MAGIC);
2667 	KKASSERT(ifac->ifa_refcnt == 0);
2668 	KASSERT(ifac->ifa_listmask == 0,
2669 		("ifa is still on %#x lists", ifac->ifa_listmask));
2670 
2671 	ifac->ifa_magic = IFA_CONTAINER_DEAD;
2672 
2673 #ifdef IFADDR_DEBUG_VERBOSE
2674 	kprintf("try free ifa %p cpu_id %d\n", ifac->ifa, cpu_id);
2675 #endif
2676 
2677 	KASSERT(ifa->ifa_ncnt > 0 && ifa->ifa_ncnt <= ncpus,
2678 		("invalid # of ifac, %d", ifa->ifa_ncnt));
2679 	if (atomic_fetchadd_int(&ifa->ifa_ncnt, -1) == 1) {
2680 #ifdef IFADDR_DEBUG
2681 		kprintf("free ifa %p\n", ifa);
2682 #endif
2683 		kfree(ifa->ifa_containers, M_IFADDR);
2684 		kfree(ifa, M_IFADDR);
2685 	}
2686 }
2687 
2688 static void
2689 ifa_iflink_dispatch(netmsg_t nmsg)
2690 {
2691 	struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2692 	struct ifaddr *ifa = msg->ifa;
2693 	struct ifnet *ifp = msg->ifp;
2694 	int cpu = mycpuid;
2695 	struct ifaddr_container *ifac;
2696 
2697 	crit_enter();
2698 
2699 	ifac = &ifa->ifa_containers[cpu];
2700 	ASSERT_IFAC_VALID(ifac);
2701 	KASSERT((ifac->ifa_listmask & IFA_LIST_IFADDRHEAD) == 0,
2702 		("ifaddr is on if_addrheads"));
2703 
2704 	ifac->ifa_listmask |= IFA_LIST_IFADDRHEAD;
2705 	if (msg->tail)
2706 		TAILQ_INSERT_TAIL(&ifp->if_addrheads[cpu], ifac, ifa_link);
2707 	else
2708 		TAILQ_INSERT_HEAD(&ifp->if_addrheads[cpu], ifac, ifa_link);
2709 
2710 	crit_exit();
2711 
2712 	ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
2713 }
2714 
2715 void
2716 ifa_iflink(struct ifaddr *ifa, struct ifnet *ifp, int tail)
2717 {
2718 	struct netmsg_ifaddr msg;
2719 
2720 	netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2721 		    0, ifa_iflink_dispatch);
2722 	msg.ifa = ifa;
2723 	msg.ifp = ifp;
2724 	msg.tail = tail;
2725 
2726 	ifa_domsg(&msg.base.lmsg, 0);
2727 }
2728 
2729 static void
2730 ifa_ifunlink_dispatch(netmsg_t nmsg)
2731 {
2732 	struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2733 	struct ifaddr *ifa = msg->ifa;
2734 	struct ifnet *ifp = msg->ifp;
2735 	int cpu = mycpuid;
2736 	struct ifaddr_container *ifac;
2737 
2738 	crit_enter();
2739 
2740 	ifac = &ifa->ifa_containers[cpu];
2741 	ASSERT_IFAC_VALID(ifac);
2742 	KASSERT(ifac->ifa_listmask & IFA_LIST_IFADDRHEAD,
2743 		("ifaddr is not on if_addrhead"));
2744 
2745 	TAILQ_REMOVE(&ifp->if_addrheads[cpu], ifac, ifa_link);
2746 	ifac->ifa_listmask &= ~IFA_LIST_IFADDRHEAD;
2747 
2748 	crit_exit();
2749 
2750 	ifa_forwardmsg(&nmsg->lmsg, cpu + 1);
2751 }
2752 
2753 void
2754 ifa_ifunlink(struct ifaddr *ifa, struct ifnet *ifp)
2755 {
2756 	struct netmsg_ifaddr msg;
2757 
2758 	netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2759 		    0, ifa_ifunlink_dispatch);
2760 	msg.ifa = ifa;
2761 	msg.ifp = ifp;
2762 
2763 	ifa_domsg(&msg.base.lmsg, 0);
2764 }
2765 
2766 static void
2767 ifa_destroy_dispatch(netmsg_t nmsg)
2768 {
2769 	struct netmsg_ifaddr *msg = (struct netmsg_ifaddr *)nmsg;
2770 
2771 	IFAFREE(msg->ifa);
2772 	ifa_forwardmsg(&nmsg->lmsg, mycpuid + 1);
2773 }
2774 
2775 void
2776 ifa_destroy(struct ifaddr *ifa)
2777 {
2778 	struct netmsg_ifaddr msg;
2779 
2780 	netmsg_init(&msg.base, NULL, &curthread->td_msgport,
2781 		    0, ifa_destroy_dispatch);
2782 	msg.ifa = ifa;
2783 
2784 	ifa_domsg(&msg.base.lmsg, 0);
2785 }
2786 
2787 struct lwkt_port *
2788 ifnet_portfn(int cpu)
2789 {
2790 	return &ifnet_threads[cpu].td_msgport;
2791 }
2792 
2793 void
2794 ifnet_forwardmsg(struct lwkt_msg *lmsg, int next_cpu)
2795 {
2796 	KKASSERT(next_cpu > mycpuid && next_cpu <= ncpus);
2797 
2798 	if (next_cpu < ncpus)
2799 		lwkt_forwardmsg(ifnet_portfn(next_cpu), lmsg);
2800 	else
2801 		lwkt_replymsg(lmsg, 0);
2802 }
2803 
2804 int
2805 ifnet_domsg(struct lwkt_msg *lmsg, int cpu)
2806 {
2807 	KKASSERT(cpu < ncpus);
2808 	return lwkt_domsg(ifnet_portfn(cpu), lmsg, 0);
2809 }
2810 
2811 void
2812 ifnet_sendmsg(struct lwkt_msg *lmsg, int cpu)
2813 {
2814 	KKASSERT(cpu < ncpus);
2815 	lwkt_sendmsg(ifnet_portfn(cpu), lmsg);
2816 }
2817 
2818 /*
2819  * Generic netmsg service loop.  Some protocols may roll their own but all
2820  * must do the basic command dispatch function call done here.
2821  */
2822 static void
2823 ifnet_service_loop(void *arg __unused)
2824 {
2825 	netmsg_t msg;
2826 
2827 	while ((msg = lwkt_waitport(&curthread->td_msgport, 0))) {
2828 		KASSERT(msg->base.nm_dispatch, ("ifnet_service: badmsg"));
2829 		msg->base.nm_dispatch(msg);
2830 	}
2831 }
2832 
2833 static void
2834 if_start_rollup(void)
2835 {
2836 	struct ifaltq_stage_head *head = &ifq_stage_heads[mycpuid];
2837 	struct ifaltq_stage *stage;
2838 
2839 	while ((stage = TAILQ_FIRST(&head->ifqs_head)) != NULL) {
2840 		struct ifaltq *ifq = stage->ifqs_altq;
2841 		int is_sched = 0;
2842 
2843 		if (stage->ifqs_flags & IFQ_STAGE_FLAG_SCHED)
2844 			is_sched = 1;
2845 		ifq_stage_remove(head, stage);
2846 
2847 		if (is_sched) {
2848 			if_start_schedule(ifq->altq_ifp, 1);
2849 		} else {
2850 			int start = 0;
2851 
2852 			ALTQ_LOCK(ifq);
2853 			if (!ifq->altq_started) {
2854 				/*
2855 				 * Hold the interlock of ifnet.if_start
2856 				 */
2857 				ifq->altq_started = 1;
2858 				start = 1;
2859 			}
2860 			ALTQ_UNLOCK(ifq);
2861 
2862 			if (start)
2863 				ifq_try_ifstart(ifq, 1);
2864 		}
2865 		KKASSERT((stage->ifqs_flags &
2866 		    (IFQ_STAGE_FLAG_QUED | IFQ_STAGE_FLAG_SCHED)) == 0);
2867 	}
2868 }
2869 
2870 static void
2871 ifnetinit(void *dummy __unused)
2872 {
2873 	int i;
2874 
2875 	for (i = 0; i < ncpus; ++i) {
2876 		struct thread *thr = &ifnet_threads[i];
2877 
2878 		lwkt_create(ifnet_service_loop, NULL, NULL,
2879 			    thr, TDF_NOSTART|TDF_FORCE_SPINPORT,
2880 			    i, "ifnet %d", i);
2881 		netmsg_service_port_init(&thr->td_msgport);
2882 		lwkt_schedule(thr);
2883 	}
2884 
2885 	for (i = 0; i < ncpus; ++i)
2886 		TAILQ_INIT(&ifq_stage_heads[i].ifqs_head);
2887 	netisr_register_rollup(if_start_rollup, NETISR_ROLLUP_PRIO_IFSTART);
2888 }
2889 
2890 struct ifnet *
2891 ifnet_byindex(unsigned short idx)
2892 {
2893 	if (idx > if_index)
2894 		return NULL;
2895 	return ifindex2ifnet[idx];
2896 }
2897 
2898 struct ifaddr *
2899 ifaddr_byindex(unsigned short idx)
2900 {
2901 	struct ifnet *ifp;
2902 
2903 	ifp = ifnet_byindex(idx);
2904 	if (!ifp)
2905 		return NULL;
2906 	return TAILQ_FIRST(&ifp->if_addrheads[mycpuid])->ifa;
2907 }
2908 
2909 void
2910 if_register_com_alloc(u_char type,
2911     if_com_alloc_t *a, if_com_free_t *f)
2912 {
2913 
2914         KASSERT(if_com_alloc[type] == NULL,
2915             ("if_register_com_alloc: %d already registered", type));
2916         KASSERT(if_com_free[type] == NULL,
2917             ("if_register_com_alloc: %d free already registered", type));
2918 
2919         if_com_alloc[type] = a;
2920         if_com_free[type] = f;
2921 }
2922 
2923 void
2924 if_deregister_com_alloc(u_char type)
2925 {
2926 
2927         KASSERT(if_com_alloc[type] != NULL,
2928             ("if_deregister_com_alloc: %d not registered", type));
2929         KASSERT(if_com_free[type] != NULL,
2930             ("if_deregister_com_alloc: %d free not registered", type));
2931         if_com_alloc[type] = NULL;
2932         if_com_free[type] = NULL;
2933 }
2934 
2935 int
2936 if_ring_count2(int cnt, int cnt_max)
2937 {
2938 	int shift = 0;
2939 
2940 	KASSERT(cnt_max >= 1 && powerof2(cnt_max),
2941 	    ("invalid ring count max %d", cnt_max));
2942 
2943 	if (cnt <= 0)
2944 		cnt = cnt_max;
2945 	if (cnt > ncpus2)
2946 		cnt = ncpus2;
2947 	if (cnt > cnt_max)
2948 		cnt = cnt_max;
2949 
2950 	while ((1 << (shift + 1)) <= cnt)
2951 		++shift;
2952 	cnt = 1 << shift;
2953 
2954 	KASSERT(cnt >= 1 && cnt <= ncpus2 && cnt <= cnt_max,
2955 	    ("calculate cnt %d, ncpus2 %d, cnt max %d",
2956 	     cnt, ncpus2, cnt_max));
2957 	return cnt;
2958 }
2959 
2960 void
2961 ifq_set_maxlen(struct ifaltq *ifq, int len)
2962 {
2963 	ifq->ifq_maxlen = len + (ncpus * ifq_stage_cntmax);
2964 }
2965