xref: /dflybsd-src/sys/net/netmap/netmap.c (revision 50a82a170aaf19668f95311f53046f00865b9734)
1 /*
2  * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * This module supports memory mapped access to network devices,
29  * see netmap(4).
30  *
31  * The module uses a large, memory pool allocated by the kernel
32  * and accessible as mmapped memory by multiple userspace threads/processes.
33  * The memory pool contains packet buffers and "netmap rings",
34  * i.e. user-accessible copies of the interface's queues.
35  *
36  * Access to the network card works like this:
37  * 1. a process/thread issues one or more open() on /dev/netmap, to create
38  *    select()able file descriptor on which events are reported.
39  * 2. on each descriptor, the process issues an ioctl() to identify
40  *    the interface that should report events to the file descriptor.
41  * 3. on each descriptor, the process issues an mmap() request to
42  *    map the shared memory region within the process' address space.
43  *    The list of interesting queues is indicated by a location in
44  *    the shared memory region.
45  * 4. using the functions in the netmap(4) userspace API, a process
46  *    can look up the occupation state of a queue, access memory buffers,
47  *    and retrieve received packets or enqueue packets to transmit.
48  * 5. using some ioctl()s the process can synchronize the userspace view
49  *    of the queue with the actual status in the kernel. This includes both
50  *    receiving the notification of new packets, and transmitting new
51  *    packets on the output interface.
52  * 6. select() or poll() can be used to wait for events on individual
53  *    transmit or receive queues (or all queues for a given interface).
54  *
55 
56 		SYNCHRONIZATION (USER)
57 
58 The netmap rings and data structures may be shared among multiple
59 user threads or even independent processes.
60 Any synchronization among those threads/processes is delegated
61 to the threads themselves. Only one thread at a time can be in
62 a system call on the same netmap ring. The OS does not enforce
63 this and only guarantees against system crashes in case of
64 invalid usage.
65 
66 		LOCKING (INTERNAL)
67 
68 Within the kernel, access to the netmap rings is protected as follows:
69 
70 - a spinlock on each ring, to handle producer/consumer races on
71   RX rings attached to the host stack (against multiple host
72   threads writing from the host stack to the same ring),
73   and on 'destination' rings attached to a VALE switch
74   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
75   protecting multiple active senders for the same destination)
76 
77 - an atomic variable to guarantee that there is at most one
78   instance of *_*xsync() on the ring at any time.
79   For rings connected to user file
80   descriptors, an atomic_test_and_set() protects this, and the
81   lock on the ring is not actually used.
82   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
83   is also used to prevent multiple executions (the driver might indeed
84   already guarantee this).
85   For NIC TX rings connected to a VALE switch, the lock arbitrates
86   access to the queue (both when allocating buffers and when pushing
87   them out).
88 
89 - *xsync() should be protected against initializations of the card.
90   On FreeBSD most devices have the reset routine protected by
91   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
92   the RING protection on rx_reset(), this should be added.
93 
94   On linux there is an external lock on the tx path, which probably
95   also arbitrates access to the reset routine. XXX to be revised
96 
97 - a per-interface core_lock protecting access from the host stack
98   while interfaces may be detached from netmap mode.
99   XXX there should be no need for this lock if we detach the interfaces
100   only while they are down.
101 
102 
103 --- VALE SWITCH ---
104 
105 NMG_LOCK() serializes all modifications to switches and ports.
106 A switch cannot be deleted until all ports are gone.
107 
108 For each switch, an SX lock (RWlock on linux) protects
109 deletion of ports. When configuring or deleting a new port, the
110 lock is acquired in exclusive mode (after holding NMG_LOCK).
111 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
112 The lock is held throughout the entire forwarding cycle,
113 during which the thread may incur in a page fault.
114 Hence it is important that sleepable shared locks are used.
115 
116 On the rx ring, the per-port lock is grabbed initially to reserve
117 a number of slot in the ring, then the lock is released,
118 packets are copied from source to destination, and then
119 the lock is acquired again and the receive ring is updated.
120 (A similar thing is done on the tx ring for NIC and host stack
121 ports attached to the switch)
122 
123  */
124 
125 /*
126  * OS-specific code that is used only within this file.
127  * Other OS-specific code that must be accessed by drivers
128  * is present in netmap_kern.h
129  */
130 
131 #include <sys/types.h>
132 #include <sys/errno.h>
133 #include <sys/param.h>	/* defines used in kernel.h */
134 #include <sys/kernel.h>	/* types used in module initialization */
135 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
136 #include <sys/devfs.h>
137 #include <sys/sockio.h>
138 #include <sys/socketvar.h>	/* struct socket */
139 #include <sys/malloc.h>
140 #include <sys/poll.h>
141 #include <sys/lock.h>
142 #include <sys/socket.h> /* sockaddrs */
143 #include <sys/event.h>
144 #include <sys/sysctl.h>
145 #include <net/if.h>
146 #include <net/if_var.h>
147 #include <net/bpf.h>		/* BIOCIMMEDIATE */
148 #include <sys/bus.h>	/* bus_dmamap_* */
149 #include <sys/endian.h>
150 #include <sys/refcount.h>
151 
152 /* reduce conditional code */
153 #define init_waitqueue_head(x)	// only needed in linux
154 
155 extern struct dev_ops netmap_cdevsw;
156 
157 /*
158  * common headers
159  */
160 #include <net/netmap.h>
161 #include <net/netmap/netmap_kern.h>
162 #include <net/netmap/netmap_mem2.h>
163 
164 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
165 
166 /*
167  * The following variables are used by the drivers and replicate
168  * fields in the global memory pool. They only refer to buffers
169  * used by physical interfaces.
170  */
171 u_int netmap_total_buffers;
172 u_int netmap_buf_size;
173 char *netmap_buffer_base;	/* also address of an invalid buffer */
174 
175 /* user-controlled variables */
176 int netmap_verbose;
177 
178 static int netmap_no_timestamp; /* don't timestamp on rxsync */
179 
180 SYSCTL_NODE(_net, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
181 SYSCTL_INT(_net_netmap, OID_AUTO, verbose,
182     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
183 SYSCTL_INT(_net_netmap, OID_AUTO, no_timestamp,
184     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
185 int netmap_mitigate = 1;
186 SYSCTL_INT(_net_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
187 int netmap_no_pendintr = 1;
188 SYSCTL_INT(_net_netmap, OID_AUTO, no_pendintr,
189     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
190 int netmap_txsync_retry = 2;
191 SYSCTL_INT(_net_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
192     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
193 
194 int netmap_flags = 0;	/* debug flags */
195 int netmap_fwd = 0;	/* force transparent mode */
196 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
197 
198 /*
199  * netmap_admode selects the netmap mode to use.
200  * Invalid values are reset to NETMAP_ADMODE_BEST
201  */
202 enum { NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
203 	NETMAP_ADMODE_NATIVE,	/* either native or none */
204 	NETMAP_ADMODE_GENERIC,	/* force generic */
205 	NETMAP_ADMODE_LAST };
206 #define NETMAP_ADMODE_NATIVE        1  /* Force native netmap adapter. */
207 #define NETMAP_ADMODE_GENERIC       2  /* Force generic netmap adapter. */
208 #define NETMAP_ADMODE_BEST          0  /* Priority to native netmap adapter. */
209 static int netmap_admode = NETMAP_ADMODE_BEST;
210 
211 int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
212 int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
213 
214 SYSCTL_INT(_net_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
215 SYSCTL_INT(_net_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
216 SYSCTL_INT(_net_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
217 SYSCTL_INT(_net_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
218 SYSCTL_INT(_net_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
219 SYSCTL_INT(_net_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
220 
221 NMG_LOCK_T	netmap_global_lock;
222 
223 
224 static void
225 nm_kr_get(struct netmap_kring *kr)
226 {
227 	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
228 		tsleep(kr, 0, "NM_KR_GET", 4);
229 }
230 
231 
232 void
233 netmap_disable_ring(struct netmap_kring *kr)
234 {
235 	kr->nkr_stopped = 1;
236 	nm_kr_get(kr);
237 	lockmgr(&kr->q_lock, LK_EXCLUSIVE);
238 	lockmgr(&kr->q_lock, LK_RELEASE);
239 	nm_kr_put(kr);
240 }
241 
242 
243 static void
244 netmap_set_all_rings(struct ifnet *ifp, int stopped)
245 {
246 	struct netmap_adapter *na;
247 	int i;
248 
249 	if (!(ifp->if_capenable & IFCAP_NETMAP))
250 		return;
251 
252 	na = NA(ifp);
253 
254 	for (i = 0; i <= na->num_tx_rings; i++) {
255 		if (stopped)
256 			netmap_disable_ring(na->tx_rings + i);
257 		else
258 			na->tx_rings[i].nkr_stopped = 0;
259 		na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY |
260 			(i == na->num_tx_rings ? NAF_GLOBAL_NOTIFY: 0));
261 	}
262 
263 	for (i = 0; i <= na->num_rx_rings; i++) {
264 		if (stopped)
265 			netmap_disable_ring(na->rx_rings + i);
266 		else
267 			na->rx_rings[i].nkr_stopped = 0;
268 		na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY |
269 			(i == na->num_rx_rings ? NAF_GLOBAL_NOTIFY: 0));
270 	}
271 }
272 
273 
274 void
275 netmap_disable_all_rings(struct ifnet *ifp)
276 {
277 	netmap_set_all_rings(ifp, 1 /* stopped */);
278 }
279 
280 
281 void
282 netmap_enable_all_rings(struct ifnet *ifp)
283 {
284 	netmap_set_all_rings(ifp, 0 /* enabled */);
285 }
286 
287 
288 /*
289  * generic bound_checking function
290  */
291 u_int
292 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
293 {
294 	u_int oldv = *v;
295 	const char *op = NULL;
296 
297 	if (dflt < lo)
298 		dflt = lo;
299 	if (dflt > hi)
300 		dflt = hi;
301 	if (oldv < lo) {
302 		*v = dflt;
303 		op = "Bump";
304 	} else if (oldv > hi) {
305 		*v = hi;
306 		op = "Clamp";
307 	}
308 	if (op && msg)
309 		kprintf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
310 	return *v;
311 }
312 
313 
314 /*
315  * packet-dump function, user-supplied or static buffer.
316  * The destination buffer must be at least 30+4*len
317  */
318 const char *
319 nm_dump_buf(char *p, int len, int lim, char *dst)
320 {
321 	static char _dst[8192];
322 	int i, j, i0;
323 	static char hex[] ="0123456789abcdef";
324 	char *o;	/* output position */
325 
326 #define P_HI(x)	hex[((x) & 0xf0)>>4]
327 #define P_LO(x)	hex[((x) & 0xf)]
328 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
329 	if (!dst)
330 		dst = _dst;
331 	if (lim <= 0 || lim > len)
332 		lim = len;
333 	o = dst;
334 	ksprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
335 	o += strlen(o);
336 	/* hexdump routine */
337 	for (i = 0; i < lim; ) {
338 		ksprintf(o, "%5d: ", i);
339 		o += strlen(o);
340 		memset(o, ' ', 48);
341 		i0 = i;
342 		for (j=0; j < 16 && i < lim; i++, j++) {
343 			o[j*3] = P_HI(p[i]);
344 			o[j*3+1] = P_LO(p[i]);
345 		}
346 		i = i0;
347 		for (j=0; j < 16 && i < lim; i++, j++)
348 			o[j + 48] = P_C(p[i]);
349 		o[j+48] = '\n';
350 		o += j+49;
351 	}
352 	*o = '\0';
353 #undef P_HI
354 #undef P_LO
355 #undef P_C
356 	return dst;
357 }
358 
359 
360 
361 /*
362  * Fetch configuration from the device, to cope with dynamic
363  * reconfigurations after loading the module.
364  */
365 int
366 netmap_update_config(struct netmap_adapter *na)
367 {
368 	struct ifnet *ifp = na->ifp;
369 	u_int txr, txd, rxr, rxd;
370 
371 	txr = txd = rxr = rxd = 0;
372 	if (na->nm_config) {
373 		na->nm_config(na, &txr, &txd, &rxr, &rxd);
374 	} else {
375 		/* take whatever we had at init time */
376 		txr = na->num_tx_rings;
377 		txd = na->num_tx_desc;
378 		rxr = na->num_rx_rings;
379 		rxd = na->num_rx_desc;
380 	}
381 
382 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
383 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
384 		return 0; /* nothing changed */
385 	if (netmap_verbose || na->active_fds > 0) {
386 		D("stored config %s: txring %d x %d, rxring %d x %d",
387 			NM_IFPNAME(ifp),
388 			na->num_tx_rings, na->num_tx_desc,
389 			na->num_rx_rings, na->num_rx_desc);
390 		D("new config %s: txring %d x %d, rxring %d x %d",
391 			NM_IFPNAME(ifp), txr, txd, rxr, rxd);
392 	}
393 	if (na->active_fds == 0) {
394 		D("configuration changed (but fine)");
395 		na->num_tx_rings = txr;
396 		na->num_tx_desc = txd;
397 		na->num_rx_rings = rxr;
398 		na->num_rx_desc = rxd;
399 		return 0;
400 	}
401 	D("configuration changed while active, this is bad...");
402 	return 1;
403 }
404 
405 
406 int
407 netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom)
408 {
409 	u_int i, len, ndesc;
410 	struct netmap_kring *kring;
411 
412 	len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
413 
414 	na->tx_rings = kmalloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
415 	if (na->tx_rings == NULL) {
416 		D("Cannot allocate krings");
417 		return ENOMEM;
418 	}
419 	na->rx_rings = na->tx_rings + ntx;
420 
421 	ndesc = na->num_tx_desc;
422 	for (i = 0; i < ntx; i++) { /* Transmit rings */
423 		kring = &na->tx_rings[i];
424 		bzero(kring, sizeof(*kring));
425 		kring->na = na;
426 		kring->nkr_num_slots = ndesc;
427 		/*
428 		 * IMPORTANT:
429 		 * Always keep one slot empty, so we can detect new
430 		 * transmissions comparing cur and nr_hwcur (they are
431 		 * the same only if there are no new transmissions).
432 		 */
433 		kring->nr_hwavail = ndesc - 1;
434 		lockinit(&kring->q_lock, "nm_txq_lock", 0, LK_CANRECURSE);
435 		init_waitqueue_head(&kring->si);
436 	}
437 
438 	ndesc = na->num_rx_desc;
439 	for (i = 0; i < nrx; i++) { /* Receive rings */
440 		kring = &na->rx_rings[i];
441 		bzero(kring, sizeof(*kring));
442 		kring->na = na;
443 		kring->nkr_num_slots = ndesc;
444 		lockinit(&kring->q_lock, "nm_rxq_lock", 0, LK_CANRECURSE);
445 		init_waitqueue_head(&kring->si);
446 	}
447 	init_waitqueue_head(&na->tx_si);
448 	init_waitqueue_head(&na->rx_si);
449 
450 	na->tailroom = na->rx_rings + nrx;
451 
452 	return 0;
453 
454 }
455 
456 
457 void
458 netmap_krings_delete(struct netmap_adapter *na)
459 {
460 	int i;
461 
462 	for (i = 0; i < na->num_tx_rings + 1; i++) {
463 		lockuninit(&na->tx_rings[i].q_lock);
464 	}
465 	for (i = 0; i < na->num_rx_rings + 1; i++) {
466 		lockuninit(&na->rx_rings[i].q_lock);
467 	}
468 	kfree(na->tx_rings, M_DEVBUF);
469 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
470 }
471 
472 
473 static struct netmap_if*
474 netmap_if_new(const char *ifname, struct netmap_adapter *na)
475 {
476 	struct netmap_if *nifp;
477 
478 	if (netmap_update_config(na)) {
479 		/* configuration mismatch, report and fail */
480 		return NULL;
481 	}
482 
483 	if (na->active_fds)
484 		goto final;
485 
486 	if (na->nm_krings_create(na))
487 		goto cleanup;
488 
489 	if (netmap_mem_rings_create(na))
490 		goto cleanup;
491 
492 final:
493 
494 	nifp = netmap_mem_if_new(ifname, na);
495 	if (nifp == NULL)
496 		goto cleanup;
497 
498 	return (nifp);
499 
500 cleanup:
501 
502 	if (na->active_fds == 0) {
503 		netmap_mem_rings_delete(na);
504 		na->nm_krings_delete(na);
505 	}
506 
507 	return NULL;
508 }
509 
510 
511 /* grab a reference to the memory allocator, if we don't have one already.  The
512  * reference is taken from the netmap_adapter registered with the priv.
513  *
514  */
515 static int
516 netmap_get_memory_locked(struct netmap_priv_d* p)
517 {
518 	struct netmap_mem_d *nmd;
519 	int error = 0;
520 
521 	if (p->np_na == NULL) {
522 		if (!netmap_mmap_unreg)
523 			return ENODEV;
524 		/* for compatibility with older versions of the API
525  		 * we use the global allocator when no interface has been
526  		 * registered
527  		 */
528 		nmd = &nm_mem;
529 	} else {
530 		nmd = p->np_na->nm_mem;
531 	}
532 	if (p->np_mref == NULL) {
533 		error = netmap_mem_finalize(nmd);
534 		if (!error)
535 			p->np_mref = nmd;
536 	} else if (p->np_mref != nmd) {
537 		/* a virtual port has been registered, but previous
538  		 * syscalls already used the global allocator.
539  		 * We cannot continue
540  		 */
541 		error = ENODEV;
542 	}
543 	return error;
544 }
545 
546 
547 int
548 netmap_get_memory(struct netmap_priv_d* p)
549 {
550 	int error;
551 	NMG_LOCK();
552 	error = netmap_get_memory_locked(p);
553 	NMG_UNLOCK();
554 	return error;
555 }
556 
557 
558 static int
559 netmap_have_memory_locked(struct netmap_priv_d* p)
560 {
561 	return p->np_mref != NULL;
562 }
563 
564 
565 static void
566 netmap_drop_memory_locked(struct netmap_priv_d* p)
567 {
568 	if (p->np_mref) {
569 		netmap_mem_deref(p->np_mref);
570 		p->np_mref = NULL;
571 	}
572 }
573 
574 
575 /*
576  * File descriptor's private data destructor.
577  *
578  * Call nm_register(ifp,0) to stop netmap mode on the interface and
579  * revert to normal operation. We expect that np_na->ifp has not gone.
580  * The second argument is the nifp to work on. In some cases it is
581  * not attached yet to the netmap_priv_d so we need to pass it as
582  * a separate argument.
583  */
584 /* call with NMG_LOCK held */
585 static void
586 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
587 {
588 	struct netmap_adapter *na = priv->np_na;
589 	struct ifnet *ifp = na->ifp;
590 
591 	NMG_LOCK_ASSERT();
592 	na->active_fds--;
593 	if (na->active_fds <= 0) {	/* last instance */
594 
595 		if (netmap_verbose)
596 			D("deleting last instance for %s", NM_IFPNAME(ifp));
597 		/*
598 		 * (TO CHECK) This function is only called
599 		 * when the last reference to this file descriptor goes
600 		 * away. This means we cannot have any pending poll()
601 		 * or interrupt routine operating on the structure.
602 		 * XXX The file may be closed in a thread while
603 		 * another thread is using it.
604 		 * Linux keeps the file opened until the last reference
605 		 * by any outstanding ioctl/poll or mmap is gone.
606 		 * FreeBSD does not track mmap()s (but we do) and
607 		 * wakes up any sleeping poll(). Need to check what
608 		 * happens if the close() occurs while a concurrent
609 		 * syscall is running.
610 		 */
611 		if (ifp)
612 			na->nm_register(na, 0); /* off, clear IFCAP_NETMAP */
613 		/* Wake up any sleeping threads. netmap_poll will
614 		 * then return POLLERR
615 		 * XXX The wake up now must happen during *_down(), when
616 		 * we order all activities to stop. -gl
617 		 */
618 		/* XXX kqueue(9) needed; these will mirror knlist_init. */
619 		/* knlist_destroy(&na->tx_si.si_note); */
620 		/* knlist_destroy(&na->rx_si.si_note); */
621 
622 		/* delete rings and buffers */
623 		netmap_mem_rings_delete(na);
624 		na->nm_krings_delete(na);
625 	}
626 	/* delete the nifp */
627 	netmap_mem_if_delete(na, nifp);
628 }
629 
630 
631 /*
632  * returns 1 if this is the last instance and we can free priv
633  */
634 int
635 netmap_dtor_locked(struct netmap_priv_d *priv)
636 {
637 	struct netmap_adapter *na = priv->np_na;
638 
639 	/*
640 	 * np_refcount is the number of active mmaps on
641 	 * this file descriptor
642 	 */
643 	if (--priv->np_refcount > 0) {
644 		return 0;
645 	}
646 	if (!na) {
647 	    return 1; //XXX is it correct?
648 	}
649 	netmap_do_unregif(priv, priv->np_nifp);
650 	priv->np_nifp = NULL;
651 	netmap_drop_memory_locked(priv);
652 	if (priv->np_na) {
653 		netmap_adapter_put(na);
654 		priv->np_na = NULL;
655 	}
656 	return 1;
657 }
658 
659 
660 void
661 netmap_dtor(void *data)
662 {
663 	struct netmap_priv_d *priv = data;
664 	int last_instance;
665 
666 	NMG_LOCK();
667 	last_instance = netmap_dtor_locked(priv);
668 	NMG_UNLOCK();
669 	if (last_instance) {
670 		bzero(priv, sizeof(*priv));	/* for safety */
671 		kfree(priv, M_DEVBUF);
672 	}
673 }
674 
675 
676 
677 
678 /*
679  * Handlers for synchronization of the queues from/to the host.
680  * Netmap has two operating modes:
681  * - in the default mode, the rings connected to the host stack are
682  *   just another ring pair managed by userspace;
683  * - in transparent mode (XXX to be defined) incoming packets
684  *   (from the host or the NIC) are marked as NS_FORWARD upon
685  *   arrival, and the user application has a chance to reset the
686  *   flag for packets that should be dropped.
687  *   On the RXSYNC or poll(), packets in RX rings between
688  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
689  *   to the other side.
690  * The transfer NIC --> host is relatively easy, just encapsulate
691  * into mbufs and we are done. The host --> NIC side is slightly
692  * harder because there might not be room in the tx ring so it
693  * might take a while before releasing the buffer.
694  */
695 
696 
697 /*
698  * pass a chain of buffers to the host stack as coming from 'dst'
699  */
700 static void
701 netmap_send_up(struct ifnet *dst, struct mbq *q)
702 {
703 	struct mbuf *m;
704 
705 	/* send packets up, outside the lock */
706 	while ((m = mbq_dequeue(q)) != NULL) {
707 		if (netmap_verbose & NM_VERB_HOST)
708 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
709 		NM_SEND_UP(dst, m);
710 	}
711 	mbq_destroy(q);
712 }
713 
714 
715 /*
716  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
717  * Run from hwcur to cur - reserved
718  */
719 static void
720 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
721 {
722 	/* Take packets from hwcur to cur-reserved and pass them up.
723 	 * In case of no buffers we give up. At the end of the loop,
724 	 * the queue is drained in all cases.
725 	 * XXX handle reserved
726 	 */
727 	u_int lim = kring->nkr_num_slots - 1;
728 	struct mbuf *m;
729 	u_int k = kring->ring->cur, n = kring->ring->reserved;
730 	struct netmap_adapter *na = kring->na;
731 
732 	/* compute the final position, ring->cur - ring->reserved */
733 	if (n > 0) {
734 		if (k < n)
735 			k += kring->nkr_num_slots;
736 		k += n;
737 	}
738 	for (n = kring->nr_hwcur; n != k;) {
739 		struct netmap_slot *slot = &kring->ring->slot[n];
740 
741 		n = nm_next(n, lim);
742 		if ((slot->flags & NS_FORWARD) == 0 && !force)
743 			continue;
744 		if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) {
745 			D("bad pkt at %d len %d", n, slot->len);
746 			continue;
747 		}
748 		slot->flags &= ~NS_FORWARD; // XXX needed ?
749 		/* XXX adapt to the case of a multisegment packet */
750 		m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL);
751 
752 		if (m == NULL)
753 			break;
754 		mbq_enqueue(q, m);
755 	}
756 }
757 
758 
759 /*
760  * The host ring has packets from nr_hwcur to (cur - reserved)
761  * to be sent down to the NIC.
762  * We need to use the queue lock on the source (host RX ring)
763  * to protect against netmap_transmit.
764  * If the user is well behaved we do not need to acquire locks
765  * on the destination(s),
766  * so we only need to make sure that there are no panics because
767  * of user errors.
768  * XXX verify
769  *
770  * We scan the tx rings, which have just been
771  * flushed so nr_hwcur == cur. Pushing packets down means
772  * increment cur and decrement avail.
773  * XXX to be verified
774  */
775 static void
776 netmap_sw_to_nic(struct netmap_adapter *na)
777 {
778 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
779 	struct netmap_kring *k1 = &na->tx_rings[0];
780 	u_int i, howmany, src_lim, dst_lim;
781 
782 	/* XXX we should also check that the carrier is on */
783 	if (kring->nkr_stopped)
784 		return;
785 
786 	lockmgr(&kring->q_lock, LK_EXCLUSIVE);
787 
788 	if (kring->nkr_stopped)
789 		goto out;
790 
791 	howmany = kring->nr_hwavail;	/* XXX otherwise cur - reserved - nr_hwcur */
792 
793 	src_lim = kring->nkr_num_slots - 1;
794 	for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) {
795 		ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail);
796 		dst_lim = k1->nkr_num_slots - 1;
797 		while (howmany > 0 && k1->ring->avail > 0) {
798 			struct netmap_slot *src, *dst, tmp;
799 			src = &kring->ring->slot[kring->nr_hwcur];
800 			dst = &k1->ring->slot[k1->ring->cur];
801 			tmp = *src;
802 			src->buf_idx = dst->buf_idx;
803 			src->flags = NS_BUF_CHANGED;
804 
805 			dst->buf_idx = tmp.buf_idx;
806 			dst->len = tmp.len;
807 			dst->flags = NS_BUF_CHANGED;
808 			ND("out len %d buf %d from %d to %d",
809 				dst->len, dst->buf_idx,
810 				kring->nr_hwcur, k1->ring->cur);
811 
812 			kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim);
813 			howmany--;
814 			kring->nr_hwavail--;
815 			k1->ring->cur = nm_next(k1->ring->cur, dst_lim);
816 			k1->ring->avail--;
817 		}
818 		kring->ring->cur = kring->nr_hwcur; // XXX
819 		k1++; // XXX why?
820 	}
821 out:
822 	lockmgr(&kring->q_lock, LK_RELEASE);
823 }
824 
825 
826 /*
827  * netmap_txsync_to_host() passes packets up. We are called from a
828  * system call in user process context, and the only contention
829  * can be among multiple user threads erroneously calling
830  * this routine concurrently.
831  */
832 void
833 netmap_txsync_to_host(struct netmap_adapter *na)
834 {
835 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
836 	struct netmap_ring *ring = kring->ring;
837 	u_int k, lim = kring->nkr_num_slots - 1;
838 	struct mbq q;
839 	int error;
840 
841 	error = nm_kr_tryget(kring);
842 	if (error) {
843 		if (error == NM_KR_BUSY)
844 			D("ring %p busy (user error)", kring);
845 		return;
846 	}
847 	k = ring->cur;
848 	if (k > lim) {
849 		D("invalid ring index in stack TX kring %p", kring);
850 		netmap_ring_reinit(kring);
851 		nm_kr_put(kring);
852 		return;
853 	}
854 
855 	/* Take packets from hwcur to cur and pass them up.
856 	 * In case of no buffers we give up. At the end of the loop,
857 	 * the queue is drained in all cases.
858 	 */
859 	mbq_init(&q);
860 	netmap_grab_packets(kring, &q, 1);
861 	kring->nr_hwcur = k;
862 	kring->nr_hwavail = ring->avail = lim;
863 
864 	nm_kr_put(kring);
865 	netmap_send_up(na->ifp, &q);
866 }
867 
868 
869 /*
870  * rxsync backend for packets coming from the host stack.
871  * They have been put in the queue by netmap_transmit() so we
872  * need to protect access to the kring using a lock.
873  *
874  * This routine also does the selrecord if called from the poll handler
875  * (we know because td != NULL).
876  *
877  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
878  *     as an additional hidden argument.
879  */
880 static void
881 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
882 {
883 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
884 	struct netmap_ring *ring = kring->ring;
885 	u_int j, n, lim = kring->nkr_num_slots;
886 	u_int k = ring->cur, resvd = ring->reserved;
887 
888 	(void)pwait;	/* disable unused warnings */
889 
890 	if (kring->nkr_stopped) /* check a first time without lock */
891 		return;
892 
893 	lockmgr(&kring->q_lock, LK_EXCLUSIVE);
894 
895 	if (kring->nkr_stopped)  /* check again with lock held */
896 		goto unlock_out;
897 
898 	if (k >= lim) {
899 		netmap_ring_reinit(kring);
900 		goto unlock_out;
901 	}
902 	/* new packets are already set in nr_hwavail */
903 	/* skip past packets that userspace has released */
904 	j = kring->nr_hwcur;
905 	if (resvd > 0) {
906 		if (resvd + ring->avail >= lim + 1) {
907 			D("XXX invalid reserve/avail %d %d", resvd, ring->avail);
908 			ring->reserved = resvd = 0; // XXX panic...
909 		}
910 		k = (k >= resvd) ? k - resvd : k + lim - resvd;
911 	}
912 	if (j != k) {
913 		n = k >= j ? k - j : k + lim - j;
914 		kring->nr_hwavail -= n;
915 		kring->nr_hwcur = k;
916 	}
917 	k = ring->avail = kring->nr_hwavail - resvd;
918 	if (k == 0 && td)
919 		KNOTE(&kring->si.ki_note, 0);
920 	if (k && (netmap_verbose & NM_VERB_HOST))
921 		D("%d pkts from stack", k);
922 unlock_out:
923 
924 	lockmgr(&kring->q_lock, LK_RELEASE);
925 }
926 
927 
928 /* Get a netmap adapter for the port.
929  *
930  * If it is possible to satisfy the request, return 0
931  * with *na containing the netmap adapter found.
932  * Otherwise return an error code, with *na containing NULL.
933  *
934  * When the port is attached to a bridge, we always return
935  * EBUSY.
936  * Otherwise, if the port is already bound to a file descriptor,
937  * then we unconditionally return the existing adapter into *na.
938  * In all the other cases, we return (into *na) either native,
939  * generic or NULL, according to the following table:
940  *
941  *					native_support
942  * active_fds   dev.netmap.admode         YES     NO
943  * -------------------------------------------------------
944  *    >0              *                 NA(ifp) NA(ifp)
945  *
946  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
947  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
948  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
949  *
950  */
951 
952 int
953 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
954 {
955 	/* generic support */
956 	int i = netmap_admode;	/* Take a snapshot. */
957 	int error = 0;
958 	struct netmap_adapter *prev_na;
959 	struct netmap_generic_adapter *gna;
960 
961 	*na = NULL; /* default */
962 
963 	/* reset in case of invalid value */
964 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
965 		i = netmap_admode = NETMAP_ADMODE_BEST;
966 
967 	if (NETMAP_CAPABLE(ifp)) {
968 		/* If an adapter already exists, but is
969 		 * attached to a vale port, we report that the
970 		 * port is busy.
971 		 */
972 		if (NETMAP_OWNED_BY_KERN(NA(ifp)))
973 			return EBUSY;
974 
975 		/* If an adapter already exists, return it if
976 		 * there are active file descriptors or if
977 		 * netmap is not forced to use generic
978 		 * adapters.
979 		 */
980 		if (NA(ifp)->active_fds > 0 ||
981 				i != NETMAP_ADMODE_GENERIC) {
982 			*na = NA(ifp);
983 			return 0;
984 		}
985 	}
986 
987 	/* If there isn't native support and netmap is not allowed
988 	 * to use generic adapters, we cannot satisfy the request.
989 	 */
990 	if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
991 		return EINVAL;
992 
993 	/* Otherwise, create a generic adapter and return it,
994 	 * saving the previously used netmap adapter, if any.
995 	 *
996 	 * Note that here 'prev_na', if not NULL, MUST be a
997 	 * native adapter, and CANNOT be a generic one. This is
998 	 * true because generic adapters are created on demand, and
999 	 * destroyed when not used anymore. Therefore, if the adapter
1000 	 * currently attached to an interface 'ifp' is generic, it
1001 	 * must be that
1002 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1003 	 * Consequently, if NA(ifp) is generic, we will enter one of
1004 	 * the branches above. This ensures that we never override
1005 	 * a generic adapter with another generic adapter.
1006 	 */
1007 	prev_na = NA(ifp);
1008 	error = generic_netmap_attach(ifp);
1009 	if (error)
1010 		return error;
1011 
1012 	*na = NA(ifp);
1013 	gna = (struct netmap_generic_adapter*)NA(ifp);
1014 	gna->prev = prev_na; /* save old na */
1015 	if (prev_na != NULL) {
1016 		ifunit(ifp->if_xname);	/* XXX huh? */
1017 		// XXX add a refcount ?
1018 		netmap_adapter_get(prev_na);
1019 	}
1020 	D("Created generic NA %p (prev %p)", gna, gna->prev);
1021 
1022 	return 0;
1023 }
1024 
1025 
1026 /*
1027  * MUST BE CALLED UNDER NMG_LOCK()
1028  *
1029  * get a refcounted reference to an interface.
1030  * This is always called in the execution of an ioctl().
1031  *
1032  * Return ENXIO if the interface does not exist, EINVAL if netmap
1033  * is not supported by the interface.
1034  * If successful, hold a reference.
1035  *
1036  * When the NIC is attached to a bridge, reference is managed
1037  * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as
1038  * virtual ports.  Hence, on the final DROP_BDG_REF(), the NIC
1039  * is detached from the bridge, then ifp's refcount is dropped (this
1040  * is equivalent to that ifp is destroyed in case of virtual ports.
1041  *
1042  * This function uses if_rele() when we want to prevent the NIC from
1043  * being detached from the bridge in error handling.  But once refcount
1044  * is acquired by this function, it must be released using nm_if_rele().
1045  */
1046 int
1047 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1048 {
1049 	struct ifnet *ifp;
1050 	int error = 0;
1051 	struct netmap_adapter *ret;
1052 
1053 	*na = NULL;     /* default return value */
1054 
1055 	/* first try to see if this is a bridge port. */
1056 	NMG_LOCK_ASSERT();
1057 
1058 	error = netmap_get_bdg_na(nmr, na, create);
1059 	if (error || *na != NULL) /* valid match in netmap_get_bdg_na() */
1060 		return error;
1061 
1062 	ifp = ifunit(nmr->nr_name);
1063 	if (ifp == NULL) {
1064 	        return ENXIO;
1065 	}
1066 
1067 	error = netmap_get_hw_na(ifp, &ret);
1068 	if (error)
1069 		goto out;
1070 
1071 	if (ret != NULL) {
1072 		/* Users cannot use the NIC attached to a bridge directly */
1073 		if (NETMAP_OWNED_BY_KERN(ret)) {
1074 			error = EINVAL;
1075 			goto out;
1076 		}
1077 		error = 0;
1078 		*na = ret;
1079 		netmap_adapter_get(ret);
1080 	}
1081 out:
1082 #if 0
1083 	if_rele(ifp);
1084 #endif
1085 
1086 	return error;
1087 }
1088 
1089 
1090 /*
1091  * Error routine called when txsync/rxsync detects an error.
1092  * Can't do much more than resetting cur = hwcur, avail = hwavail.
1093  * Return 1 on reinit.
1094  *
1095  * This routine is only called by the upper half of the kernel.
1096  * It only reads hwcur (which is changed only by the upper half, too)
1097  * and hwavail (which may be changed by the lower half, but only on
1098  * a tx ring and only to increase it, so any error will be recovered
1099  * on the next call). For the above, we don't strictly need to call
1100  * it under lock.
1101  */
1102 int
1103 netmap_ring_reinit(struct netmap_kring *kring)
1104 {
1105 	struct netmap_ring *ring = kring->ring;
1106 	u_int i, lim = kring->nkr_num_slots - 1;
1107 	int errors = 0;
1108 
1109 	// XXX KASSERT nm_kr_tryget
1110 	RD(10, "called for %s", NM_IFPNAME(kring->na->ifp));
1111 	if (ring->cur > lim)
1112 		errors++;
1113 	for (i = 0; i <= lim; i++) {
1114 		u_int idx = ring->slot[i].buf_idx;
1115 		u_int len = ring->slot[i].len;
1116 		if (idx < 2 || idx >= netmap_total_buffers) {
1117 			if (!errors++)
1118 				D("bad buffer at slot %d idx %d len %d ", i, idx, len);
1119 			ring->slot[i].buf_idx = 0;
1120 			ring->slot[i].len = 0;
1121 		} else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) {
1122 			ring->slot[i].len = 0;
1123 			if (!errors++)
1124 				D("bad len %d at slot %d idx %d",
1125 					len, i, idx);
1126 		}
1127 	}
1128 	if (errors) {
1129 		int pos = kring - kring->na->tx_rings;
1130 		int n = kring->na->num_tx_rings + 1;
1131 
1132 		RD(10, "total %d errors", errors);
1133 		errors++;
1134 		RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d",
1135 			NM_IFPNAME(kring->na->ifp),
1136 			pos < n ?  "TX" : "RX", pos < n ? pos : pos - n,
1137 			ring->cur, kring->nr_hwcur,
1138 			ring->avail, kring->nr_hwavail);
1139 		ring->cur = kring->nr_hwcur;
1140 		ring->avail = kring->nr_hwavail;
1141 	}
1142 	return (errors ? 1 : 0);
1143 }
1144 
1145 
1146 /*
1147  * Set the ring ID. For devices with a single queue, a request
1148  * for all rings is the same as a single ring.
1149  */
1150 static int
1151 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid)
1152 {
1153 	struct netmap_adapter *na = priv->np_na;
1154 	struct ifnet *ifp = na->ifp;
1155 	u_int i = ringid & NETMAP_RING_MASK;
1156 	/* initially (np_qfirst == np_qlast) we don't want to lock */
1157 	u_int lim = na->num_rx_rings;
1158 
1159 	if (na->num_tx_rings > lim)
1160 		lim = na->num_tx_rings;
1161 	if ( (ringid & NETMAP_HW_RING) && i >= lim) {
1162 		D("invalid ring id %d", i);
1163 		return (EINVAL);
1164 	}
1165 	priv->np_ringid = ringid;
1166 	if (ringid & NETMAP_SW_RING) {
1167 		priv->np_qfirst = NETMAP_SW_RING;
1168 		priv->np_qlast = 0;
1169 	} else if (ringid & NETMAP_HW_RING) {
1170 		priv->np_qfirst = i;
1171 		priv->np_qlast = i + 1;
1172 	} else {
1173 		priv->np_qfirst = 0;
1174 		priv->np_qlast = NETMAP_HW_RING ;
1175 	}
1176 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1177     if (netmap_verbose) {
1178 	if (ringid & NETMAP_SW_RING)
1179 		D("ringid %s set to SW RING", NM_IFPNAME(ifp));
1180 	else if (ringid & NETMAP_HW_RING)
1181 		D("ringid %s set to HW RING %d", NM_IFPNAME(ifp),
1182 			priv->np_qfirst);
1183 	else
1184 		D("ringid %s set to all %d HW RINGS", NM_IFPNAME(ifp), lim);
1185     }
1186 	return 0;
1187 }
1188 
1189 
1190 /*
1191  * possibly move the interface to netmap-mode.
1192  * If success it returns a pointer to netmap_if, otherwise NULL.
1193  * This must be called with NMG_LOCK held.
1194  */
1195 struct netmap_if *
1196 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1197 	uint16_t ringid, int *err)
1198 {
1199 	struct ifnet *ifp = na->ifp;
1200 	struct netmap_if *nifp = NULL;
1201 	int error, need_mem = 0;
1202 
1203 	NMG_LOCK_ASSERT();
1204 	/* ring configuration may have changed, fetch from the card */
1205 	netmap_update_config(na);
1206 	priv->np_na = na;     /* store the reference */
1207 	error = netmap_set_ringid(priv, ringid);
1208 	if (error)
1209 		goto out;
1210 	/* ensure allocators are ready */
1211 	need_mem = !netmap_have_memory_locked(priv);
1212 	if (need_mem) {
1213 		error = netmap_get_memory_locked(priv);
1214 		ND("get_memory returned %d", error);
1215 		if (error)
1216 			goto out;
1217 	}
1218 	nifp = netmap_if_new(NM_IFPNAME(ifp), na);
1219 	if (nifp == NULL) { /* allocation failed */
1220 		/* we should drop the allocator, but only
1221 		 * if we were the ones who grabbed it
1222 		 */
1223 		error = ENOMEM;
1224 		goto out;
1225 	}
1226 	na->active_fds++;
1227 	if (ifp->if_capenable & IFCAP_NETMAP) {
1228 		/* was already set */
1229 	} else {
1230 		/* Otherwise set the card in netmap mode
1231 		 * and make it use the shared buffers.
1232 		 *
1233 		 * do not core lock because the race is harmless here,
1234 		 * there cannot be any traffic to netmap_transmit()
1235 		 */
1236 		na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut;
1237 		ND("%p->na_lut == %p", na, na->na_lut);
1238 		na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal;
1239 		error = na->nm_register(na, 1); /* mode on */
1240 		if (error) {
1241 			netmap_do_unregif(priv, nifp);
1242 			nifp = NULL;
1243 		}
1244 	}
1245 out:
1246 	*err = error;
1247 	if (error) {
1248 		priv->np_na = NULL;
1249 		if (need_mem)
1250 			netmap_drop_memory_locked(priv);
1251 	}
1252 	if (nifp != NULL) {
1253 		/*
1254 		 * advertise that the interface is ready bt setting ni_nifp.
1255 		 * The barrier is needed because readers (poll and *SYNC)
1256 		 * check for priv->np_nifp != NULL without locking
1257 		 */
1258 		wmb(); /* make sure previous writes are visible to all CPUs */
1259 		priv->np_nifp = nifp;
1260 	}
1261 	return nifp;
1262 }
1263 
1264 
1265 
1266 /*
1267  * ioctl(2) support for the "netmap" device.
1268  *
1269  * Following a list of accepted commands:
1270  * - NIOCGINFO
1271  * - SIOCGIFADDR	just for convenience
1272  * - NIOCREGIF
1273  * - NIOCUNREGIF
1274  * - NIOCTXSYNC
1275  * - NIOCRXSYNC
1276  *
1277  * Return 0 on success, errno otherwise.
1278  */
1279 int
1280 netmap_ioctl(struct dev_ioctl_args *ap)
1281 {
1282 	struct netmap_priv_d *priv = NULL;
1283 	struct ifnet *ifp = NULL;
1284 	struct nmreq *nmr = (struct nmreq *) ap->a_data;
1285 	struct netmap_adapter *na = NULL;
1286 	int error;
1287 	u_int i, lim;
1288 	struct netmap_if *nifp;
1289 	struct netmap_kring *krings;
1290 	u_long cmd = ap->a_cmd;
1291 
1292 	priv = ap->a_head.a_dev->si_drv1;
1293 
1294 	nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';	/* truncate name */
1295 	switch (cmd) {
1296 	case NIOCGINFO:		/* return capabilities etc */
1297 		if (nmr->nr_version != NETMAP_API) {
1298 			D("API mismatch got %d have %d",
1299 				nmr->nr_version, NETMAP_API);
1300 			nmr->nr_version = NETMAP_API;
1301 			error = EINVAL;
1302 			break;
1303 		}
1304 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
1305 			error = netmap_bdg_ctl(nmr, NULL);
1306 			break;
1307 		}
1308 
1309 		NMG_LOCK();
1310 		do {
1311 			/* memsize is always valid */
1312 			struct netmap_mem_d *nmd = &nm_mem;
1313 			u_int memflags;
1314 
1315 			if (nmr->nr_name[0] != '\0') {
1316 				/* get a refcount */
1317 				error = netmap_get_na(nmr, &na, 1 /* create */);
1318 				if (error)
1319 					break;
1320 				nmd = na->nm_mem; /* get memory allocator */
1321 			}
1322 
1323 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags);
1324 			if (error)
1325 				break;
1326 			if (na == NULL) /* only memory info */
1327 				break;
1328 			nmr->nr_offset = 0;
1329 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
1330 			netmap_update_config(na);
1331 			nmr->nr_rx_rings = na->num_rx_rings;
1332 			nmr->nr_tx_rings = na->num_tx_rings;
1333 			nmr->nr_rx_slots = na->num_rx_desc;
1334 			nmr->nr_tx_slots = na->num_tx_desc;
1335 			if (memflags & NETMAP_MEM_PRIVATE)
1336 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
1337 			netmap_adapter_put(na);
1338 		} while (0);
1339 		NMG_UNLOCK();
1340 		break;
1341 
1342 	case NIOCREGIF:
1343 		if (nmr->nr_version != NETMAP_API) {
1344 			nmr->nr_version = NETMAP_API;
1345 			error = EINVAL;
1346 			break;
1347 		}
1348 		/* possibly attach/detach NIC and VALE switch */
1349 		i = nmr->nr_cmd;
1350 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) {
1351 			error = netmap_bdg_ctl(nmr, NULL);
1352 			break;
1353 		} else if (i != 0) {
1354 			D("nr_cmd must be 0 not %d", i);
1355 			error = EINVAL;
1356 			break;
1357 		}
1358 
1359 		/* protect access to priv from concurrent NIOCREGIF */
1360 		NMG_LOCK();
1361 		do {
1362 			u_int memflags;
1363 
1364 			if (priv->np_na != NULL) {	/* thread already registered */
1365 				error = netmap_set_ringid(priv, nmr->nr_ringid);
1366 				break;
1367 			}
1368 			/* find the interface and a reference */
1369 			error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
1370 			if (error)
1371 				break;
1372 			ifp = na->ifp;
1373 			if (NETMAP_OWNED_BY_KERN(na)) {
1374 				netmap_adapter_put(na);
1375 				error = EBUSY;
1376 				break;
1377 			}
1378 			nifp = netmap_do_regif(priv, na, nmr->nr_ringid, &error);
1379 			if (!nifp) {    /* reg. failed, release priv and ref */
1380 				netmap_adapter_put(na);
1381 				priv->np_nifp = NULL;
1382 				break;
1383 			}
1384 
1385 			/* return the offset of the netmap_if object */
1386 			nmr->nr_rx_rings = na->num_rx_rings;
1387 			nmr->nr_tx_rings = na->num_tx_rings;
1388 			nmr->nr_rx_slots = na->num_rx_desc;
1389 			nmr->nr_tx_slots = na->num_tx_desc;
1390 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags);
1391 			if (error) {
1392 				netmap_adapter_put(na);
1393 				break;
1394 			}
1395 			if (memflags & NETMAP_MEM_PRIVATE) {
1396 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
1397 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
1398 			}
1399 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
1400 		} while (0);
1401 		NMG_UNLOCK();
1402 		break;
1403 
1404 	case NIOCUNREGIF:
1405 		// XXX we have no data here ?
1406 		D("deprecated, data is %p", nmr);
1407 		error = EINVAL;
1408 		break;
1409 
1410 	case NIOCTXSYNC:
1411 	case NIOCRXSYNC:
1412 		nifp = priv->np_nifp;
1413 
1414 		if (nifp == NULL) {
1415 			error = ENXIO;
1416 			break;
1417 		}
1418 		rmb(); /* make sure following reads are not from cache */
1419 
1420 		na = priv->np_na;      /* we have a reference */
1421 
1422 		if (na == NULL) {
1423 			D("Internal error: nifp != NULL && na == NULL");
1424 			error = ENXIO;
1425 			break;
1426 		}
1427 
1428 		ifp = na->ifp;
1429 		if (ifp == NULL) {
1430 			RD(1, "the ifp is gone");
1431 			error = ENXIO;
1432 			break;
1433 		}
1434 
1435 		if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */
1436 			if (cmd == NIOCTXSYNC)
1437 				netmap_txsync_to_host(na);
1438 			else
1439 				netmap_rxsync_from_host(na, NULL, NULL);
1440 			break;
1441 		}
1442 		/* find the last ring to scan */
1443 		lim = priv->np_qlast;
1444 		if (lim == NETMAP_HW_RING)
1445 			lim = (cmd == NIOCTXSYNC) ?
1446 			    na->num_tx_rings : na->num_rx_rings;
1447 
1448 		krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings;
1449 		for (i = priv->np_qfirst; i < lim; i++) {
1450 			struct netmap_kring *kring = krings + i;
1451 			if (nm_kr_tryget(kring)) {
1452 				error = EBUSY;
1453 				goto out;
1454 			}
1455 			if (cmd == NIOCTXSYNC) {
1456 				if (netmap_verbose & NM_VERB_TXSYNC)
1457 					D("pre txsync ring %d cur %d hwcur %d",
1458 					    i, kring->ring->cur,
1459 					    kring->nr_hwcur);
1460 				na->nm_txsync(na, i, NAF_FORCE_RECLAIM);
1461 				if (netmap_verbose & NM_VERB_TXSYNC)
1462 					D("post txsync ring %d cur %d hwcur %d",
1463 					    i, kring->ring->cur,
1464 					    kring->nr_hwcur);
1465 			} else {
1466 				na->nm_rxsync(na, i, NAF_FORCE_READ);
1467 				microtime(&na->rx_rings[i].ring->ts);
1468 			}
1469 			nm_kr_put(kring);
1470 		}
1471 
1472 		break;
1473 	case BIOCIMMEDIATE:
1474 	case BIOCGHDRCMPLT:
1475 	case BIOCSHDRCMPLT:
1476 	case BIOCSSEESENT:
1477 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
1478 		break;
1479 
1480 	default:	/* allow device-specific ioctls */
1481 	    {
1482 		struct socket so;
1483 
1484 		bzero(&so, sizeof(so));
1485 		NMG_LOCK();
1486 		error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */
1487 		if (error) {
1488 			netmap_adapter_put(na);
1489 			NMG_UNLOCK();
1490 			break;
1491 		}
1492 		ifp = na->ifp;
1493 		// so->so_proto not null.
1494 		error = ifioctl(&so, cmd, ap->a_data, ap->a_cred);
1495 		netmap_adapter_put(na);
1496 		NMG_UNLOCK();
1497 		break;
1498 	    }
1499 	}
1500 out:
1501 
1502 	return (error);
1503 }
1504 
1505 static int
1506 netmap_kqfilter_event(struct knote *kn, long hint)
1507 {
1508 	return (0);
1509 }
1510 
1511 static void
1512 netmap_kqfilter_detach(struct knote *kn)
1513 {
1514 }
1515 
1516 static struct filterops netmap_kqfilter_ops = {
1517 	FILTEROP_ISFD, NULL, netmap_kqfilter_detach, netmap_kqfilter_event,
1518 };
1519 
1520 int
1521 netmap_kqfilter(struct dev_kqfilter_args *ap)
1522 {
1523 	struct knote *kn = ap->a_kn;
1524 
1525 	ap->a_result = 0;
1526 
1527 	switch (kn->kn_filter) {
1528 	case EVFILT_READ:
1529 	case EVFILT_WRITE:
1530 		kn->kn_fop = &netmap_kqfilter_ops;
1531 		break;
1532 	default:
1533 		ap->a_result = EOPNOTSUPP;
1534 		return (0);
1535 	}
1536 
1537 	return (0);
1538 }
1539 
1540 /*
1541  * select(2) and poll(2) handlers for the "netmap" device.
1542  *
1543  * Can be called for one or more queues.
1544  * Return true the event mask corresponding to ready events.
1545  * If there are no ready events, do a selrecord on either individual
1546  * selinfo or on the global one.
1547  * Device-dependent parts (locking and sync of tx/rx rings)
1548  * are done through callbacks.
1549  *
1550  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
1551  * The first one is remapped to pwait as selrecord() uses the name as an
1552  * hidden argument.
1553  */
1554 static int
1555 netmap_poll(struct cdev *dev, int events, struct thread *td)
1556 {
1557 	struct netmap_priv_d *priv = NULL;
1558 	struct netmap_adapter *na;
1559 	struct ifnet *ifp;
1560 	struct netmap_kring *kring;
1561 	u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
1562 	u_int lim_tx, lim_rx, host_forwarded = 0;
1563 	struct mbq q;
1564 	void *pwait = dev;	/* linux compatibility */
1565 
1566 	/*
1567 	 * In order to avoid nested locks, we need to "double check"
1568 	 * txsync and rxsync if we decide to do a selrecord().
1569 	 * retry_tx (and retry_rx, later) prevent looping forever.
1570 	 */
1571 	int retry_tx = 1;
1572 
1573 	(void)pwait;
1574 	mbq_init(&q);
1575 
1576 	priv = dev->si_drv1;
1577 
1578 	if (priv->np_nifp == NULL) {
1579 		D("No if registered");
1580 		return POLLERR;
1581 	}
1582 	rmb(); /* make sure following reads are not from cache */
1583 
1584 	na = priv->np_na;
1585 	ifp = na->ifp;
1586 	// check for deleted
1587 	if (ifp == NULL) {
1588 		RD(1, "the ifp is gone");
1589 		return POLLERR;
1590 	}
1591 
1592 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
1593 		return POLLERR;
1594 
1595 	if (netmap_verbose & 0x8000)
1596 		D("device %s events 0x%x", NM_IFPNAME(ifp), events);
1597 	want_tx = events & (POLLOUT | POLLWRNORM);
1598 	want_rx = events & (POLLIN | POLLRDNORM);
1599 
1600 	lim_tx = na->num_tx_rings;
1601 	lim_rx = na->num_rx_rings;
1602 
1603 	if (priv->np_qfirst == NETMAP_SW_RING) {
1604 		/* handle the host stack ring */
1605 		if (priv->np_txpoll || want_tx) {
1606 			/* push any packets up, then we are always ready */
1607 			netmap_txsync_to_host(na);
1608 			revents |= want_tx;
1609 		}
1610 		if (want_rx) {
1611 			kring = &na->rx_rings[lim_rx];
1612 			if (kring->ring->avail == 0)
1613 				netmap_rxsync_from_host(na, td, dev);
1614 			if (kring->ring->avail > 0) {
1615 				revents |= want_rx;
1616 			}
1617 		}
1618 		return (revents);
1619 	}
1620 
1621 	/*
1622 	 * If we are in transparent mode, check also the host rx ring
1623 	 * XXX Transparent mode at the moment requires to bind all
1624  	 * rings to a single file descriptor.
1625 	 */
1626 	kring = &na->rx_rings[lim_rx];
1627 	if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
1628 			&& want_rx
1629 			&& (netmap_fwd || kring->ring->flags & NR_FORWARD) ) {
1630 		if (kring->ring->avail == 0)
1631 			netmap_rxsync_from_host(na, td, dev);
1632 		if (kring->ring->avail > 0)
1633 			revents |= want_rx;
1634 	}
1635 
1636 	/*
1637 	 * check_all_{tx|rx} are set if the card has more than one queue AND
1638 	 * the file descriptor is bound to all of them. If so, we sleep on
1639 	 * the "global" selinfo, otherwise we sleep on individual selinfo
1640 	 * (FreeBSD only allows two selinfo's per file descriptor).
1641 	 * The interrupt routine in the driver wake one or the other
1642 	 * (or both) depending on which clients are active.
1643 	 *
1644 	 * rxsync() is only called if we run out of buffers on a POLLIN.
1645 	 * txsync() is called if we run out of buffers on POLLOUT, or
1646 	 * there are pending packets to send. The latter can be disabled
1647 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
1648 	 */
1649 	check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1);
1650 	check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1);
1651 
1652 	if (priv->np_qlast != NETMAP_HW_RING) {
1653 		lim_tx = lim_rx = priv->np_qlast;
1654 	}
1655 
1656 	/*
1657 	 * We start with a lock free round which is cheap if we have
1658 	 * slots available. If this fails, then lock and call the sync
1659 	 * routines.
1660 	 */
1661 	for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) {
1662 		kring = &na->rx_rings[i];
1663 		if (kring->ring->avail > 0) {
1664 			revents |= want_rx;
1665 			want_rx = 0;	/* also breaks the loop */
1666 		}
1667 	}
1668 	for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) {
1669 		kring = &na->tx_rings[i];
1670 		if (kring->ring->avail > 0) {
1671 			revents |= want_tx;
1672 			want_tx = 0;	/* also breaks the loop */
1673 		}
1674 	}
1675 
1676 	/*
1677 	 * If we to push packets out (priv->np_txpoll) or want_tx is
1678 	 * still set, we do need to run the txsync calls (on all rings,
1679 	 * to avoid that the tx rings stall).
1680 	 * XXX should also check cur != hwcur on the tx rings.
1681 	 * Fortunately, normal tx mode has np_txpoll set.
1682 	 */
1683 	if (priv->np_txpoll || want_tx) {
1684 		/* If we really want to be woken up (want_tx),
1685 		 * do a selrecord, either on the global or on
1686 		 * the private structure.  Then issue the txsync
1687 		 * so there is no race in the selrecord/selwait
1688 		 */
1689 flush_tx:
1690 		for (i = priv->np_qfirst; i < lim_tx; i++) {
1691 			kring = &na->tx_rings[i];
1692 			/*
1693 			 * Skip this ring if want_tx == 0
1694 			 * (we have already done a successful sync on
1695 			 * a previous ring) AND kring->cur == kring->hwcur
1696 			 * (there are no pending transmissions for this ring).
1697 			 */
1698 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
1699 				continue;
1700 			/* make sure only one user thread is doing this */
1701 			if (nm_kr_tryget(kring)) {
1702 				ND("ring %p busy is %d",
1703 				    kring, (int)kring->nr_busy);
1704 				revents |= POLLERR;
1705 				goto out;
1706 			}
1707 
1708 			if (netmap_verbose & NM_VERB_TXSYNC)
1709 				D("send %d on %s %d",
1710 					kring->ring->cur, NM_IFPNAME(ifp), i);
1711 			if (na->nm_txsync(na, i, 0))
1712 				revents |= POLLERR;
1713 
1714 			/* Check avail/call selrecord only if called with POLLOUT */
1715 			if (want_tx) {
1716 				if (kring->ring->avail > 0) {
1717 					/* stop at the first ring. We don't risk
1718 					 * starvation.
1719 					 */
1720 					revents |= want_tx;
1721 					want_tx = 0;
1722 				}
1723 			}
1724 			nm_kr_put(kring);
1725 		}
1726 		if (want_tx && retry_tx) {
1727 			KNOTE(check_all_tx ? &na->tx_si.ki_note :
1728 			    &na->tx_rings[priv->np_qfirst].si.ki_note, 0);
1729 			retry_tx = 0;
1730 			goto flush_tx;
1731 		}
1732 	}
1733 
1734 	/*
1735 	 * now if want_rx is still set we need to lock and rxsync.
1736 	 * Do it on all rings because otherwise we starve.
1737 	 */
1738 	if (want_rx) {
1739 		int retry_rx = 1;
1740 do_retry_rx:
1741 		for (i = priv->np_qfirst; i < lim_rx; i++) {
1742 			kring = &na->rx_rings[i];
1743 
1744 			if (nm_kr_tryget(kring)) {
1745 				revents |= POLLERR;
1746 				goto out;
1747 			}
1748 
1749 			/* XXX NR_FORWARD should only be read on
1750 			 * physical or NIC ports
1751 			 */
1752 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
1753 				ND(10, "forwarding some buffers up %d to %d",
1754 				    kring->nr_hwcur, kring->ring->cur);
1755 				netmap_grab_packets(kring, &q, netmap_fwd);
1756 			}
1757 
1758 			if (na->nm_rxsync(na, i, 0))
1759 				revents |= POLLERR;
1760 			if (netmap_no_timestamp == 0 ||
1761 					kring->ring->flags & NR_TIMESTAMP) {
1762 				microtime(&kring->ring->ts);
1763 			}
1764 
1765 			if (kring->ring->avail > 0) {
1766 				revents |= want_rx;
1767 				retry_rx = 0;
1768 			}
1769 			nm_kr_put(kring);
1770 		}
1771 		if (retry_rx) {
1772 			retry_rx = 0;
1773 			KNOTE(check_all_rx ? &na->rx_si.ki_note :
1774 			    &na->rx_rings[priv->np_qfirst].si.ki_note, 0);
1775 			goto do_retry_rx;
1776 		}
1777 	}
1778 
1779 	/* forward host to the netmap ring.
1780 	 * I am accessing nr_hwavail without lock, but netmap_transmit
1781 	 * can only increment it, so the operation is safe.
1782 	 */
1783 	kring = &na->rx_rings[lim_rx];
1784 	if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
1785 			&& (netmap_fwd || kring->ring->flags & NR_FORWARD)
1786 			 && kring->nr_hwavail > 0 && !host_forwarded) {
1787 		netmap_sw_to_nic(na);
1788 		host_forwarded = 1; /* prevent another pass */
1789 		want_rx = 0;
1790 		goto flush_tx;
1791 	}
1792 
1793 	if (q.head)
1794 		netmap_send_up(na->ifp, &q);
1795 
1796 out:
1797 
1798 	return (revents);
1799 }
1800 
1801 /*------- driver support routines ------*/
1802 
1803 static int netmap_hw_krings_create(struct netmap_adapter *);
1804 
1805 static int
1806 netmap_notify(struct netmap_adapter *na, u_int n_ring, enum txrx tx, int flags)
1807 {
1808 	struct netmap_kring *kring;
1809 
1810 	if (tx == NR_TX) {
1811 		kring = na->tx_rings + n_ring;
1812 		KNOTE(&kring->si.ki_note, 0);
1813 		wakeup(&kring->si.ki_note);
1814 		if (flags & NAF_GLOBAL_NOTIFY)
1815 			wakeup(&na->tx_si.ki_note);
1816 	} else {
1817 		kring = na->rx_rings + n_ring;
1818 		KNOTE(&kring->si.ki_note, 0);
1819 		wakeup(&kring->si.ki_note);
1820 		if (flags & NAF_GLOBAL_NOTIFY)
1821 			wakeup(&na->rx_si.ki_note);
1822 	}
1823 	return 0;
1824 }
1825 
1826 
1827 // XXX check handling of failures
1828 int
1829 netmap_attach_common(struct netmap_adapter *na)
1830 {
1831 	struct ifnet *ifp = na->ifp;
1832 
1833 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
1834 		D("%s: invalid rings tx %d rx %d",
1835 			ifp->if_xname, na->num_tx_rings, na->num_rx_rings);
1836 		return EINVAL;
1837 	}
1838 	WNA(ifp) = na;
1839 	NETMAP_SET_CAPABLE(ifp);
1840 	if (na->nm_krings_create == NULL) {
1841 		na->nm_krings_create = netmap_hw_krings_create;
1842 		na->nm_krings_delete = netmap_krings_delete;
1843 	}
1844 	if (na->nm_notify == NULL)
1845 		na->nm_notify = netmap_notify;
1846 	na->active_fds = 0;
1847 
1848 	if (na->nm_mem == NULL)
1849 		na->nm_mem = &nm_mem;
1850 	return 0;
1851 }
1852 
1853 
1854 void
1855 netmap_detach_common(struct netmap_adapter *na)
1856 {
1857 	if (na->ifp)
1858 		WNA(na->ifp) = NULL; /* XXX do we need this? */
1859 
1860 	if (na->tx_rings) { /* XXX should not happen */
1861 		D("freeing leftover tx_rings");
1862 		na->nm_krings_delete(na);
1863 	}
1864 	if (na->na_flags & NAF_MEM_OWNER)
1865 		netmap_mem_private_delete(na->nm_mem);
1866 	bzero(na, sizeof(*na));
1867 	kfree(na, M_DEVBUF);
1868 }
1869 
1870 
1871 /*
1872  * Initialize a ``netmap_adapter`` object created by driver on attach.
1873  * We allocate a block of memory with room for a struct netmap_adapter
1874  * plus two sets of N+2 struct netmap_kring (where N is the number
1875  * of hardware rings):
1876  * krings	0..N-1	are for the hardware queues.
1877  * kring	N	is for the host stack queue
1878  * kring	N+1	is only used for the selinfo for all queues.
1879  * Return 0 on success, ENOMEM otherwise.
1880  *
1881  * By default the receive and transmit adapter ring counts are both initialized
1882  * to num_queues.  na->num_tx_rings can be set for cards with different tx/rx
1883  * setups.
1884  */
1885 int
1886 netmap_attach(struct netmap_adapter *arg)
1887 {
1888 	struct netmap_hw_adapter *hwna = NULL;
1889 	// XXX when is arg == NULL ?
1890 	struct ifnet *ifp = arg ? arg->ifp : NULL;
1891 
1892 	if (arg == NULL || ifp == NULL)
1893 		goto fail;
1894 	hwna = kmalloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
1895 	if (hwna == NULL)
1896 		goto fail;
1897 	hwna->up = *arg;
1898 	if (netmap_attach_common(&hwna->up)) {
1899 		kfree(hwna, M_DEVBUF);
1900 		goto fail;
1901 	}
1902 	netmap_adapter_get(&hwna->up);
1903 
1904 	D("success for %s", NM_IFPNAME(ifp));
1905 	return 0;
1906 
1907 fail:
1908 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
1909 	netmap_detach(ifp);
1910 	return (hwna ? EINVAL : ENOMEM);
1911 }
1912 
1913 
1914 void
1915 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
1916 {
1917 	if (!na) {
1918 		return;
1919 	}
1920 
1921 	refcount_acquire(&na->na_refcount);
1922 }
1923 
1924 
1925 /* returns 1 iff the netmap_adapter is destroyed */
1926 int
1927 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
1928 {
1929 	if (!na)
1930 		return 1;
1931 
1932 	if (!refcount_release(&na->na_refcount))
1933 		return 0;
1934 
1935 	if (na->nm_dtor)
1936 		na->nm_dtor(na);
1937 
1938 	netmap_detach_common(na);
1939 
1940 	return 1;
1941 }
1942 
1943 
1944 int
1945 netmap_hw_krings_create(struct netmap_adapter *na)
1946 {
1947 	return netmap_krings_create(na,
1948 		na->num_tx_rings + 1, na->num_rx_rings + 1, 0);
1949 }
1950 
1951 
1952 
1953 /*
1954  * Free the allocated memory linked to the given ``netmap_adapter``
1955  * object.
1956  */
1957 void
1958 netmap_detach(struct ifnet *ifp)
1959 {
1960 	struct netmap_adapter *na = NA(ifp);
1961 
1962 	if (!na)
1963 		return;
1964 
1965 	NMG_LOCK();
1966 	netmap_disable_all_rings(ifp);
1967 	netmap_adapter_put(na);
1968 	na->ifp = NULL;
1969 	netmap_enable_all_rings(ifp);
1970 	NMG_UNLOCK();
1971 }
1972 
1973 
1974 /*
1975  * Intercept packets from the network stack and pass them
1976  * to netmap as incoming packets on the 'software' ring.
1977  * We rely on the OS to make sure that the ifp and na do not go
1978  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
1979  * In nm_register() or whenever there is a reinitialization,
1980  * we make sure to access the core lock and per-ring locks
1981  * so that IFCAP_NETMAP is visible here.
1982  */
1983 int
1984 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
1985 {
1986 	struct netmap_adapter *na = NA(ifp);
1987 	struct netmap_kring *kring;
1988 	u_int i, len = MBUF_LEN(m);
1989 	u_int error = EBUSY, lim;
1990 	struct netmap_slot *slot;
1991 
1992 	// XXX [Linux] we do not need this lock
1993 	// if we follow the down/configure/up protocol -gl
1994 	// mtx_lock(&na->core_lock);
1995 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) {
1996 		/* interface not in netmap mode anymore */
1997 		error = ENXIO;
1998 		goto done;
1999 	}
2000 
2001 	kring = &na->rx_rings[na->num_rx_rings];
2002 	lim = kring->nkr_num_slots - 1;
2003 	if (netmap_verbose & NM_VERB_HOST)
2004 		D("%s packet %d len %d from the stack", NM_IFPNAME(ifp),
2005 			kring->nr_hwcur + kring->nr_hwavail, len);
2006 	// XXX reconsider long packets if we handle fragments
2007 	if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */
2008 		D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp),
2009 			len, NETMAP_BDG_BUF_SIZE(na->nm_mem));
2010 		goto done;
2011 	}
2012 	/* protect against other instances of netmap_transmit,
2013 	 * and userspace invocations of rxsync().
2014 	 */
2015 	// XXX [Linux] there can be no other instances of netmap_transmit
2016 	// on this same ring, but we still need this lock to protect
2017 	// concurrent access from netmap_sw_to_nic() -gl
2018 	lockmgr(&kring->q_lock, LK_EXCLUSIVE);
2019 	if (kring->nr_hwavail >= lim) {
2020 		if (netmap_verbose)
2021 			D("stack ring %s full\n", NM_IFPNAME(ifp));
2022 	} else {
2023 		/* compute the insert position */
2024 		i = nm_kr_rxpos(kring);
2025 		slot = &kring->ring->slot[i];
2026 		m_copydata(m, 0, (int)len, BDG_NMB(na, slot));
2027 		slot->len = len;
2028 		slot->flags = kring->nkr_slot_flags;
2029 		kring->nr_hwavail++;
2030 		if (netmap_verbose  & NM_VERB_HOST)
2031 			D("wake up host ring %s %d", NM_IFPNAME(na->ifp), na->num_rx_rings);
2032 		na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
2033 		error = 0;
2034 	}
2035 	lockmgr(&kring->q_lock, LK_RELEASE);
2036 
2037 done:
2038 	// mtx_unlock(&na->core_lock);
2039 
2040 	/* release the mbuf in either cases of success or failure. As an
2041 	 * alternative, put the mbuf in a free list and free the list
2042 	 * only when really necessary.
2043 	 */
2044 	m_freem(m);
2045 
2046 	return (error);
2047 }
2048 
2049 
2050 /*
2051  * netmap_reset() is called by the driver routines when reinitializing
2052  * a ring. The driver is in charge of locking to protect the kring.
2053  * If native netmap mode is not set just return NULL.
2054  */
2055 struct netmap_slot *
2056 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2057 	u_int new_cur)
2058 {
2059 	struct netmap_kring *kring;
2060 	int new_hwofs, lim;
2061 
2062 	if (na == NULL) {
2063 		D("NULL na, should not happen");
2064 		return NULL;	/* no netmap support here */
2065 	}
2066 	if (!(na->ifp->if_capenable & IFCAP_NETMAP) || nma_is_generic(na)) {
2067 		ND("interface not in netmap mode");
2068 		return NULL;	/* nothing to reinitialize */
2069 	}
2070 
2071 	/* XXX note- in the new scheme, we are not guaranteed to be
2072 	 * under lock (e.g. when called on a device reset).
2073 	 * In this case, we should set a flag and do not trust too
2074 	 * much the values. In practice: TODO
2075 	 * - set a RESET flag somewhere in the kring
2076 	 * - do the processing in a conservative way
2077 	 * - let the *sync() fixup at the end.
2078 	 */
2079 	if (tx == NR_TX) {
2080 		if (n >= na->num_tx_rings)
2081 			return NULL;
2082 		kring = na->tx_rings + n;
2083 		new_hwofs = kring->nr_hwcur - new_cur;
2084 	} else {
2085 		if (n >= na->num_rx_rings)
2086 			return NULL;
2087 		kring = na->rx_rings + n;
2088 		new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur;
2089 	}
2090 	lim = kring->nkr_num_slots - 1;
2091 	if (new_hwofs > lim)
2092 		new_hwofs -= lim + 1;
2093 
2094 	/* Always set the new offset value and realign the ring. */
2095 	D("%s hwofs %d -> %d, hwavail %d -> %d",
2096 		tx == NR_TX ? "TX" : "RX",
2097 		kring->nkr_hwofs, new_hwofs,
2098 		kring->nr_hwavail,
2099 		tx == NR_TX ? lim : kring->nr_hwavail);
2100 	kring->nkr_hwofs = new_hwofs;
2101 	if (tx == NR_TX)
2102 		kring->nr_hwavail = lim;
2103 	kring->nr_hwreserved = 0;
2104 
2105 	/*
2106 	 * Wakeup on the individual and global selwait
2107 	 * We do the wakeup here, but the ring is not yet reconfigured.
2108 	 * However, we are under lock so there are no races.
2109 	 */
2110 	na->nm_notify(na, n, tx, NAF_GLOBAL_NOTIFY);
2111 	return kring->ring->slot;
2112 }
2113 
2114 
2115 /*
2116  * Default functions to handle rx/tx interrupts from a physical device.
2117  * "work_done" is non-null on the RX path, NULL for the TX path.
2118  * "generic" is 0 when we are called by a device driver, and 1 when we
2119  * are called by the generic netmap adapter layer.
2120  * We rely on the OS to make sure that there is only one active
2121  * instance per queue, and that there is appropriate locking.
2122  *
2123  * If the card is not in netmap mode, simply return 0,
2124  * so that the caller proceeds with regular processing.
2125  *
2126  * We return 0 also when the card is in netmap mode but the current
2127  * netmap adapter is the generic one, because this function will be
2128  * called by the generic layer.
2129  *
2130  * If the card is connected to a netmap file descriptor,
2131  * do a selwakeup on the individual queue, plus one on the global one
2132  * if needed (multiqueue card _and_ there are multiqueue listeners),
2133  * and return 1.
2134  *
2135  * Finally, if called on rx from an interface connected to a switch,
2136  * calls the proper forwarding routine, and return 1.
2137  */
2138 int
2139 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2140 {
2141 	struct netmap_adapter *na = NA(ifp);
2142 	struct netmap_kring *kring;
2143 
2144 	q &= NETMAP_RING_MASK;
2145 
2146 	if (netmap_verbose) {
2147 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
2148 	}
2149 
2150 	if (work_done) { /* RX path */
2151 		if (q >= na->num_rx_rings)
2152 			return 0;	// not a physical queue
2153 		kring = na->rx_rings + q;
2154 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
2155 		na->nm_notify(na, q, NR_RX,
2156 			(na->num_rx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0));
2157 		*work_done = 1; /* do not fire napi again */
2158 	} else { /* TX path */
2159 		if (q >= na->num_tx_rings)
2160 			return 0;	// not a physical queue
2161 		kring = na->tx_rings + q;
2162 		na->nm_notify(na, q, NR_TX,
2163 			(na->num_tx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0));
2164 	}
2165 	return 1;
2166 }
2167 
2168 /*
2169  * Default functions to handle rx/tx interrupts from a physical device.
2170  * "work_done" is non-null on the RX path, NULL for the TX path.
2171  * "generic" is 0 when we are called by a device driver, and 1 when we
2172  * are called by the generic netmap adapter layer.
2173  * We rely on the OS to make sure that there is only one active
2174  * instance per queue, and that there is appropriate locking.
2175  *
2176  * If the card is not in netmap mode, simply return 0,
2177  * so that the caller proceeds with regular processing.
2178  *
2179  * If the card is connected to a netmap file descriptor,
2180  * do a selwakeup on the individual queue, plus one on the global one
2181  * if needed (multiqueue card _and_ there are multiqueue listeners),
2182  * and return 1.
2183  *
2184  * Finally, if called on rx from an interface connected to a switch,
2185  * calls the proper forwarding routine, and return 1.
2186  */
2187 int
2188 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2189 {
2190 	// XXX could we check NAF_NATIVE_ON ?
2191 	if (!(ifp->if_capenable & IFCAP_NETMAP))
2192 		return 0;
2193 
2194 	if (NA(ifp)->na_flags & NAF_SKIP_INTR) {
2195 		ND("use regular interrupt");
2196 		return 0;
2197 	}
2198 
2199 	return netmap_common_irq(ifp, q, work_done);
2200 }
2201 
2202 
2203 static struct cdev *netmap_dev; /* /dev/netmap character device. */
2204 
2205 
2206 /*
2207  * Module loader.
2208  *
2209  * Create the /dev/netmap device and initialize all global
2210  * variables.
2211  *
2212  * Return 0 on success, errno on failure.
2213  */
2214 int
2215 netmap_init(void)
2216 {
2217 	int error;
2218 
2219 	NMG_LOCK_INIT();
2220 
2221 	error = netmap_mem_init();
2222 	if (error != 0) {
2223 		kprintf("netmap: unable to initialize the memory allocator.\n");
2224 		return (error);
2225 	}
2226 	kprintf("netmap: loaded module\n");
2227 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
2228 			      "netmap");
2229 
2230 	netmap_init_bridges();
2231 	return (error);
2232 }
2233 
2234 
2235 /*
2236  * Module unloader.
2237  *
2238  * Free all the memory, and destroy the ``/dev/netmap`` device.
2239  */
2240 void
2241 netmap_fini(void)
2242 {
2243 	destroy_dev(netmap_dev);
2244 	netmap_mem_fini();
2245 	NMG_LOCK_DESTROY();
2246 	kprintf("netmap: unloaded module.\n");
2247 }
2248