xref: /dflybsd-src/sys/net/netmap/netmap.c (revision 572ff6f6e8b95055988f178b6ba12ce77bb5b3c2)
1 /*
2  * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * This module supports memory mapped access to network devices,
29  * see netmap(4).
30  *
31  * The module uses a large, memory pool allocated by the kernel
32  * and accessible as mmapped memory by multiple userspace threads/processes.
33  * The memory pool contains packet buffers and "netmap rings",
34  * i.e. user-accessible copies of the interface's queues.
35  *
36  * Access to the network card works like this:
37  * 1. a process/thread issues one or more open() on /dev/netmap, to create
38  *    select()able file descriptor on which events are reported.
39  * 2. on each descriptor, the process issues an ioctl() to identify
40  *    the interface that should report events to the file descriptor.
41  * 3. on each descriptor, the process issues an mmap() request to
42  *    map the shared memory region within the process' address space.
43  *    The list of interesting queues is indicated by a location in
44  *    the shared memory region.
45  * 4. using the functions in the netmap(4) userspace API, a process
46  *    can look up the occupation state of a queue, access memory buffers,
47  *    and retrieve received packets or enqueue packets to transmit.
48  * 5. using some ioctl()s the process can synchronize the userspace view
49  *    of the queue with the actual status in the kernel. This includes both
50  *    receiving the notification of new packets, and transmitting new
51  *    packets on the output interface.
52  * 6. select() or poll() can be used to wait for events on individual
53  *    transmit or receive queues (or all queues for a given interface).
54  *
55 
56 		SYNCHRONIZATION (USER)
57 
58 The netmap rings and data structures may be shared among multiple
59 user threads or even independent processes.
60 Any synchronization among those threads/processes is delegated
61 to the threads themselves. Only one thread at a time can be in
62 a system call on the same netmap ring. The OS does not enforce
63 this and only guarantees against system crashes in case of
64 invalid usage.
65 
66 		LOCKING (INTERNAL)
67 
68 Within the kernel, access to the netmap rings is protected as follows:
69 
70 - a spinlock on each ring, to handle producer/consumer races on
71   RX rings attached to the host stack (against multiple host
72   threads writing from the host stack to the same ring),
73   and on 'destination' rings attached to a VALE switch
74   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
75   protecting multiple active senders for the same destination)
76 
77 - an atomic variable to guarantee that there is at most one
78   instance of *_*xsync() on the ring at any time.
79   For rings connected to user file
80   descriptors, an atomic_test_and_set() protects this, and the
81   lock on the ring is not actually used.
82   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
83   is also used to prevent multiple executions (the driver might indeed
84   already guarantee this).
85   For NIC TX rings connected to a VALE switch, the lock arbitrates
86   access to the queue (both when allocating buffers and when pushing
87   them out).
88 
89 - *xsync() should be protected against initializations of the card.
90   On FreeBSD most devices have the reset routine protected by
91   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
92   the RING protection on rx_reset(), this should be added.
93 
94   On linux there is an external lock on the tx path, which probably
95   also arbitrates access to the reset routine. XXX to be revised
96 
97 - a per-interface core_lock protecting access from the host stack
98   while interfaces may be detached from netmap mode.
99   XXX there should be no need for this lock if we detach the interfaces
100   only while they are down.
101 
102 
103 --- VALE SWITCH ---
104 
105 NMG_LOCK() serializes all modifications to switches and ports.
106 A switch cannot be deleted until all ports are gone.
107 
108 For each switch, an SX lock (RWlock on linux) protects
109 deletion of ports. When configuring or deleting a new port, the
110 lock is acquired in exclusive mode (after holding NMG_LOCK).
111 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
112 The lock is held throughout the entire forwarding cycle,
113 during which the thread may incur in a page fault.
114 Hence it is important that sleepable shared locks are used.
115 
116 On the rx ring, the per-port lock is grabbed initially to reserve
117 a number of slot in the ring, then the lock is released,
118 packets are copied from source to destination, and then
119 the lock is acquired again and the receive ring is updated.
120 (A similar thing is done on the tx ring for NIC and host stack
121 ports attached to the switch)
122 
123  */
124 
125 /*
126  * OS-specific code that is used only within this file.
127  * Other OS-specific code that must be accessed by drivers
128  * is present in netmap_kern.h
129  */
130 
131 /* __FBSDID("$FreeBSD: head/sys/dev/netmap/netmap.c 257176 2013-10-26 17:58:36Z glebius $"); */
132 #include <sys/types.h>
133 #include <sys/errno.h>
134 #include <sys/param.h>	/* defines used in kernel.h */
135 #include <sys/kernel.h>	/* types used in module initialization */
136 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
137 #include <sys/devfs.h>
138 #include <sys/sockio.h>
139 #include <sys/socketvar.h>	/* struct socket */
140 #include <sys/malloc.h>
141 #include <sys/poll.h>
142 #include <sys/lock.h>
143 #include <sys/socket.h> /* sockaddrs */
144 #include <sys/event.h>
145 #include <sys/sysctl.h>
146 #include <net/if.h>
147 #include <net/if_var.h>
148 #include <net/bpf.h>		/* BIOCIMMEDIATE */
149 #include <sys/bus.h>	/* bus_dmamap_* */
150 #include <sys/endian.h>
151 #include <sys/refcount.h>
152 
153 /* reduce conditional code */
154 #define init_waitqueue_head(x)	// only needed in linux
155 
156 extern struct dev_ops netmap_cdevsw;
157 
158 /*
159  * common headers
160  */
161 #include <net/netmap.h>
162 #include "netmap_kern.h"
163 #include "netmap_mem2.h"
164 
165 #define selrecord(x, y) do { } while (0)	/* XXX porting in progress */
166 
167 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
168 
169 /*
170  * The following variables are used by the drivers and replicate
171  * fields in the global memory pool. They only refer to buffers
172  * used by physical interfaces.
173  */
174 u_int netmap_total_buffers;
175 u_int netmap_buf_size;
176 char *netmap_buffer_base;	/* also address of an invalid buffer */
177 
178 /* user-controlled variables */
179 int netmap_verbose;
180 
181 static int netmap_no_timestamp; /* don't timestamp on rxsync */
182 
183 SYSCTL_NODE(_net, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
184 SYSCTL_INT(_net_netmap, OID_AUTO, verbose,
185     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
186 SYSCTL_INT(_net_netmap, OID_AUTO, no_timestamp,
187     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
188 int netmap_mitigate = 1;
189 SYSCTL_INT(_net_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
190 int netmap_no_pendintr = 1;
191 SYSCTL_INT(_net_netmap, OID_AUTO, no_pendintr,
192     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
193 int netmap_txsync_retry = 2;
194 SYSCTL_INT(_net_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
195     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
196 
197 int netmap_flags = 0;	/* debug flags */
198 int netmap_fwd = 0;	/* force transparent mode */
199 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
200 
201 /*
202  * netmap_admode selects the netmap mode to use.
203  * Invalid values are reset to NETMAP_ADMODE_BEST
204  */
205 enum { NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
206 	NETMAP_ADMODE_NATIVE,	/* either native or none */
207 	NETMAP_ADMODE_GENERIC,	/* force generic */
208 	NETMAP_ADMODE_LAST };
209 #define NETMAP_ADMODE_NATIVE        1  /* Force native netmap adapter. */
210 #define NETMAP_ADMODE_GENERIC       2  /* Force generic netmap adapter. */
211 #define NETMAP_ADMODE_BEST          0  /* Priority to native netmap adapter. */
212 static int netmap_admode = NETMAP_ADMODE_BEST;
213 
214 int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
215 int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
216 
217 SYSCTL_INT(_net_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
218 SYSCTL_INT(_net_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
219 SYSCTL_INT(_net_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
220 SYSCTL_INT(_net_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
221 SYSCTL_INT(_net_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
222 SYSCTL_INT(_net_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
223 
224 NMG_LOCK_T	netmap_global_lock;
225 
226 
227 static void
228 nm_kr_get(struct netmap_kring *kr)
229 {
230 	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
231 		tsleep(kr, 0, "NM_KR_GET", 4);
232 }
233 
234 
235 void
236 netmap_disable_ring(struct netmap_kring *kr)
237 {
238 	kr->nkr_stopped = 1;
239 	nm_kr_get(kr);
240 	lockmgr(&kr->q_lock, LK_EXCLUSIVE);
241 	lockmgr(&kr->q_lock, LK_RELEASE);
242 	nm_kr_put(kr);
243 }
244 
245 
246 static void
247 netmap_set_all_rings(struct ifnet *ifp, int stopped)
248 {
249 	struct netmap_adapter *na;
250 	int i;
251 
252 	if (!(ifp->if_capenable & IFCAP_NETMAP))
253 		return;
254 
255 	na = NA(ifp);
256 
257 	for (i = 0; i <= na->num_tx_rings; i++) {
258 		if (stopped)
259 			netmap_disable_ring(na->tx_rings + i);
260 		else
261 			na->tx_rings[i].nkr_stopped = 0;
262 		na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY |
263 			(i == na->num_tx_rings ? NAF_GLOBAL_NOTIFY: 0));
264 	}
265 
266 	for (i = 0; i <= na->num_rx_rings; i++) {
267 		if (stopped)
268 			netmap_disable_ring(na->rx_rings + i);
269 		else
270 			na->rx_rings[i].nkr_stopped = 0;
271 		na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY |
272 			(i == na->num_rx_rings ? NAF_GLOBAL_NOTIFY: 0));
273 	}
274 }
275 
276 
277 void
278 netmap_disable_all_rings(struct ifnet *ifp)
279 {
280 	netmap_set_all_rings(ifp, 1 /* stopped */);
281 }
282 
283 
284 void
285 netmap_enable_all_rings(struct ifnet *ifp)
286 {
287 	netmap_set_all_rings(ifp, 0 /* enabled */);
288 }
289 
290 
291 /*
292  * generic bound_checking function
293  */
294 u_int
295 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
296 {
297 	u_int oldv = *v;
298 	const char *op = NULL;
299 
300 	if (dflt < lo)
301 		dflt = lo;
302 	if (dflt > hi)
303 		dflt = hi;
304 	if (oldv < lo) {
305 		*v = dflt;
306 		op = "Bump";
307 	} else if (oldv > hi) {
308 		*v = hi;
309 		op = "Clamp";
310 	}
311 	if (op && msg)
312 		kprintf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
313 	return *v;
314 }
315 
316 
317 /*
318  * packet-dump function, user-supplied or static buffer.
319  * The destination buffer must be at least 30+4*len
320  */
321 const char *
322 nm_dump_buf(char *p, int len, int lim, char *dst)
323 {
324 	static char _dst[8192];
325 	int i, j, i0;
326 	static char hex[] ="0123456789abcdef";
327 	char *o;	/* output position */
328 
329 #define P_HI(x)	hex[((x) & 0xf0)>>4]
330 #define P_LO(x)	hex[((x) & 0xf)]
331 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
332 	if (!dst)
333 		dst = _dst;
334 	if (lim <= 0 || lim > len)
335 		lim = len;
336 	o = dst;
337 	ksprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
338 	o += strlen(o);
339 	/* hexdump routine */
340 	for (i = 0; i < lim; ) {
341 		ksprintf(o, "%5d: ", i);
342 		o += strlen(o);
343 		memset(o, ' ', 48);
344 		i0 = i;
345 		for (j=0; j < 16 && i < lim; i++, j++) {
346 			o[j*3] = P_HI(p[i]);
347 			o[j*3+1] = P_LO(p[i]);
348 		}
349 		i = i0;
350 		for (j=0; j < 16 && i < lim; i++, j++)
351 			o[j + 48] = P_C(p[i]);
352 		o[j+48] = '\n';
353 		o += j+49;
354 	}
355 	*o = '\0';
356 #undef P_HI
357 #undef P_LO
358 #undef P_C
359 	return dst;
360 }
361 
362 
363 
364 /*
365  * Fetch configuration from the device, to cope with dynamic
366  * reconfigurations after loading the module.
367  */
368 int
369 netmap_update_config(struct netmap_adapter *na)
370 {
371 	struct ifnet *ifp = na->ifp;
372 	u_int txr, txd, rxr, rxd;
373 
374 	txr = txd = rxr = rxd = 0;
375 	if (na->nm_config) {
376 		na->nm_config(na, &txr, &txd, &rxr, &rxd);
377 	} else {
378 		/* take whatever we had at init time */
379 		txr = na->num_tx_rings;
380 		txd = na->num_tx_desc;
381 		rxr = na->num_rx_rings;
382 		rxd = na->num_rx_desc;
383 	}
384 
385 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
386 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
387 		return 0; /* nothing changed */
388 	if (netmap_verbose || na->active_fds > 0) {
389 		D("stored config %s: txring %d x %d, rxring %d x %d",
390 			NM_IFPNAME(ifp),
391 			na->num_tx_rings, na->num_tx_desc,
392 			na->num_rx_rings, na->num_rx_desc);
393 		D("new config %s: txring %d x %d, rxring %d x %d",
394 			NM_IFPNAME(ifp), txr, txd, rxr, rxd);
395 	}
396 	if (na->active_fds == 0) {
397 		D("configuration changed (but fine)");
398 		na->num_tx_rings = txr;
399 		na->num_tx_desc = txd;
400 		na->num_rx_rings = rxr;
401 		na->num_rx_desc = rxd;
402 		return 0;
403 	}
404 	D("configuration changed while active, this is bad...");
405 	return 1;
406 }
407 
408 
409 int
410 netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom)
411 {
412 	u_int i, len, ndesc;
413 	struct netmap_kring *kring;
414 
415 	len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
416 
417 	na->tx_rings = kmalloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
418 	if (na->tx_rings == NULL) {
419 		D("Cannot allocate krings");
420 		return ENOMEM;
421 	}
422 	na->rx_rings = na->tx_rings + ntx;
423 
424 	ndesc = na->num_tx_desc;
425 	for (i = 0; i < ntx; i++) { /* Transmit rings */
426 		kring = &na->tx_rings[i];
427 		bzero(kring, sizeof(*kring));
428 		kring->na = na;
429 		kring->nkr_num_slots = ndesc;
430 		/*
431 		 * IMPORTANT:
432 		 * Always keep one slot empty, so we can detect new
433 		 * transmissions comparing cur and nr_hwcur (they are
434 		 * the same only if there are no new transmissions).
435 		 */
436 		kring->nr_hwavail = ndesc - 1;
437 		lockinit(&kring->q_lock, "nm_txq_lock", 0, LK_CANRECURSE);
438 		init_waitqueue_head(&kring->si);
439 	}
440 
441 	ndesc = na->num_rx_desc;
442 	for (i = 0; i < nrx; i++) { /* Receive rings */
443 		kring = &na->rx_rings[i];
444 		bzero(kring, sizeof(*kring));
445 		kring->na = na;
446 		kring->nkr_num_slots = ndesc;
447 		lockinit(&kring->q_lock, "nm_rxq_lock", 0, LK_CANRECURSE);
448 		init_waitqueue_head(&kring->si);
449 	}
450 	init_waitqueue_head(&na->tx_si);
451 	init_waitqueue_head(&na->rx_si);
452 
453 	na->tailroom = na->rx_rings + nrx;
454 
455 	return 0;
456 
457 }
458 
459 
460 void
461 netmap_krings_delete(struct netmap_adapter *na)
462 {
463 	int i;
464 
465 	for (i = 0; i < na->num_tx_rings + 1; i++) {
466 		lockuninit(&na->tx_rings[i].q_lock);
467 	}
468 	for (i = 0; i < na->num_rx_rings + 1; i++) {
469 		lockuninit(&na->rx_rings[i].q_lock);
470 	}
471 	kfree(na->tx_rings, M_DEVBUF);
472 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
473 }
474 
475 
476 static struct netmap_if*
477 netmap_if_new(const char *ifname, struct netmap_adapter *na)
478 {
479 	struct netmap_if *nifp;
480 
481 	if (netmap_update_config(na)) {
482 		/* configuration mismatch, report and fail */
483 		return NULL;
484 	}
485 
486 	if (na->active_fds)
487 		goto final;
488 
489 	if (na->nm_krings_create(na))
490 		goto cleanup;
491 
492 	if (netmap_mem_rings_create(na))
493 		goto cleanup;
494 
495 final:
496 
497 	nifp = netmap_mem_if_new(ifname, na);
498 	if (nifp == NULL)
499 		goto cleanup;
500 
501 	return (nifp);
502 
503 cleanup:
504 
505 	if (na->active_fds == 0) {
506 		netmap_mem_rings_delete(na);
507 		na->nm_krings_delete(na);
508 	}
509 
510 	return NULL;
511 }
512 
513 
514 /* grab a reference to the memory allocator, if we don't have one already.  The
515  * reference is taken from the netmap_adapter registered with the priv.
516  *
517  */
518 static int
519 netmap_get_memory_locked(struct netmap_priv_d* p)
520 {
521 	struct netmap_mem_d *nmd;
522 	int error = 0;
523 
524 	if (p->np_na == NULL) {
525 		if (!netmap_mmap_unreg)
526 			return ENODEV;
527 		/* for compatibility with older versions of the API
528  		 * we use the global allocator when no interface has been
529  		 * registered
530  		 */
531 		nmd = &nm_mem;
532 	} else {
533 		nmd = p->np_na->nm_mem;
534 	}
535 	if (p->np_mref == NULL) {
536 		error = netmap_mem_finalize(nmd);
537 		if (!error)
538 			p->np_mref = nmd;
539 	} else if (p->np_mref != nmd) {
540 		/* a virtual port has been registered, but previous
541  		 * syscalls already used the global allocator.
542  		 * We cannot continue
543  		 */
544 		error = ENODEV;
545 	}
546 	return error;
547 }
548 
549 
550 int
551 netmap_get_memory(struct netmap_priv_d* p)
552 {
553 	int error;
554 	NMG_LOCK();
555 	error = netmap_get_memory_locked(p);
556 	NMG_UNLOCK();
557 	return error;
558 }
559 
560 
561 static int
562 netmap_have_memory_locked(struct netmap_priv_d* p)
563 {
564 	return p->np_mref != NULL;
565 }
566 
567 
568 static void
569 netmap_drop_memory_locked(struct netmap_priv_d* p)
570 {
571 	if (p->np_mref) {
572 		netmap_mem_deref(p->np_mref);
573 		p->np_mref = NULL;
574 	}
575 }
576 
577 
578 /*
579  * File descriptor's private data destructor.
580  *
581  * Call nm_register(ifp,0) to stop netmap mode on the interface and
582  * revert to normal operation. We expect that np_na->ifp has not gone.
583  * The second argument is the nifp to work on. In some cases it is
584  * not attached yet to the netmap_priv_d so we need to pass it as
585  * a separate argument.
586  */
587 /* call with NMG_LOCK held */
588 static void
589 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
590 {
591 	struct netmap_adapter *na = priv->np_na;
592 	struct ifnet *ifp = na->ifp;
593 
594 	NMG_LOCK_ASSERT();
595 	na->active_fds--;
596 	if (na->active_fds <= 0) {	/* last instance */
597 
598 		if (netmap_verbose)
599 			D("deleting last instance for %s", NM_IFPNAME(ifp));
600 		/*
601 		 * (TO CHECK) This function is only called
602 		 * when the last reference to this file descriptor goes
603 		 * away. This means we cannot have any pending poll()
604 		 * or interrupt routine operating on the structure.
605 		 * XXX The file may be closed in a thread while
606 		 * another thread is using it.
607 		 * Linux keeps the file opened until the last reference
608 		 * by any outstanding ioctl/poll or mmap is gone.
609 		 * FreeBSD does not track mmap()s (but we do) and
610 		 * wakes up any sleeping poll(). Need to check what
611 		 * happens if the close() occurs while a concurrent
612 		 * syscall is running.
613 		 */
614 		if (ifp)
615 			na->nm_register(na, 0); /* off, clear IFCAP_NETMAP */
616 		/* Wake up any sleeping threads. netmap_poll will
617 		 * then return POLLERR
618 		 * XXX The wake up now must happen during *_down(), when
619 		 * we order all activities to stop. -gl
620 		 */
621 		/* XXX kqueue(9) needed; these will mirror knlist_init. */
622 		/* knlist_destroy(&na->tx_si.si_note); */
623 		/* knlist_destroy(&na->rx_si.si_note); */
624 
625 		/* delete rings and buffers */
626 		netmap_mem_rings_delete(na);
627 		na->nm_krings_delete(na);
628 	}
629 	/* delete the nifp */
630 	netmap_mem_if_delete(na, nifp);
631 }
632 
633 
634 /*
635  * returns 1 if this is the last instance and we can free priv
636  */
637 int
638 netmap_dtor_locked(struct netmap_priv_d *priv)
639 {
640 	struct netmap_adapter *na = priv->np_na;
641 
642 	/*
643 	 * np_refcount is the number of active mmaps on
644 	 * this file descriptor
645 	 */
646 	if (--priv->np_refcount > 0) {
647 		return 0;
648 	}
649 	if (!na) {
650 	    return 1; //XXX is it correct?
651 	}
652 	netmap_do_unregif(priv, priv->np_nifp);
653 	priv->np_nifp = NULL;
654 	netmap_drop_memory_locked(priv);
655 	if (priv->np_na) {
656 		netmap_adapter_put(na);
657 		priv->np_na = NULL;
658 	}
659 	return 1;
660 }
661 
662 
663 void
664 netmap_dtor(void *data)
665 {
666 	struct netmap_priv_d *priv = data;
667 	int last_instance;
668 
669 	NMG_LOCK();
670 	last_instance = netmap_dtor_locked(priv);
671 	NMG_UNLOCK();
672 	if (last_instance) {
673 		bzero(priv, sizeof(*priv));	/* for safety */
674 		kfree(priv, M_DEVBUF);
675 	}
676 }
677 
678 
679 
680 
681 /*
682  * Handlers for synchronization of the queues from/to the host.
683  * Netmap has two operating modes:
684  * - in the default mode, the rings connected to the host stack are
685  *   just another ring pair managed by userspace;
686  * - in transparent mode (XXX to be defined) incoming packets
687  *   (from the host or the NIC) are marked as NS_FORWARD upon
688  *   arrival, and the user application has a chance to reset the
689  *   flag for packets that should be dropped.
690  *   On the RXSYNC or poll(), packets in RX rings between
691  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
692  *   to the other side.
693  * The transfer NIC --> host is relatively easy, just encapsulate
694  * into mbufs and we are done. The host --> NIC side is slightly
695  * harder because there might not be room in the tx ring so it
696  * might take a while before releasing the buffer.
697  */
698 
699 
700 /*
701  * pass a chain of buffers to the host stack as coming from 'dst'
702  */
703 static void
704 netmap_send_up(struct ifnet *dst, struct mbq *q)
705 {
706 	struct mbuf *m;
707 
708 	/* send packets up, outside the lock */
709 	while ((m = mbq_dequeue(q)) != NULL) {
710 		if (netmap_verbose & NM_VERB_HOST)
711 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
712 		NM_SEND_UP(dst, m);
713 	}
714 	mbq_destroy(q);
715 }
716 
717 
718 /*
719  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
720  * Run from hwcur to cur - reserved
721  */
722 static void
723 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
724 {
725 	/* Take packets from hwcur to cur-reserved and pass them up.
726 	 * In case of no buffers we give up. At the end of the loop,
727 	 * the queue is drained in all cases.
728 	 * XXX handle reserved
729 	 */
730 	u_int lim = kring->nkr_num_slots - 1;
731 	struct mbuf *m;
732 	u_int k = kring->ring->cur, n = kring->ring->reserved;
733 	struct netmap_adapter *na = kring->na;
734 
735 	/* compute the final position, ring->cur - ring->reserved */
736 	if (n > 0) {
737 		if (k < n)
738 			k += kring->nkr_num_slots;
739 		k += n;
740 	}
741 	for (n = kring->nr_hwcur; n != k;) {
742 		struct netmap_slot *slot = &kring->ring->slot[n];
743 
744 		n = nm_next(n, lim);
745 		if ((slot->flags & NS_FORWARD) == 0 && !force)
746 			continue;
747 		if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) {
748 			D("bad pkt at %d len %d", n, slot->len);
749 			continue;
750 		}
751 		slot->flags &= ~NS_FORWARD; // XXX needed ?
752 		/* XXX adapt to the case of a multisegment packet */
753 		m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL);
754 
755 		if (m == NULL)
756 			break;
757 		mbq_enqueue(q, m);
758 	}
759 }
760 
761 
762 /*
763  * The host ring has packets from nr_hwcur to (cur - reserved)
764  * to be sent down to the NIC.
765  * We need to use the queue lock on the source (host RX ring)
766  * to protect against netmap_transmit.
767  * If the user is well behaved we do not need to acquire locks
768  * on the destination(s),
769  * so we only need to make sure that there are no panics because
770  * of user errors.
771  * XXX verify
772  *
773  * We scan the tx rings, which have just been
774  * flushed so nr_hwcur == cur. Pushing packets down means
775  * increment cur and decrement avail.
776  * XXX to be verified
777  */
778 static void
779 netmap_sw_to_nic(struct netmap_adapter *na)
780 {
781 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
782 	struct netmap_kring *k1 = &na->tx_rings[0];
783 	u_int i, howmany, src_lim, dst_lim;
784 
785 	/* XXX we should also check that the carrier is on */
786 	if (kring->nkr_stopped)
787 		return;
788 
789 	lockmgr(&kring->q_lock, LK_EXCLUSIVE);
790 
791 	if (kring->nkr_stopped)
792 		goto out;
793 
794 	howmany = kring->nr_hwavail;	/* XXX otherwise cur - reserved - nr_hwcur */
795 
796 	src_lim = kring->nkr_num_slots - 1;
797 	for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) {
798 		ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail);
799 		dst_lim = k1->nkr_num_slots - 1;
800 		while (howmany > 0 && k1->ring->avail > 0) {
801 			struct netmap_slot *src, *dst, tmp;
802 			src = &kring->ring->slot[kring->nr_hwcur];
803 			dst = &k1->ring->slot[k1->ring->cur];
804 			tmp = *src;
805 			src->buf_idx = dst->buf_idx;
806 			src->flags = NS_BUF_CHANGED;
807 
808 			dst->buf_idx = tmp.buf_idx;
809 			dst->len = tmp.len;
810 			dst->flags = NS_BUF_CHANGED;
811 			ND("out len %d buf %d from %d to %d",
812 				dst->len, dst->buf_idx,
813 				kring->nr_hwcur, k1->ring->cur);
814 
815 			kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim);
816 			howmany--;
817 			kring->nr_hwavail--;
818 			k1->ring->cur = nm_next(k1->ring->cur, dst_lim);
819 			k1->ring->avail--;
820 		}
821 		kring->ring->cur = kring->nr_hwcur; // XXX
822 		k1++; // XXX why?
823 	}
824 out:
825 	lockmgr(&kring->q_lock, LK_RELEASE);
826 }
827 
828 
829 /*
830  * netmap_txsync_to_host() passes packets up. We are called from a
831  * system call in user process context, and the only contention
832  * can be among multiple user threads erroneously calling
833  * this routine concurrently.
834  */
835 void
836 netmap_txsync_to_host(struct netmap_adapter *na)
837 {
838 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
839 	struct netmap_ring *ring = kring->ring;
840 	u_int k, lim = kring->nkr_num_slots - 1;
841 	struct mbq q;
842 	int error;
843 
844 	error = nm_kr_tryget(kring);
845 	if (error) {
846 		if (error == NM_KR_BUSY)
847 			D("ring %p busy (user error)", kring);
848 		return;
849 	}
850 	k = ring->cur;
851 	if (k > lim) {
852 		D("invalid ring index in stack TX kring %p", kring);
853 		netmap_ring_reinit(kring);
854 		nm_kr_put(kring);
855 		return;
856 	}
857 
858 	/* Take packets from hwcur to cur and pass them up.
859 	 * In case of no buffers we give up. At the end of the loop,
860 	 * the queue is drained in all cases.
861 	 */
862 	mbq_init(&q);
863 	netmap_grab_packets(kring, &q, 1);
864 	kring->nr_hwcur = k;
865 	kring->nr_hwavail = ring->avail = lim;
866 
867 	nm_kr_put(kring);
868 	netmap_send_up(na->ifp, &q);
869 }
870 
871 
872 /*
873  * rxsync backend for packets coming from the host stack.
874  * They have been put in the queue by netmap_transmit() so we
875  * need to protect access to the kring using a lock.
876  *
877  * This routine also does the selrecord if called from the poll handler
878  * (we know because td != NULL).
879  *
880  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
881  *     as an additional hidden argument.
882  */
883 static void
884 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
885 {
886 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
887 	struct netmap_ring *ring = kring->ring;
888 	u_int j, n, lim = kring->nkr_num_slots;
889 	u_int k = ring->cur, resvd = ring->reserved;
890 
891 	(void)pwait;	/* disable unused warnings */
892 
893 	if (kring->nkr_stopped) /* check a first time without lock */
894 		return;
895 
896 	lockmgr(&kring->q_lock, LK_EXCLUSIVE);
897 
898 	if (kring->nkr_stopped)  /* check again with lock held */
899 		goto unlock_out;
900 
901 	if (k >= lim) {
902 		netmap_ring_reinit(kring);
903 		goto unlock_out;
904 	}
905 	/* new packets are already set in nr_hwavail */
906 	/* skip past packets that userspace has released */
907 	j = kring->nr_hwcur;
908 	if (resvd > 0) {
909 		if (resvd + ring->avail >= lim + 1) {
910 			D("XXX invalid reserve/avail %d %d", resvd, ring->avail);
911 			ring->reserved = resvd = 0; // XXX panic...
912 		}
913 		k = (k >= resvd) ? k - resvd : k + lim - resvd;
914 	}
915 	if (j != k) {
916 		n = k >= j ? k - j : k + lim - j;
917 		kring->nr_hwavail -= n;
918 		kring->nr_hwcur = k;
919 	}
920 	k = ring->avail = kring->nr_hwavail - resvd;
921 	if (k == 0 && td)
922 		selrecord(td, &kring->si);
923 	if (k && (netmap_verbose & NM_VERB_HOST))
924 		D("%d pkts from stack", k);
925 unlock_out:
926 
927 	lockmgr(&kring->q_lock, LK_RELEASE);
928 }
929 
930 
931 /* Get a netmap adapter for the port.
932  *
933  * If it is possible to satisfy the request, return 0
934  * with *na containing the netmap adapter found.
935  * Otherwise return an error code, with *na containing NULL.
936  *
937  * When the port is attached to a bridge, we always return
938  * EBUSY.
939  * Otherwise, if the port is already bound to a file descriptor,
940  * then we unconditionally return the existing adapter into *na.
941  * In all the other cases, we return (into *na) either native,
942  * generic or NULL, according to the following table:
943  *
944  *					native_support
945  * active_fds   dev.netmap.admode         YES     NO
946  * -------------------------------------------------------
947  *    >0              *                 NA(ifp) NA(ifp)
948  *
949  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
950  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
951  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
952  *
953  */
954 
955 int
956 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
957 {
958 	/* generic support */
959 	int i = netmap_admode;	/* Take a snapshot. */
960 	int error = 0;
961 	struct netmap_adapter *prev_na;
962 	struct netmap_generic_adapter *gna;
963 
964 	*na = NULL; /* default */
965 
966 	/* reset in case of invalid value */
967 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
968 		i = netmap_admode = NETMAP_ADMODE_BEST;
969 
970 	if (NETMAP_CAPABLE(ifp)) {
971 		/* If an adapter already exists, but is
972 		 * attached to a vale port, we report that the
973 		 * port is busy.
974 		 */
975 		if (NETMAP_OWNED_BY_KERN(NA(ifp)))
976 			return EBUSY;
977 
978 		/* If an adapter already exists, return it if
979 		 * there are active file descriptors or if
980 		 * netmap is not forced to use generic
981 		 * adapters.
982 		 */
983 		if (NA(ifp)->active_fds > 0 ||
984 				i != NETMAP_ADMODE_GENERIC) {
985 			*na = NA(ifp);
986 			return 0;
987 		}
988 	}
989 
990 	/* If there isn't native support and netmap is not allowed
991 	 * to use generic adapters, we cannot satisfy the request.
992 	 */
993 	if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
994 		return EINVAL;
995 
996 	/* Otherwise, create a generic adapter and return it,
997 	 * saving the previously used netmap adapter, if any.
998 	 *
999 	 * Note that here 'prev_na', if not NULL, MUST be a
1000 	 * native adapter, and CANNOT be a generic one. This is
1001 	 * true because generic adapters are created on demand, and
1002 	 * destroyed when not used anymore. Therefore, if the adapter
1003 	 * currently attached to an interface 'ifp' is generic, it
1004 	 * must be that
1005 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1006 	 * Consequently, if NA(ifp) is generic, we will enter one of
1007 	 * the branches above. This ensures that we never override
1008 	 * a generic adapter with another generic adapter.
1009 	 */
1010 	prev_na = NA(ifp);
1011 	error = generic_netmap_attach(ifp);
1012 	if (error)
1013 		return error;
1014 
1015 	*na = NA(ifp);
1016 	gna = (struct netmap_generic_adapter*)NA(ifp);
1017 	gna->prev = prev_na; /* save old na */
1018 	if (prev_na != NULL) {
1019 		ifunit(ifp->if_xname);	/* XXX huh? */
1020 		// XXX add a refcount ?
1021 		netmap_adapter_get(prev_na);
1022 	}
1023 	D("Created generic NA %p (prev %p)", gna, gna->prev);
1024 
1025 	return 0;
1026 }
1027 
1028 
1029 /*
1030  * MUST BE CALLED UNDER NMG_LOCK()
1031  *
1032  * get a refcounted reference to an interface.
1033  * This is always called in the execution of an ioctl().
1034  *
1035  * Return ENXIO if the interface does not exist, EINVAL if netmap
1036  * is not supported by the interface.
1037  * If successful, hold a reference.
1038  *
1039  * When the NIC is attached to a bridge, reference is managed
1040  * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as
1041  * virtual ports.  Hence, on the final DROP_BDG_REF(), the NIC
1042  * is detached from the bridge, then ifp's refcount is dropped (this
1043  * is equivalent to that ifp is destroyed in case of virtual ports.
1044  *
1045  * This function uses if_rele() when we want to prevent the NIC from
1046  * being detached from the bridge in error handling.  But once refcount
1047  * is acquired by this function, it must be released using nm_if_rele().
1048  */
1049 int
1050 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1051 {
1052 	struct ifnet *ifp;
1053 	int error = 0;
1054 	struct netmap_adapter *ret;
1055 
1056 	*na = NULL;     /* default return value */
1057 
1058 	/* first try to see if this is a bridge port. */
1059 	NMG_LOCK_ASSERT();
1060 
1061 	error = netmap_get_bdg_na(nmr, na, create);
1062 	if (error || *na != NULL) /* valid match in netmap_get_bdg_na() */
1063 		return error;
1064 
1065 	ifp = ifunit(nmr->nr_name);
1066 	if (ifp == NULL) {
1067 	        return ENXIO;
1068 	}
1069 
1070 	error = netmap_get_hw_na(ifp, &ret);
1071 	if (error)
1072 		goto out;
1073 
1074 	if (ret != NULL) {
1075 		/* Users cannot use the NIC attached to a bridge directly */
1076 		if (NETMAP_OWNED_BY_KERN(ret)) {
1077 			error = EINVAL;
1078 			goto out;
1079 		}
1080 		error = 0;
1081 		*na = ret;
1082 		netmap_adapter_get(ret);
1083 	}
1084 out:
1085 #if 0
1086 	if_rele(ifp);
1087 #endif
1088 
1089 	return error;
1090 }
1091 
1092 
1093 /*
1094  * Error routine called when txsync/rxsync detects an error.
1095  * Can't do much more than resetting cur = hwcur, avail = hwavail.
1096  * Return 1 on reinit.
1097  *
1098  * This routine is only called by the upper half of the kernel.
1099  * It only reads hwcur (which is changed only by the upper half, too)
1100  * and hwavail (which may be changed by the lower half, but only on
1101  * a tx ring and only to increase it, so any error will be recovered
1102  * on the next call). For the above, we don't strictly need to call
1103  * it under lock.
1104  */
1105 int
1106 netmap_ring_reinit(struct netmap_kring *kring)
1107 {
1108 	struct netmap_ring *ring = kring->ring;
1109 	u_int i, lim = kring->nkr_num_slots - 1;
1110 	int errors = 0;
1111 
1112 	// XXX KASSERT nm_kr_tryget
1113 	RD(10, "called for %s", NM_IFPNAME(kring->na->ifp));
1114 	if (ring->cur > lim)
1115 		errors++;
1116 	for (i = 0; i <= lim; i++) {
1117 		u_int idx = ring->slot[i].buf_idx;
1118 		u_int len = ring->slot[i].len;
1119 		if (idx < 2 || idx >= netmap_total_buffers) {
1120 			if (!errors++)
1121 				D("bad buffer at slot %d idx %d len %d ", i, idx, len);
1122 			ring->slot[i].buf_idx = 0;
1123 			ring->slot[i].len = 0;
1124 		} else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) {
1125 			ring->slot[i].len = 0;
1126 			if (!errors++)
1127 				D("bad len %d at slot %d idx %d",
1128 					len, i, idx);
1129 		}
1130 	}
1131 	if (errors) {
1132 		int pos = kring - kring->na->tx_rings;
1133 		int n = kring->na->num_tx_rings + 1;
1134 
1135 		RD(10, "total %d errors", errors);
1136 		errors++;
1137 		RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d",
1138 			NM_IFPNAME(kring->na->ifp),
1139 			pos < n ?  "TX" : "RX", pos < n ? pos : pos - n,
1140 			ring->cur, kring->nr_hwcur,
1141 			ring->avail, kring->nr_hwavail);
1142 		ring->cur = kring->nr_hwcur;
1143 		ring->avail = kring->nr_hwavail;
1144 	}
1145 	return (errors ? 1 : 0);
1146 }
1147 
1148 
1149 /*
1150  * Set the ring ID. For devices with a single queue, a request
1151  * for all rings is the same as a single ring.
1152  */
1153 static int
1154 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid)
1155 {
1156 	struct netmap_adapter *na = priv->np_na;
1157 	struct ifnet *ifp = na->ifp;
1158 	u_int i = ringid & NETMAP_RING_MASK;
1159 	/* initially (np_qfirst == np_qlast) we don't want to lock */
1160 	u_int lim = na->num_rx_rings;
1161 
1162 	if (na->num_tx_rings > lim)
1163 		lim = na->num_tx_rings;
1164 	if ( (ringid & NETMAP_HW_RING) && i >= lim) {
1165 		D("invalid ring id %d", i);
1166 		return (EINVAL);
1167 	}
1168 	priv->np_ringid = ringid;
1169 	if (ringid & NETMAP_SW_RING) {
1170 		priv->np_qfirst = NETMAP_SW_RING;
1171 		priv->np_qlast = 0;
1172 	} else if (ringid & NETMAP_HW_RING) {
1173 		priv->np_qfirst = i;
1174 		priv->np_qlast = i + 1;
1175 	} else {
1176 		priv->np_qfirst = 0;
1177 		priv->np_qlast = NETMAP_HW_RING ;
1178 	}
1179 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1180     if (netmap_verbose) {
1181 	if (ringid & NETMAP_SW_RING)
1182 		D("ringid %s set to SW RING", NM_IFPNAME(ifp));
1183 	else if (ringid & NETMAP_HW_RING)
1184 		D("ringid %s set to HW RING %d", NM_IFPNAME(ifp),
1185 			priv->np_qfirst);
1186 	else
1187 		D("ringid %s set to all %d HW RINGS", NM_IFPNAME(ifp), lim);
1188     }
1189 	return 0;
1190 }
1191 
1192 
1193 /*
1194  * possibly move the interface to netmap-mode.
1195  * If success it returns a pointer to netmap_if, otherwise NULL.
1196  * This must be called with NMG_LOCK held.
1197  */
1198 struct netmap_if *
1199 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1200 	uint16_t ringid, int *err)
1201 {
1202 	struct ifnet *ifp = na->ifp;
1203 	struct netmap_if *nifp = NULL;
1204 	int error, need_mem = 0;
1205 
1206 	NMG_LOCK_ASSERT();
1207 	/* ring configuration may have changed, fetch from the card */
1208 	netmap_update_config(na);
1209 	priv->np_na = na;     /* store the reference */
1210 	error = netmap_set_ringid(priv, ringid);
1211 	if (error)
1212 		goto out;
1213 	/* ensure allocators are ready */
1214 	need_mem = !netmap_have_memory_locked(priv);
1215 	if (need_mem) {
1216 		error = netmap_get_memory_locked(priv);
1217 		ND("get_memory returned %d", error);
1218 		if (error)
1219 			goto out;
1220 	}
1221 	nifp = netmap_if_new(NM_IFPNAME(ifp), na);
1222 	if (nifp == NULL) { /* allocation failed */
1223 		/* we should drop the allocator, but only
1224 		 * if we were the ones who grabbed it
1225 		 */
1226 		error = ENOMEM;
1227 		goto out;
1228 	}
1229 	na->active_fds++;
1230 	if (ifp->if_capenable & IFCAP_NETMAP) {
1231 		/* was already set */
1232 	} else {
1233 		/* Otherwise set the card in netmap mode
1234 		 * and make it use the shared buffers.
1235 		 *
1236 		 * do not core lock because the race is harmless here,
1237 		 * there cannot be any traffic to netmap_transmit()
1238 		 */
1239 		na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut;
1240 		ND("%p->na_lut == %p", na, na->na_lut);
1241 		na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal;
1242 		error = na->nm_register(na, 1); /* mode on */
1243 		if (error) {
1244 			netmap_do_unregif(priv, nifp);
1245 			nifp = NULL;
1246 		}
1247 	}
1248 out:
1249 	*err = error;
1250 	if (error) {
1251 		priv->np_na = NULL;
1252 		if (need_mem)
1253 			netmap_drop_memory_locked(priv);
1254 	}
1255 	if (nifp != NULL) {
1256 		/*
1257 		 * advertise that the interface is ready bt setting ni_nifp.
1258 		 * The barrier is needed because readers (poll and *SYNC)
1259 		 * check for priv->np_nifp != NULL without locking
1260 		 */
1261 		wmb(); /* make sure previous writes are visible to all CPUs */
1262 		priv->np_nifp = nifp;
1263 	}
1264 	return nifp;
1265 }
1266 
1267 
1268 
1269 /*
1270  * ioctl(2) support for the "netmap" device.
1271  *
1272  * Following a list of accepted commands:
1273  * - NIOCGINFO
1274  * - SIOCGIFADDR	just for convenience
1275  * - NIOCREGIF
1276  * - NIOCUNREGIF
1277  * - NIOCTXSYNC
1278  * - NIOCRXSYNC
1279  *
1280  * Return 0 on success, errno otherwise.
1281  */
1282 int
1283 netmap_ioctl(struct dev_ioctl_args *ap)
1284 {
1285 	struct netmap_priv_d *priv = NULL;
1286 	struct ifnet *ifp = NULL;
1287 	struct nmreq *nmr = (struct nmreq *) ap->a_data;
1288 	struct netmap_adapter *na = NULL;
1289 	int error;
1290 	u_int i, lim;
1291 	struct netmap_if *nifp;
1292 	struct netmap_kring *krings;
1293 	u_long cmd = ap->a_cmd;
1294 
1295 #if 0
1296 	error = devfs_get_cdevpriv((void **)&priv);
1297 	if (error) {
1298 		/* XXX ENOENT should be impossible, since the priv
1299 		 * is now created in the open */
1300 		return (error == ENOENT ? ENXIO : error);
1301 	}
1302 #endif
1303 
1304 	nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';	/* truncate name */
1305 	switch (cmd) {
1306 	case NIOCGINFO:		/* return capabilities etc */
1307 		if (nmr->nr_version != NETMAP_API) {
1308 			D("API mismatch got %d have %d",
1309 				nmr->nr_version, NETMAP_API);
1310 			nmr->nr_version = NETMAP_API;
1311 			error = EINVAL;
1312 			break;
1313 		}
1314 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
1315 			error = netmap_bdg_ctl(nmr, NULL);
1316 			break;
1317 		}
1318 
1319 		NMG_LOCK();
1320 		do {
1321 			/* memsize is always valid */
1322 			struct netmap_mem_d *nmd = &nm_mem;
1323 			u_int memflags;
1324 
1325 			if (nmr->nr_name[0] != '\0') {
1326 				/* get a refcount */
1327 				error = netmap_get_na(nmr, &na, 1 /* create */);
1328 				if (error)
1329 					break;
1330 				nmd = na->nm_mem; /* get memory allocator */
1331 			}
1332 
1333 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags);
1334 			if (error)
1335 				break;
1336 			if (na == NULL) /* only memory info */
1337 				break;
1338 			nmr->nr_offset = 0;
1339 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
1340 			netmap_update_config(na);
1341 			nmr->nr_rx_rings = na->num_rx_rings;
1342 			nmr->nr_tx_rings = na->num_tx_rings;
1343 			nmr->nr_rx_slots = na->num_rx_desc;
1344 			nmr->nr_tx_slots = na->num_tx_desc;
1345 			if (memflags & NETMAP_MEM_PRIVATE)
1346 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
1347 			netmap_adapter_put(na);
1348 		} while (0);
1349 		NMG_UNLOCK();
1350 		break;
1351 
1352 	case NIOCREGIF:
1353 		if (nmr->nr_version != NETMAP_API) {
1354 			nmr->nr_version = NETMAP_API;
1355 			error = EINVAL;
1356 			break;
1357 		}
1358 		/* possibly attach/detach NIC and VALE switch */
1359 		i = nmr->nr_cmd;
1360 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) {
1361 			error = netmap_bdg_ctl(nmr, NULL);
1362 			break;
1363 		} else if (i != 0) {
1364 			D("nr_cmd must be 0 not %d", i);
1365 			error = EINVAL;
1366 			break;
1367 		}
1368 
1369 		/* protect access to priv from concurrent NIOCREGIF */
1370 		NMG_LOCK();
1371 		do {
1372 			u_int memflags;
1373 
1374 			if (priv->np_na != NULL) {	/* thread already registered */
1375 				error = netmap_set_ringid(priv, nmr->nr_ringid);
1376 				break;
1377 			}
1378 			/* find the interface and a reference */
1379 			error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
1380 			if (error)
1381 				break;
1382 			ifp = na->ifp;
1383 			if (NETMAP_OWNED_BY_KERN(na)) {
1384 				netmap_adapter_put(na);
1385 				error = EBUSY;
1386 				break;
1387 			}
1388 			nifp = netmap_do_regif(priv, na, nmr->nr_ringid, &error);
1389 			if (!nifp) {    /* reg. failed, release priv and ref */
1390 				netmap_adapter_put(na);
1391 				priv->np_nifp = NULL;
1392 				break;
1393 			}
1394 
1395 			/* return the offset of the netmap_if object */
1396 			nmr->nr_rx_rings = na->num_rx_rings;
1397 			nmr->nr_tx_rings = na->num_tx_rings;
1398 			nmr->nr_rx_slots = na->num_rx_desc;
1399 			nmr->nr_tx_slots = na->num_tx_desc;
1400 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags);
1401 			if (error) {
1402 				netmap_adapter_put(na);
1403 				break;
1404 			}
1405 			if (memflags & NETMAP_MEM_PRIVATE) {
1406 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
1407 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
1408 			}
1409 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
1410 		} while (0);
1411 		NMG_UNLOCK();
1412 		break;
1413 
1414 	case NIOCUNREGIF:
1415 		// XXX we have no data here ?
1416 		D("deprecated, data is %p", nmr);
1417 		error = EINVAL;
1418 		break;
1419 
1420 	case NIOCTXSYNC:
1421 	case NIOCRXSYNC:
1422 		nifp = priv->np_nifp;
1423 
1424 		if (nifp == NULL) {
1425 			error = ENXIO;
1426 			break;
1427 		}
1428 		rmb(); /* make sure following reads are not from cache */
1429 
1430 		na = priv->np_na;      /* we have a reference */
1431 
1432 		if (na == NULL) {
1433 			D("Internal error: nifp != NULL && na == NULL");
1434 			error = ENXIO;
1435 			break;
1436 		}
1437 
1438 		ifp = na->ifp;
1439 		if (ifp == NULL) {
1440 			RD(1, "the ifp is gone");
1441 			error = ENXIO;
1442 			break;
1443 		}
1444 
1445 		if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */
1446 			if (cmd == NIOCTXSYNC)
1447 				netmap_txsync_to_host(na);
1448 			else
1449 				netmap_rxsync_from_host(na, NULL, NULL);
1450 			break;
1451 		}
1452 		/* find the last ring to scan */
1453 		lim = priv->np_qlast;
1454 		if (lim == NETMAP_HW_RING)
1455 			lim = (cmd == NIOCTXSYNC) ?
1456 			    na->num_tx_rings : na->num_rx_rings;
1457 
1458 		krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings;
1459 		for (i = priv->np_qfirst; i < lim; i++) {
1460 			struct netmap_kring *kring = krings + i;
1461 			if (nm_kr_tryget(kring)) {
1462 				error = EBUSY;
1463 				goto out;
1464 			}
1465 			if (cmd == NIOCTXSYNC) {
1466 				if (netmap_verbose & NM_VERB_TXSYNC)
1467 					D("pre txsync ring %d cur %d hwcur %d",
1468 					    i, kring->ring->cur,
1469 					    kring->nr_hwcur);
1470 				na->nm_txsync(na, i, NAF_FORCE_RECLAIM);
1471 				if (netmap_verbose & NM_VERB_TXSYNC)
1472 					D("post txsync ring %d cur %d hwcur %d",
1473 					    i, kring->ring->cur,
1474 					    kring->nr_hwcur);
1475 			} else {
1476 				na->nm_rxsync(na, i, NAF_FORCE_READ);
1477 				microtime(&na->rx_rings[i].ring->ts);
1478 			}
1479 			nm_kr_put(kring);
1480 		}
1481 
1482 		break;
1483 	case BIOCIMMEDIATE:
1484 	case BIOCGHDRCMPLT:
1485 	case BIOCSHDRCMPLT:
1486 	case BIOCSSEESENT:
1487 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
1488 		break;
1489 
1490 	default:	/* allow device-specific ioctls */
1491 	    {
1492 		struct socket so;
1493 
1494 		bzero(&so, sizeof(so));
1495 		NMG_LOCK();
1496 		error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */
1497 		if (error) {
1498 			netmap_adapter_put(na);
1499 			NMG_UNLOCK();
1500 			break;
1501 		}
1502 		ifp = na->ifp;
1503 		// so->so_proto not null.
1504 		error = ifioctl(&so, cmd, ap->a_data, ap->a_cred);
1505 		netmap_adapter_put(na);
1506 		NMG_UNLOCK();
1507 		break;
1508 	    }
1509 	}
1510 out:
1511 
1512 	return (error);
1513 }
1514 
1515 static int
1516 netmap_kqfilter_event(struct knote *kn, long hint)
1517 {
1518 	return (0);
1519 }
1520 
1521 static void
1522 netmap_kqfilter_detach(struct knote *kn)
1523 {
1524 }
1525 
1526 static struct filterops netmap_kqfilter_ops = {
1527 	FILTEROP_ISFD, NULL, netmap_kqfilter_detach, netmap_kqfilter_event,
1528 };
1529 
1530 int
1531 netmap_kqfilter(struct dev_kqfilter_args *ap)
1532 {
1533 	struct knote *kn = ap->a_kn;
1534 
1535 	ap->a_result = 0;
1536 
1537 	switch (kn->kn_filter) {
1538 	case EVFILT_READ:
1539 	case EVFILT_WRITE:
1540 		kn->kn_fop = &netmap_kqfilter_ops;
1541 		break;
1542 	default:
1543 		ap->a_result = EOPNOTSUPP;
1544 		return (0);
1545 	}
1546 
1547 	return (0);
1548 }
1549 
1550 /*
1551  * select(2) and poll(2) handlers for the "netmap" device.
1552  *
1553  * Can be called for one or more queues.
1554  * Return true the event mask corresponding to ready events.
1555  * If there are no ready events, do a selrecord on either individual
1556  * selinfo or on the global one.
1557  * Device-dependent parts (locking and sync of tx/rx rings)
1558  * are done through callbacks.
1559  *
1560  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
1561  * The first one is remapped to pwait as selrecord() uses the name as an
1562  * hidden argument.
1563  */
1564 static int
1565 netmap_poll(struct cdev *dev, int events, struct thread *td)
1566 {
1567 	struct netmap_priv_d *priv = NULL;
1568 	struct netmap_adapter *na;
1569 	struct ifnet *ifp;
1570 	struct netmap_kring *kring;
1571 	u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
1572 	u_int lim_tx, lim_rx, host_forwarded = 0;
1573 	struct mbq q;
1574 	void *pwait = dev;	/* linux compatibility */
1575 
1576 	/*
1577 	 * In order to avoid nested locks, we need to "double check"
1578 	 * txsync and rxsync if we decide to do a selrecord().
1579 	 * retry_tx (and retry_rx, later) prevent looping forever.
1580 	 */
1581 	int retry_tx = 1;
1582 
1583 	(void)pwait;
1584 	mbq_init(&q);
1585 
1586 #if 0
1587 	if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL)
1588 		return POLLERR;
1589 #endif
1590 
1591 	if (priv->np_nifp == NULL) {
1592 		D("No if registered");
1593 		return POLLERR;
1594 	}
1595 	rmb(); /* make sure following reads are not from cache */
1596 
1597 	na = priv->np_na;
1598 	ifp = na->ifp;
1599 	// check for deleted
1600 	if (ifp == NULL) {
1601 		RD(1, "the ifp is gone");
1602 		return POLLERR;
1603 	}
1604 
1605 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
1606 		return POLLERR;
1607 
1608 	if (netmap_verbose & 0x8000)
1609 		D("device %s events 0x%x", NM_IFPNAME(ifp), events);
1610 	want_tx = events & (POLLOUT | POLLWRNORM);
1611 	want_rx = events & (POLLIN | POLLRDNORM);
1612 
1613 	lim_tx = na->num_tx_rings;
1614 	lim_rx = na->num_rx_rings;
1615 
1616 	if (priv->np_qfirst == NETMAP_SW_RING) {
1617 		/* handle the host stack ring */
1618 		if (priv->np_txpoll || want_tx) {
1619 			/* push any packets up, then we are always ready */
1620 			netmap_txsync_to_host(na);
1621 			revents |= want_tx;
1622 		}
1623 		if (want_rx) {
1624 			kring = &na->rx_rings[lim_rx];
1625 			if (kring->ring->avail == 0)
1626 				netmap_rxsync_from_host(na, td, dev);
1627 			if (kring->ring->avail > 0) {
1628 				revents |= want_rx;
1629 			}
1630 		}
1631 		return (revents);
1632 	}
1633 
1634 	/*
1635 	 * If we are in transparent mode, check also the host rx ring
1636 	 * XXX Transparent mode at the moment requires to bind all
1637  	 * rings to a single file descriptor.
1638 	 */
1639 	kring = &na->rx_rings[lim_rx];
1640 	if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
1641 			&& want_rx
1642 			&& (netmap_fwd || kring->ring->flags & NR_FORWARD) ) {
1643 		if (kring->ring->avail == 0)
1644 			netmap_rxsync_from_host(na, td, dev);
1645 		if (kring->ring->avail > 0)
1646 			revents |= want_rx;
1647 	}
1648 
1649 	/*
1650 	 * check_all_{tx|rx} are set if the card has more than one queue AND
1651 	 * the file descriptor is bound to all of them. If so, we sleep on
1652 	 * the "global" selinfo, otherwise we sleep on individual selinfo
1653 	 * (FreeBSD only allows two selinfo's per file descriptor).
1654 	 * The interrupt routine in the driver wake one or the other
1655 	 * (or both) depending on which clients are active.
1656 	 *
1657 	 * rxsync() is only called if we run out of buffers on a POLLIN.
1658 	 * txsync() is called if we run out of buffers on POLLOUT, or
1659 	 * there are pending packets to send. The latter can be disabled
1660 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
1661 	 */
1662 	check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1);
1663 	check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1);
1664 
1665 	if (priv->np_qlast != NETMAP_HW_RING) {
1666 		lim_tx = lim_rx = priv->np_qlast;
1667 	}
1668 
1669 	/*
1670 	 * We start with a lock free round which is cheap if we have
1671 	 * slots available. If this fails, then lock and call the sync
1672 	 * routines.
1673 	 */
1674 	for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) {
1675 		kring = &na->rx_rings[i];
1676 		if (kring->ring->avail > 0) {
1677 			revents |= want_rx;
1678 			want_rx = 0;	/* also breaks the loop */
1679 		}
1680 	}
1681 	for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) {
1682 		kring = &na->tx_rings[i];
1683 		if (kring->ring->avail > 0) {
1684 			revents |= want_tx;
1685 			want_tx = 0;	/* also breaks the loop */
1686 		}
1687 	}
1688 
1689 	/*
1690 	 * If we to push packets out (priv->np_txpoll) or want_tx is
1691 	 * still set, we do need to run the txsync calls (on all rings,
1692 	 * to avoid that the tx rings stall).
1693 	 * XXX should also check cur != hwcur on the tx rings.
1694 	 * Fortunately, normal tx mode has np_txpoll set.
1695 	 */
1696 	if (priv->np_txpoll || want_tx) {
1697 		/* If we really want to be woken up (want_tx),
1698 		 * do a selrecord, either on the global or on
1699 		 * the private structure.  Then issue the txsync
1700 		 * so there is no race in the selrecord/selwait
1701 		 */
1702 flush_tx:
1703 		for (i = priv->np_qfirst; i < lim_tx; i++) {
1704 			kring = &na->tx_rings[i];
1705 			/*
1706 			 * Skip this ring if want_tx == 0
1707 			 * (we have already done a successful sync on
1708 			 * a previous ring) AND kring->cur == kring->hwcur
1709 			 * (there are no pending transmissions for this ring).
1710 			 */
1711 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
1712 				continue;
1713 			/* make sure only one user thread is doing this */
1714 			if (nm_kr_tryget(kring)) {
1715 				ND("ring %p busy is %d",
1716 				    kring, (int)kring->nr_busy);
1717 				revents |= POLLERR;
1718 				goto out;
1719 			}
1720 
1721 			if (netmap_verbose & NM_VERB_TXSYNC)
1722 				D("send %d on %s %d",
1723 					kring->ring->cur, NM_IFPNAME(ifp), i);
1724 			if (na->nm_txsync(na, i, 0))
1725 				revents |= POLLERR;
1726 
1727 			/* Check avail/call selrecord only if called with POLLOUT */
1728 			if (want_tx) {
1729 				if (kring->ring->avail > 0) {
1730 					/* stop at the first ring. We don't risk
1731 					 * starvation.
1732 					 */
1733 					revents |= want_tx;
1734 					want_tx = 0;
1735 				}
1736 			}
1737 			nm_kr_put(kring);
1738 		}
1739 		if (want_tx && retry_tx) {
1740 			selrecord(td, check_all_tx ?
1741 			    &na->tx_si : &na->tx_rings[priv->np_qfirst].si);
1742 			retry_tx = 0;
1743 			goto flush_tx;
1744 		}
1745 	}
1746 
1747 	/*
1748 	 * now if want_rx is still set we need to lock and rxsync.
1749 	 * Do it on all rings because otherwise we starve.
1750 	 */
1751 	if (want_rx) {
1752 		int retry_rx = 1;
1753 do_retry_rx:
1754 		for (i = priv->np_qfirst; i < lim_rx; i++) {
1755 			kring = &na->rx_rings[i];
1756 
1757 			if (nm_kr_tryget(kring)) {
1758 				revents |= POLLERR;
1759 				goto out;
1760 			}
1761 
1762 			/* XXX NR_FORWARD should only be read on
1763 			 * physical or NIC ports
1764 			 */
1765 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
1766 				ND(10, "forwarding some buffers up %d to %d",
1767 				    kring->nr_hwcur, kring->ring->cur);
1768 				netmap_grab_packets(kring, &q, netmap_fwd);
1769 			}
1770 
1771 			if (na->nm_rxsync(na, i, 0))
1772 				revents |= POLLERR;
1773 			if (netmap_no_timestamp == 0 ||
1774 					kring->ring->flags & NR_TIMESTAMP) {
1775 				microtime(&kring->ring->ts);
1776 			}
1777 
1778 			if (kring->ring->avail > 0) {
1779 				revents |= want_rx;
1780 				retry_rx = 0;
1781 			}
1782 			nm_kr_put(kring);
1783 		}
1784 		if (retry_rx) {
1785 			retry_rx = 0;
1786 			selrecord(td, check_all_rx ?
1787 			    &na->rx_si : &na->rx_rings[priv->np_qfirst].si);
1788 			goto do_retry_rx;
1789 		}
1790 	}
1791 
1792 	/* forward host to the netmap ring.
1793 	 * I am accessing nr_hwavail without lock, but netmap_transmit
1794 	 * can only increment it, so the operation is safe.
1795 	 */
1796 	kring = &na->rx_rings[lim_rx];
1797 	if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
1798 			&& (netmap_fwd || kring->ring->flags & NR_FORWARD)
1799 			 && kring->nr_hwavail > 0 && !host_forwarded) {
1800 		netmap_sw_to_nic(na);
1801 		host_forwarded = 1; /* prevent another pass */
1802 		want_rx = 0;
1803 		goto flush_tx;
1804 	}
1805 
1806 	if (q.head)
1807 		netmap_send_up(na->ifp, &q);
1808 
1809 out:
1810 
1811 	return (revents);
1812 }
1813 
1814 /*------- driver support routines ------*/
1815 
1816 static int netmap_hw_krings_create(struct netmap_adapter *);
1817 
1818 static int
1819 netmap_notify(struct netmap_adapter *na, u_int n_ring, enum txrx tx, int flags)
1820 {
1821 	struct netmap_kring *kring;
1822 
1823 	if (tx == NR_TX) {
1824 		kring = na->tx_rings + n_ring;
1825 		KNOTE(&kring->si.ki_note, 0);
1826 		wakeup(&kring->si.ki_note);
1827 		if (flags & NAF_GLOBAL_NOTIFY)
1828 			wakeup(&na->tx_si.ki_note);
1829 	} else {
1830 		kring = na->rx_rings + n_ring;
1831 		KNOTE(&kring->si.ki_note, 0);
1832 		wakeup(&kring->si.ki_note);
1833 		if (flags & NAF_GLOBAL_NOTIFY)
1834 			wakeup(&na->rx_si.ki_note);
1835 	}
1836 	return 0;
1837 }
1838 
1839 
1840 // XXX check handling of failures
1841 int
1842 netmap_attach_common(struct netmap_adapter *na)
1843 {
1844 	struct ifnet *ifp = na->ifp;
1845 
1846 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
1847 		D("%s: invalid rings tx %d rx %d",
1848 			ifp->if_xname, na->num_tx_rings, na->num_rx_rings);
1849 		return EINVAL;
1850 	}
1851 	WNA(ifp) = na;
1852 	NETMAP_SET_CAPABLE(ifp);
1853 	if (na->nm_krings_create == NULL) {
1854 		na->nm_krings_create = netmap_hw_krings_create;
1855 		na->nm_krings_delete = netmap_krings_delete;
1856 	}
1857 	if (na->nm_notify == NULL)
1858 		na->nm_notify = netmap_notify;
1859 	na->active_fds = 0;
1860 
1861 	if (na->nm_mem == NULL)
1862 		na->nm_mem = &nm_mem;
1863 	return 0;
1864 }
1865 
1866 
1867 void
1868 netmap_detach_common(struct netmap_adapter *na)
1869 {
1870 	if (na->ifp)
1871 		WNA(na->ifp) = NULL; /* XXX do we need this? */
1872 
1873 	if (na->tx_rings) { /* XXX should not happen */
1874 		D("freeing leftover tx_rings");
1875 		na->nm_krings_delete(na);
1876 	}
1877 	if (na->na_flags & NAF_MEM_OWNER)
1878 		netmap_mem_private_delete(na->nm_mem);
1879 	bzero(na, sizeof(*na));
1880 	kfree(na, M_DEVBUF);
1881 }
1882 
1883 
1884 /*
1885  * Initialize a ``netmap_adapter`` object created by driver on attach.
1886  * We allocate a block of memory with room for a struct netmap_adapter
1887  * plus two sets of N+2 struct netmap_kring (where N is the number
1888  * of hardware rings):
1889  * krings	0..N-1	are for the hardware queues.
1890  * kring	N	is for the host stack queue
1891  * kring	N+1	is only used for the selinfo for all queues.
1892  * Return 0 on success, ENOMEM otherwise.
1893  *
1894  * By default the receive and transmit adapter ring counts are both initialized
1895  * to num_queues.  na->num_tx_rings can be set for cards with different tx/rx
1896  * setups.
1897  */
1898 int
1899 netmap_attach(struct netmap_adapter *arg)
1900 {
1901 	struct netmap_hw_adapter *hwna = NULL;
1902 	// XXX when is arg == NULL ?
1903 	struct ifnet *ifp = arg ? arg->ifp : NULL;
1904 
1905 	if (arg == NULL || ifp == NULL)
1906 		goto fail;
1907 	hwna = kmalloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
1908 	if (hwna == NULL)
1909 		goto fail;
1910 	hwna->up = *arg;
1911 	if (netmap_attach_common(&hwna->up)) {
1912 		kfree(hwna, M_DEVBUF);
1913 		goto fail;
1914 	}
1915 	netmap_adapter_get(&hwna->up);
1916 
1917 	D("success for %s", NM_IFPNAME(ifp));
1918 	return 0;
1919 
1920 fail:
1921 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
1922 	netmap_detach(ifp);
1923 	return (hwna ? EINVAL : ENOMEM);
1924 }
1925 
1926 
1927 void
1928 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
1929 {
1930 	if (!na) {
1931 		return;
1932 	}
1933 
1934 	refcount_acquire(&na->na_refcount);
1935 }
1936 
1937 
1938 /* returns 1 iff the netmap_adapter is destroyed */
1939 int
1940 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
1941 {
1942 	if (!na)
1943 		return 1;
1944 
1945 	if (!refcount_release(&na->na_refcount))
1946 		return 0;
1947 
1948 	if (na->nm_dtor)
1949 		na->nm_dtor(na);
1950 
1951 	netmap_detach_common(na);
1952 
1953 	return 1;
1954 }
1955 
1956 
1957 int
1958 netmap_hw_krings_create(struct netmap_adapter *na)
1959 {
1960 	return netmap_krings_create(na,
1961 		na->num_tx_rings + 1, na->num_rx_rings + 1, 0);
1962 }
1963 
1964 
1965 
1966 /*
1967  * Free the allocated memory linked to the given ``netmap_adapter``
1968  * object.
1969  */
1970 void
1971 netmap_detach(struct ifnet *ifp)
1972 {
1973 	struct netmap_adapter *na = NA(ifp);
1974 
1975 	if (!na)
1976 		return;
1977 
1978 	NMG_LOCK();
1979 	netmap_disable_all_rings(ifp);
1980 	netmap_adapter_put(na);
1981 	na->ifp = NULL;
1982 	netmap_enable_all_rings(ifp);
1983 	NMG_UNLOCK();
1984 }
1985 
1986 
1987 /*
1988  * Intercept packets from the network stack and pass them
1989  * to netmap as incoming packets on the 'software' ring.
1990  * We rely on the OS to make sure that the ifp and na do not go
1991  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
1992  * In nm_register() or whenever there is a reinitialization,
1993  * we make sure to access the core lock and per-ring locks
1994  * so that IFCAP_NETMAP is visible here.
1995  */
1996 int
1997 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
1998 {
1999 	struct netmap_adapter *na = NA(ifp);
2000 	struct netmap_kring *kring;
2001 	u_int i, len = MBUF_LEN(m);
2002 	u_int error = EBUSY, lim;
2003 	struct netmap_slot *slot;
2004 
2005 	// XXX [Linux] we do not need this lock
2006 	// if we follow the down/configure/up protocol -gl
2007 	// mtx_lock(&na->core_lock);
2008 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) {
2009 		/* interface not in netmap mode anymore */
2010 		error = ENXIO;
2011 		goto done;
2012 	}
2013 
2014 	kring = &na->rx_rings[na->num_rx_rings];
2015 	lim = kring->nkr_num_slots - 1;
2016 	if (netmap_verbose & NM_VERB_HOST)
2017 		D("%s packet %d len %d from the stack", NM_IFPNAME(ifp),
2018 			kring->nr_hwcur + kring->nr_hwavail, len);
2019 	// XXX reconsider long packets if we handle fragments
2020 	if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */
2021 		D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp),
2022 			len, NETMAP_BDG_BUF_SIZE(na->nm_mem));
2023 		goto done;
2024 	}
2025 	/* protect against other instances of netmap_transmit,
2026 	 * and userspace invocations of rxsync().
2027 	 */
2028 	// XXX [Linux] there can be no other instances of netmap_transmit
2029 	// on this same ring, but we still need this lock to protect
2030 	// concurrent access from netmap_sw_to_nic() -gl
2031 	lockmgr(&kring->q_lock, LK_EXCLUSIVE);
2032 	if (kring->nr_hwavail >= lim) {
2033 		if (netmap_verbose)
2034 			D("stack ring %s full\n", NM_IFPNAME(ifp));
2035 	} else {
2036 		/* compute the insert position */
2037 		i = nm_kr_rxpos(kring);
2038 		slot = &kring->ring->slot[i];
2039 		m_copydata(m, 0, (int)len, BDG_NMB(na, slot));
2040 		slot->len = len;
2041 		slot->flags = kring->nkr_slot_flags;
2042 		kring->nr_hwavail++;
2043 		if (netmap_verbose  & NM_VERB_HOST)
2044 			D("wake up host ring %s %d", NM_IFPNAME(na->ifp), na->num_rx_rings);
2045 		na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
2046 		error = 0;
2047 	}
2048 	lockmgr(&kring->q_lock, LK_RELEASE);
2049 
2050 done:
2051 	// mtx_unlock(&na->core_lock);
2052 
2053 	/* release the mbuf in either cases of success or failure. As an
2054 	 * alternative, put the mbuf in a free list and free the list
2055 	 * only when really necessary.
2056 	 */
2057 	m_freem(m);
2058 
2059 	return (error);
2060 }
2061 
2062 
2063 /*
2064  * netmap_reset() is called by the driver routines when reinitializing
2065  * a ring. The driver is in charge of locking to protect the kring.
2066  * If native netmap mode is not set just return NULL.
2067  */
2068 struct netmap_slot *
2069 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2070 	u_int new_cur)
2071 {
2072 	struct netmap_kring *kring;
2073 	int new_hwofs, lim;
2074 
2075 	if (na == NULL) {
2076 		D("NULL na, should not happen");
2077 		return NULL;	/* no netmap support here */
2078 	}
2079 	if (!(na->ifp->if_capenable & IFCAP_NETMAP) || nma_is_generic(na)) {
2080 		ND("interface not in netmap mode");
2081 		return NULL;	/* nothing to reinitialize */
2082 	}
2083 
2084 	/* XXX note- in the new scheme, we are not guaranteed to be
2085 	 * under lock (e.g. when called on a device reset).
2086 	 * In this case, we should set a flag and do not trust too
2087 	 * much the values. In practice: TODO
2088 	 * - set a RESET flag somewhere in the kring
2089 	 * - do the processing in a conservative way
2090 	 * - let the *sync() fixup at the end.
2091 	 */
2092 	if (tx == NR_TX) {
2093 		if (n >= na->num_tx_rings)
2094 			return NULL;
2095 		kring = na->tx_rings + n;
2096 		new_hwofs = kring->nr_hwcur - new_cur;
2097 	} else {
2098 		if (n >= na->num_rx_rings)
2099 			return NULL;
2100 		kring = na->rx_rings + n;
2101 		new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur;
2102 	}
2103 	lim = kring->nkr_num_slots - 1;
2104 	if (new_hwofs > lim)
2105 		new_hwofs -= lim + 1;
2106 
2107 	/* Always set the new offset value and realign the ring. */
2108 	D("%s hwofs %d -> %d, hwavail %d -> %d",
2109 		tx == NR_TX ? "TX" : "RX",
2110 		kring->nkr_hwofs, new_hwofs,
2111 		kring->nr_hwavail,
2112 		tx == NR_TX ? lim : kring->nr_hwavail);
2113 	kring->nkr_hwofs = new_hwofs;
2114 	if (tx == NR_TX)
2115 		kring->nr_hwavail = lim;
2116 	kring->nr_hwreserved = 0;
2117 
2118 	/*
2119 	 * Wakeup on the individual and global selwait
2120 	 * We do the wakeup here, but the ring is not yet reconfigured.
2121 	 * However, we are under lock so there are no races.
2122 	 */
2123 	na->nm_notify(na, n, tx, NAF_GLOBAL_NOTIFY);
2124 	return kring->ring->slot;
2125 }
2126 
2127 
2128 /*
2129  * Default functions to handle rx/tx interrupts from a physical device.
2130  * "work_done" is non-null on the RX path, NULL for the TX path.
2131  * "generic" is 0 when we are called by a device driver, and 1 when we
2132  * are called by the generic netmap adapter layer.
2133  * We rely on the OS to make sure that there is only one active
2134  * instance per queue, and that there is appropriate locking.
2135  *
2136  * If the card is not in netmap mode, simply return 0,
2137  * so that the caller proceeds with regular processing.
2138  *
2139  * We return 0 also when the card is in netmap mode but the current
2140  * netmap adapter is the generic one, because this function will be
2141  * called by the generic layer.
2142  *
2143  * If the card is connected to a netmap file descriptor,
2144  * do a selwakeup on the individual queue, plus one on the global one
2145  * if needed (multiqueue card _and_ there are multiqueue listeners),
2146  * and return 1.
2147  *
2148  * Finally, if called on rx from an interface connected to a switch,
2149  * calls the proper forwarding routine, and return 1.
2150  */
2151 int
2152 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2153 {
2154 	struct netmap_adapter *na = NA(ifp);
2155 	struct netmap_kring *kring;
2156 
2157 	q &= NETMAP_RING_MASK;
2158 
2159 	if (netmap_verbose) {
2160 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
2161 	}
2162 
2163 	if (work_done) { /* RX path */
2164 		if (q >= na->num_rx_rings)
2165 			return 0;	// not a physical queue
2166 		kring = na->rx_rings + q;
2167 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
2168 		na->nm_notify(na, q, NR_RX,
2169 			(na->num_rx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0));
2170 		*work_done = 1; /* do not fire napi again */
2171 	} else { /* TX path */
2172 		if (q >= na->num_tx_rings)
2173 			return 0;	// not a physical queue
2174 		kring = na->tx_rings + q;
2175 		na->nm_notify(na, q, NR_TX,
2176 			(na->num_tx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0));
2177 	}
2178 	return 1;
2179 }
2180 
2181 /*
2182  * Default functions to handle rx/tx interrupts from a physical device.
2183  * "work_done" is non-null on the RX path, NULL for the TX path.
2184  * "generic" is 0 when we are called by a device driver, and 1 when we
2185  * are called by the generic netmap adapter layer.
2186  * We rely on the OS to make sure that there is only one active
2187  * instance per queue, and that there is appropriate locking.
2188  *
2189  * If the card is not in netmap mode, simply return 0,
2190  * so that the caller proceeds with regular processing.
2191  *
2192  * If the card is connected to a netmap file descriptor,
2193  * do a selwakeup on the individual queue, plus one on the global one
2194  * if needed (multiqueue card _and_ there are multiqueue listeners),
2195  * and return 1.
2196  *
2197  * Finally, if called on rx from an interface connected to a switch,
2198  * calls the proper forwarding routine, and return 1.
2199  */
2200 int
2201 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2202 {
2203 	// XXX could we check NAF_NATIVE_ON ?
2204 	if (!(ifp->if_capenable & IFCAP_NETMAP))
2205 		return 0;
2206 
2207 	if (NA(ifp)->na_flags & NAF_SKIP_INTR) {
2208 		ND("use regular interrupt");
2209 		return 0;
2210 	}
2211 
2212 	return netmap_common_irq(ifp, q, work_done);
2213 }
2214 
2215 
2216 static struct cdev *netmap_dev; /* /dev/netmap character device. */
2217 
2218 
2219 /*
2220  * Module loader.
2221  *
2222  * Create the /dev/netmap device and initialize all global
2223  * variables.
2224  *
2225  * Return 0 on success, errno on failure.
2226  */
2227 int
2228 netmap_init(void)
2229 {
2230 	int error;
2231 
2232 	NMG_LOCK_INIT();
2233 
2234 	error = netmap_mem_init();
2235 	if (error != 0) {
2236 		kprintf("netmap: unable to initialize the memory allocator.\n");
2237 		return (error);
2238 	}
2239 	kprintf("netmap: loaded module\n");
2240 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
2241 			      "netmap");
2242 
2243 	netmap_init_bridges();
2244 	return (error);
2245 }
2246 
2247 
2248 /*
2249  * Module unloader.
2250  *
2251  * Free all the memory, and destroy the ``/dev/netmap`` device.
2252  */
2253 void
2254 netmap_fini(void)
2255 {
2256 	destroy_dev(netmap_dev);
2257 	netmap_mem_fini();
2258 	NMG_LOCK_DESTROY();
2259 	kprintf("netmap: unloaded module.\n");
2260 }
2261