xref: /dflybsd-src/sys/net/netmap/netmap.c (revision ed9bd855a8b93a4d4c9df4cae9d83c7abb3b37a6)
1 /*
2  * Copyright (C) 2011-2013 Matteo Landi, Luigi Rizzo. All rights reserved.
3  *
4  * Redistribution and use in source and binary forms, with or without
5  * modification, are permitted provided that the following conditions
6  * are met:
7  *   1. Redistributions of source code must retain the above copyright
8  *      notice, this list of conditions and the following disclaimer.
9  *   2. Redistributions in binary form must reproduce the above copyright
10  *      notice, this list of conditions and the following disclaimer in the
11  *      documentation and/or other materials provided with the distribution.
12  *
13  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
14  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
15  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
16  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
17  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
18  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
19  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
20  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
21  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
22  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
23  * SUCH DAMAGE.
24  */
25 
26 
27 /*
28  * This module supports memory mapped access to network devices,
29  * see netmap(4).
30  *
31  * The module uses a large, memory pool allocated by the kernel
32  * and accessible as mmapped memory by multiple userspace threads/processes.
33  * The memory pool contains packet buffers and "netmap rings",
34  * i.e. user-accessible copies of the interface's queues.
35  *
36  * Access to the network card works like this:
37  * 1. a process/thread issues one or more open() on /dev/netmap, to create
38  *    select()able file descriptor on which events are reported.
39  * 2. on each descriptor, the process issues an ioctl() to identify
40  *    the interface that should report events to the file descriptor.
41  * 3. on each descriptor, the process issues an mmap() request to
42  *    map the shared memory region within the process' address space.
43  *    The list of interesting queues is indicated by a location in
44  *    the shared memory region.
45  * 4. using the functions in the netmap(4) userspace API, a process
46  *    can look up the occupation state of a queue, access memory buffers,
47  *    and retrieve received packets or enqueue packets to transmit.
48  * 5. using some ioctl()s the process can synchronize the userspace view
49  *    of the queue with the actual status in the kernel. This includes both
50  *    receiving the notification of new packets, and transmitting new
51  *    packets on the output interface.
52  * 6. select() or poll() can be used to wait for events on individual
53  *    transmit or receive queues (or all queues for a given interface).
54  *
55 
56 		SYNCHRONIZATION (USER)
57 
58 The netmap rings and data structures may be shared among multiple
59 user threads or even independent processes.
60 Any synchronization among those threads/processes is delegated
61 to the threads themselves. Only one thread at a time can be in
62 a system call on the same netmap ring. The OS does not enforce
63 this and only guarantees against system crashes in case of
64 invalid usage.
65 
66 		LOCKING (INTERNAL)
67 
68 Within the kernel, access to the netmap rings is protected as follows:
69 
70 - a spinlock on each ring, to handle producer/consumer races on
71   RX rings attached to the host stack (against multiple host
72   threads writing from the host stack to the same ring),
73   and on 'destination' rings attached to a VALE switch
74   (i.e. RX rings in VALE ports, and TX rings in NIC/host ports)
75   protecting multiple active senders for the same destination)
76 
77 - an atomic variable to guarantee that there is at most one
78   instance of *_*xsync() on the ring at any time.
79   For rings connected to user file
80   descriptors, an atomic_test_and_set() protects this, and the
81   lock on the ring is not actually used.
82   For NIC RX rings connected to a VALE switch, an atomic_test_and_set()
83   is also used to prevent multiple executions (the driver might indeed
84   already guarantee this).
85   For NIC TX rings connected to a VALE switch, the lock arbitrates
86   access to the queue (both when allocating buffers and when pushing
87   them out).
88 
89 - *xsync() should be protected against initializations of the card.
90   On FreeBSD most devices have the reset routine protected by
91   a RING lock (ixgbe, igb, em) or core lock (re). lem is missing
92   the RING protection on rx_reset(), this should be added.
93 
94   On linux there is an external lock on the tx path, which probably
95   also arbitrates access to the reset routine. XXX to be revised
96 
97 - a per-interface core_lock protecting access from the host stack
98   while interfaces may be detached from netmap mode.
99   XXX there should be no need for this lock if we detach the interfaces
100   only while they are down.
101 
102 
103 --- VALE SWITCH ---
104 
105 NMG_LOCK() serializes all modifications to switches and ports.
106 A switch cannot be deleted until all ports are gone.
107 
108 For each switch, an SX lock (RWlock on linux) protects
109 deletion of ports. When configuring or deleting a new port, the
110 lock is acquired in exclusive mode (after holding NMG_LOCK).
111 When forwarding, the lock is acquired in shared mode (without NMG_LOCK).
112 The lock is held throughout the entire forwarding cycle,
113 during which the thread may incur in a page fault.
114 Hence it is important that sleepable shared locks are used.
115 
116 On the rx ring, the per-port lock is grabbed initially to reserve
117 a number of slot in the ring, then the lock is released,
118 packets are copied from source to destination, and then
119 the lock is acquired again and the receive ring is updated.
120 (A similar thing is done on the tx ring for NIC and host stack
121 ports attached to the switch)
122 
123  */
124 
125 /*
126  * OS-specific code that is used only within this file.
127  * Other OS-specific code that must be accessed by drivers
128  * is present in netmap_kern.h
129  */
130 
131 /* __FBSDID("$FreeBSD: head/sys/dev/netmap/netmap.c 257176 2013-10-26 17:58:36Z glebius $"); */
132 #include <sys/types.h>
133 #include <sys/errno.h>
134 #include <sys/param.h>	/* defines used in kernel.h */
135 #include <sys/kernel.h>	/* types used in module initialization */
136 #include <sys/conf.h>	/* cdevsw struct, UID, GID */
137 #include <sys/sockio.h>
138 #include <sys/socketvar.h>	/* struct socket */
139 #include <sys/malloc.h>
140 #include <sys/kernel.h>
141 #include <sys/queue.h>
142 #include <sys/taskqueue.h>
143 #include <sys/poll.h>
144 #include <sys/lock.h>
145 #include <sys/socket.h> /* sockaddrs */
146 #include <sys/sysctl.h>
147 #include <sys/bus.h>	/* bus_dmamap_* */
148 #include <sys/endian.h>
149 #include <sys/refcount.h>
150 #include <net/if.h>
151 #include <net/if_var.h>
152 #include <net/bpf.h>		/* BIOCIMMEDIATE */
153 
154 /* reduce conditional code */
155 #define init_waitqueue_head(x)	// only needed in linux
156 
157 extern struct cdevsw netmap_cdevsw;
158 
159 /*
160  * common headers
161  */
162 #include <net/netmap.h>
163 #include "netmap_kern.h"
164 #include "netmap_mem2.h"
165 
166 
167 MALLOC_DEFINE(M_NETMAP, "netmap", "Network memory map");
168 
169 /*
170  * The following variables are used by the drivers and replicate
171  * fields in the global memory pool. They only refer to buffers
172  * used by physical interfaces.
173  */
174 u_int netmap_total_buffers;
175 u_int netmap_buf_size;
176 char *netmap_buffer_base;	/* also address of an invalid buffer */
177 
178 /* user-controlled variables */
179 int netmap_verbose;
180 
181 static int netmap_no_timestamp; /* don't timestamp on rxsync */
182 
183 SYSCTL_NODE(_dev, OID_AUTO, netmap, CTLFLAG_RW, 0, "Netmap args");
184 SYSCTL_INT(_dev_netmap, OID_AUTO, verbose,
185     CTLFLAG_RW, &netmap_verbose, 0, "Verbose mode");
186 SYSCTL_INT(_dev_netmap, OID_AUTO, no_timestamp,
187     CTLFLAG_RW, &netmap_no_timestamp, 0, "no_timestamp");
188 int netmap_mitigate = 1;
189 SYSCTL_INT(_dev_netmap, OID_AUTO, mitigate, CTLFLAG_RW, &netmap_mitigate, 0, "");
190 int netmap_no_pendintr = 1;
191 SYSCTL_INT(_dev_netmap, OID_AUTO, no_pendintr,
192     CTLFLAG_RW, &netmap_no_pendintr, 0, "Always look for new received packets.");
193 int netmap_txsync_retry = 2;
194 SYSCTL_INT(_dev_netmap, OID_AUTO, txsync_retry, CTLFLAG_RW,
195     &netmap_txsync_retry, 0 , "Number of txsync loops in bridge's flush.");
196 
197 int netmap_flags = 0;	/* debug flags */
198 int netmap_fwd = 0;	/* force transparent mode */
199 int netmap_mmap_unreg = 0; /* allow mmap of unregistered fds */
200 
201 /*
202  * netmap_admode selects the netmap mode to use.
203  * Invalid values are reset to NETMAP_ADMODE_BEST
204  */
205 enum { NETMAP_ADMODE_BEST = 0,	/* use native, fallback to generic */
206 	NETMAP_ADMODE_NATIVE,	/* either native or none */
207 	NETMAP_ADMODE_GENERIC,	/* force generic */
208 	NETMAP_ADMODE_LAST };
209 #define NETMAP_ADMODE_NATIVE        1  /* Force native netmap adapter. */
210 #define NETMAP_ADMODE_GENERIC       2  /* Force generic netmap adapter. */
211 #define NETMAP_ADMODE_BEST          0  /* Priority to native netmap adapter. */
212 static int netmap_admode = NETMAP_ADMODE_BEST;
213 
214 int netmap_generic_mit = 100*1000;   /* Generic mitigation interval in nanoseconds. */
215 int netmap_generic_ringsize = 1024;   /* Generic ringsize. */
216 
217 SYSCTL_INT(_dev_netmap, OID_AUTO, flags, CTLFLAG_RW, &netmap_flags, 0 , "");
218 SYSCTL_INT(_dev_netmap, OID_AUTO, fwd, CTLFLAG_RW, &netmap_fwd, 0 , "");
219 SYSCTL_INT(_dev_netmap, OID_AUTO, mmap_unreg, CTLFLAG_RW, &netmap_mmap_unreg, 0, "");
220 SYSCTL_INT(_dev_netmap, OID_AUTO, admode, CTLFLAG_RW, &netmap_admode, 0 , "");
221 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_mit, CTLFLAG_RW, &netmap_generic_mit, 0 , "");
222 SYSCTL_INT(_dev_netmap, OID_AUTO, generic_ringsize, CTLFLAG_RW, &netmap_generic_ringsize, 0 , "");
223 
224 NMG_LOCK_T	netmap_global_lock;
225 
226 
227 static void
228 nm_kr_get(struct netmap_kring *kr)
229 {
230 	while (NM_ATOMIC_TEST_AND_SET(&kr->nr_busy))
231 		tsleep(kr, 0, "NM_KR_GET", 4);
232 }
233 
234 
235 void
236 netmap_disable_ring(struct netmap_kring *kr)
237 {
238 	kr->nkr_stopped = 1;
239 	nm_kr_get(kr);
240 	lockmgr(&kr->q_lock, LK_EXCLUSIVE);
241 	lockmgr(&kr->q_lock, LK_RELEASE);
242 	nm_kr_put(kr);
243 }
244 
245 
246 static void
247 netmap_set_all_rings(struct ifnet *ifp, int stopped)
248 {
249 	struct netmap_adapter *na;
250 	int i;
251 
252 	if (!(ifp->if_capenable & IFCAP_NETMAP))
253 		return;
254 
255 	na = NA(ifp);
256 
257 	for (i = 0; i <= na->num_tx_rings; i++) {
258 		if (stopped)
259 			netmap_disable_ring(na->tx_rings + i);
260 		else
261 			na->tx_rings[i].nkr_stopped = 0;
262 		na->nm_notify(na, i, NR_TX, NAF_DISABLE_NOTIFY |
263 			(i == na->num_tx_rings ? NAF_GLOBAL_NOTIFY: 0));
264 	}
265 
266 	for (i = 0; i <= na->num_rx_rings; i++) {
267 		if (stopped)
268 			netmap_disable_ring(na->rx_rings + i);
269 		else
270 			na->rx_rings[i].nkr_stopped = 0;
271 		na->nm_notify(na, i, NR_RX, NAF_DISABLE_NOTIFY |
272 			(i == na->num_rx_rings ? NAF_GLOBAL_NOTIFY: 0));
273 	}
274 }
275 
276 
277 void
278 netmap_disable_all_rings(struct ifnet *ifp)
279 {
280 	netmap_set_all_rings(ifp, 1 /* stopped */);
281 }
282 
283 
284 void
285 netmap_enable_all_rings(struct ifnet *ifp)
286 {
287 	netmap_set_all_rings(ifp, 0 /* enabled */);
288 }
289 
290 
291 /*
292  * generic bound_checking function
293  */
294 u_int
295 nm_bound_var(u_int *v, u_int dflt, u_int lo, u_int hi, const char *msg)
296 {
297 	u_int oldv = *v;
298 	const char *op = NULL;
299 
300 	if (dflt < lo)
301 		dflt = lo;
302 	if (dflt > hi)
303 		dflt = hi;
304 	if (oldv < lo) {
305 		*v = dflt;
306 		op = "Bump";
307 	} else if (oldv > hi) {
308 		*v = hi;
309 		op = "Clamp";
310 	}
311 	if (op && msg)
312 		kprintf("%s %s to %d (was %d)\n", op, msg, *v, oldv);
313 	return *v;
314 }
315 
316 
317 /*
318  * packet-dump function, user-supplied or static buffer.
319  * The destination buffer must be at least 30+4*len
320  */
321 const char *
322 nm_dump_buf(char *p, int len, int lim, char *dst)
323 {
324 	static char _dst[8192];
325 	int i, j, i0;
326 	static char hex[] ="0123456789abcdef";
327 	char *o;	/* output position */
328 
329 #define P_HI(x)	hex[((x) & 0xf0)>>4]
330 #define P_LO(x)	hex[((x) & 0xf)]
331 #define P_C(x)	((x) >= 0x20 && (x) <= 0x7e ? (x) : '.')
332 	if (!dst)
333 		dst = _dst;
334 	if (lim <= 0 || lim > len)
335 		lim = len;
336 	o = dst;
337 	ksprintf(o, "buf 0x%p len %d lim %d\n", p, len, lim);
338 	o += strlen(o);
339 	/* hexdump routine */
340 	for (i = 0; i < lim; ) {
341 		ksprintf(o, "%5d: ", i);
342 		o += strlen(o);
343 		memset(o, ' ', 48);
344 		i0 = i;
345 		for (j=0; j < 16 && i < lim; i++, j++) {
346 			o[j*3] = P_HI(p[i]);
347 			o[j*3+1] = P_LO(p[i]);
348 		}
349 		i = i0;
350 		for (j=0; j < 16 && i < lim; i++, j++)
351 			o[j + 48] = P_C(p[i]);
352 		o[j+48] = '\n';
353 		o += j+49;
354 	}
355 	*o = '\0';
356 #undef P_HI
357 #undef P_LO
358 #undef P_C
359 	return dst;
360 }
361 
362 
363 
364 /*
365  * Fetch configuration from the device, to cope with dynamic
366  * reconfigurations after loading the module.
367  */
368 int
369 netmap_update_config(struct netmap_adapter *na)
370 {
371 	struct ifnet *ifp = na->ifp;
372 	u_int txr, txd, rxr, rxd;
373 
374 	txr = txd = rxr = rxd = 0;
375 	if (na->nm_config) {
376 		na->nm_config(na, &txr, &txd, &rxr, &rxd);
377 	} else {
378 		/* take whatever we had at init time */
379 		txr = na->num_tx_rings;
380 		txd = na->num_tx_desc;
381 		rxr = na->num_rx_rings;
382 		rxd = na->num_rx_desc;
383 	}
384 
385 	if (na->num_tx_rings == txr && na->num_tx_desc == txd &&
386 	    na->num_rx_rings == rxr && na->num_rx_desc == rxd)
387 		return 0; /* nothing changed */
388 	if (netmap_verbose || na->active_fds > 0) {
389 		D("stored config %s: txring %d x %d, rxring %d x %d",
390 			NM_IFPNAME(ifp),
391 			na->num_tx_rings, na->num_tx_desc,
392 			na->num_rx_rings, na->num_rx_desc);
393 		D("new config %s: txring %d x %d, rxring %d x %d",
394 			NM_IFPNAME(ifp), txr, txd, rxr, rxd);
395 	}
396 	if (na->active_fds == 0) {
397 		D("configuration changed (but fine)");
398 		na->num_tx_rings = txr;
399 		na->num_tx_desc = txd;
400 		na->num_rx_rings = rxr;
401 		na->num_rx_desc = rxd;
402 		return 0;
403 	}
404 	D("configuration changed while active, this is bad...");
405 	return 1;
406 }
407 
408 
409 int
410 netmap_krings_create(struct netmap_adapter *na, u_int ntx, u_int nrx, u_int tailroom)
411 {
412 	u_int i, len, ndesc;
413 	struct netmap_kring *kring;
414 
415 	len = (ntx + nrx) * sizeof(struct netmap_kring) + tailroom;
416 
417 	na->tx_rings = kmalloc((size_t)len, M_DEVBUF, M_NOWAIT | M_ZERO);
418 	if (na->tx_rings == NULL) {
419 		D("Cannot allocate krings");
420 		return ENOMEM;
421 	}
422 	na->rx_rings = na->tx_rings + ntx;
423 
424 	ndesc = na->num_tx_desc;
425 	for (i = 0; i < ntx; i++) { /* Transmit rings */
426 		kring = &na->tx_rings[i];
427 		bzero(kring, sizeof(*kring));
428 		kring->na = na;
429 		kring->nkr_num_slots = ndesc;
430 		/*
431 		 * IMPORTANT:
432 		 * Always keep one slot empty, so we can detect new
433 		 * transmissions comparing cur and nr_hwcur (they are
434 		 * the same only if there are no new transmissions).
435 		 */
436 		kring->nr_hwavail = ndesc - 1;
437 		lockinit(&kring->q_lock, "nm_txq_lock", 0, 0);
438 		init_waitqueue_head(&kring->si);
439 	}
440 
441 	ndesc = na->num_rx_desc;
442 	for (i = 0; i < nrx; i++) { /* Receive rings */
443 		kring = &na->rx_rings[i];
444 		bzero(kring, sizeof(*kring));
445 		kring->na = na;
446 		kring->nkr_num_slots = ndesc;
447 		lockinit(&kring->q_lock, "nm_rxq_lock", 0, 0);
448 		init_waitqueue_head(&kring->si);
449 	}
450 	init_waitqueue_head(&na->tx_si);
451 	init_waitqueue_head(&na->rx_si);
452 
453 	na->tailroom = na->rx_rings + nrx;
454 
455 	return 0;
456 
457 }
458 
459 
460 void
461 netmap_krings_delete(struct netmap_adapter *na)
462 {
463 	int i;
464 
465 	for (i = 0; i < na->num_tx_rings + 1; i++) {
466 		lockuninit(&na->tx_rings[i].q_lock);
467 	}
468 	for (i = 0; i < na->num_rx_rings + 1; i++) {
469 		lockuninit(&na->rx_rings[i].q_lock);
470 	}
471 	kfree(na->tx_rings, M_DEVBUF);
472 	na->tx_rings = na->rx_rings = na->tailroom = NULL;
473 }
474 
475 
476 static struct netmap_if*
477 netmap_if_new(const char *ifname, struct netmap_adapter *na)
478 {
479 	struct netmap_if *nifp;
480 
481 	if (netmap_update_config(na)) {
482 		/* configuration mismatch, report and fail */
483 		return NULL;
484 	}
485 
486 	if (na->active_fds)
487 		goto final;
488 
489 	if (na->nm_krings_create(na))
490 		goto cleanup;
491 
492 	if (netmap_mem_rings_create(na))
493 		goto cleanup;
494 
495 final:
496 
497 	nifp = netmap_mem_if_new(ifname, na);
498 	if (nifp == NULL)
499 		goto cleanup;
500 
501 	return (nifp);
502 
503 cleanup:
504 
505 	if (na->active_fds == 0) {
506 		netmap_mem_rings_delete(na);
507 		na->nm_krings_delete(na);
508 	}
509 
510 	return NULL;
511 }
512 
513 
514 /* grab a reference to the memory allocator, if we don't have one already.  The
515  * reference is taken from the netmap_adapter registered with the priv.
516  *
517  */
518 static int
519 netmap_get_memory_locked(struct netmap_priv_d* p)
520 {
521 	struct netmap_mem_d *nmd;
522 	int error = 0;
523 
524 	if (p->np_na == NULL) {
525 		if (!netmap_mmap_unreg)
526 			return ENODEV;
527 		/* for compatibility with older versions of the API
528  		 * we use the global allocator when no interface has been
529  		 * registered
530  		 */
531 		nmd = &nm_mem;
532 	} else {
533 		nmd = p->np_na->nm_mem;
534 	}
535 	if (p->np_mref == NULL) {
536 		error = netmap_mem_finalize(nmd);
537 		if (!error)
538 			p->np_mref = nmd;
539 	} else if (p->np_mref != nmd) {
540 		/* a virtual port has been registered, but previous
541  		 * syscalls already used the global allocator.
542  		 * We cannot continue
543  		 */
544 		error = ENODEV;
545 	}
546 	return error;
547 }
548 
549 
550 int
551 netmap_get_memory(struct netmap_priv_d* p)
552 {
553 	int error;
554 	NMG_LOCK();
555 	error = netmap_get_memory_locked(p);
556 	NMG_UNLOCK();
557 	return error;
558 }
559 
560 
561 static int
562 netmap_have_memory_locked(struct netmap_priv_d* p)
563 {
564 	return p->np_mref != NULL;
565 }
566 
567 
568 static void
569 netmap_drop_memory_locked(struct netmap_priv_d* p)
570 {
571 	if (p->np_mref) {
572 		netmap_mem_deref(p->np_mref);
573 		p->np_mref = NULL;
574 	}
575 }
576 
577 
578 /*
579  * File descriptor's private data destructor.
580  *
581  * Call nm_register(ifp,0) to stop netmap mode on the interface and
582  * revert to normal operation. We expect that np_na->ifp has not gone.
583  * The second argument is the nifp to work on. In some cases it is
584  * not attached yet to the netmap_priv_d so we need to pass it as
585  * a separate argument.
586  */
587 /* call with NMG_LOCK held */
588 static void
589 netmap_do_unregif(struct netmap_priv_d *priv, struct netmap_if *nifp)
590 {
591 	struct netmap_adapter *na = priv->np_na;
592 	struct ifnet *ifp = na->ifp;
593 
594 	NMG_LOCK_ASSERT();
595 	na->active_fds--;
596 	if (na->active_fds <= 0) {	/* last instance */
597 
598 		if (netmap_verbose)
599 			D("deleting last instance for %s", NM_IFPNAME(ifp));
600 		/*
601 		 * (TO CHECK) This function is only called
602 		 * when the last reference to this file descriptor goes
603 		 * away. This means we cannot have any pending poll()
604 		 * or interrupt routine operating on the structure.
605 		 * XXX The file may be closed in a thread while
606 		 * another thread is using it.
607 		 * Linux keeps the file opened until the last reference
608 		 * by any outstanding ioctl/poll or mmap is gone.
609 		 * FreeBSD does not track mmap()s (but we do) and
610 		 * wakes up any sleeping poll(). Need to check what
611 		 * happens if the close() occurs while a concurrent
612 		 * syscall is running.
613 		 */
614 		if (ifp)
615 			na->nm_register(na, 0); /* off, clear IFCAP_NETMAP */
616 		/* Wake up any sleeping threads. netmap_poll will
617 		 * then return POLLERR
618 		 * XXX The wake up now must happen during *_down(), when
619 		 * we order all activities to stop. -gl
620 		 */
621 		/* XXX kqueue(9) needed; these will mirror knlist_init. */
622 		/* knlist_destroy(&na->tx_si.si_note); */
623 		/* knlist_destroy(&na->rx_si.si_note); */
624 
625 		/* delete rings and buffers */
626 		netmap_mem_rings_delete(na);
627 		na->nm_krings_delete(na);
628 	}
629 	/* delete the nifp */
630 	netmap_mem_if_delete(na, nifp);
631 }
632 
633 
634 /*
635  * returns 1 if this is the last instance and we can free priv
636  */
637 int
638 netmap_dtor_locked(struct netmap_priv_d *priv)
639 {
640 	struct netmap_adapter *na = priv->np_na;
641 
642 	/*
643 	 * np_refcount is the number of active mmaps on
644 	 * this file descriptor
645 	 */
646 	if (--priv->np_refcount > 0) {
647 		return 0;
648 	}
649 	if (!na) {
650 	    return 1; //XXX is it correct?
651 	}
652 	netmap_do_unregif(priv, priv->np_nifp);
653 	priv->np_nifp = NULL;
654 	netmap_drop_memory_locked(priv);
655 	if (priv->np_na) {
656 		netmap_adapter_put(na);
657 		priv->np_na = NULL;
658 	}
659 	return 1;
660 }
661 
662 
663 void
664 netmap_dtor(void *data)
665 {
666 	struct netmap_priv_d *priv = data;
667 	int last_instance;
668 
669 	NMG_LOCK();
670 	last_instance = netmap_dtor_locked(priv);
671 	NMG_UNLOCK();
672 	if (last_instance) {
673 		bzero(priv, sizeof(*priv));	/* for safety */
674 		kfree(priv, M_DEVBUF);
675 	}
676 }
677 
678 
679 
680 
681 /*
682  * Handlers for synchronization of the queues from/to the host.
683  * Netmap has two operating modes:
684  * - in the default mode, the rings connected to the host stack are
685  *   just another ring pair managed by userspace;
686  * - in transparent mode (XXX to be defined) incoming packets
687  *   (from the host or the NIC) are marked as NS_FORWARD upon
688  *   arrival, and the user application has a chance to reset the
689  *   flag for packets that should be dropped.
690  *   On the RXSYNC or poll(), packets in RX rings between
691  *   kring->nr_kcur and ring->cur with NS_FORWARD still set are moved
692  *   to the other side.
693  * The transfer NIC --> host is relatively easy, just encapsulate
694  * into mbufs and we are done. The host --> NIC side is slightly
695  * harder because there might not be room in the tx ring so it
696  * might take a while before releasing the buffer.
697  */
698 
699 
700 /*
701  * pass a chain of buffers to the host stack as coming from 'dst'
702  */
703 static void
704 netmap_send_up(struct ifnet *dst, struct mbq *q)
705 {
706 	struct mbuf *m;
707 
708 	/* send packets up, outside the lock */
709 	while ((m = mbq_dequeue(q)) != NULL) {
710 		if (netmap_verbose & NM_VERB_HOST)
711 			D("sending up pkt %p size %d", m, MBUF_LEN(m));
712 		NM_SEND_UP(dst, m);
713 	}
714 	mbq_destroy(q);
715 }
716 
717 
718 /*
719  * put a copy of the buffers marked NS_FORWARD into an mbuf chain.
720  * Run from hwcur to cur - reserved
721  */
722 static void
723 netmap_grab_packets(struct netmap_kring *kring, struct mbq *q, int force)
724 {
725 	/* Take packets from hwcur to cur-reserved and pass them up.
726 	 * In case of no buffers we give up. At the end of the loop,
727 	 * the queue is drained in all cases.
728 	 * XXX handle reserved
729 	 */
730 	u_int lim = kring->nkr_num_slots - 1;
731 	struct mbuf *m;
732 	u_int k = kring->ring->cur, n = kring->ring->reserved;
733 	struct netmap_adapter *na = kring->na;
734 
735 	/* compute the final position, ring->cur - ring->reserved */
736 	if (n > 0) {
737 		if (k < n)
738 			k += kring->nkr_num_slots;
739 		k += n;
740 	}
741 	for (n = kring->nr_hwcur; n != k;) {
742 		struct netmap_slot *slot = &kring->ring->slot[n];
743 
744 		n = nm_next(n, lim);
745 		if ((slot->flags & NS_FORWARD) == 0 && !force)
746 			continue;
747 		if (slot->len < 14 || slot->len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) {
748 			D("bad pkt at %d len %d", n, slot->len);
749 			continue;
750 		}
751 		slot->flags &= ~NS_FORWARD; // XXX needed ?
752 		/* XXX adapt to the case of a multisegment packet */
753 		m = m_devget(BDG_NMB(na, slot), slot->len, 0, na->ifp, NULL);
754 
755 		if (m == NULL)
756 			break;
757 		mbq_enqueue(q, m);
758 	}
759 }
760 
761 
762 /*
763  * The host ring has packets from nr_hwcur to (cur - reserved)
764  * to be sent down to the NIC.
765  * We need to use the queue lock on the source (host RX ring)
766  * to protect against netmap_transmit.
767  * If the user is well behaved we do not need to acquire locks
768  * on the destination(s),
769  * so we only need to make sure that there are no panics because
770  * of user errors.
771  * XXX verify
772  *
773  * We scan the tx rings, which have just been
774  * flushed so nr_hwcur == cur. Pushing packets down means
775  * increment cur and decrement avail.
776  * XXX to be verified
777  */
778 static void
779 netmap_sw_to_nic(struct netmap_adapter *na)
780 {
781 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
782 	struct netmap_kring *k1 = &na->tx_rings[0];
783 	u_int i, howmany, src_lim, dst_lim;
784 
785 	/* XXX we should also check that the carrier is on */
786 	if (kring->nkr_stopped)
787 		return;
788 
789 	lockmgr(&kring->q_lock, LK_EXCLUSIVE);
790 
791 	if (kring->nkr_stopped)
792 		goto out;
793 
794 	howmany = kring->nr_hwavail;	/* XXX otherwise cur - reserved - nr_hwcur */
795 
796 	src_lim = kring->nkr_num_slots - 1;
797 	for (i = 0; howmany > 0 && i < na->num_tx_rings; i++, k1++) {
798 		ND("%d packets left to ring %d (space %d)", howmany, i, k1->nr_hwavail);
799 		dst_lim = k1->nkr_num_slots - 1;
800 		while (howmany > 0 && k1->ring->avail > 0) {
801 			struct netmap_slot *src, *dst, tmp;
802 			src = &kring->ring->slot[kring->nr_hwcur];
803 			dst = &k1->ring->slot[k1->ring->cur];
804 			tmp = *src;
805 			src->buf_idx = dst->buf_idx;
806 			src->flags = NS_BUF_CHANGED;
807 
808 			dst->buf_idx = tmp.buf_idx;
809 			dst->len = tmp.len;
810 			dst->flags = NS_BUF_CHANGED;
811 			ND("out len %d buf %d from %d to %d",
812 				dst->len, dst->buf_idx,
813 				kring->nr_hwcur, k1->ring->cur);
814 
815 			kring->nr_hwcur = nm_next(kring->nr_hwcur, src_lim);
816 			howmany--;
817 			kring->nr_hwavail--;
818 			k1->ring->cur = nm_next(k1->ring->cur, dst_lim);
819 			k1->ring->avail--;
820 		}
821 		kring->ring->cur = kring->nr_hwcur; // XXX
822 		k1++; // XXX why?
823 	}
824 out:
825 	lockmgr(&kring->q_lock, LK_RELEASE);
826 }
827 
828 
829 /*
830  * netmap_txsync_to_host() passes packets up. We are called from a
831  * system call in user process context, and the only contention
832  * can be among multiple user threads erroneously calling
833  * this routine concurrently.
834  */
835 void
836 netmap_txsync_to_host(struct netmap_adapter *na)
837 {
838 	struct netmap_kring *kring = &na->tx_rings[na->num_tx_rings];
839 	struct netmap_ring *ring = kring->ring;
840 	u_int k, lim = kring->nkr_num_slots - 1;
841 	struct mbq q;
842 	int error;
843 
844 	error = nm_kr_tryget(kring);
845 	if (error) {
846 		if (error == NM_KR_BUSY)
847 			D("ring %p busy (user error)", kring);
848 		return;
849 	}
850 	k = ring->cur;
851 	if (k > lim) {
852 		D("invalid ring index in stack TX kring %p", kring);
853 		netmap_ring_reinit(kring);
854 		nm_kr_put(kring);
855 		return;
856 	}
857 
858 	/* Take packets from hwcur to cur and pass them up.
859 	 * In case of no buffers we give up. At the end of the loop,
860 	 * the queue is drained in all cases.
861 	 */
862 	mbq_init(&q);
863 	netmap_grab_packets(kring, &q, 1);
864 	kring->nr_hwcur = k;
865 	kring->nr_hwavail = ring->avail = lim;
866 
867 	nm_kr_put(kring);
868 	netmap_send_up(na->ifp, &q);
869 }
870 
871 
872 /*
873  * rxsync backend for packets coming from the host stack.
874  * They have been put in the queue by netmap_transmit() so we
875  * need to protect access to the kring using a lock.
876  *
877  * This routine also does the selrecord if called from the poll handler
878  * (we know because td != NULL).
879  *
880  * NOTE: on linux, selrecord() is defined as a macro and uses pwait
881  *     as an additional hidden argument.
882  */
883 static void
884 netmap_rxsync_from_host(struct netmap_adapter *na, struct thread *td, void *pwait)
885 {
886 	struct netmap_kring *kring = &na->rx_rings[na->num_rx_rings];
887 	struct netmap_ring *ring = kring->ring;
888 	u_int j, n, lim = kring->nkr_num_slots;
889 	u_int k = ring->cur, resvd = ring->reserved;
890 
891 	(void)pwait;	/* disable unused warnings */
892 
893 	if (kring->nkr_stopped) /* check a first time without lock */
894 		return;
895 
896 	lockmgr(&kring->q_lock, LK_EXCLUSIVE);
897 
898 	if (kring->nkr_stopped)  /* check again with lock held */
899 		goto unlock_out;
900 
901 	if (k >= lim) {
902 		netmap_ring_reinit(kring);
903 		goto unlock_out;
904 	}
905 	/* new packets are already set in nr_hwavail */
906 	/* skip past packets that userspace has released */
907 	j = kring->nr_hwcur;
908 	if (resvd > 0) {
909 		if (resvd + ring->avail >= lim + 1) {
910 			D("XXX invalid reserve/avail %d %d", resvd, ring->avail);
911 			ring->reserved = resvd = 0; // XXX panic...
912 		}
913 		k = (k >= resvd) ? k - resvd : k + lim - resvd;
914 	}
915 	if (j != k) {
916 		n = k >= j ? k - j : k + lim - j;
917 		kring->nr_hwavail -= n;
918 		kring->nr_hwcur = k;
919 	}
920 	k = ring->avail = kring->nr_hwavail - resvd;
921 	if (k == 0 && td)
922 		selrecord(td, &kring->si);
923 	if (k && (netmap_verbose & NM_VERB_HOST))
924 		D("%d pkts from stack", k);
925 unlock_out:
926 
927 	lockmgr(&kring->q_lock, LK_RELEASE);
928 }
929 
930 
931 /* Get a netmap adapter for the port.
932  *
933  * If it is possible to satisfy the request, return 0
934  * with *na containing the netmap adapter found.
935  * Otherwise return an error code, with *na containing NULL.
936  *
937  * When the port is attached to a bridge, we always return
938  * EBUSY.
939  * Otherwise, if the port is already bound to a file descriptor,
940  * then we unconditionally return the existing adapter into *na.
941  * In all the other cases, we return (into *na) either native,
942  * generic or NULL, according to the following table:
943  *
944  *					native_support
945  * active_fds   dev.netmap.admode         YES     NO
946  * -------------------------------------------------------
947  *    >0              *                 NA(ifp) NA(ifp)
948  *
949  *     0        NETMAP_ADMODE_BEST      NATIVE  GENERIC
950  *     0        NETMAP_ADMODE_NATIVE    NATIVE   NULL
951  *     0        NETMAP_ADMODE_GENERIC   GENERIC GENERIC
952  *
953  */
954 
955 int
956 netmap_get_hw_na(struct ifnet *ifp, struct netmap_adapter **na)
957 {
958 	/* generic support */
959 	int i = netmap_admode;	/* Take a snapshot. */
960 	int error = 0;
961 	struct netmap_adapter *prev_na;
962 	struct netmap_generic_adapter *gna;
963 
964 	*na = NULL; /* default */
965 
966 	/* reset in case of invalid value */
967 	if (i < NETMAP_ADMODE_BEST || i >= NETMAP_ADMODE_LAST)
968 		i = netmap_admode = NETMAP_ADMODE_BEST;
969 
970 	if (NETMAP_CAPABLE(ifp)) {
971 		/* If an adapter already exists, but is
972 		 * attached to a vale port, we report that the
973 		 * port is busy.
974 		 */
975 		if (NETMAP_OWNED_BY_KERN(NA(ifp)))
976 			return EBUSY;
977 
978 		/* If an adapter already exists, return it if
979 		 * there are active file descriptors or if
980 		 * netmap is not forced to use generic
981 		 * adapters.
982 		 */
983 		if (NA(ifp)->active_fds > 0 ||
984 				i != NETMAP_ADMODE_GENERIC) {
985 			*na = NA(ifp);
986 			return 0;
987 		}
988 	}
989 
990 	/* If there isn't native support and netmap is not allowed
991 	 * to use generic adapters, we cannot satisfy the request.
992 	 */
993 	if (!NETMAP_CAPABLE(ifp) && i == NETMAP_ADMODE_NATIVE)
994 		return EINVAL;
995 
996 	/* Otherwise, create a generic adapter and return it,
997 	 * saving the previously used netmap adapter, if any.
998 	 *
999 	 * Note that here 'prev_na', if not NULL, MUST be a
1000 	 * native adapter, and CANNOT be a generic one. This is
1001 	 * true because generic adapters are created on demand, and
1002 	 * destroyed when not used anymore. Therefore, if the adapter
1003 	 * currently attached to an interface 'ifp' is generic, it
1004 	 * must be that
1005 	 * (NA(ifp)->active_fds > 0 || NETMAP_OWNED_BY_KERN(NA(ifp))).
1006 	 * Consequently, if NA(ifp) is generic, we will enter one of
1007 	 * the branches above. This ensures that we never override
1008 	 * a generic adapter with another generic adapter.
1009 	 */
1010 	prev_na = NA(ifp);
1011 	error = generic_netmap_attach(ifp);
1012 	if (error)
1013 		return error;
1014 
1015 	*na = NA(ifp);
1016 	gna = (struct netmap_generic_adapter*)NA(ifp);
1017 	gna->prev = prev_na; /* save old na */
1018 	if (prev_na != NULL) {
1019 		ifunit_ref(ifp->if_xname);
1020 		// XXX add a refcount ?
1021 		netmap_adapter_get(prev_na);
1022 	}
1023 	D("Created generic NA %p (prev %p)", gna, gna->prev);
1024 
1025 	return 0;
1026 }
1027 
1028 
1029 /*
1030  * MUST BE CALLED UNDER NMG_LOCK()
1031  *
1032  * get a refcounted reference to an interface.
1033  * This is always called in the execution of an ioctl().
1034  *
1035  * Return ENXIO if the interface does not exist, EINVAL if netmap
1036  * is not supported by the interface.
1037  * If successful, hold a reference.
1038  *
1039  * When the NIC is attached to a bridge, reference is managed
1040  * at na->na_bdg_refcount using ADD/DROP_BDG_REF() as well as
1041  * virtual ports.  Hence, on the final DROP_BDG_REF(), the NIC
1042  * is detached from the bridge, then ifp's refcount is dropped (this
1043  * is equivalent to that ifp is destroyed in case of virtual ports.
1044  *
1045  * This function uses if_rele() when we want to prevent the NIC from
1046  * being detached from the bridge in error handling.  But once refcount
1047  * is acquired by this function, it must be released using nm_if_rele().
1048  */
1049 int
1050 netmap_get_na(struct nmreq *nmr, struct netmap_adapter **na, int create)
1051 {
1052 	struct ifnet *ifp;
1053 	int error = 0;
1054 	struct netmap_adapter *ret;
1055 
1056 	*na = NULL;     /* default return value */
1057 
1058 	/* first try to see if this is a bridge port. */
1059 	NMG_LOCK_ASSERT();
1060 
1061 	error = netmap_get_bdg_na(nmr, na, create);
1062 	if (error || *na != NULL) /* valid match in netmap_get_bdg_na() */
1063 		return error;
1064 
1065 	ifp = ifunit_ref(nmr->nr_name);
1066 	if (ifp == NULL) {
1067 	        return ENXIO;
1068 	}
1069 
1070 	error = netmap_get_hw_na(ifp, &ret);
1071 	if (error)
1072 		goto out;
1073 
1074 	if (ret != NULL) {
1075 		/* Users cannot use the NIC attached to a bridge directly */
1076 		if (NETMAP_OWNED_BY_KERN(ret)) {
1077 			error = EINVAL;
1078 			goto out;
1079 		}
1080 		error = 0;
1081 		*na = ret;
1082 		netmap_adapter_get(ret);
1083 	}
1084 out:
1085 	if_rele(ifp);
1086 
1087 	return error;
1088 }
1089 
1090 
1091 /*
1092  * Error routine called when txsync/rxsync detects an error.
1093  * Can't do much more than resetting cur = hwcur, avail = hwavail.
1094  * Return 1 on reinit.
1095  *
1096  * This routine is only called by the upper half of the kernel.
1097  * It only reads hwcur (which is changed only by the upper half, too)
1098  * and hwavail (which may be changed by the lower half, but only on
1099  * a tx ring and only to increase it, so any error will be recovered
1100  * on the next call). For the above, we don't strictly need to call
1101  * it under lock.
1102  */
1103 int
1104 netmap_ring_reinit(struct netmap_kring *kring)
1105 {
1106 	struct netmap_ring *ring = kring->ring;
1107 	u_int i, lim = kring->nkr_num_slots - 1;
1108 	int errors = 0;
1109 
1110 	// XXX KASSERT nm_kr_tryget
1111 	RD(10, "called for %s", NM_IFPNAME(kring->na->ifp));
1112 	if (ring->cur > lim)
1113 		errors++;
1114 	for (i = 0; i <= lim; i++) {
1115 		u_int idx = ring->slot[i].buf_idx;
1116 		u_int len = ring->slot[i].len;
1117 		if (idx < 2 || idx >= netmap_total_buffers) {
1118 			if (!errors++)
1119 				D("bad buffer at slot %d idx %d len %d ", i, idx, len);
1120 			ring->slot[i].buf_idx = 0;
1121 			ring->slot[i].len = 0;
1122 		} else if (len > NETMAP_BDG_BUF_SIZE(kring->na->nm_mem)) {
1123 			ring->slot[i].len = 0;
1124 			if (!errors++)
1125 				D("bad len %d at slot %d idx %d",
1126 					len, i, idx);
1127 		}
1128 	}
1129 	if (errors) {
1130 		int pos = kring - kring->na->tx_rings;
1131 		int n = kring->na->num_tx_rings + 1;
1132 
1133 		RD(10, "total %d errors", errors);
1134 		errors++;
1135 		RD(10, "%s %s[%d] reinit, cur %d -> %d avail %d -> %d",
1136 			NM_IFPNAME(kring->na->ifp),
1137 			pos < n ?  "TX" : "RX", pos < n ? pos : pos - n,
1138 			ring->cur, kring->nr_hwcur,
1139 			ring->avail, kring->nr_hwavail);
1140 		ring->cur = kring->nr_hwcur;
1141 		ring->avail = kring->nr_hwavail;
1142 	}
1143 	return (errors ? 1 : 0);
1144 }
1145 
1146 
1147 /*
1148  * Set the ring ID. For devices with a single queue, a request
1149  * for all rings is the same as a single ring.
1150  */
1151 static int
1152 netmap_set_ringid(struct netmap_priv_d *priv, u_int ringid)
1153 {
1154 	struct netmap_adapter *na = priv->np_na;
1155 	struct ifnet *ifp = na->ifp;
1156 	u_int i = ringid & NETMAP_RING_MASK;
1157 	/* initially (np_qfirst == np_qlast) we don't want to lock */
1158 	u_int lim = na->num_rx_rings;
1159 
1160 	if (na->num_tx_rings > lim)
1161 		lim = na->num_tx_rings;
1162 	if ( (ringid & NETMAP_HW_RING) && i >= lim) {
1163 		D("invalid ring id %d", i);
1164 		return (EINVAL);
1165 	}
1166 	priv->np_ringid = ringid;
1167 	if (ringid & NETMAP_SW_RING) {
1168 		priv->np_qfirst = NETMAP_SW_RING;
1169 		priv->np_qlast = 0;
1170 	} else if (ringid & NETMAP_HW_RING) {
1171 		priv->np_qfirst = i;
1172 		priv->np_qlast = i + 1;
1173 	} else {
1174 		priv->np_qfirst = 0;
1175 		priv->np_qlast = NETMAP_HW_RING ;
1176 	}
1177 	priv->np_txpoll = (ringid & NETMAP_NO_TX_POLL) ? 0 : 1;
1178     if (netmap_verbose) {
1179 	if (ringid & NETMAP_SW_RING)
1180 		D("ringid %s set to SW RING", NM_IFPNAME(ifp));
1181 	else if (ringid & NETMAP_HW_RING)
1182 		D("ringid %s set to HW RING %d", NM_IFPNAME(ifp),
1183 			priv->np_qfirst);
1184 	else
1185 		D("ringid %s set to all %d HW RINGS", NM_IFPNAME(ifp), lim);
1186     }
1187 	return 0;
1188 }
1189 
1190 
1191 /*
1192  * possibly move the interface to netmap-mode.
1193  * If success it returns a pointer to netmap_if, otherwise NULL.
1194  * This must be called with NMG_LOCK held.
1195  */
1196 struct netmap_if *
1197 netmap_do_regif(struct netmap_priv_d *priv, struct netmap_adapter *na,
1198 	uint16_t ringid, int *err)
1199 {
1200 	struct ifnet *ifp = na->ifp;
1201 	struct netmap_if *nifp = NULL;
1202 	int error, need_mem = 0;
1203 
1204 	NMG_LOCK_ASSERT();
1205 	/* ring configuration may have changed, fetch from the card */
1206 	netmap_update_config(na);
1207 	priv->np_na = na;     /* store the reference */
1208 	error = netmap_set_ringid(priv, ringid);
1209 	if (error)
1210 		goto out;
1211 	/* ensure allocators are ready */
1212 	need_mem = !netmap_have_memory_locked(priv);
1213 	if (need_mem) {
1214 		error = netmap_get_memory_locked(priv);
1215 		ND("get_memory returned %d", error);
1216 		if (error)
1217 			goto out;
1218 	}
1219 	nifp = netmap_if_new(NM_IFPNAME(ifp), na);
1220 	if (nifp == NULL) { /* allocation failed */
1221 		/* we should drop the allocator, but only
1222 		 * if we were the ones who grabbed it
1223 		 */
1224 		error = ENOMEM;
1225 		goto out;
1226 	}
1227 	na->active_fds++;
1228 	if (ifp->if_capenable & IFCAP_NETMAP) {
1229 		/* was already set */
1230 	} else {
1231 		/* Otherwise set the card in netmap mode
1232 		 * and make it use the shared buffers.
1233 		 *
1234 		 * do not core lock because the race is harmless here,
1235 		 * there cannot be any traffic to netmap_transmit()
1236 		 */
1237 		na->na_lut = na->nm_mem->pools[NETMAP_BUF_POOL].lut;
1238 		ND("%p->na_lut == %p", na, na->na_lut);
1239 		na->na_lut_objtotal = na->nm_mem->pools[NETMAP_BUF_POOL].objtotal;
1240 		error = na->nm_register(na, 1); /* mode on */
1241 		if (error) {
1242 			netmap_do_unregif(priv, nifp);
1243 			nifp = NULL;
1244 		}
1245 	}
1246 out:
1247 	*err = error;
1248 	if (error) {
1249 		priv->np_na = NULL;
1250 		if (need_mem)
1251 			netmap_drop_memory_locked(priv);
1252 	}
1253 	if (nifp != NULL) {
1254 		/*
1255 		 * advertise that the interface is ready bt setting ni_nifp.
1256 		 * The barrier is needed because readers (poll and *SYNC)
1257 		 * check for priv->np_nifp != NULL without locking
1258 		 */
1259 		wmb(); /* make sure previous writes are visible to all CPUs */
1260 		priv->np_nifp = nifp;
1261 	}
1262 	return nifp;
1263 }
1264 
1265 
1266 
1267 /*
1268  * ioctl(2) support for the "netmap" device.
1269  *
1270  * Following a list of accepted commands:
1271  * - NIOCGINFO
1272  * - SIOCGIFADDR	just for convenience
1273  * - NIOCREGIF
1274  * - NIOCUNREGIF
1275  * - NIOCTXSYNC
1276  * - NIOCRXSYNC
1277  *
1278  * Return 0 on success, errno otherwise.
1279  */
1280 int
1281 netmap_ioctl(struct cdev *dev, u_long cmd, caddr_t data,
1282 	int fflag, struct thread *td)
1283 {
1284 	struct netmap_priv_d *priv = NULL;
1285 	struct ifnet *ifp = NULL;
1286 	struct nmreq *nmr = (struct nmreq *) data;
1287 	struct netmap_adapter *na = NULL;
1288 	int error;
1289 	u_int i, lim;
1290 	struct netmap_if *nifp;
1291 	struct netmap_kring *krings;
1292 
1293 	(void)dev;	/* UNUSED */
1294 	(void)fflag;	/* UNUSED */
1295 
1296 	CURVNET_SET(TD_TO_VNET(td));
1297 
1298 	error = devfs_get_cdevpriv((void **)&priv);
1299 	if (error) {
1300 		CURVNET_RESTORE();
1301 		/* XXX ENOENT should be impossible, since the priv
1302 		 * is now created in the open */
1303 		return (error == ENOENT ? ENXIO : error);
1304 	}
1305 
1306 	nmr->nr_name[sizeof(nmr->nr_name) - 1] = '\0';	/* truncate name */
1307 	switch (cmd) {
1308 	case NIOCGINFO:		/* return capabilities etc */
1309 		if (nmr->nr_version != NETMAP_API) {
1310 			D("API mismatch got %d have %d",
1311 				nmr->nr_version, NETMAP_API);
1312 			nmr->nr_version = NETMAP_API;
1313 			error = EINVAL;
1314 			break;
1315 		}
1316 		if (nmr->nr_cmd == NETMAP_BDG_LIST) {
1317 			error = netmap_bdg_ctl(nmr, NULL);
1318 			break;
1319 		}
1320 
1321 		NMG_LOCK();
1322 		do {
1323 			/* memsize is always valid */
1324 			struct netmap_mem_d *nmd = &nm_mem;
1325 			u_int memflags;
1326 
1327 			if (nmr->nr_name[0] != '\0') {
1328 				/* get a refcount */
1329 				error = netmap_get_na(nmr, &na, 1 /* create */);
1330 				if (error)
1331 					break;
1332 				nmd = na->nm_mem; /* get memory allocator */
1333 			}
1334 
1335 			error = netmap_mem_get_info(nmd, &nmr->nr_memsize, &memflags);
1336 			if (error)
1337 				break;
1338 			if (na == NULL) /* only memory info */
1339 				break;
1340 			nmr->nr_offset = 0;
1341 			nmr->nr_rx_slots = nmr->nr_tx_slots = 0;
1342 			netmap_update_config(na);
1343 			nmr->nr_rx_rings = na->num_rx_rings;
1344 			nmr->nr_tx_rings = na->num_tx_rings;
1345 			nmr->nr_rx_slots = na->num_rx_desc;
1346 			nmr->nr_tx_slots = na->num_tx_desc;
1347 			if (memflags & NETMAP_MEM_PRIVATE)
1348 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
1349 			netmap_adapter_put(na);
1350 		} while (0);
1351 		NMG_UNLOCK();
1352 		break;
1353 
1354 	case NIOCREGIF:
1355 		if (nmr->nr_version != NETMAP_API) {
1356 			nmr->nr_version = NETMAP_API;
1357 			error = EINVAL;
1358 			break;
1359 		}
1360 		/* possibly attach/detach NIC and VALE switch */
1361 		i = nmr->nr_cmd;
1362 		if (i == NETMAP_BDG_ATTACH || i == NETMAP_BDG_DETACH) {
1363 			error = netmap_bdg_ctl(nmr, NULL);
1364 			break;
1365 		} else if (i != 0) {
1366 			D("nr_cmd must be 0 not %d", i);
1367 			error = EINVAL;
1368 			break;
1369 		}
1370 
1371 		/* protect access to priv from concurrent NIOCREGIF */
1372 		NMG_LOCK();
1373 		do {
1374 			u_int memflags;
1375 
1376 			if (priv->np_na != NULL) {	/* thread already registered */
1377 				error = netmap_set_ringid(priv, nmr->nr_ringid);
1378 				break;
1379 			}
1380 			/* find the interface and a reference */
1381 			error = netmap_get_na(nmr, &na, 1 /* create */); /* keep reference */
1382 			if (error)
1383 				break;
1384 			ifp = na->ifp;
1385 			if (NETMAP_OWNED_BY_KERN(na)) {
1386 				netmap_adapter_put(na);
1387 				error = EBUSY;
1388 				break;
1389 			}
1390 			nifp = netmap_do_regif(priv, na, nmr->nr_ringid, &error);
1391 			if (!nifp) {    /* reg. failed, release priv and ref */
1392 				netmap_adapter_put(na);
1393 				priv->np_nifp = NULL;
1394 				break;
1395 			}
1396 
1397 			/* return the offset of the netmap_if object */
1398 			nmr->nr_rx_rings = na->num_rx_rings;
1399 			nmr->nr_tx_rings = na->num_tx_rings;
1400 			nmr->nr_rx_slots = na->num_rx_desc;
1401 			nmr->nr_tx_slots = na->num_tx_desc;
1402 			error = netmap_mem_get_info(na->nm_mem, &nmr->nr_memsize, &memflags);
1403 			if (error) {
1404 				netmap_adapter_put(na);
1405 				break;
1406 			}
1407 			if (memflags & NETMAP_MEM_PRIVATE) {
1408 				nmr->nr_ringid |= NETMAP_PRIV_MEM;
1409 				*(uint32_t *)(uintptr_t)&nifp->ni_flags |= NI_PRIV_MEM;
1410 			}
1411 			nmr->nr_offset = netmap_mem_if_offset(na->nm_mem, nifp);
1412 		} while (0);
1413 		NMG_UNLOCK();
1414 		break;
1415 
1416 	case NIOCUNREGIF:
1417 		// XXX we have no data here ?
1418 		D("deprecated, data is %p", nmr);
1419 		error = EINVAL;
1420 		break;
1421 
1422 	case NIOCTXSYNC:
1423 	case NIOCRXSYNC:
1424 		nifp = priv->np_nifp;
1425 
1426 		if (nifp == NULL) {
1427 			error = ENXIO;
1428 			break;
1429 		}
1430 		rmb(); /* make sure following reads are not from cache */
1431 
1432 		na = priv->np_na;      /* we have a reference */
1433 
1434 		if (na == NULL) {
1435 			D("Internal error: nifp != NULL && na == NULL");
1436 			error = ENXIO;
1437 			break;
1438 		}
1439 
1440 		ifp = na->ifp;
1441 		if (ifp == NULL) {
1442 			RD(1, "the ifp is gone");
1443 			error = ENXIO;
1444 			break;
1445 		}
1446 
1447 		if (priv->np_qfirst == NETMAP_SW_RING) { /* host rings */
1448 			if (cmd == NIOCTXSYNC)
1449 				netmap_txsync_to_host(na);
1450 			else
1451 				netmap_rxsync_from_host(na, NULL, NULL);
1452 			break;
1453 		}
1454 		/* find the last ring to scan */
1455 		lim = priv->np_qlast;
1456 		if (lim == NETMAP_HW_RING)
1457 			lim = (cmd == NIOCTXSYNC) ?
1458 			    na->num_tx_rings : na->num_rx_rings;
1459 
1460 		krings = (cmd == NIOCTXSYNC) ? na->tx_rings : na->rx_rings;
1461 		for (i = priv->np_qfirst; i < lim; i++) {
1462 			struct netmap_kring *kring = krings + i;
1463 			if (nm_kr_tryget(kring)) {
1464 				error = EBUSY;
1465 				goto out;
1466 			}
1467 			if (cmd == NIOCTXSYNC) {
1468 				if (netmap_verbose & NM_VERB_TXSYNC)
1469 					D("pre txsync ring %d cur %d hwcur %d",
1470 					    i, kring->ring->cur,
1471 					    kring->nr_hwcur);
1472 				na->nm_txsync(na, i, NAF_FORCE_RECLAIM);
1473 				if (netmap_verbose & NM_VERB_TXSYNC)
1474 					D("post txsync ring %d cur %d hwcur %d",
1475 					    i, kring->ring->cur,
1476 					    kring->nr_hwcur);
1477 			} else {
1478 				na->nm_rxsync(na, i, NAF_FORCE_READ);
1479 				microtime(&na->rx_rings[i].ring->ts);
1480 			}
1481 			nm_kr_put(kring);
1482 		}
1483 
1484 		break;
1485 	case BIOCIMMEDIATE:
1486 	case BIOCGHDRCMPLT:
1487 	case BIOCSHDRCMPLT:
1488 	case BIOCSSEESENT:
1489 		D("ignore BIOCIMMEDIATE/BIOCSHDRCMPLT/BIOCSHDRCMPLT/BIOCSSEESENT");
1490 		break;
1491 
1492 	default:	/* allow device-specific ioctls */
1493 	    {
1494 		struct socket so;
1495 
1496 		bzero(&so, sizeof(so));
1497 		NMG_LOCK();
1498 		error = netmap_get_na(nmr, &na, 0 /* don't create */); /* keep reference */
1499 		if (error) {
1500 			netmap_adapter_put(na);
1501 			NMG_UNLOCK();
1502 			break;
1503 		}
1504 		ifp = na->ifp;
1505 		so.so_vnet = ifp->if_vnet;
1506 		// so->so_proto not null.
1507 		error = ifioctl(&so, cmd, data, td);
1508 		netmap_adapter_put(na);
1509 		NMG_UNLOCK();
1510 		break;
1511 	    }
1512 	}
1513 out:
1514 
1515 	CURVNET_RESTORE();
1516 	return (error);
1517 }
1518 
1519 
1520 /*
1521  * select(2) and poll(2) handlers for the "netmap" device.
1522  *
1523  * Can be called for one or more queues.
1524  * Return true the event mask corresponding to ready events.
1525  * If there are no ready events, do a selrecord on either individual
1526  * selinfo or on the global one.
1527  * Device-dependent parts (locking and sync of tx/rx rings)
1528  * are done through callbacks.
1529  *
1530  * On linux, arguments are really pwait, the poll table, and 'td' is struct file *
1531  * The first one is remapped to pwait as selrecord() uses the name as an
1532  * hidden argument.
1533  */
1534 int
1535 netmap_poll(struct cdev *dev, int events, struct thread *td)
1536 {
1537 	struct netmap_priv_d *priv = NULL;
1538 	struct netmap_adapter *na;
1539 	struct ifnet *ifp;
1540 	struct netmap_kring *kring;
1541 	u_int i, check_all_tx, check_all_rx, want_tx, want_rx, revents = 0;
1542 	u_int lim_tx, lim_rx, host_forwarded = 0;
1543 	struct mbq q;
1544 	void *pwait = dev;	/* linux compatibility */
1545 
1546 	/*
1547 	 * In order to avoid nested locks, we need to "double check"
1548 	 * txsync and rxsync if we decide to do a selrecord().
1549 	 * retry_tx (and retry_rx, later) prevent looping forever.
1550 	 */
1551 	int retry_tx = 1;
1552 
1553 	(void)pwait;
1554 	mbq_init(&q);
1555 
1556 	if (devfs_get_cdevpriv((void **)&priv) != 0 || priv == NULL)
1557 		return POLLERR;
1558 
1559 	if (priv->np_nifp == NULL) {
1560 		D("No if registered");
1561 		return POLLERR;
1562 	}
1563 	rmb(); /* make sure following reads are not from cache */
1564 
1565 	na = priv->np_na;
1566 	ifp = na->ifp;
1567 	// check for deleted
1568 	if (ifp == NULL) {
1569 		RD(1, "the ifp is gone");
1570 		return POLLERR;
1571 	}
1572 
1573 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0)
1574 		return POLLERR;
1575 
1576 	if (netmap_verbose & 0x8000)
1577 		D("device %s events 0x%x", NM_IFPNAME(ifp), events);
1578 	want_tx = events & (POLLOUT | POLLWRNORM);
1579 	want_rx = events & (POLLIN | POLLRDNORM);
1580 
1581 	lim_tx = na->num_tx_rings;
1582 	lim_rx = na->num_rx_rings;
1583 
1584 	if (priv->np_qfirst == NETMAP_SW_RING) {
1585 		/* handle the host stack ring */
1586 		if (priv->np_txpoll || want_tx) {
1587 			/* push any packets up, then we are always ready */
1588 			netmap_txsync_to_host(na);
1589 			revents |= want_tx;
1590 		}
1591 		if (want_rx) {
1592 			kring = &na->rx_rings[lim_rx];
1593 			if (kring->ring->avail == 0)
1594 				netmap_rxsync_from_host(na, td, dev);
1595 			if (kring->ring->avail > 0) {
1596 				revents |= want_rx;
1597 			}
1598 		}
1599 		return (revents);
1600 	}
1601 
1602 	/*
1603 	 * If we are in transparent mode, check also the host rx ring
1604 	 * XXX Transparent mode at the moment requires to bind all
1605  	 * rings to a single file descriptor.
1606 	 */
1607 	kring = &na->rx_rings[lim_rx];
1608 	if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
1609 			&& want_rx
1610 			&& (netmap_fwd || kring->ring->flags & NR_FORWARD) ) {
1611 		if (kring->ring->avail == 0)
1612 			netmap_rxsync_from_host(na, td, dev);
1613 		if (kring->ring->avail > 0)
1614 			revents |= want_rx;
1615 	}
1616 
1617 	/*
1618 	 * check_all_{tx|rx} are set if the card has more than one queue AND
1619 	 * the file descriptor is bound to all of them. If so, we sleep on
1620 	 * the "global" selinfo, otherwise we sleep on individual selinfo
1621 	 * (FreeBSD only allows two selinfo's per file descriptor).
1622 	 * The interrupt routine in the driver wake one or the other
1623 	 * (or both) depending on which clients are active.
1624 	 *
1625 	 * rxsync() is only called if we run out of buffers on a POLLIN.
1626 	 * txsync() is called if we run out of buffers on POLLOUT, or
1627 	 * there are pending packets to send. The latter can be disabled
1628 	 * passing NETMAP_NO_TX_POLL in the NIOCREG call.
1629 	 */
1630 	check_all_tx = (priv->np_qlast == NETMAP_HW_RING) && (lim_tx > 1);
1631 	check_all_rx = (priv->np_qlast == NETMAP_HW_RING) && (lim_rx > 1);
1632 
1633 	if (priv->np_qlast != NETMAP_HW_RING) {
1634 		lim_tx = lim_rx = priv->np_qlast;
1635 	}
1636 
1637 	/*
1638 	 * We start with a lock free round which is cheap if we have
1639 	 * slots available. If this fails, then lock and call the sync
1640 	 * routines.
1641 	 */
1642 	for (i = priv->np_qfirst; want_rx && i < lim_rx; i++) {
1643 		kring = &na->rx_rings[i];
1644 		if (kring->ring->avail > 0) {
1645 			revents |= want_rx;
1646 			want_rx = 0;	/* also breaks the loop */
1647 		}
1648 	}
1649 	for (i = priv->np_qfirst; want_tx && i < lim_tx; i++) {
1650 		kring = &na->tx_rings[i];
1651 		if (kring->ring->avail > 0) {
1652 			revents |= want_tx;
1653 			want_tx = 0;	/* also breaks the loop */
1654 		}
1655 	}
1656 
1657 	/*
1658 	 * If we to push packets out (priv->np_txpoll) or want_tx is
1659 	 * still set, we do need to run the txsync calls (on all rings,
1660 	 * to avoid that the tx rings stall).
1661 	 * XXX should also check cur != hwcur on the tx rings.
1662 	 * Fortunately, normal tx mode has np_txpoll set.
1663 	 */
1664 	if (priv->np_txpoll || want_tx) {
1665 		/* If we really want to be woken up (want_tx),
1666 		 * do a selrecord, either on the global or on
1667 		 * the private structure.  Then issue the txsync
1668 		 * so there is no race in the selrecord/selwait
1669 		 */
1670 flush_tx:
1671 		for (i = priv->np_qfirst; i < lim_tx; i++) {
1672 			kring = &na->tx_rings[i];
1673 			/*
1674 			 * Skip this ring if want_tx == 0
1675 			 * (we have already done a successful sync on
1676 			 * a previous ring) AND kring->cur == kring->hwcur
1677 			 * (there are no pending transmissions for this ring).
1678 			 */
1679 			if (!want_tx && kring->ring->cur == kring->nr_hwcur)
1680 				continue;
1681 			/* make sure only one user thread is doing this */
1682 			if (nm_kr_tryget(kring)) {
1683 				ND("ring %p busy is %d",
1684 				    kring, (int)kring->nr_busy);
1685 				revents |= POLLERR;
1686 				goto out;
1687 			}
1688 
1689 			if (netmap_verbose & NM_VERB_TXSYNC)
1690 				D("send %d on %s %d",
1691 					kring->ring->cur, NM_IFPNAME(ifp), i);
1692 			if (na->nm_txsync(na, i, 0))
1693 				revents |= POLLERR;
1694 
1695 			/* Check avail/call selrecord only if called with POLLOUT */
1696 			if (want_tx) {
1697 				if (kring->ring->avail > 0) {
1698 					/* stop at the first ring. We don't risk
1699 					 * starvation.
1700 					 */
1701 					revents |= want_tx;
1702 					want_tx = 0;
1703 				}
1704 			}
1705 			nm_kr_put(kring);
1706 		}
1707 		if (want_tx && retry_tx) {
1708 			selrecord(td, check_all_tx ?
1709 			    &na->tx_si : &na->tx_rings[priv->np_qfirst].si);
1710 			retry_tx = 0;
1711 			goto flush_tx;
1712 		}
1713 	}
1714 
1715 	/*
1716 	 * now if want_rx is still set we need to lock and rxsync.
1717 	 * Do it on all rings because otherwise we starve.
1718 	 */
1719 	if (want_rx) {
1720 		int retry_rx = 1;
1721 do_retry_rx:
1722 		for (i = priv->np_qfirst; i < lim_rx; i++) {
1723 			kring = &na->rx_rings[i];
1724 
1725 			if (nm_kr_tryget(kring)) {
1726 				revents |= POLLERR;
1727 				goto out;
1728 			}
1729 
1730 			/* XXX NR_FORWARD should only be read on
1731 			 * physical or NIC ports
1732 			 */
1733 			if (netmap_fwd ||kring->ring->flags & NR_FORWARD) {
1734 				ND(10, "forwarding some buffers up %d to %d",
1735 				    kring->nr_hwcur, kring->ring->cur);
1736 				netmap_grab_packets(kring, &q, netmap_fwd);
1737 			}
1738 
1739 			if (na->nm_rxsync(na, i, 0))
1740 				revents |= POLLERR;
1741 			if (netmap_no_timestamp == 0 ||
1742 					kring->ring->flags & NR_TIMESTAMP) {
1743 				microtime(&kring->ring->ts);
1744 			}
1745 
1746 			if (kring->ring->avail > 0) {
1747 				revents |= want_rx;
1748 				retry_rx = 0;
1749 			}
1750 			nm_kr_put(kring);
1751 		}
1752 		if (retry_rx) {
1753 			retry_rx = 0;
1754 			selrecord(td, check_all_rx ?
1755 			    &na->rx_si : &na->rx_rings[priv->np_qfirst].si);
1756 			goto do_retry_rx;
1757 		}
1758 	}
1759 
1760 	/* forward host to the netmap ring.
1761 	 * I am accessing nr_hwavail without lock, but netmap_transmit
1762 	 * can only increment it, so the operation is safe.
1763 	 */
1764 	kring = &na->rx_rings[lim_rx];
1765 	if ( (priv->np_qlast == NETMAP_HW_RING) // XXX check_all
1766 			&& (netmap_fwd || kring->ring->flags & NR_FORWARD)
1767 			 && kring->nr_hwavail > 0 && !host_forwarded) {
1768 		netmap_sw_to_nic(na);
1769 		host_forwarded = 1; /* prevent another pass */
1770 		want_rx = 0;
1771 		goto flush_tx;
1772 	}
1773 
1774 	if (q.head)
1775 		netmap_send_up(na->ifp, &q);
1776 
1777 out:
1778 
1779 	return (revents);
1780 }
1781 
1782 /*------- driver support routines ------*/
1783 
1784 static int netmap_hw_krings_create(struct netmap_adapter *);
1785 
1786 static int
1787 netmap_notify(struct netmap_adapter *na, u_int n_ring, enum txrx tx, int flags)
1788 {
1789 	struct netmap_kring *kring;
1790 
1791 	if (tx == NR_TX) {
1792 		kring = na->tx_rings + n_ring;
1793 		selwakeuppri(&kring->si, PI_NET);
1794 		if (flags & NAF_GLOBAL_NOTIFY)
1795 			selwakeuppri(&na->tx_si, PI_NET);
1796 	} else {
1797 		kring = na->rx_rings + n_ring;
1798 		selwakeuppri(&kring->si, PI_NET);
1799 		if (flags & NAF_GLOBAL_NOTIFY)
1800 			selwakeuppri(&na->rx_si, PI_NET);
1801 	}
1802 	return 0;
1803 }
1804 
1805 
1806 // XXX check handling of failures
1807 int
1808 netmap_attach_common(struct netmap_adapter *na)
1809 {
1810 	struct ifnet *ifp = na->ifp;
1811 
1812 	if (na->num_tx_rings == 0 || na->num_rx_rings == 0) {
1813 		D("%s: invalid rings tx %d rx %d",
1814 			ifp->if_xname, na->num_tx_rings, na->num_rx_rings);
1815 		return EINVAL;
1816 	}
1817 	WNA(ifp) = na;
1818 	NETMAP_SET_CAPABLE(ifp);
1819 	if (na->nm_krings_create == NULL) {
1820 		na->nm_krings_create = netmap_hw_krings_create;
1821 		na->nm_krings_delete = netmap_krings_delete;
1822 	}
1823 	if (na->nm_notify == NULL)
1824 		na->nm_notify = netmap_notify;
1825 	na->active_fds = 0;
1826 
1827 	if (na->nm_mem == NULL)
1828 		na->nm_mem = &nm_mem;
1829 	return 0;
1830 }
1831 
1832 
1833 void
1834 netmap_detach_common(struct netmap_adapter *na)
1835 {
1836 	if (na->ifp)
1837 		WNA(na->ifp) = NULL; /* XXX do we need this? */
1838 
1839 	if (na->tx_rings) { /* XXX should not happen */
1840 		D("freeing leftover tx_rings");
1841 		na->nm_krings_delete(na);
1842 	}
1843 	if (na->na_flags & NAF_MEM_OWNER)
1844 		netmap_mem_private_delete(na->nm_mem);
1845 	bzero(na, sizeof(*na));
1846 	kfree(na, M_DEVBUF);
1847 }
1848 
1849 
1850 /*
1851  * Initialize a ``netmap_adapter`` object created by driver on attach.
1852  * We allocate a block of memory with room for a struct netmap_adapter
1853  * plus two sets of N+2 struct netmap_kring (where N is the number
1854  * of hardware rings):
1855  * krings	0..N-1	are for the hardware queues.
1856  * kring	N	is for the host stack queue
1857  * kring	N+1	is only used for the selinfo for all queues.
1858  * Return 0 on success, ENOMEM otherwise.
1859  *
1860  * By default the receive and transmit adapter ring counts are both initialized
1861  * to num_queues.  na->num_tx_rings can be set for cards with different tx/rx
1862  * setups.
1863  */
1864 int
1865 netmap_attach(struct netmap_adapter *arg)
1866 {
1867 	struct netmap_hw_adapter *hwna = NULL;
1868 	// XXX when is arg == NULL ?
1869 	struct ifnet *ifp = arg ? arg->ifp : NULL;
1870 
1871 	if (arg == NULL || ifp == NULL)
1872 		goto fail;
1873 	hwna = kmalloc(sizeof(*hwna), M_DEVBUF, M_NOWAIT | M_ZERO);
1874 	if (hwna == NULL)
1875 		goto fail;
1876 	hwna->up = *arg;
1877 	if (netmap_attach_common(&hwna->up)) {
1878 		kfree(hwna, M_DEVBUF);
1879 		goto fail;
1880 	}
1881 	netmap_adapter_get(&hwna->up);
1882 
1883 	D("success for %s", NM_IFPNAME(ifp));
1884 	return 0;
1885 
1886 fail:
1887 	D("fail, arg %p ifp %p na %p", arg, ifp, hwna);
1888 	netmap_detach(ifp);
1889 	return (hwna ? EINVAL : ENOMEM);
1890 }
1891 
1892 
1893 void
1894 NM_DBG(netmap_adapter_get)(struct netmap_adapter *na)
1895 {
1896 	if (!na) {
1897 		return;
1898 	}
1899 
1900 	refcount_acquire(&na->na_refcount);
1901 }
1902 
1903 
1904 /* returns 1 iff the netmap_adapter is destroyed */
1905 int
1906 NM_DBG(netmap_adapter_put)(struct netmap_adapter *na)
1907 {
1908 	if (!na)
1909 		return 1;
1910 
1911 	if (!refcount_release(&na->na_refcount))
1912 		return 0;
1913 
1914 	if (na->nm_dtor)
1915 		na->nm_dtor(na);
1916 
1917 	netmap_detach_common(na);
1918 
1919 	return 1;
1920 }
1921 
1922 
1923 int
1924 netmap_hw_krings_create(struct netmap_adapter *na)
1925 {
1926 	return netmap_krings_create(na,
1927 		na->num_tx_rings + 1, na->num_rx_rings + 1, 0);
1928 }
1929 
1930 
1931 
1932 /*
1933  * Free the allocated memory linked to the given ``netmap_adapter``
1934  * object.
1935  */
1936 void
1937 netmap_detach(struct ifnet *ifp)
1938 {
1939 	struct netmap_adapter *na = NA(ifp);
1940 
1941 	if (!na)
1942 		return;
1943 
1944 	NMG_LOCK();
1945 	netmap_disable_all_rings(ifp);
1946 	netmap_adapter_put(na);
1947 	na->ifp = NULL;
1948 	netmap_enable_all_rings(ifp);
1949 	NMG_UNLOCK();
1950 }
1951 
1952 
1953 /*
1954  * Intercept packets from the network stack and pass them
1955  * to netmap as incoming packets on the 'software' ring.
1956  * We rely on the OS to make sure that the ifp and na do not go
1957  * away (typically the caller checks for IFF_DRV_RUNNING or the like).
1958  * In nm_register() or whenever there is a reinitialization,
1959  * we make sure to access the core lock and per-ring locks
1960  * so that IFCAP_NETMAP is visible here.
1961  */
1962 int
1963 netmap_transmit(struct ifnet *ifp, struct mbuf *m)
1964 {
1965 	struct netmap_adapter *na = NA(ifp);
1966 	struct netmap_kring *kring;
1967 	u_int i, len = MBUF_LEN(m);
1968 	u_int error = EBUSY, lim;
1969 	struct netmap_slot *slot;
1970 
1971 	// XXX [Linux] we do not need this lock
1972 	// if we follow the down/configure/up protocol -gl
1973 	// mtx_lock(&na->core_lock);
1974 	if ( (ifp->if_capenable & IFCAP_NETMAP) == 0) {
1975 		/* interface not in netmap mode anymore */
1976 		error = ENXIO;
1977 		goto done;
1978 	}
1979 
1980 	kring = &na->rx_rings[na->num_rx_rings];
1981 	lim = kring->nkr_num_slots - 1;
1982 	if (netmap_verbose & NM_VERB_HOST)
1983 		D("%s packet %d len %d from the stack", NM_IFPNAME(ifp),
1984 			kring->nr_hwcur + kring->nr_hwavail, len);
1985 	// XXX reconsider long packets if we handle fragments
1986 	if (len > NETMAP_BDG_BUF_SIZE(na->nm_mem)) { /* too long for us */
1987 		D("%s from_host, drop packet size %d > %d", NM_IFPNAME(ifp),
1988 			len, NETMAP_BDG_BUF_SIZE(na->nm_mem));
1989 		goto done;
1990 	}
1991 	/* protect against other instances of netmap_transmit,
1992 	 * and userspace invocations of rxsync().
1993 	 */
1994 	// XXX [Linux] there can be no other instances of netmap_transmit
1995 	// on this same ring, but we still need this lock to protect
1996 	// concurrent access from netmap_sw_to_nic() -gl
1997 	lockmgr(&kring->q_lock, LK_EXCLUSIVE);
1998 	if (kring->nr_hwavail >= lim) {
1999 		if (netmap_verbose)
2000 			D("stack ring %s full\n", NM_IFPNAME(ifp));
2001 	} else {
2002 		/* compute the insert position */
2003 		i = nm_kr_rxpos(kring);
2004 		slot = &kring->ring->slot[i];
2005 		m_copydata(m, 0, (int)len, BDG_NMB(na, slot));
2006 		slot->len = len;
2007 		slot->flags = kring->nkr_slot_flags;
2008 		kring->nr_hwavail++;
2009 		if (netmap_verbose  & NM_VERB_HOST)
2010 			D("wake up host ring %s %d", NM_IFPNAME(na->ifp), na->num_rx_rings);
2011 		na->nm_notify(na, na->num_rx_rings, NR_RX, 0);
2012 		error = 0;
2013 	}
2014 	lockmgr(&kring->q_lock, LK_RELEASE);
2015 
2016 done:
2017 	// mtx_unlock(&na->core_lock);
2018 
2019 	/* release the mbuf in either cases of success or failure. As an
2020 	 * alternative, put the mbuf in a free list and free the list
2021 	 * only when really necessary.
2022 	 */
2023 	m_freem(m);
2024 
2025 	return (error);
2026 }
2027 
2028 
2029 /*
2030  * netmap_reset() is called by the driver routines when reinitializing
2031  * a ring. The driver is in charge of locking to protect the kring.
2032  * If native netmap mode is not set just return NULL.
2033  */
2034 struct netmap_slot *
2035 netmap_reset(struct netmap_adapter *na, enum txrx tx, u_int n,
2036 	u_int new_cur)
2037 {
2038 	struct netmap_kring *kring;
2039 	int new_hwofs, lim;
2040 
2041 	if (na == NULL) {
2042 		D("NULL na, should not happen");
2043 		return NULL;	/* no netmap support here */
2044 	}
2045 	if (!(na->ifp->if_capenable & IFCAP_NETMAP) || nma_is_generic(na)) {
2046 		ND("interface not in netmap mode");
2047 		return NULL;	/* nothing to reinitialize */
2048 	}
2049 
2050 	/* XXX note- in the new scheme, we are not guaranteed to be
2051 	 * under lock (e.g. when called on a device reset).
2052 	 * In this case, we should set a flag and do not trust too
2053 	 * much the values. In practice: TODO
2054 	 * - set a RESET flag somewhere in the kring
2055 	 * - do the processing in a conservative way
2056 	 * - let the *sync() fixup at the end.
2057 	 */
2058 	if (tx == NR_TX) {
2059 		if (n >= na->num_tx_rings)
2060 			return NULL;
2061 		kring = na->tx_rings + n;
2062 		new_hwofs = kring->nr_hwcur - new_cur;
2063 	} else {
2064 		if (n >= na->num_rx_rings)
2065 			return NULL;
2066 		kring = na->rx_rings + n;
2067 		new_hwofs = kring->nr_hwcur + kring->nr_hwavail - new_cur;
2068 	}
2069 	lim = kring->nkr_num_slots - 1;
2070 	if (new_hwofs > lim)
2071 		new_hwofs -= lim + 1;
2072 
2073 	/* Always set the new offset value and realign the ring. */
2074 	D("%s hwofs %d -> %d, hwavail %d -> %d",
2075 		tx == NR_TX ? "TX" : "RX",
2076 		kring->nkr_hwofs, new_hwofs,
2077 		kring->nr_hwavail,
2078 		tx == NR_TX ? lim : kring->nr_hwavail);
2079 	kring->nkr_hwofs = new_hwofs;
2080 	if (tx == NR_TX)
2081 		kring->nr_hwavail = lim;
2082 	kring->nr_hwreserved = 0;
2083 
2084 	/*
2085 	 * Wakeup on the individual and global selwait
2086 	 * We do the wakeup here, but the ring is not yet reconfigured.
2087 	 * However, we are under lock so there are no races.
2088 	 */
2089 	na->nm_notify(na, n, tx, NAF_GLOBAL_NOTIFY);
2090 	return kring->ring->slot;
2091 }
2092 
2093 
2094 /*
2095  * Default functions to handle rx/tx interrupts from a physical device.
2096  * "work_done" is non-null on the RX path, NULL for the TX path.
2097  * "generic" is 0 when we are called by a device driver, and 1 when we
2098  * are called by the generic netmap adapter layer.
2099  * We rely on the OS to make sure that there is only one active
2100  * instance per queue, and that there is appropriate locking.
2101  *
2102  * If the card is not in netmap mode, simply return 0,
2103  * so that the caller proceeds with regular processing.
2104  *
2105  * We return 0 also when the card is in netmap mode but the current
2106  * netmap adapter is the generic one, because this function will be
2107  * called by the generic layer.
2108  *
2109  * If the card is connected to a netmap file descriptor,
2110  * do a selwakeup on the individual queue, plus one on the global one
2111  * if needed (multiqueue card _and_ there are multiqueue listeners),
2112  * and return 1.
2113  *
2114  * Finally, if called on rx from an interface connected to a switch,
2115  * calls the proper forwarding routine, and return 1.
2116  */
2117 int
2118 netmap_common_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2119 {
2120 	struct netmap_adapter *na = NA(ifp);
2121 	struct netmap_kring *kring;
2122 
2123 	q &= NETMAP_RING_MASK;
2124 
2125 	if (netmap_verbose) {
2126 	        RD(5, "received %s queue %d", work_done ? "RX" : "TX" , q);
2127 	}
2128 
2129 	if (work_done) { /* RX path */
2130 		if (q >= na->num_rx_rings)
2131 			return 0;	// not a physical queue
2132 		kring = na->rx_rings + q;
2133 		kring->nr_kflags |= NKR_PENDINTR;	// XXX atomic ?
2134 		na->nm_notify(na, q, NR_RX,
2135 			(na->num_rx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0));
2136 		*work_done = 1; /* do not fire napi again */
2137 	} else { /* TX path */
2138 		if (q >= na->num_tx_rings)
2139 			return 0;	// not a physical queue
2140 		kring = na->tx_rings + q;
2141 		na->nm_notify(na, q, NR_TX,
2142 			(na->num_tx_rings > 1 ? NAF_GLOBAL_NOTIFY : 0));
2143 	}
2144 	return 1;
2145 }
2146 
2147 /*
2148  * Default functions to handle rx/tx interrupts from a physical device.
2149  * "work_done" is non-null on the RX path, NULL for the TX path.
2150  * "generic" is 0 when we are called by a device driver, and 1 when we
2151  * are called by the generic netmap adapter layer.
2152  * We rely on the OS to make sure that there is only one active
2153  * instance per queue, and that there is appropriate locking.
2154  *
2155  * If the card is not in netmap mode, simply return 0,
2156  * so that the caller proceeds with regular processing.
2157  *
2158  * If the card is connected to a netmap file descriptor,
2159  * do a selwakeup on the individual queue, plus one on the global one
2160  * if needed (multiqueue card _and_ there are multiqueue listeners),
2161  * and return 1.
2162  *
2163  * Finally, if called on rx from an interface connected to a switch,
2164  * calls the proper forwarding routine, and return 1.
2165  */
2166 int
2167 netmap_rx_irq(struct ifnet *ifp, u_int q, u_int *work_done)
2168 {
2169 	// XXX could we check NAF_NATIVE_ON ?
2170 	if (!(ifp->if_capenable & IFCAP_NETMAP))
2171 		return 0;
2172 
2173 	if (NA(ifp)->na_flags & NAF_SKIP_INTR) {
2174 		ND("use regular interrupt");
2175 		return 0;
2176 	}
2177 
2178 	return netmap_common_irq(ifp, q, work_done);
2179 }
2180 
2181 
2182 static struct cdev *netmap_dev; /* /dev/netmap character device. */
2183 
2184 
2185 /*
2186  * Module loader.
2187  *
2188  * Create the /dev/netmap device and initialize all global
2189  * variables.
2190  *
2191  * Return 0 on success, errno on failure.
2192  */
2193 int
2194 netmap_init(void)
2195 {
2196 	int error;
2197 
2198 	NMG_LOCK_INIT();
2199 
2200 	error = netmap_mem_init();
2201 	if (error != 0) {
2202 		kprintf("netmap: unable to initialize the memory allocator.\n");
2203 		return (error);
2204 	}
2205 	kprintf("netmap: loaded module\n");
2206 	netmap_dev = make_dev(&netmap_cdevsw, 0, UID_ROOT, GID_WHEEL, 0660,
2207 			      "netmap");
2208 
2209 	netmap_init_bridges();
2210 	return (error);
2211 }
2212 
2213 
2214 /*
2215  * Module unloader.
2216  *
2217  * Free all the memory, and destroy the ``/dev/netmap`` device.
2218  */
2219 void
2220 netmap_fini(void)
2221 {
2222 	destroy_dev(netmap_dev);
2223 	netmap_mem_fini();
2224 	NMG_LOCK_DESTROY();
2225 	kprintf("netmap: unloaded module.\n");
2226 }
2227