xref: /netbsd-src/sys/dev/hyperv/if_hvn.c (revision 901e7e84758515fbf39dfc064cb0b45ab146d8b0)
1 /*	$NetBSD: if_hvn.c,v 1.24 2022/09/18 16:59:35 thorpej Exp $	*/
2 /*	$OpenBSD: if_hvn.c,v 1.39 2018/03/11 14:31:34 mikeb Exp $	*/
3 
4 /*-
5  * Copyright (c) 2009-2012,2016 Microsoft Corp.
6  * Copyright (c) 2010-2012 Citrix Inc.
7  * Copyright (c) 2012 NetApp Inc.
8  * Copyright (c) 2016 Mike Belopuhov <mike@esdenera.com>
9  * All rights reserved.
10  *
11  * Redistribution and use in source and binary forms, with or without
12  * modification, are permitted provided that the following conditions
13  * are met:
14  * 1. Redistributions of source code must retain the above copyright
15  *    notice unmodified, this list of conditions, and the following
16  *    disclaimer.
17  * 2. Redistributions in binary form must reproduce the above copyright
18  *    notice, this list of conditions and the following disclaimer in the
19  *    documentation and/or other materials provided with the distribution.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
22  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
23  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
24  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
25  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
26  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
27  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
28  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
29  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
30  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
31  */
32 
33 /*
34  * The OpenBSD port was done under funding by Esdenera Networks GmbH.
35  */
36 
37 #include <sys/cdefs.h>
38 __KERNEL_RCSID(0, "$NetBSD: if_hvn.c,v 1.24 2022/09/18 16:59:35 thorpej Exp $");
39 
40 #ifdef _KERNEL_OPT
41 #include "opt_if_hvn.h"
42 #include "opt_inet.h"
43 #include "opt_inet6.h"
44 #include "opt_net_mpsafe.h"
45 #endif
46 
47 #include <sys/param.h>
48 #include <sys/systm.h>
49 #include <sys/kernel.h>
50 #include <sys/device.h>
51 #include <sys/bitops.h>
52 #include <sys/bus.h>
53 #include <sys/condvar.h>
54 #include <sys/cpu.h>
55 #include <sys/evcnt.h>
56 #include <sys/intr.h>
57 #include <sys/kmem.h>
58 #include <sys/kthread.h>
59 #include <sys/mutex.h>
60 #include <sys/pcq.h>
61 #include <sys/sysctl.h>
62 #include <sys/workqueue.h>
63 
64 #include <net/if.h>
65 #include <net/if_ether.h>
66 #include <net/if_media.h>
67 #include <net/if_vlanvar.h>
68 #include <net/rss_config.h>
69 #include <netinet/in.h>
70 #include <netinet/ip.h>
71 #include <netinet/ip6.h>
72 #include <netinet/udp.h>
73 
74 #include <net/bpf.h>
75 
76 #include <dev/ic/ndisreg.h>
77 #include <dev/ic/rndisreg.h>
78 
79 #include <dev/hyperv/vmbusvar.h>
80 #include <dev/hyperv/if_hvnreg.h>
81 
82 #ifndef EVL_PRIO_BITS
83 #define EVL_PRIO_BITS	13
84 #endif
85 #ifndef EVL_CFI_BITS
86 #define EVL_CFI_BITS	12
87 #endif
88 
89 #define HVN_CHIM_SIZE			(15 * 1024 * 1024)
90 
91 #define HVN_NVS_MSGSIZE			32
92 #define HVN_NVS_BUFSIZE			PAGE_SIZE
93 
94 #define HVN_RING_BUFSIZE		(128 * PAGE_SIZE)
95 #define HVN_RING_IDX2CPU(sc, idx)	((idx) % ncpu)
96 
97 #ifndef HVN_CHANNEL_MAX_COUNT_DEFAULT
98 #define HVN_CHANNEL_MAX_COUNT_DEFAULT	8
99 #endif
100 
101 #ifndef HVN_LINK_STATE_CHANGE_DELAY
102 #define HVN_LINK_STATE_CHANGE_DELAY	5000
103 #endif
104 
105 #define HVN_WORKQUEUE_PRI		PRI_SOFTNET
106 
107 /*
108  * RNDIS control interface
109  */
110 #define HVN_RNDIS_CTLREQS		4
111 #define HVN_RNDIS_BUFSIZE		512
112 
113 struct rndis_cmd {
114 	uint32_t			rc_id;
115 	struct hvn_nvs_rndis		rc_msg;
116 	void				*rc_req;
117 	bus_dmamap_t			rc_dmap;
118 	bus_dma_segment_t		rc_segs;
119 	int				rc_nsegs;
120 	uint64_t			rc_gpa;
121 	struct rndis_packet_msg		rc_cmp;
122 	uint32_t			rc_cmplen;
123 	uint8_t				rc_cmpbuf[HVN_RNDIS_BUFSIZE];
124 	int				rc_done;
125 	TAILQ_ENTRY(rndis_cmd)		rc_entry;
126 	kmutex_t			rc_lock;
127 	kcondvar_t			rc_cv;
128 };
129 TAILQ_HEAD(rndis_queue, rndis_cmd);
130 
131 #define HVN_MTU_MIN			68
132 #define HVN_MTU_MAX			(65535 - ETHER_ADDR_LEN)
133 
134 #define HVN_RNDIS_XFER_SIZE		2048
135 
136 #define HVN_NDIS_TXCSUM_CAP_IP4 \
137 	(NDIS_TXCSUM_CAP_IP4 | NDIS_TXCSUM_CAP_IP4OPT)
138 #define HVN_NDIS_TXCSUM_CAP_TCP4 \
139 	(NDIS_TXCSUM_CAP_TCP4 | NDIS_TXCSUM_CAP_TCP4OPT)
140 #define HVN_NDIS_TXCSUM_CAP_TCP6 \
141 	(NDIS_TXCSUM_CAP_TCP6 | NDIS_TXCSUM_CAP_TCP6OPT | \
142 	    NDIS_TXCSUM_CAP_IP6EXT)
143 #define HVN_NDIS_TXCSUM_CAP_UDP6 \
144 	(NDIS_TXCSUM_CAP_UDP6 | NDIS_TXCSUM_CAP_IP6EXT)
145 #define HVN_NDIS_LSOV2_CAP_IP6 \
146 	(NDIS_LSOV2_CAP_IP6EXT | NDIS_LSOV2_CAP_TCP6OPT)
147 
148 #define HVN_RNDIS_CMD_NORESP	__BIT(0)
149 
150 #define HVN_NVS_CMD_NORESP	__BIT(0)
151 
152 /*
153  * Tx ring
154  */
155 #define HVN_TX_DESC			512
156 #define HVN_TX_FRAGS			15		/* 31 is the max */
157 #define HVN_TX_FRAG_SIZE		PAGE_SIZE
158 #define HVN_TX_PKT_SIZE			16384
159 
160 #define HVN_RNDIS_PKT_LEN					\
161 	(sizeof(struct rndis_packet_msg) +			\
162 	 sizeof(struct rndis_pktinfo) + NDIS_VLAN_INFO_SIZE +	\
163 	 sizeof(struct rndis_pktinfo) + NDIS_TXCSUM_INFO_SIZE)
164 
165 #define HVN_PKTSIZE_MIN(align)						\
166 	roundup2(ETHER_MIN_LEN + ETHER_VLAN_ENCAP_LEN - ETHER_CRC_LEN +	\
167 	HVN_RNDIS_PKT_LEN, (align))
168 #define HVN_PKTSIZE(m, align)						\
169 	roundup2((m)->m_pkthdr.len + HVN_RNDIS_PKT_LEN, (align))
170 
171 struct hvn_tx_desc {
172 	uint32_t			txd_id;
173 	struct vmbus_gpa		txd_sgl[HVN_TX_FRAGS + 1];
174 	int				txd_nsge;
175 	struct mbuf			*txd_buf;
176 	bus_dmamap_t			txd_dmap;
177 	struct vmbus_gpa		txd_gpa;
178 	struct rndis_packet_msg		*txd_req;
179 	TAILQ_ENTRY(hvn_tx_desc)	txd_entry;
180 	u_int				txd_refs;
181 	uint32_t			txd_flags;
182 #define HVN_TXD_FLAG_ONAGG		__BIT(0)
183 #define HVN_TXD_FLAG_DMAMAP		__BIT(1)
184 	uint32_t			txd_chim_index;
185 	int				txd_chim_size;
186 	STAILQ_ENTRY(hvn_tx_desc)	txd_agg_entry;
187 	STAILQ_HEAD(, hvn_tx_desc)	txd_agg_list;
188 };
189 
190 struct hvn_softc;
191 struct hvn_rx_ring;
192 
193 struct hvn_tx_ring {
194 	struct hvn_softc		*txr_softc;
195 	struct vmbus_channel		*txr_chan;
196 	struct hvn_rx_ring		*txr_rxr;
197 	void				*txr_si;
198 	char				txr_name[16];
199 
200 	int				txr_id;
201 	int				txr_oactive;
202 	int				txr_suspended;
203 	int				txr_csum_assist;
204 	uint64_t			txr_caps_assist;
205 	uint32_t			txr_flags;
206 #define HVN_TXR_FLAG_UDP_HASH		__BIT(0)
207 
208 	struct evcnt			txr_evpkts;
209 	struct evcnt			txr_evsends;
210 	struct evcnt			txr_evnodesc;
211 	struct evcnt			txr_evdmafailed;
212 	struct evcnt			txr_evdefrag;
213 	struct evcnt			txr_evpcqdrop;
214 	struct evcnt			txr_evtransmitdefer;
215 	struct evcnt			txr_evflushfailed;
216 	struct evcnt			txr_evchimneytried;
217 	struct evcnt			txr_evchimney;
218 	struct evcnt			txr_evvlanfixup;
219 	struct evcnt			txr_evvlanhwtagging;
220 	struct evcnt			txr_evvlantap;
221 
222 	kmutex_t			txr_lock;
223 	pcq_t				*txr_interq;
224 
225 	uint32_t			txr_avail;
226 	TAILQ_HEAD(, hvn_tx_desc)	txr_list;
227 	struct hvn_tx_desc		txr_desc[HVN_TX_DESC];
228 	uint8_t				*txr_msgs;
229 	struct hyperv_dma		txr_dma;
230 
231 	int				txr_chim_size;
232 
233 	/* Applied packet transmission aggregation limits. */
234 	int				txr_agg_szmax;
235 	short				txr_agg_pktmax;
236 	short				txr_agg_align;
237 
238 	/* Packet transmission aggregation states. */
239 	struct hvn_tx_desc		*txr_agg_txd;
240 	int				txr_agg_szleft;
241 	short				txr_agg_pktleft;
242 	struct rndis_packet_msg		*txr_agg_prevpkt;
243 
244 	/* Temporary stats for each sends. */
245 	int				txr_stat_pkts;
246 	int				txr_stat_size;
247 	int				txr_stat_mcasts;
248 
249 	int				(*txr_sendpkt)(struct hvn_tx_ring *,
250 					    struct hvn_tx_desc *);
251 } __aligned(CACHE_LINE_SIZE);
252 
253 struct hvn_rx_ring {
254 	struct hvn_softc		*rxr_softc;
255 	struct vmbus_channel		*rxr_chan;
256 	struct hvn_tx_ring		*rxr_txr;
257 	void				*rxr_si;
258 	bool				rxr_workqueue;
259 	char				rxr_name[16];
260 
261 	struct work			rxr_wk;
262 	volatile bool			rxr_onlist;
263 	volatile bool			rxr_onproc;
264 	kmutex_t			rxr_onwork_lock;
265 	kcondvar_t			rxr_onwork_cv;
266 
267 	uint32_t			rxr_flags;
268 #define HVN_RXR_FLAG_UDP_HASH		__BIT(0)
269 
270 	kmutex_t			rxr_lock;
271 
272 	struct evcnt			rxr_evpkts;
273 	struct evcnt			rxr_evcsum_ip;
274 	struct evcnt			rxr_evcsum_tcp;
275 	struct evcnt			rxr_evcsum_udp;
276 	struct evcnt			rxr_evvlanhwtagging;
277 	struct evcnt			rxr_evintr;
278 	struct evcnt			rxr_evdefer;
279 	struct evcnt			rxr_evdeferreq;
280 	struct evcnt			rxr_evredeferreq;
281 
282 	/* NVS */
283 	uint8_t				*rxr_nvsbuf;
284 } __aligned(CACHE_LINE_SIZE);
285 
286 struct hvn_softc {
287 	device_t			sc_dev;
288 
289 	struct vmbus_softc		*sc_vmbus;
290 	struct vmbus_channel		*sc_prichan;
291 	bus_dma_tag_t			sc_dmat;
292 
293 	struct ethercom			sc_ec;
294 	struct ifmedia			sc_media;
295 	struct if_percpuq		*sc_ipq;
296 	struct workqueue		*sc_wq;
297 	bool				sc_txrx_workqueue;
298 	kmutex_t			sc_core_lock;
299 
300 	kmutex_t			sc_link_lock;
301 	kcondvar_t			sc_link_cv;
302 	callout_t			sc_link_tmout;
303 	lwp_t				*sc_link_lwp;
304 	uint32_t			sc_link_ev;
305 #define HVN_LINK_EV_STATE_CHANGE	__BIT(0)
306 #define HVN_LINK_EV_NETWORK_CHANGE_TMOUT __BIT(1)
307 #define HVN_LINK_EV_NETWORK_CHANGE	__BIT(2)
308 #define HVN_LINK_EV_RESUME_NETWORK	__BIT(3)
309 #define HVN_LINK_EV_EXIT_THREAD		__BIT(4)
310 	int				sc_link_state;
311 	bool				sc_link_onproc;
312 	bool				sc_link_pending;
313 	bool				sc_link_suspend;
314 
315 	int				sc_tx_process_limit;
316 	int				sc_rx_process_limit;
317 	int				sc_tx_intr_process_limit;
318 	int				sc_rx_intr_process_limit;
319 
320 	struct sysctllog		*sc_sysctllog;
321 
322 	uint32_t			sc_caps;
323 #define HVN_CAPS_VLAN			__BIT(0)
324 #define HVN_CAPS_MTU			__BIT(1)
325 #define HVN_CAPS_IPCS			__BIT(2)
326 #define HVN_CAPS_TCP4CS			__BIT(3)
327 #define HVN_CAPS_TCP6CS			__BIT(4)
328 #define HVN_CAPS_UDP4CS			__BIT(5)
329 #define HVN_CAPS_UDP6CS			__BIT(6)
330 #define HVN_CAPS_TSO4			__BIT(7)
331 #define HVN_CAPS_TSO6			__BIT(8)
332 #define HVN_CAPS_HASHVAL		__BIT(9)
333 #define HVN_CAPS_UDPHASH		__BIT(10)
334 
335 	uint32_t			sc_flags;
336 #define HVN_SCF_ATTACHED		__BIT(0)
337 #define HVN_SCF_RXBUF_CONNECTED		__BIT(1)
338 #define HVN_SCF_CHIM_CONNECTED		__BIT(2)
339 #define HVN_SCF_REVOKED			__BIT(3)
340 #define HVN_SCF_HAS_RSSKEY		__BIT(4)
341 #define HVN_SCF_HAS_RSSIND		__BIT(5)
342 
343 	/* NVS protocol */
344 	int				sc_proto;
345 	uint32_t			sc_nvstid;
346 	uint8_t				sc_nvsrsp[HVN_NVS_MSGSIZE];
347 	int				sc_nvsdone;
348 	kmutex_t			sc_nvsrsp_lock;
349 	kcondvar_t			sc_nvsrsp_cv;
350 
351 	/* RNDIS protocol */
352 	int				sc_ndisver;
353 	uint32_t			sc_rndisrid;
354 	int				sc_tso_szmax;
355 	int				sc_tso_sgmin;
356 	uint32_t			sc_rndis_agg_size;
357 	uint32_t			sc_rndis_agg_pkts;
358 	uint32_t			sc_rndis_agg_align;
359 	struct rndis_queue		sc_cntl_sq; /* submission queue */
360 	kmutex_t			sc_cntl_sqlck;
361 	struct rndis_queue		sc_cntl_cq; /* completion queue */
362 	kmutex_t			sc_cntl_cqlck;
363 	struct rndis_queue		sc_cntl_fq; /* free queue */
364 	kmutex_t			sc_cntl_fqlck;
365 	kcondvar_t			sc_cntl_fqcv;
366 	struct rndis_cmd		sc_cntl_msgs[HVN_RNDIS_CTLREQS];
367 	struct hvn_nvs_rndis		sc_data_msg;
368 
369 	int				sc_rss_ind_size;
370 	uint32_t			sc_rss_hash; /* setting, NDIS_HASH_ */
371 	uint32_t			sc_rss_hcap; /* caps, NDIS_HASH_ */
372 	struct ndis_rssprm_toeplitz	sc_rss;
373 
374 	/* Rx ring */
375 	uint8_t				*sc_rx_ring;
376 	int				sc_rx_size;
377 	uint32_t			sc_rx_hndl;
378 	struct hyperv_dma		sc_rx_dma;
379 	struct hvn_rx_ring		*sc_rxr;
380 	int				sc_nrxr;
381 	int				sc_nrxr_inuse;
382 
383 	/* Tx ring */
384 	struct hvn_tx_ring		*sc_txr;
385 	int				sc_ntxr;
386 	int				sc_ntxr_inuse;
387 
388 	/* chimney sending buffers */
389 	uint8_t				*sc_chim;
390 	uint32_t			sc_chim_hndl;
391 	struct hyperv_dma		sc_chim_dma;
392 	kmutex_t			sc_chim_bmap_lock;
393 	u_long				*sc_chim_bmap;
394 	int				sc_chim_bmap_cnt;
395 	int				sc_chim_cnt;
396 	int				sc_chim_szmax;
397 
398 	/* Packet transmission aggregation user settings. */
399 	int				sc_agg_size;
400 	int				sc_agg_pkts;
401 };
402 
403 #define SC2IFP(_sc_)	(&(_sc_)->sc_ec.ec_if)
404 #define IFP2SC(_ifp_)	((_ifp_)->if_softc)
405 
406 #ifndef HVN_TX_PROCESS_LIMIT_DEFAULT
407 #define HVN_TX_PROCESS_LIMIT_DEFAULT		128
408 #endif
409 #ifndef HVN_RX_PROCESS_LIMIT_DEFAULT
410 #define HVN_RX_PROCESS_LIMIT_DEFAULT		128
411 #endif
412 #ifndef HVN_TX_INTR_PROCESS_LIMIT_DEFAULT
413 #define HVN_TX_INTR_PROCESS_LIMIT_DEFAULT	256
414 #endif
415 #ifndef HVN_RX_INTR_PROCESS_LIMIT_DEFAULT
416 #define HVN_RX_INTR_PROCESS_LIMIT_DEFAULT	256
417 #endif
418 
419 /*
420  * See hvn_set_hlen().
421  *
422  * This value is for Azure.  For Hyper-V, set this above
423  * 65536 to disable UDP datagram checksum fixup.
424  */
425 #ifndef HVN_UDP_CKSUM_FIXUP_MTU_DEFAULT
426 #define HVN_UDP_CKSUM_FIXUP_MTU_DEFAULT	1420
427 #endif
428 static int hvn_udpcs_fixup_mtu = HVN_UDP_CKSUM_FIXUP_MTU_DEFAULT;
429 
430 /* Limit chimney send size */
431 static int hvn_tx_chimney_size = 0;
432 
433 /* # of channels to use; each channel has one RX ring and one TX ring */
434 #ifndef HVN_CHANNEL_COUNT_DEFAULT
435 #define HVN_CHANNEL_COUNT_DEFAULT	0
436 #endif
437 static int hvn_channel_cnt = HVN_CHANNEL_COUNT_DEFAULT;
438 
439 /* # of transmit rings to use */
440 #ifndef HVN_TX_RING_COUNT_DEFAULT
441 #define HVN_TX_RING_COUNT_DEFAULT	0
442 #endif
443 static int hvn_tx_ring_cnt = HVN_TX_RING_COUNT_DEFAULT;
444 
445 /* Packet transmission aggregation size limit */
446 static int hvn_tx_agg_size = -1;
447 
448 /* Packet transmission aggregation count limit */
449 static int hvn_tx_agg_pkts = -1;
450 
451 static int	hvn_match(device_t, cfdata_t, void *);
452 static void	hvn_attach(device_t, device_t, void *);
453 static int	hvn_detach(device_t, int);
454 
455 CFATTACH_DECL_NEW(hvn, sizeof(struct hvn_softc),
456     hvn_match, hvn_attach, hvn_detach, NULL);
457 
458 static int	hvn_ioctl(struct ifnet *, u_long, void *);
459 static int	hvn_media_change(struct ifnet *);
460 static void	hvn_media_status(struct ifnet *, struct ifmediareq *);
461 static void	hvn_link_task(void *);
462 static void	hvn_link_event(struct hvn_softc *, uint32_t);
463 static void	hvn_link_netchg_tmout_cb(void *);
464 static int	hvn_init(struct ifnet *);
465 static int	hvn_init_locked(struct ifnet *);
466 static void	hvn_stop(struct ifnet *, int);
467 static void	hvn_stop_locked(struct ifnet *);
468 static void	hvn_start(struct ifnet *);
469 static int	hvn_transmit(struct ifnet *, struct mbuf *);
470 static void	hvn_deferred_transmit(void *);
471 static int	hvn_flush_txagg(struct hvn_tx_ring *);
472 static int	hvn_encap(struct hvn_tx_ring *, struct hvn_tx_desc *,
473 		    struct mbuf *, int);
474 static int	hvn_txpkt(struct hvn_tx_ring *, struct hvn_tx_desc *);
475 static void	hvn_txeof(struct hvn_tx_ring *, uint64_t);
476 static int	hvn_rx_ring_create(struct hvn_softc *, int);
477 static int	hvn_rx_ring_destroy(struct hvn_softc *);
478 static void	hvn_fixup_rx_data(struct hvn_softc *);
479 static int	hvn_tx_ring_create(struct hvn_softc *, int);
480 static void	hvn_tx_ring_destroy(struct hvn_softc *);
481 static void	hvn_set_chim_size(struct hvn_softc *, int);
482 static uint32_t	hvn_chim_alloc(struct hvn_softc *);
483 static void	hvn_chim_free(struct hvn_softc *, uint32_t);
484 static void	hvn_fixup_tx_data(struct hvn_softc *);
485 static struct mbuf *
486 		hvn_set_hlen(struct mbuf *, int *);
487 static int	hvn_txd_peek(struct hvn_tx_ring *);
488 static struct hvn_tx_desc *
489 		hvn_txd_get(struct hvn_tx_ring *);
490 static void	hvn_txd_put(struct hvn_tx_ring *, struct hvn_tx_desc *);
491 static void	hvn_txd_gc(struct hvn_tx_ring *, struct hvn_tx_desc *);
492 static void	hvn_txd_hold(struct hvn_tx_desc *);
493 static void	hvn_txd_agg(struct hvn_tx_desc *, struct hvn_tx_desc *);
494 static int	hvn_tx_ring_pending(struct hvn_tx_ring *);
495 static void	hvn_tx_ring_qflush(struct hvn_softc *, struct hvn_tx_ring *);
496 static int	hvn_get_rsscaps(struct hvn_softc *, int *);
497 static int	hvn_set_rss(struct hvn_softc *, uint16_t);
498 static void	hvn_fixup_rss_ind(struct hvn_softc *);
499 static int	hvn_get_hwcaps(struct hvn_softc *, struct ndis_offload *);
500 static int	hvn_set_capabilities(struct hvn_softc *, int);
501 static int	hvn_get_lladdr(struct hvn_softc *, uint8_t *);
502 static void	hvn_update_link_status(struct hvn_softc *);
503 static int	hvn_get_mtu(struct hvn_softc *, uint32_t *);
504 static int	hvn_channel_attach(struct hvn_softc *, struct vmbus_channel *);
505 static void	hvn_channel_detach(struct hvn_softc *, struct vmbus_channel *);
506 static void	hvn_channel_detach_all(struct hvn_softc *);
507 static int	hvn_subchannel_attach(struct hvn_softc *);
508 static int	hvn_synth_alloc_subchannels(struct hvn_softc *, int *);
509 static int	hvn_synth_attachable(const struct hvn_softc *);
510 static int	hvn_synth_attach(struct hvn_softc *, int);
511 static void	hvn_synth_detach(struct hvn_softc *);
512 static void	hvn_set_ring_inuse(struct hvn_softc *, int);
513 static void	hvn_disable_rx(struct hvn_softc *);
514 static void	hvn_drain_rxtx(struct hvn_softc *, int );
515 static void	hvn_suspend_data(struct hvn_softc *);
516 static void	hvn_suspend_mgmt(struct hvn_softc *);
517 static void	hvn_suspend(struct hvn_softc *) __unused;
518 static void	hvn_resume_tx(struct hvn_softc *, int);
519 static void	hvn_resume_data(struct hvn_softc *);
520 static void	hvn_resume_mgmt(struct hvn_softc *);
521 static void	hvn_resume(struct hvn_softc *) __unused;
522 static void	hvn_init_sysctls(struct hvn_softc *);
523 
524 /* NSVP */
525 static int	hvn_nvs_init(struct hvn_softc *);
526 static void	hvn_nvs_destroy(struct hvn_softc *);
527 static int	hvn_nvs_attach(struct hvn_softc *, int);
528 static int	hvn_nvs_connect_rxbuf(struct hvn_softc *);
529 static int	hvn_nvs_disconnect_rxbuf(struct hvn_softc *);
530 static int	hvn_nvs_connect_chim(struct hvn_softc *);
531 static int	hvn_nvs_disconnect_chim(struct hvn_softc *);
532 static void	hvn_handle_ring_work(struct work *, void *);
533 static void	hvn_nvs_softintr(void *);
534 static void	hvn_nvs_intr(void *);
535 static void	hvn_nvs_intr1(struct hvn_rx_ring *, int, int);
536 static int	hvn_nvs_cmd(struct hvn_softc *, void *, size_t, uint64_t,
537 		    u_int);
538 static int	hvn_nvs_ack(struct hvn_rx_ring *, uint64_t);
539 static void	hvn_nvs_detach(struct hvn_softc *);
540 static int	hvn_nvs_alloc_subchannels(struct hvn_softc *, int *);
541 
542 /* RNDIS */
543 static int	hvn_rndis_init(struct hvn_softc *);
544 static void	hvn_rndis_destroy(struct hvn_softc *);
545 static int	hvn_rndis_attach(struct hvn_softc *, int);
546 static int	hvn_rndis_cmd(struct hvn_softc *, struct rndis_cmd *, u_int);
547 static int	hvn_rndis_input(struct hvn_rx_ring *, uint64_t, void *);
548 static int	hvn_rxeof(struct hvn_rx_ring *, uint8_t *, uint32_t);
549 static void	hvn_rndis_complete(struct hvn_softc *, uint8_t *, uint32_t);
550 static int	hvn_rndis_output_sgl(struct hvn_tx_ring *,
551 		    struct hvn_tx_desc *);
552 static int	hvn_rndis_output_chim(struct hvn_tx_ring *,
553 		    struct hvn_tx_desc *);
554 static void	hvn_rndis_status(struct hvn_softc *, uint8_t *, uint32_t);
555 static int	hvn_rndis_query(struct hvn_softc *, uint32_t, void *, size_t *);
556 static int	hvn_rndis_query2(struct hvn_softc *, uint32_t, const void *,
557 		    size_t, void *, size_t *, size_t);
558 static int	hvn_rndis_set(struct hvn_softc *, uint32_t, void *, size_t);
559 static int	hvn_rndis_open(struct hvn_softc *);
560 static int	hvn_rndis_close(struct hvn_softc *);
561 static void	hvn_rndis_detach(struct hvn_softc *);
562 
563 static int
564 hvn_match(device_t parent, cfdata_t match, void *aux)
565 {
566 	struct vmbus_attach_args *aa = aux;
567 
568 	if (memcmp(aa->aa_type, &hyperv_guid_network, sizeof(*aa->aa_type)))
569 		return 0;
570 	return 1;
571 }
572 
573 static void
574 hvn_attach(device_t parent, device_t self, void *aux)
575 {
576 	struct hvn_softc *sc = device_private(self);
577 	struct vmbus_attach_args *aa = aux;
578 	struct ifnet *ifp = SC2IFP(sc);
579 	char xnamebuf[32];
580 	uint8_t enaddr[ETHER_ADDR_LEN];
581 	uint32_t mtu;
582 	int tx_ring_cnt, ring_cnt;
583 	int error;
584 
585 	sc->sc_dev = self;
586 	sc->sc_vmbus = (struct vmbus_softc *)device_private(parent);
587 	sc->sc_prichan = aa->aa_chan;
588 	sc->sc_dmat = sc->sc_vmbus->sc_dmat;
589 
590 	aprint_naive("\n");
591 	aprint_normal(": Hyper-V NetVSC\n");
592 
593 	sc->sc_txrx_workqueue = true;
594 	sc->sc_tx_process_limit = HVN_TX_PROCESS_LIMIT_DEFAULT;
595 	sc->sc_rx_process_limit = HVN_RX_PROCESS_LIMIT_DEFAULT;
596 	sc->sc_tx_intr_process_limit = HVN_TX_INTR_PROCESS_LIMIT_DEFAULT;
597 	sc->sc_rx_intr_process_limit = HVN_RX_INTR_PROCESS_LIMIT_DEFAULT;
598 	sc->sc_agg_size = hvn_tx_agg_size;
599 	sc->sc_agg_pkts = hvn_tx_agg_pkts;
600 
601 	mutex_init(&sc->sc_core_lock, MUTEX_DEFAULT, IPL_SOFTNET);
602 	mutex_init(&sc->sc_link_lock, MUTEX_DEFAULT, IPL_NET);
603 	cv_init(&sc->sc_link_cv, "hvnknkcv");
604 	callout_init(&sc->sc_link_tmout, CALLOUT_MPSAFE);
605 	callout_setfunc(&sc->sc_link_tmout, hvn_link_netchg_tmout_cb, sc);
606 	if (kthread_create(PRI_NONE, KTHREAD_MUSTJOIN | KTHREAD_MPSAFE, NULL,
607 	    hvn_link_task, sc, &sc->sc_link_lwp, "%slink",
608 	    device_xname(self))) {
609 		aprint_error_dev(self, "failed to create link thread\n");
610 		return;
611 	}
612 
613 	snprintf(xnamebuf, sizeof(xnamebuf), "%srxtx", device_xname(self));
614 	if (workqueue_create(&sc->sc_wq, xnamebuf, hvn_handle_ring_work,
615 	    sc, HVN_WORKQUEUE_PRI, IPL_NET, WQ_PERCPU | WQ_MPSAFE)) {
616 		aprint_error_dev(self, "failed to create workqueue\n");
617 		sc->sc_wq = NULL;
618 		goto destroy_link_thread;
619 	}
620 
621 	ring_cnt = hvn_channel_cnt;
622 	if (ring_cnt <= 0) {
623 		ring_cnt = ncpu;
624 		if (ring_cnt > HVN_CHANNEL_MAX_COUNT_DEFAULT)
625 			ring_cnt = HVN_CHANNEL_MAX_COUNT_DEFAULT;
626 	} else if (ring_cnt > ncpu)
627 		ring_cnt = ncpu;
628 
629 	tx_ring_cnt = hvn_tx_ring_cnt;
630 	if (tx_ring_cnt <= 0 || tx_ring_cnt > ring_cnt)
631 		tx_ring_cnt = ring_cnt;
632 
633 	if (hvn_tx_ring_create(sc, tx_ring_cnt)) {
634 		aprint_error_dev(self, "failed to create Tx ring\n");
635 		goto destroy_wq;
636 	}
637 
638 	if (hvn_rx_ring_create(sc, ring_cnt)) {
639 		aprint_error_dev(self, "failed to create Rx ring\n");
640 		goto destroy_tx_ring;
641 	}
642 
643 	strlcpy(ifp->if_xname, device_xname(sc->sc_dev), IFNAMSIZ);
644 	ifp->if_softc = sc;
645 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
646 	ifp->if_extflags = IFEF_MPSAFE;
647 	ifp->if_ioctl = hvn_ioctl;
648 	ifp->if_start = hvn_start;
649 	ifp->if_transmit = hvn_transmit;
650 	ifp->if_init = hvn_init;
651 	ifp->if_stop = hvn_stop;
652 	ifp->if_baudrate = IF_Gbps(10);
653 
654 	IFQ_SET_MAXLEN(&ifp->if_snd, uimax(HVN_TX_DESC - 1, IFQ_MAXLEN));
655 	IFQ_SET_READY(&ifp->if_snd);
656 
657 	/* Initialize ifmedia structures. */
658 	sc->sc_ec.ec_ifmedia = &sc->sc_media;
659 	ifmedia_init_with_lock(&sc->sc_media, IFM_IMASK,
660 	    hvn_media_change, hvn_media_status, &sc->sc_core_lock);
661 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_AUTO, 0, NULL);
662 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_10G_T | IFM_FDX, 0, NULL);
663 	ifmedia_add(&sc->sc_media, IFM_ETHER | IFM_10G_T, 0, NULL);
664 	ifmedia_set(&sc->sc_media, IFM_ETHER | IFM_AUTO);
665 
666 	if_initialize(ifp);
667 	sc->sc_ipq = if_percpuq_create(ifp);
668 	if_deferred_start_init(ifp, NULL);
669 
670 	hvn_nvs_init(sc);
671 	hvn_rndis_init(sc);
672 	if (hvn_synth_attach(sc, ETHERMTU)) {
673 		aprint_error_dev(self, "failed to attach synth\n");
674 		goto destroy_if_percpuq;
675 	}
676 
677 	aprint_normal_dev(self, "NVS %d.%d NDIS %d.%d\n",
678 	    sc->sc_proto >> 16, sc->sc_proto & 0xffff,
679 	    sc->sc_ndisver >> 16 , sc->sc_ndisver & 0xffff);
680 
681 	if (hvn_get_lladdr(sc, enaddr)) {
682 		aprint_error_dev(self,
683 		    "failed to obtain an ethernet address\n");
684 		goto detach_synth;
685 	}
686 	aprint_normal_dev(self, "Ethernet address %s\n", ether_sprintf(enaddr));
687 
688 	/*
689 	 * Fixup TX/RX stuffs after synthetic parts are attached.
690 	 */
691 	hvn_fixup_tx_data(sc);
692 	hvn_fixup_rx_data(sc);
693 
694 	ifp->if_capabilities |= sc->sc_txr[0].txr_caps_assist &
695 		(IFCAP_CSUM_IPv4_Tx | IFCAP_CSUM_IPv4_Rx |
696 		 IFCAP_CSUM_TCPv4_Tx | IFCAP_CSUM_TCPv4_Rx |
697 		 IFCAP_CSUM_TCPv6_Tx | IFCAP_CSUM_TCPv6_Rx |
698 		 IFCAP_CSUM_UDPv4_Tx | IFCAP_CSUM_UDPv4_Rx |
699 		 IFCAP_CSUM_UDPv6_Tx | IFCAP_CSUM_UDPv6_Rx);
700 	/* XXX TSOv4, TSOv6 */
701 	if (sc->sc_caps & HVN_CAPS_VLAN) {
702 		/* XXX not sure about VLAN_MTU. */
703 		sc->sc_ec.ec_capabilities |= ETHERCAP_VLAN_HWTAGGING;
704 		sc->sc_ec.ec_capabilities |= ETHERCAP_VLAN_MTU;
705 	}
706 	sc->sc_ec.ec_capabilities |= ETHERCAP_JUMBO_MTU;
707 
708 	ether_ifattach(ifp, enaddr);
709 
710 	error = hvn_get_mtu(sc, &mtu);
711 	if (error)
712 		mtu = ETHERMTU;
713 	if (mtu < ETHERMTU) {
714 		DPRINTF("%s: fixup mtu %u -> %u\n", device_xname(sc->sc_dev),
715 		    ETHERMTU, mtu);
716 		ifp->if_mtu = mtu;
717 	}
718 
719 	if_register(ifp);
720 
721 	/*
722 	 * Kick off link status check.
723 	 */
724 	hvn_link_event(sc, HVN_LINK_EV_STATE_CHANGE);
725 
726 	hvn_init_sysctls(sc);
727 
728 	if (pmf_device_register(self, NULL, NULL))
729 		pmf_class_network_register(self, ifp);
730 	else
731 		aprint_error_dev(self, "couldn't establish power handler\n");
732 
733 	SET(sc->sc_flags, HVN_SCF_ATTACHED);
734 	return;
735 
736 detach_synth:
737 	hvn_synth_detach(sc);
738 	hvn_rndis_destroy(sc);
739 	hvn_nvs_destroy(sc);
740 destroy_if_percpuq:
741 	if_percpuq_destroy(sc->sc_ipq);
742 	hvn_rx_ring_destroy(sc);
743 destroy_tx_ring:
744 	hvn_tx_ring_destroy(sc);
745 destroy_wq:
746 	workqueue_destroy(sc->sc_wq);
747 	sc->sc_wq = NULL;
748 destroy_link_thread:
749 	hvn_link_event(sc, HVN_LINK_EV_EXIT_THREAD);
750 	kthread_join(sc->sc_link_lwp);
751 	callout_destroy(&sc->sc_link_tmout);
752 	cv_destroy(&sc->sc_link_cv);
753 	mutex_destroy(&sc->sc_link_lock);
754 	mutex_destroy(&sc->sc_core_lock);
755 }
756 
757 static int
758 hvn_detach(device_t self, int flags)
759 {
760 	struct hvn_softc *sc = device_private(self);
761 	struct ifnet *ifp = SC2IFP(sc);
762 
763 	if (!ISSET(sc->sc_flags, HVN_SCF_ATTACHED))
764 		return 0;
765 
766 	if (vmbus_channel_is_revoked(sc->sc_prichan))
767 		SET(sc->sc_flags, HVN_SCF_REVOKED);
768 
769 	pmf_device_deregister(self);
770 
771 	mutex_enter(&sc->sc_core_lock);
772 
773 	if (ifp->if_flags & IFF_RUNNING)
774 		hvn_stop_locked(ifp);
775 	/*
776 	 * NOTE:
777 	 * hvn_stop() only suspends data, so managment
778 	 * stuffs have to be suspended manually here.
779 	 */
780 	hvn_suspend_mgmt(sc);
781 
782 	ether_ifdetach(ifp);
783 	if_detach(ifp);
784 	if_percpuq_destroy(sc->sc_ipq);
785 
786 	hvn_link_event(sc, HVN_LINK_EV_EXIT_THREAD);
787 	kthread_join(sc->sc_link_lwp);
788 	callout_halt(&sc->sc_link_tmout, NULL);
789 
790 	hvn_synth_detach(sc);
791 	hvn_rndis_destroy(sc);
792 	hvn_nvs_destroy(sc);
793 
794 	mutex_exit(&sc->sc_core_lock);
795 
796 	hvn_rx_ring_destroy(sc);
797 	hvn_tx_ring_destroy(sc);
798 
799 	workqueue_destroy(sc->sc_wq);
800 	callout_destroy(&sc->sc_link_tmout);
801 	cv_destroy(&sc->sc_link_cv);
802 	mutex_destroy(&sc->sc_link_lock);
803 	mutex_destroy(&sc->sc_core_lock);
804 
805 	sysctl_teardown(&sc->sc_sysctllog);
806 
807 	return 0;
808 }
809 
810 static int
811 hvn_ioctl(struct ifnet *ifp, u_long command, void * data)
812 {
813 	struct hvn_softc *sc = IFP2SC(ifp);
814 	struct ifreq *ifr = (struct ifreq *)data;
815 	uint32_t mtu;
816 	int s, error = 0;
817 
818 	switch (command) {
819 	case SIOCSIFMTU:
820 		if (ifr->ifr_mtu < HVN_MTU_MIN || ifr->ifr_mtu > HVN_MTU_MAX) {
821 			error = EINVAL;
822 			break;
823 		}
824 
825 		mutex_enter(&sc->sc_core_lock);
826 
827 		if (!(sc->sc_caps & HVN_CAPS_MTU)) {
828 			/* Can't change MTU */
829 			mutex_exit(&sc->sc_core_lock);
830 			error = EOPNOTSUPP;
831 			break;
832 		}
833 
834 		if (ifp->if_mtu == ifr->ifr_mtu) {
835 			mutex_exit(&sc->sc_core_lock);
836 			break;
837 		}
838 
839 		/*
840 		 * Suspend this interface before the synthetic parts
841 		 * are ripped.
842 		 */
843 		hvn_suspend(sc);
844 
845 		/*
846 		 * Detach the synthetics parts, i.e. NVS and RNDIS.
847 		 */
848 		hvn_synth_detach(sc);
849 
850 		/*
851 		 * Reattach the synthetic parts, i.e. NVS and RNDIS,
852 		 * with the new MTU setting.
853 		 */
854 		error = hvn_synth_attach(sc, ifr->ifr_mtu);
855 		if (error) {
856 			mutex_exit(&sc->sc_core_lock);
857 			break;
858 		}
859 
860 		error = hvn_get_mtu(sc, &mtu);
861 		if (error)
862 			mtu = ifr->ifr_mtu;
863 		DPRINTF("%s: RNDIS mtu=%d\n", device_xname(sc->sc_dev), mtu);
864 
865 		/*
866 		 * Commit the requested MTU, after the synthetic parts
867 		 * have been successfully attached.
868 		 */
869 		if (mtu >= ifr->ifr_mtu) {
870 			mtu = ifr->ifr_mtu;
871 		} else {
872 			DPRINTF("%s: fixup mtu %d -> %u\n",
873 			    device_xname(sc->sc_dev), ifr->ifr_mtu, mtu);
874 		}
875 		ifp->if_mtu = mtu;
876 
877 		/*
878 		 * Synthetic parts' reattach may change the chimney
879 		 * sending size; update it.
880 		 */
881 		if (sc->sc_txr[0].txr_chim_size > sc->sc_chim_szmax)
882 			hvn_set_chim_size(sc, sc->sc_chim_szmax);
883 
884 		/*
885 		 * All done!  Resume the interface now.
886 		 */
887 		hvn_resume(sc);
888 
889 		mutex_exit(&sc->sc_core_lock);
890 		break;
891 	default:
892 		s = splnet();
893 		if (command == SIOCGIFMEDIA || command == SIOCSIFMEDIA)
894 			error = ifmedia_ioctl(ifp, ifr, &sc->sc_media, command);
895 		else
896 			error = ether_ioctl(ifp, command, data);
897 		splx(s);
898 		if (error == ENETRESET) {
899 			mutex_enter(&sc->sc_core_lock);
900 			if (ifp->if_flags & IFF_RUNNING)
901 				hvn_init_locked(ifp);
902 			mutex_exit(&sc->sc_core_lock);
903 			error = 0;
904 		}
905 		break;
906 	}
907 
908 	return error;
909 }
910 
911 static int
912 hvn_media_change(struct ifnet *ifp)
913 {
914 	struct hvn_softc *sc = IFP2SC(ifp);
915 	struct ifmedia *ifm = &sc->sc_media;
916 
917 	if (IFM_TYPE(ifm->ifm_media) != IFM_ETHER)
918 		return EINVAL;
919 
920 	switch (IFM_SUBTYPE(ifm->ifm_media)) {
921 	case IFM_AUTO:
922 		break;
923 	default:
924 		device_printf(sc->sc_dev, "Only auto media type\n");
925 		return EINVAL;
926 	}
927 	return 0;
928 }
929 
930 static void
931 hvn_media_status(struct ifnet *ifp, struct ifmediareq *ifmr)
932 {
933 	struct hvn_softc *sc = IFP2SC(ifp);
934 
935 	ifmr->ifm_status = IFM_AVALID;
936 	ifmr->ifm_active = IFM_ETHER;
937 
938 	if (sc->sc_link_state != LINK_STATE_UP) {
939 		ifmr->ifm_active |= IFM_NONE;
940 		return;
941 	}
942 
943 	ifmr->ifm_status |= IFM_ACTIVE;
944 	ifmr->ifm_active |= IFM_10G_T | IFM_FDX;
945 }
946 
947 static void
948 hvn_link_task(void *arg)
949 {
950 	struct hvn_softc *sc = arg;
951 	struct ifnet *ifp = SC2IFP(sc);
952 	uint32_t event;
953 	int old_link_state;
954 
955 	mutex_enter(&sc->sc_link_lock);
956 	sc->sc_link_onproc = false;
957 	for (;;) {
958 		if (sc->sc_link_ev == 0) {
959 			cv_wait(&sc->sc_link_cv, &sc->sc_link_lock);
960 			continue;
961 		}
962 
963 		sc->sc_link_onproc = true;
964 		event = sc->sc_link_ev;
965 		sc->sc_link_ev = 0;
966 		mutex_exit(&sc->sc_link_lock);
967 
968 		if (event & HVN_LINK_EV_EXIT_THREAD)
969 			break;
970 
971 		if (sc->sc_link_suspend)
972 			goto next;
973 
974 		if (event & HVN_LINK_EV_RESUME_NETWORK) {
975 			if (sc->sc_link_pending)
976 				event |= HVN_LINK_EV_NETWORK_CHANGE;
977 			else
978 				event |= HVN_LINK_EV_STATE_CHANGE;
979 		}
980 
981 		if (event & HVN_LINK_EV_NETWORK_CHANGE) {
982 			/* Prevent any link status checks from running. */
983 			sc->sc_link_pending = true;
984 
985 			/*
986 			 * Fake up a [link down --> link up] state change;
987 			 * 5 seconds delay is used, which closely simulates
988 			 * miibus reaction upon link down event.
989 			 */
990 			old_link_state = sc->sc_link_state;
991 			sc->sc_link_state = LINK_STATE_DOWN;
992 			if (old_link_state != sc->sc_link_state) {
993 				if_link_state_change(ifp, LINK_STATE_DOWN);
994 			}
995 #if defined(HVN_LINK_STATE_CHANGE_DELAY) && HVN_LINK_STATE_CHANGE_DELAY > 0
996 			callout_schedule(&sc->sc_link_tmout,
997 			    mstohz(HVN_LINK_STATE_CHANGE_DELAY));
998 #else
999 			hvn_link_event(sc, HVN_LINK_EV_NETWORK_CHANGE_TMOUT);
1000 #endif
1001 		} else if (event & HVN_LINK_EV_NETWORK_CHANGE_TMOUT) {
1002 			/* Re-allow link status checks. */
1003 			sc->sc_link_pending = false;
1004 			hvn_update_link_status(sc);
1005 		} else if (event & HVN_LINK_EV_STATE_CHANGE) {
1006 			if (!sc->sc_link_pending)
1007 				hvn_update_link_status(sc);
1008 		}
1009  next:
1010 		mutex_enter(&sc->sc_link_lock);
1011 		sc->sc_link_onproc = false;
1012 	}
1013 
1014 	mutex_enter(&sc->sc_link_lock);
1015 	sc->sc_link_onproc = false;
1016 	mutex_exit(&sc->sc_link_lock);
1017 
1018 	kthread_exit(0);
1019 }
1020 
1021 static void
1022 hvn_link_event(struct hvn_softc *sc, uint32_t ev)
1023 {
1024 
1025 	mutex_enter(&sc->sc_link_lock);
1026 	SET(sc->sc_link_ev, ev);
1027 	cv_signal(&sc->sc_link_cv);
1028 	mutex_exit(&sc->sc_link_lock);
1029 }
1030 
1031 static void
1032 hvn_link_netchg_tmout_cb(void *arg)
1033 {
1034 	struct hvn_softc *sc = arg;
1035 
1036 	hvn_link_event(sc, HVN_LINK_EV_NETWORK_CHANGE_TMOUT);
1037 }
1038 
1039 static int
1040 hvn_init(struct ifnet *ifp)
1041 {
1042 	struct hvn_softc *sc = IFP2SC(ifp);
1043 	int error;
1044 
1045 	mutex_enter(&sc->sc_core_lock);
1046 	error = hvn_init_locked(ifp);
1047 	mutex_exit(&sc->sc_core_lock);
1048 
1049 	return error;
1050 }
1051 
1052 static int
1053 hvn_init_locked(struct ifnet *ifp)
1054 {
1055 	struct hvn_softc *sc = IFP2SC(ifp);
1056 	int error;
1057 
1058 	KASSERT(mutex_owned(&sc->sc_core_lock));
1059 
1060 	hvn_stop_locked(ifp);
1061 
1062 	error = hvn_rndis_open(sc);
1063 	if (error)
1064 		return error;
1065 
1066 	/* Clear TX 'suspended' bit. */
1067 	hvn_resume_tx(sc, sc->sc_ntxr_inuse);
1068 
1069 	/* Everything is ready; unleash! */
1070 	ifp->if_flags |= IFF_RUNNING;
1071 
1072 	return 0;
1073 }
1074 
1075 static void
1076 hvn_stop(struct ifnet *ifp, int disable)
1077 {
1078 	struct hvn_softc *sc = IFP2SC(ifp);
1079 
1080 	mutex_enter(&sc->sc_core_lock);
1081 	hvn_stop_locked(ifp);
1082 	mutex_exit(&sc->sc_core_lock);
1083 }
1084 
1085 static void
1086 hvn_stop_locked(struct ifnet *ifp)
1087 {
1088 	struct hvn_softc *sc = IFP2SC(ifp);
1089 	int i;
1090 
1091 	KASSERT(mutex_owned(&sc->sc_core_lock));
1092 
1093 	/* Clear RUNNING bit ASAP. */
1094 	ifp->if_flags &= ~IFF_RUNNING;
1095 
1096 	/* Suspend data transfers. */
1097 	hvn_suspend_data(sc);
1098 
1099 	/* Clear OACTIVE state. */
1100 	for (i = 0; i < sc->sc_ntxr_inuse; i++)
1101 		sc->sc_txr[i].txr_oactive = 0;
1102 }
1103 
1104 static void
1105 hvn_transmit_common(struct ifnet *ifp, struct hvn_tx_ring *txr,
1106     bool is_transmit)
1107 {
1108 	struct hvn_tx_desc *txd;
1109 	struct mbuf *m;
1110 	int l2hlen = ETHER_HDR_LEN;
1111 
1112 	KASSERT(mutex_owned(&txr->txr_lock));
1113 
1114 	if (!(ifp->if_flags & IFF_RUNNING))
1115 		return;
1116 	if (txr->txr_oactive)
1117 		return;
1118 	if (txr->txr_suspended)
1119 		return;
1120 
1121 	for (;;) {
1122 		if (!hvn_txd_peek(txr)) {
1123 			/* transient */
1124 			txr->txr_oactive = 1;
1125 			txr->txr_evnodesc.ev_count++;
1126 			break;
1127 		}
1128 
1129 		if (is_transmit)
1130 			m = pcq_get(txr->txr_interq);
1131 		else
1132 			IFQ_DEQUEUE(&ifp->if_snd, m);
1133 		if (m == NULL)
1134 			break;
1135 
1136 #if defined(INET) || defined(INET6)
1137 		if (m->m_pkthdr.csum_flags &
1138 		    (M_CSUM_TCPv4|M_CSUM_UDPv4|M_CSUM_TCPv6|M_CSUM_UDPv6)) {
1139 			m = hvn_set_hlen(m, &l2hlen);
1140 			if (__predict_false(m == NULL)) {
1141 				if_statinc(ifp, if_oerrors);
1142 				continue;
1143 			}
1144 		}
1145 #endif
1146 
1147 		txd = hvn_txd_get(txr);
1148 		if (hvn_encap(txr, txd, m, l2hlen)) {
1149 			/* the chain is too large */
1150 			if_statinc(ifp, if_oerrors);
1151 			hvn_txd_put(txr, txd);
1152 			m_freem(m);
1153 			continue;
1154 		}
1155 
1156 		if (txr->txr_agg_pktleft == 0) {
1157 			if (txr->txr_agg_txd != NULL) {
1158 				hvn_flush_txagg(txr);
1159 			} else {
1160 				if (hvn_txpkt(txr, txd)) {
1161 					/* txd is freed, but m is not. */
1162 					m_freem(m);
1163 					if_statinc(ifp, if_oerrors);
1164 				}
1165 			}
1166 		}
1167 	}
1168 
1169 	/* Flush pending aggerated transmission. */
1170 	if (txr->txr_agg_txd != NULL)
1171 		hvn_flush_txagg(txr);
1172 }
1173 
1174 static void
1175 hvn_start(struct ifnet *ifp)
1176 {
1177 	struct hvn_softc *sc = IFP2SC(ifp);
1178 	struct hvn_tx_ring *txr = &sc->sc_txr[0];
1179 
1180 	mutex_enter(&txr->txr_lock);
1181 	hvn_transmit_common(ifp, txr, false);
1182 	mutex_exit(&txr->txr_lock);
1183 }
1184 
1185 static int
1186 hvn_select_txqueue(struct ifnet *ifp, struct mbuf *m __unused)
1187 {
1188 	struct hvn_softc *sc = IFP2SC(ifp);
1189 	u_int cpu;
1190 
1191 	cpu = cpu_index(curcpu());
1192 
1193 	return cpu % sc->sc_ntxr_inuse;
1194 }
1195 
1196 static int
1197 hvn_transmit(struct ifnet *ifp, struct mbuf *m)
1198 {
1199 	struct hvn_softc *sc = IFP2SC(ifp);
1200 	struct hvn_tx_ring *txr;
1201 	int qid;
1202 
1203 	qid = hvn_select_txqueue(ifp, m);
1204 	txr = &sc->sc_txr[qid];
1205 
1206 	if (__predict_false(!pcq_put(txr->txr_interq, m))) {
1207 		mutex_enter(&txr->txr_lock);
1208 		txr->txr_evpcqdrop.ev_count++;
1209 		mutex_exit(&txr->txr_lock);
1210 		m_freem(m);
1211 		return ENOBUFS;
1212 	}
1213 
1214 	kpreempt_disable();
1215 	softint_schedule(txr->txr_si);
1216 	kpreempt_enable();
1217 	return 0;
1218 }
1219 
1220 static void
1221 hvn_deferred_transmit(void *arg)
1222 {
1223 	struct hvn_tx_ring *txr = arg;
1224 	struct hvn_softc *sc = txr->txr_softc;
1225 	struct ifnet *ifp = SC2IFP(sc);
1226 
1227 	mutex_enter(&txr->txr_lock);
1228 	txr->txr_evtransmitdefer.ev_count++;
1229 	hvn_transmit_common(ifp, txr, true);
1230 	mutex_exit(&txr->txr_lock);
1231 }
1232 
1233 static inline char *
1234 hvn_rndis_pktinfo_append(struct rndis_packet_msg *pkt, size_t pktsize,
1235     size_t datalen, uint32_t type)
1236 {
1237 	struct rndis_pktinfo *pi;
1238 	size_t pi_size = sizeof(*pi) + datalen;
1239 	char *cp;
1240 
1241 	KASSERT(pkt->rm_pktinfooffset + pkt->rm_pktinfolen + pi_size <=
1242 	    pktsize);
1243 
1244 	cp = (char *)pkt + pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1245 	pi = (struct rndis_pktinfo *)cp;
1246 	pi->rm_size = pi_size;
1247 	pi->rm_type = type;
1248 	pi->rm_pktinfooffset = sizeof(*pi);
1249 	pkt->rm_pktinfolen += pi_size;
1250 	pkt->rm_dataoffset += pi_size;
1251 	pkt->rm_len += pi_size;
1252 
1253 	return (char *)pi->rm_data;
1254 }
1255 
1256 static struct mbuf *
1257 hvn_pullup_hdr(struct mbuf *m, int len)
1258 {
1259 	struct mbuf *mn;
1260 
1261 	if (__predict_false(m->m_len < len)) {
1262 		mn = m_pullup(m, len);
1263 		if (mn == NULL)
1264 			return NULL;
1265 		m = mn;
1266 	}
1267 	return m;
1268 }
1269 
1270 /*
1271  * NOTE: If this function failed, the m would be freed.
1272  */
1273 static struct mbuf *
1274 hvn_set_hlen(struct mbuf *m, int *l2hlenp)
1275 {
1276 	const struct ether_header *eh;
1277 	int l2hlen, off;
1278 
1279 	m = hvn_pullup_hdr(m, sizeof(*eh));
1280 	if (m == NULL)
1281 		return NULL;
1282 
1283 	eh = mtod(m, const struct ether_header *);
1284 	if (eh->ether_type == ntohs(ETHERTYPE_VLAN))
1285 		l2hlen = ETHER_HDR_LEN + ETHER_VLAN_ENCAP_LEN;
1286 	else
1287 		l2hlen = ETHER_HDR_LEN;
1288 
1289 #if defined(INET)
1290 	if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv4 | M_CSUM_UDPv4)) {
1291 		const struct ip *ip;
1292 
1293 		off = l2hlen + sizeof(*ip);
1294 		m = hvn_pullup_hdr(m, off);
1295 		if (m == NULL)
1296 			return NULL;
1297 
1298 		ip = (struct ip *)((mtod(m, uint8_t *)) + off);
1299 
1300 		/*
1301 		 * UDP checksum offload does not work in Azure, if the
1302 		 * following conditions meet:
1303 		 * - sizeof(IP hdr + UDP hdr + payload) > 1420.
1304 		 * - IP_DF is not set in the IP hdr.
1305 		 *
1306 		 * Fallback to software checksum for these UDP datagrams.
1307 		 */
1308 		if ((m->m_pkthdr.csum_flags & M_CSUM_UDPv4) &&
1309 		    m->m_pkthdr.len > hvn_udpcs_fixup_mtu + l2hlen &&
1310 		    !(ntohs(ip->ip_off) & IP_DF)) {
1311 			uint16_t *csump;
1312 
1313 			off = l2hlen +
1314 			    M_CSUM_DATA_IPv4_IPHL(m->m_pkthdr.csum_data);
1315 			m = hvn_pullup_hdr(m, off + sizeof(struct udphdr));
1316 			if (m == NULL)
1317 				return NULL;
1318 
1319 			csump = (uint16_t *)(mtod(m, uint8_t *) + off +
1320 			    M_CSUM_DATA_IPv4_OFFSET(m->m_pkthdr.csum_data));
1321 			*csump = cpu_in_cksum(m, m->m_pkthdr.len - off, off, 0);
1322 			m->m_pkthdr.csum_flags &= ~M_CSUM_UDPv4;
1323 		}
1324 	}
1325 #endif	/* INET */
1326 #if defined(INET) && defined(INET6)
1327 	else
1328 #endif	/* INET && INET6 */
1329 #if defined(INET6)
1330 	{
1331 		const struct ip6_hdr *ip6;
1332 
1333 		off = l2hlen + sizeof(*ip6);
1334 		m = hvn_pullup_hdr(m, off);
1335 		if (m == NULL)
1336 			return NULL;
1337 
1338 		ip6 = (struct ip6_hdr *)((mtod(m, uint8_t *)) + l2hlen);
1339 		if (ip6->ip6_nxt != IPPROTO_TCP &&
1340 		    ip6->ip6_nxt != IPPROTO_UDP) {
1341 			m_freem(m);
1342 			return NULL;
1343 		}
1344 	}
1345 #endif	/* INET6 */
1346 
1347 	*l2hlenp = l2hlen;
1348 
1349 	return m;
1350 }
1351 
1352 static int
1353 hvn_flush_txagg(struct hvn_tx_ring *txr)
1354 {
1355 	struct hvn_softc *sc = txr->txr_softc;
1356 	struct ifnet *ifp = SC2IFP(sc);
1357 	struct hvn_tx_desc *txd;
1358 	struct mbuf *m;
1359 	int error, pkts;
1360 
1361 	txd = txr->txr_agg_txd;
1362 	KASSERTMSG(txd != NULL, "no aggregate txdesc");
1363 
1364 	/*
1365 	 * Since hvn_txpkt() will reset this temporary stat, save
1366 	 * it now, so that oerrors can be updated properly, if
1367 	 * hvn_txpkt() ever fails.
1368 	 */
1369 	pkts = txr->txr_stat_pkts;
1370 
1371 	/*
1372 	 * Since txd's mbuf will _not_ be freed upon hvn_txpkt()
1373 	 * failure, save it for later freeing, if hvn_txpkt() ever
1374 	 * fails.
1375 	 */
1376 	m = txd->txd_buf;
1377 	error = hvn_txpkt(txr, txd);
1378 	if (__predict_false(error)) {
1379 		/* txd is freed, but m is not. */
1380 		m_freem(m);
1381 		txr->txr_evflushfailed.ev_count++;
1382 		if_statadd(ifp, if_oerrors, pkts);
1383 	}
1384 
1385 	/* Reset all aggregation states. */
1386 	txr->txr_agg_txd = NULL;
1387 	txr->txr_agg_szleft = 0;
1388 	txr->txr_agg_pktleft = 0;
1389 	txr->txr_agg_prevpkt = NULL;
1390 
1391 	return error;
1392 }
1393 
1394 static void *
1395 hvn_try_txagg(struct hvn_tx_ring *txr, struct hvn_tx_desc *txd, int pktsz)
1396 {
1397 	struct hvn_softc *sc = txr->txr_softc;
1398 	struct hvn_tx_desc *agg_txd;
1399 	struct rndis_packet_msg *pkt;
1400 	void *chim;
1401 	int olen;
1402 
1403 	if (txr->txr_agg_txd != NULL) {
1404 		if (txr->txr_agg_pktleft > 0 && txr->txr_agg_szleft > pktsz) {
1405 			agg_txd = txr->txr_agg_txd;
1406 			pkt = txr->txr_agg_prevpkt;
1407 
1408 			/*
1409 			 * Update the previous RNDIS packet's total length,
1410 			 * it can be increased due to the mandatory alignment
1411 			 * padding for this RNDIS packet.  And update the
1412 			 * aggregating txdesc's chimney sending buffer size
1413 			 * accordingly.
1414 			 *
1415 			 * XXX
1416 			 * Zero-out the padding, as required by the RNDIS spec.
1417 			 */
1418 			olen = pkt->rm_len;
1419 			pkt->rm_len = roundup2(olen, txr->txr_agg_align);
1420 			agg_txd->txd_chim_size += pkt->rm_len - olen;
1421 
1422 			/* Link this txdesc to the parent. */
1423 			hvn_txd_agg(agg_txd, txd);
1424 
1425 			chim = (uint8_t *)pkt + pkt->rm_len;
1426 			/* Save the current packet for later fixup. */
1427 			txr->txr_agg_prevpkt = chim;
1428 
1429 			txr->txr_agg_pktleft--;
1430 			txr->txr_agg_szleft -= pktsz;
1431 			if (txr->txr_agg_szleft <=
1432 			    HVN_PKTSIZE_MIN(txr->txr_agg_align)) {
1433 				/*
1434 				 * Probably can't aggregate more packets,
1435 				 * flush this aggregating txdesc proactively.
1436 				 */
1437 				txr->txr_agg_pktleft = 0;
1438 			}
1439 
1440 			/* Done! */
1441 			return chim;
1442 		}
1443 		hvn_flush_txagg(txr);
1444 	}
1445 
1446 	txr->txr_evchimneytried.ev_count++;
1447 	txd->txd_chim_index = hvn_chim_alloc(sc);
1448 	if (txd->txd_chim_index == HVN_NVS_CHIM_IDX_INVALID)
1449 		return NULL;
1450 	txr->txr_evchimney.ev_count++;
1451 
1452 	chim = sc->sc_chim + (txd->txd_chim_index * sc->sc_chim_szmax);
1453 
1454 	if (txr->txr_agg_pktmax > 1 &&
1455 	    txr->txr_agg_szmax > pktsz + HVN_PKTSIZE_MIN(txr->txr_agg_align)) {
1456 		txr->txr_agg_txd = txd;
1457 		txr->txr_agg_pktleft = txr->txr_agg_pktmax - 1;
1458 		txr->txr_agg_szleft = txr->txr_agg_szmax - pktsz;
1459 		txr->txr_agg_prevpkt = chim;
1460 	}
1461 
1462 	return chim;
1463 }
1464 
1465 static int
1466 hvn_encap(struct hvn_tx_ring *txr, struct hvn_tx_desc *txd, struct mbuf *m,
1467     int l2hlen)
1468 {
1469 	/* Used to pad ethernet frames with < ETHER_MIN_LEN bytes */
1470 	static const char zero_pad[ETHER_MIN_LEN];
1471 	struct hvn_softc *sc = txr->txr_softc;
1472 	struct rndis_packet_msg *pkt;
1473 	bus_dma_segment_t *seg;
1474 	void *chim = NULL;
1475 	size_t pktlen, pktsize;
1476 	int l3hlen;
1477 	int i, rv;
1478 
1479 	if (ISSET(sc->sc_caps, HVN_CAPS_VLAN) && !vlan_has_tag(m)) {
1480 		struct ether_vlan_header *evl;
1481 
1482 		m = hvn_pullup_hdr(m, sizeof(*evl));
1483 		if (m == NULL) {
1484 			DPRINTF("%s: failed to pullup mbuf\n",
1485 			    device_xname(sc->sc_dev));
1486 			return -1;
1487 		}
1488 
1489 		evl = mtod(m, struct ether_vlan_header *);
1490 		if (evl->evl_encap_proto == htons(ETHERTYPE_VLAN)) {
1491 			struct ether_header *eh;
1492 			uint16_t proto = evl->evl_proto;
1493 
1494 			vlan_set_tag(m, ntohs(evl->evl_tag));
1495 
1496 			/*
1497 			 * Trim VLAN tag from header.
1498 			 */
1499 			memmove((uint8_t *)evl + ETHER_VLAN_ENCAP_LEN,
1500 			    evl, ETHER_HDR_LEN);
1501 			m_adj(m, ETHER_VLAN_ENCAP_LEN);
1502 
1503 			eh = mtod(m, struct ether_header *);
1504 			eh->ether_type = proto;
1505 
1506 			/*
1507 			 * Re-padding.  See sys/net/if_vlan.c:vlan_start().
1508 			 */
1509 			if (m->m_pkthdr.len < (ETHER_MIN_LEN - ETHER_CRC_LEN +
1510 			    ETHER_VLAN_ENCAP_LEN)) {
1511 				m_copyback(m, m->m_pkthdr.len,
1512 				    (ETHER_MIN_LEN - ETHER_CRC_LEN +
1513 				     ETHER_VLAN_ENCAP_LEN) -
1514 				    m->m_pkthdr.len, zero_pad);
1515 			}
1516 
1517 			txr->txr_evvlanfixup.ev_count++;
1518 		}
1519 	}
1520 
1521 	pkt = txd->txd_req;
1522 	pktsize = HVN_PKTSIZE(m, txr->txr_agg_align);
1523 	if (pktsize < txr->txr_chim_size) {
1524 		chim = hvn_try_txagg(txr, txd, pktsize);
1525 		if (chim != NULL)
1526 			pkt = chim;
1527 	} else {
1528 		if (txr->txr_agg_txd != NULL)
1529 			hvn_flush_txagg(txr);
1530 	}
1531 
1532 	memset(pkt, 0, HVN_RNDIS_PKT_LEN);
1533 	pkt->rm_type = REMOTE_NDIS_PACKET_MSG;
1534 	pkt->rm_len = sizeof(*pkt) + m->m_pkthdr.len;
1535 	pkt->rm_dataoffset = RNDIS_DATA_OFFSET;
1536 	pkt->rm_datalen = m->m_pkthdr.len;
1537 	pkt->rm_pktinfooffset = sizeof(*pkt); /* adjusted below */
1538 	pkt->rm_pktinfolen = 0;
1539 
1540 	if (txr->txr_flags & HVN_TXR_FLAG_UDP_HASH) {
1541 		char *cp;
1542 
1543 		/*
1544 		 * Set the hash value for this packet, so that the host could
1545 		 * dispatch the TX done event for this packet back to this TX
1546 		 * ring's channel.
1547 		 */
1548 		cp = hvn_rndis_pktinfo_append(pkt, HVN_RNDIS_PKT_LEN,
1549 		    HVN_NDIS_HASH_VALUE_SIZE, HVN_NDIS_PKTINFO_TYPE_HASHVAL);
1550 		memcpy(cp, &txr->txr_id, HVN_NDIS_HASH_VALUE_SIZE);
1551 	}
1552 
1553 	if (vlan_has_tag(m)) {
1554 		uint32_t vlan;
1555 		char *cp;
1556 		uint16_t tag;
1557 
1558 		tag = vlan_get_tag(m);
1559 		vlan = NDIS_VLAN_INFO_MAKE(EVL_VLANOFTAG(tag),
1560 		    EVL_PRIOFTAG(tag), EVL_CFIOFTAG(tag));
1561 		cp = hvn_rndis_pktinfo_append(pkt, HVN_RNDIS_PKT_LEN,
1562 		    NDIS_VLAN_INFO_SIZE, NDIS_PKTINFO_TYPE_VLAN);
1563 		memcpy(cp, &vlan, NDIS_VLAN_INFO_SIZE);
1564 		txr->txr_evvlanhwtagging.ev_count++;
1565 	}
1566 
1567 	if (m->m_pkthdr.csum_flags & txr->txr_csum_assist) {
1568 		uint32_t csum;
1569 		char *cp;
1570 
1571 		if (m->m_pkthdr.csum_flags & (M_CSUM_TCPv6 | M_CSUM_UDPv6)) {
1572 			csum = NDIS_TXCSUM_INFO_IPV6;
1573 			l3hlen = M_CSUM_DATA_IPv6_IPHL(m->m_pkthdr.csum_data);
1574 			if (m->m_pkthdr.csum_flags & M_CSUM_TCPv6)
1575 				csum |= NDIS_TXCSUM_INFO_MKTCPCS(l2hlen +
1576 				    l3hlen);
1577 			if (m->m_pkthdr.csum_flags & M_CSUM_UDPv6)
1578 				csum |= NDIS_TXCSUM_INFO_MKUDPCS(l2hlen +
1579 				    l3hlen);
1580 		} else {
1581 			csum = NDIS_TXCSUM_INFO_IPV4;
1582 			l3hlen = M_CSUM_DATA_IPv4_IPHL(m->m_pkthdr.csum_data);
1583 			if (m->m_pkthdr.csum_flags & M_CSUM_IPv4)
1584 				csum |= NDIS_TXCSUM_INFO_IPCS;
1585 			if (m->m_pkthdr.csum_flags & M_CSUM_TCPv4)
1586 				csum |= NDIS_TXCSUM_INFO_MKTCPCS(l2hlen +
1587 				    l3hlen);
1588 			if (m->m_pkthdr.csum_flags & M_CSUM_UDPv4)
1589 				csum |= NDIS_TXCSUM_INFO_MKUDPCS(l2hlen +
1590 				    l3hlen);
1591 		}
1592 		cp = hvn_rndis_pktinfo_append(pkt, HVN_RNDIS_PKT_LEN,
1593 		    NDIS_TXCSUM_INFO_SIZE, NDIS_PKTINFO_TYPE_CSUM);
1594 		memcpy(cp, &csum, NDIS_TXCSUM_INFO_SIZE);
1595 	}
1596 
1597 	pktlen = pkt->rm_pktinfooffset + pkt->rm_pktinfolen;
1598 	pkt->rm_pktinfooffset -= RNDIS_HEADER_OFFSET;
1599 
1600 	/*
1601 	 * Fast path: Chimney sending.
1602 	 */
1603 	if (chim != NULL) {
1604 		struct hvn_tx_desc *tgt_txd;
1605 
1606 		tgt_txd = (txr->txr_agg_txd != NULL) ? txr->txr_agg_txd : txd;
1607 
1608 		KASSERTMSG(pkt == chim,
1609 		    "RNDIS pkt not in chimney sending buffer");
1610 		KASSERTMSG(tgt_txd->txd_chim_index != HVN_NVS_CHIM_IDX_INVALID,
1611 		    "chimney sending buffer is not used");
1612 
1613 		tgt_txd->txd_chim_size += pkt->rm_len;
1614 		m_copydata(m, 0, m->m_pkthdr.len, (uint8_t *)chim + pktlen);
1615 
1616 		txr->txr_sendpkt = hvn_rndis_output_chim;
1617 		goto done;
1618 	}
1619 
1620 	KASSERTMSG(txr->txr_agg_txd == NULL, "aggregating sglist txdesc");
1621 	KASSERTMSG(txd->txd_chim_index == HVN_NVS_CHIM_IDX_INVALID,
1622 	    "chimney buffer is used");
1623 	KASSERTMSG(pkt == txd->txd_req, "RNDIS pkt not in txdesc");
1624 
1625 	rv = bus_dmamap_load_mbuf(sc->sc_dmat, txd->txd_dmap, m, BUS_DMA_READ |
1626 	    BUS_DMA_NOWAIT);
1627 	switch (rv) {
1628 	case 0:
1629 		break;
1630 	case EFBIG:
1631 		if (m_defrag(m, M_NOWAIT) != NULL) {
1632 			txr->txr_evdefrag.ev_count++;
1633 			if (bus_dmamap_load_mbuf(sc->sc_dmat, txd->txd_dmap, m,
1634 			    BUS_DMA_READ | BUS_DMA_NOWAIT) == 0)
1635 				break;
1636 		}
1637 		/* FALLTHROUGH */
1638 	default:
1639 		DPRINTF("%s: failed to load mbuf\n", device_xname(sc->sc_dev));
1640 		txr->txr_evdmafailed.ev_count++;
1641 		return -1;
1642 	}
1643 	bus_dmamap_sync(sc->sc_dmat, txd->txd_dmap,
1644 	    0, txd->txd_dmap->dm_mapsize, BUS_DMASYNC_PREWRITE);
1645 	SET(txd->txd_flags, HVN_TXD_FLAG_DMAMAP);
1646 
1647 	/* Attach an RNDIS message to the first slot */
1648 	txd->txd_sgl[0].gpa_page = txd->txd_gpa.gpa_page;
1649 	txd->txd_sgl[0].gpa_ofs = txd->txd_gpa.gpa_ofs;
1650 	txd->txd_sgl[0].gpa_len = pktlen;
1651 	txd->txd_nsge = txd->txd_dmap->dm_nsegs + 1;
1652 
1653 	for (i = 0; i < txd->txd_dmap->dm_nsegs; i++) {
1654 		seg = &txd->txd_dmap->dm_segs[i];
1655 		txd->txd_sgl[1 + i].gpa_page = atop(seg->ds_addr);
1656 		txd->txd_sgl[1 + i].gpa_ofs = seg->ds_addr & PAGE_MASK;
1657 		txd->txd_sgl[1 + i].gpa_len = seg->ds_len;
1658 	}
1659 
1660 	txd->txd_chim_index = HVN_NVS_CHIM_IDX_INVALID;
1661 	txd->txd_chim_size = 0;
1662 	txr->txr_sendpkt = hvn_rndis_output_sgl;
1663 done:
1664 	txd->txd_buf = m;
1665 
1666 	/* Update temporary stats for later use. */
1667 	txr->txr_stat_pkts++;
1668 	txr->txr_stat_size += m->m_pkthdr.len;
1669 	if (m->m_flags & M_MCAST)
1670 		txr->txr_stat_mcasts++;
1671 
1672 	return 0;
1673 }
1674 
1675 static void
1676 hvn_bpf_mtap(struct hvn_tx_ring *txr, struct mbuf *m, u_int direction)
1677 {
1678 	struct hvn_softc *sc = txr->txr_softc;
1679 	struct ifnet *ifp = SC2IFP(sc);
1680 	struct ether_header *eh;
1681 	struct ether_vlan_header evl;
1682 
1683 	if (!vlan_has_tag(m)) {
1684 		bpf_mtap(ifp, m, direction);
1685 		return;
1686 	}
1687 
1688 	if (ifp->if_bpf == NULL)
1689 		return;
1690 
1691 	txr->txr_evvlantap.ev_count++;
1692 
1693 	/*
1694 	 * Restore a VLAN tag for bpf.
1695 	 *
1696 	 * Do not modify contents of the original mbuf,
1697 	 * because Tx processing on the mbuf is still in progress.
1698 	 */
1699 
1700 	eh = mtod(m, struct ether_header *);
1701 	memcpy(evl.evl_dhost, eh->ether_dhost, ETHER_ADDR_LEN * 2);
1702 	evl.evl_encap_proto = htons(ETHERTYPE_VLAN);
1703 	evl.evl_tag = htons(vlan_get_tag(m));
1704 	evl.evl_proto = eh->ether_type;
1705 
1706 	/* Do not tap ether header of the original mbuf. */
1707 	m_adj(m, sizeof(*eh));
1708 
1709 	bpf_mtap2(ifp->if_bpf, &evl, sizeof(evl), m, direction);
1710 
1711 	/* Cannot restore ether header of the original mbuf,
1712 	 * but do not worry about it because just free it. */
1713 }
1714 
1715 static int
1716 hvn_txpkt(struct hvn_tx_ring *txr, struct hvn_tx_desc *txd)
1717 {
1718 	struct hvn_softc *sc = txr->txr_softc;
1719 	struct ifnet *ifp = SC2IFP(sc);
1720 	const struct hvn_tx_desc *tmp_txd;
1721 	int error;
1722 
1723 	/*
1724 	 * Make sure that this txd and any aggregated txds are not
1725 	 * freed before bpf_mtap.
1726 	 */
1727 	hvn_txd_hold(txd);
1728 
1729 	error = (*txr->txr_sendpkt)(txr, txd);
1730 	if (error == 0) {
1731 		hvn_bpf_mtap(txr, txd->txd_buf, BPF_D_OUT);
1732 		STAILQ_FOREACH(tmp_txd, &txd->txd_agg_list, txd_agg_entry)
1733 			hvn_bpf_mtap(txr, tmp_txd->txd_buf, BPF_D_OUT);
1734 
1735 		if_statadd(ifp, if_opackets, txr->txr_stat_pkts);
1736 		if_statadd(ifp, if_obytes, txr->txr_stat_size);
1737 		if (txr->txr_stat_mcasts != 0)
1738 			if_statadd(ifp, if_omcasts, txr->txr_stat_mcasts);
1739 		txr->txr_evpkts.ev_count += txr->txr_stat_pkts;
1740 		txr->txr_evsends.ev_count++;
1741 	}
1742 
1743 	hvn_txd_put(txr, txd);
1744 
1745 	if (__predict_false(error)) {
1746 		/*
1747 		 * Caller will perform further processing on the
1748 		 * associated mbuf, so don't free it in hvn_txd_put();
1749 		 * only unload it from the DMA map in hvn_txd_put(),
1750 		 * if it was loaded.
1751 		 */
1752 		txd->txd_buf = NULL;
1753 		hvn_txd_put(txr, txd);
1754 	}
1755 
1756 	/* Reset temporary stats, after this sending is done. */
1757 	txr->txr_stat_pkts = 0;
1758 	txr->txr_stat_size = 0;
1759 	txr->txr_stat_mcasts = 0;
1760 
1761 	return error;
1762 }
1763 
1764 static void
1765 hvn_txeof(struct hvn_tx_ring *txr, uint64_t tid)
1766 {
1767 	struct hvn_softc *sc = txr->txr_softc;
1768 	struct hvn_tx_desc *txd;
1769 	uint32_t id = tid >> 32;
1770 
1771 	if ((tid & 0xffffffffU) != 0)
1772 		return;
1773 
1774 	id -= HVN_NVS_CHIM_SIG;
1775 	if (id >= HVN_TX_DESC) {
1776 		device_printf(sc->sc_dev, "tx packet index too large: %u", id);
1777 		return;
1778 	}
1779 
1780 	txd = &txr->txr_desc[id];
1781 
1782 	if (txd->txd_buf == NULL)
1783 		device_printf(sc->sc_dev, "no mbuf @%u\n", id);
1784 
1785 	hvn_txd_put(txr, txd);
1786 }
1787 
1788 static int
1789 hvn_rx_ring_create(struct hvn_softc *sc, int ring_cnt)
1790 {
1791 	struct hvn_rx_ring *rxr;
1792 	int i;
1793 
1794 	if (sc->sc_proto <= HVN_NVS_PROTO_VERSION_2)
1795 		sc->sc_rx_size = 15 * 1024 * 1024;	/* 15MB */
1796 	else
1797 		sc->sc_rx_size = 16 * 1024 * 1024; 	/* 16MB */
1798 	sc->sc_rx_ring = hyperv_dma_alloc(sc->sc_dmat, &sc->sc_rx_dma,
1799 	    sc->sc_rx_size, PAGE_SIZE, PAGE_SIZE, sc->sc_rx_size / PAGE_SIZE);
1800 	if (sc->sc_rx_ring == NULL) {
1801 		DPRINTF("%s: failed to allocate Rx ring buffer\n",
1802 		    device_xname(sc->sc_dev));
1803 		return -1;
1804 	}
1805 
1806 	sc->sc_rxr = kmem_zalloc(sizeof(*rxr) * ring_cnt, KM_SLEEP);
1807 	sc->sc_nrxr_inuse = sc->sc_nrxr = ring_cnt;
1808 
1809 	for (i = 0; i < sc->sc_nrxr; i++) {
1810 		rxr = &sc->sc_rxr[i];
1811 		rxr->rxr_softc = sc;
1812 		if (i < sc->sc_ntxr) {
1813 			rxr->rxr_txr = &sc->sc_txr[i];
1814 			rxr->rxr_txr->txr_rxr = rxr;
1815 		}
1816 
1817 		mutex_init(&rxr->rxr_lock, MUTEX_DEFAULT, IPL_NET);
1818 		mutex_init(&rxr->rxr_onwork_lock, MUTEX_DEFAULT, IPL_NET);
1819 		cv_init(&rxr->rxr_onwork_cv, "waitonwk");
1820 
1821 		snprintf(rxr->rxr_name, sizeof(rxr->rxr_name),
1822 		    "%s-rx%d", device_xname(sc->sc_dev), i);
1823 		evcnt_attach_dynamic(&rxr->rxr_evpkts, EVCNT_TYPE_MISC,
1824 		    NULL, rxr->rxr_name, "packets received");
1825 		evcnt_attach_dynamic(&rxr->rxr_evcsum_ip, EVCNT_TYPE_MISC,
1826 		    NULL, rxr->rxr_name, "IP checksum");
1827 		evcnt_attach_dynamic(&rxr->rxr_evcsum_tcp, EVCNT_TYPE_MISC,
1828 		    NULL, rxr->rxr_name, "TCP checksum");
1829 		evcnt_attach_dynamic(&rxr->rxr_evcsum_udp, EVCNT_TYPE_MISC,
1830 		    NULL, rxr->rxr_name, "UDP checksum");
1831 		evcnt_attach_dynamic(&rxr->rxr_evvlanhwtagging, EVCNT_TYPE_MISC,
1832 		    NULL, rxr->rxr_name, "VLAN H/W tagging");
1833 		evcnt_attach_dynamic(&rxr->rxr_evintr, EVCNT_TYPE_INTR,
1834 		    NULL, rxr->rxr_name, "interrupt on ring");
1835 		evcnt_attach_dynamic(&rxr->rxr_evdefer, EVCNT_TYPE_MISC,
1836 		    NULL, rxr->rxr_name, "handled queue in workqueue");
1837 		evcnt_attach_dynamic(&rxr->rxr_evdeferreq, EVCNT_TYPE_MISC,
1838 		    NULL, rxr->rxr_name, "requested defer on ring");
1839 		evcnt_attach_dynamic(&rxr->rxr_evredeferreq, EVCNT_TYPE_MISC,
1840 		    NULL, rxr->rxr_name, "requested defer in workqueue");
1841 
1842 		rxr->rxr_nvsbuf = kmem_zalloc(HVN_NVS_BUFSIZE, KM_SLEEP);
1843 		if (rxr->rxr_nvsbuf == NULL) {
1844 			DPRINTF("%s: failed to allocate channel data buffer\n",
1845 			    device_xname(sc->sc_dev));
1846 			goto errout;
1847 		}
1848 
1849 		rxr->rxr_si = softint_establish(SOFTINT_NET | SOFTINT_MPSAFE,
1850 		    hvn_nvs_softintr, rxr);
1851 		if (rxr->rxr_si == NULL) {
1852 			DPRINTF("%s: failed to establish rx softint\n",
1853 			    device_xname(sc->sc_dev));
1854 			goto errout;
1855 		}
1856 	}
1857 
1858 	return 0;
1859 
1860  errout:
1861 	hvn_rx_ring_destroy(sc);
1862 	return -1;
1863 }
1864 
1865 static int
1866 hvn_rx_ring_destroy(struct hvn_softc *sc)
1867 {
1868 	struct hvn_rx_ring *rxr;
1869 	int i;
1870 
1871 	if (sc->sc_rxr != NULL) {
1872 		for (i = 0; i < sc->sc_nrxr; i++) {
1873 			rxr = &sc->sc_rxr[i];
1874 
1875 			if (rxr->rxr_si != NULL) {
1876 				softint_disestablish(rxr->rxr_si);
1877 				rxr->rxr_si = NULL;
1878 			}
1879 
1880 			if (rxr->rxr_nvsbuf != NULL) {
1881 				kmem_free(rxr->rxr_nvsbuf, HVN_NVS_BUFSIZE);
1882 				rxr->rxr_nvsbuf = NULL;
1883 			}
1884 
1885 			evcnt_detach(&rxr->rxr_evpkts);
1886 			evcnt_detach(&rxr->rxr_evcsum_ip);
1887 			evcnt_detach(&rxr->rxr_evcsum_tcp);
1888 			evcnt_detach(&rxr->rxr_evcsum_udp);
1889 			evcnt_detach(&rxr->rxr_evvlanhwtagging);
1890 			evcnt_detach(&rxr->rxr_evintr);
1891 			evcnt_detach(&rxr->rxr_evdefer);
1892 			evcnt_detach(&rxr->rxr_evdeferreq);
1893 			evcnt_detach(&rxr->rxr_evredeferreq);
1894 
1895 			cv_destroy(&rxr->rxr_onwork_cv);
1896 			mutex_destroy(&rxr->rxr_onwork_lock);
1897 			mutex_destroy(&rxr->rxr_lock);
1898 		}
1899 		kmem_free(sc->sc_rxr, sizeof(*rxr) * sc->sc_nrxr);
1900 		sc->sc_rxr = NULL;
1901 		sc->sc_nrxr = 0;
1902 	}
1903 	if (sc->sc_rx_ring != NULL) {
1904 		hyperv_dma_free(sc->sc_dmat, &sc->sc_rx_dma);
1905 		sc->sc_rx_ring = NULL;
1906 	}
1907 
1908 	return 0;
1909 }
1910 
1911 static void
1912 hvn_fixup_rx_data(struct hvn_softc *sc)
1913 {
1914 	struct hvn_rx_ring *rxr;
1915 	int i;
1916 
1917 	if (sc->sc_caps & HVN_CAPS_UDPHASH) {
1918 		for (i = 0; i < sc->sc_nrxr; i++) {
1919 			rxr = &sc->sc_rxr[i];
1920 			rxr->rxr_flags |= HVN_RXR_FLAG_UDP_HASH;
1921 		}
1922 	}
1923 }
1924 
1925 static int
1926 hvn_tx_ring_create(struct hvn_softc *sc, int ring_cnt)
1927 {
1928 	struct hvn_tx_ring *txr;
1929 	struct hvn_tx_desc *txd;
1930 	bus_dma_segment_t *seg;
1931 	size_t msgsize;
1932 	int i, j;
1933 	paddr_t pa;
1934 
1935 	/*
1936 	 * Create TXBUF for chimney sending.
1937 	 *
1938 	 * NOTE: It is shared by all channels.
1939 	 */
1940 	sc->sc_chim = hyperv_dma_alloc(sc->sc_dmat, &sc->sc_chim_dma,
1941 	    HVN_CHIM_SIZE, PAGE_SIZE, 0, 1);
1942 	if (sc->sc_chim == NULL) {
1943 		DPRINTF("%s: failed to allocate chimney sending memory",
1944 		    device_xname(sc->sc_dev));
1945 		goto errout;
1946 	}
1947 
1948 	sc->sc_txr = kmem_zalloc(sizeof(*txr) * ring_cnt, KM_SLEEP);
1949 	sc->sc_ntxr_inuse = sc->sc_ntxr = ring_cnt;
1950 
1951 	msgsize = roundup(HVN_RNDIS_PKT_LEN, 128);
1952 
1953 	for (j = 0; j < ring_cnt; j++) {
1954 		txr = &sc->sc_txr[j];
1955 		txr->txr_softc = sc;
1956 		txr->txr_id = j;
1957 
1958 		mutex_init(&txr->txr_lock, MUTEX_DEFAULT, IPL_NET);
1959 		txr->txr_interq = pcq_create(HVN_TX_DESC, KM_SLEEP);
1960 
1961 		snprintf(txr->txr_name, sizeof(txr->txr_name),
1962 		    "%s-tx%d", device_xname(sc->sc_dev), j);
1963 		evcnt_attach_dynamic(&txr->txr_evpkts, EVCNT_TYPE_MISC,
1964 		    NULL, txr->txr_name, "packets transmit");
1965 		evcnt_attach_dynamic(&txr->txr_evsends, EVCNT_TYPE_MISC,
1966 		    NULL, txr->txr_name, "sends");
1967 		evcnt_attach_dynamic(&txr->txr_evnodesc, EVCNT_TYPE_MISC,
1968 		    NULL, txr->txr_name, "descriptor shortage");
1969 		evcnt_attach_dynamic(&txr->txr_evdmafailed, EVCNT_TYPE_MISC,
1970 		    NULL, txr->txr_name, "DMA failure");
1971 		evcnt_attach_dynamic(&txr->txr_evdefrag, EVCNT_TYPE_MISC,
1972 		    NULL, txr->txr_name, "mbuf defraged");
1973 		evcnt_attach_dynamic(&txr->txr_evpcqdrop, EVCNT_TYPE_MISC,
1974 		    NULL, txr->txr_name, "dropped in pcq");
1975 		evcnt_attach_dynamic(&txr->txr_evtransmitdefer, EVCNT_TYPE_MISC,
1976 		    NULL, txr->txr_name, "deferred transmit");
1977 		evcnt_attach_dynamic(&txr->txr_evflushfailed, EVCNT_TYPE_MISC,
1978 		    NULL, txr->txr_name, "aggregation flush failure");
1979 		evcnt_attach_dynamic(&txr->txr_evchimneytried, EVCNT_TYPE_MISC,
1980 		    NULL, txr->txr_name, "chimney send tried");
1981 		evcnt_attach_dynamic(&txr->txr_evchimney, EVCNT_TYPE_MISC,
1982 		    NULL, txr->txr_name, "chimney send");
1983 		evcnt_attach_dynamic(&txr->txr_evvlanfixup, EVCNT_TYPE_MISC,
1984 		    NULL, txr->txr_name, "VLAN fixup");
1985 		evcnt_attach_dynamic(&txr->txr_evvlanhwtagging, EVCNT_TYPE_MISC,
1986 		    NULL, txr->txr_name, "VLAN H/W tagging");
1987 		evcnt_attach_dynamic(&txr->txr_evvlantap, EVCNT_TYPE_MISC,
1988 		    NULL, txr->txr_name, "VLAN bpf_mtap fixup");
1989 
1990 		txr->txr_si = softint_establish(SOFTINT_NET | SOFTINT_MPSAFE,
1991 		    hvn_deferred_transmit, txr);
1992 		if (txr->txr_si == NULL) {
1993 			aprint_error_dev(sc->sc_dev,
1994 			    "failed to establish softint for tx ring\n");
1995 			goto errout;
1996 		}
1997 
1998 		/* Allocate memory to store RNDIS messages */
1999 		txr->txr_msgs = hyperv_dma_alloc(sc->sc_dmat, &txr->txr_dma,
2000 		    msgsize * HVN_TX_DESC, PAGE_SIZE, 0, 1);
2001 		if (txr->txr_msgs == NULL) {
2002 			DPRINTF("%s: failed to allocate memory for RDNIS "
2003 			    "messages\n", device_xname(sc->sc_dev));
2004 			goto errout;
2005 		}
2006 
2007 		TAILQ_INIT(&txr->txr_list);
2008 		for (i = 0; i < HVN_TX_DESC; i++) {
2009 			txd = &txr->txr_desc[i];
2010 			txd->txd_chim_index = HVN_NVS_CHIM_IDX_INVALID;
2011 			txd->txd_chim_size = 0;
2012 			STAILQ_INIT(&txd->txd_agg_list);
2013 			if (bus_dmamap_create(sc->sc_dmat, HVN_TX_PKT_SIZE,
2014 			    HVN_TX_FRAGS, HVN_TX_FRAG_SIZE, PAGE_SIZE,
2015 			    BUS_DMA_WAITOK, &txd->txd_dmap)) {
2016 				DPRINTF("%s: failed to create map for TX "
2017 				    "descriptors\n", device_xname(sc->sc_dev));
2018 				goto errout;
2019 			}
2020 			seg = &txr->txr_dma.map->dm_segs[0];
2021 			pa = seg->ds_addr + (msgsize * i);
2022 			txd->txd_gpa.gpa_page = atop(pa);
2023 			txd->txd_gpa.gpa_ofs = pa & PAGE_MASK;
2024 			txd->txd_gpa.gpa_len = msgsize;
2025 			txd->txd_req = (void *)(txr->txr_msgs + (msgsize * i));
2026 			txd->txd_id = i + HVN_NVS_CHIM_SIG;
2027 			TAILQ_INSERT_TAIL(&txr->txr_list, txd, txd_entry);
2028 		}
2029 		txr->txr_avail = HVN_TX_DESC;
2030 	}
2031 
2032 	return 0;
2033 
2034  errout:
2035 	hvn_tx_ring_destroy(sc);
2036 	return -1;
2037 }
2038 
2039 static void
2040 hvn_tx_ring_destroy(struct hvn_softc *sc)
2041 {
2042 	struct hvn_tx_ring *txr;
2043 	struct hvn_tx_desc *txd;
2044 	int i, j;
2045 
2046 	if (sc->sc_txr != NULL) {
2047 		for (j = 0; j < sc->sc_ntxr; j++) {
2048 			txr = &sc->sc_txr[j];
2049 
2050 			mutex_enter(&txr->txr_lock);
2051 			for (i = 0; i < HVN_TX_DESC; i++) {
2052 				txd = &txr->txr_desc[i];
2053 				hvn_txd_gc(txr, txd);
2054 			}
2055 			mutex_exit(&txr->txr_lock);
2056 			for (i = 0; i < HVN_TX_DESC; i++) {
2057 				txd = &txr->txr_desc[i];
2058 				if (txd->txd_dmap != NULL) {
2059 					bus_dmamap_destroy(sc->sc_dmat,
2060 					    txd->txd_dmap);
2061 					txd->txd_dmap = NULL;
2062 				}
2063 			}
2064 			if (txr->txr_msgs != NULL) {
2065 				hyperv_dma_free(sc->sc_dmat, &txr->txr_dma);
2066 				txr->txr_msgs = NULL;
2067 			}
2068 			if (txr->txr_si != NULL) {
2069 				softint_disestablish(txr->txr_si);
2070 				txr->txr_si = NULL;
2071 			}
2072 			if (txr->txr_interq != NULL) {
2073 				hvn_tx_ring_qflush(sc, txr);
2074 				pcq_destroy(txr->txr_interq);
2075 				txr->txr_interq = NULL;
2076 			}
2077 
2078 			evcnt_detach(&txr->txr_evpkts);
2079 			evcnt_detach(&txr->txr_evsends);
2080 			evcnt_detach(&txr->txr_evnodesc);
2081 			evcnt_detach(&txr->txr_evdmafailed);
2082 			evcnt_detach(&txr->txr_evdefrag);
2083 			evcnt_detach(&txr->txr_evpcqdrop);
2084 			evcnt_detach(&txr->txr_evtransmitdefer);
2085 			evcnt_detach(&txr->txr_evflushfailed);
2086 			evcnt_detach(&txr->txr_evchimneytried);
2087 			evcnt_detach(&txr->txr_evchimney);
2088 			evcnt_detach(&txr->txr_evvlanfixup);
2089 			evcnt_detach(&txr->txr_evvlanhwtagging);
2090 			evcnt_detach(&txr->txr_evvlantap);
2091 
2092 			mutex_destroy(&txr->txr_lock);
2093 		}
2094 
2095 		kmem_free(sc->sc_txr, sizeof(*txr) * sc->sc_ntxr);
2096 		sc->sc_txr = NULL;
2097 	}
2098 
2099 	if (sc->sc_chim != NULL) {
2100 		hyperv_dma_free(sc->sc_dmat, &sc->sc_chim_dma);
2101 		sc->sc_chim = NULL;
2102 	}
2103 }
2104 
2105 static void
2106 hvn_set_chim_size(struct hvn_softc *sc, int chim_size)
2107 {
2108 	struct hvn_tx_ring *txr;
2109 	int i;
2110 
2111 	for (i = 0; i < sc->sc_ntxr_inuse; i++) {
2112 		txr = &sc->sc_txr[i];
2113 		txr->txr_chim_size = chim_size;
2114 	}
2115 }
2116 
2117 #if LONG_BIT == 64
2118 #define ffsl(v)	ffs64(v)
2119 #elif LONG_BIT == 32
2120 #define ffsl(v)	ffs32(v)
2121 #else
2122 #error unsupport LONG_BIT
2123 #endif  /* LONG_BIT */
2124 
2125 static uint32_t
2126 hvn_chim_alloc(struct hvn_softc *sc)
2127 {
2128 	uint32_t chim_idx = HVN_NVS_CHIM_IDX_INVALID;
2129 	int i, idx;
2130 
2131 	mutex_spin_enter(&sc->sc_chim_bmap_lock);
2132 	for (i = 0; i < sc->sc_chim_bmap_cnt; i++) {
2133 		idx = ffsl(~sc->sc_chim_bmap[i]);
2134 		if (idx == 0)
2135 			continue;
2136 
2137 		--idx;	/* ffsl is 1-based */
2138 		SET(sc->sc_chim_bmap[i], __BIT(idx));
2139 
2140 		chim_idx = i * LONG_BIT + idx;
2141 		break;
2142 	}
2143 	mutex_spin_exit(&sc->sc_chim_bmap_lock);
2144 
2145 	return chim_idx;
2146 }
2147 
2148 static void
2149 hvn_chim_free(struct hvn_softc *sc, uint32_t chim_idx)
2150 {
2151 	u_long mask;
2152 	uint32_t idx;
2153 
2154 	idx = chim_idx / LONG_BIT;
2155 	mask = __BIT(chim_idx % LONG_BIT);
2156 
2157 	mutex_spin_enter(&sc->sc_chim_bmap_lock);
2158 	CLR(sc->sc_chim_bmap[idx], mask);
2159 	mutex_spin_exit(&sc->sc_chim_bmap_lock);
2160 }
2161 
2162 static void
2163 hvn_fixup_tx_data(struct hvn_softc *sc)
2164 {
2165 	struct hvn_tx_ring *txr;
2166 	uint64_t caps_assist;
2167 	int csum_assist;
2168 	int i;
2169 
2170 	hvn_set_chim_size(sc, sc->sc_chim_szmax);
2171 	if (hvn_tx_chimney_size > 0 && hvn_tx_chimney_size < sc->sc_chim_szmax)
2172 		hvn_set_chim_size(sc, hvn_tx_chimney_size);
2173 
2174 	caps_assist = 0;
2175 	csum_assist = 0;
2176 	if (sc->sc_caps & HVN_CAPS_IPCS) {
2177 		caps_assist |= IFCAP_CSUM_IPv4_Tx;
2178 		caps_assist |= IFCAP_CSUM_IPv4_Rx;
2179 		csum_assist |= M_CSUM_IPv4;
2180 	}
2181 	if (sc->sc_caps & HVN_CAPS_TCP4CS) {
2182 		caps_assist |= IFCAP_CSUM_TCPv4_Tx;
2183 		caps_assist |= IFCAP_CSUM_TCPv4_Rx;
2184 		csum_assist |= M_CSUM_TCPv4;
2185 	}
2186 	if (sc->sc_caps &  HVN_CAPS_TCP6CS) {
2187 		caps_assist |= IFCAP_CSUM_TCPv6_Tx;
2188 		csum_assist |= M_CSUM_TCPv6;
2189 	}
2190 	if (sc->sc_caps & HVN_CAPS_UDP4CS) {
2191 		caps_assist |= IFCAP_CSUM_UDPv4_Tx;
2192 		caps_assist |= IFCAP_CSUM_UDPv4_Rx;
2193 		csum_assist |= M_CSUM_UDPv4;
2194 	}
2195 	if (sc->sc_caps & HVN_CAPS_UDP6CS) {
2196 		caps_assist |= IFCAP_CSUM_UDPv6_Tx;
2197 		csum_assist |= M_CSUM_UDPv6;
2198 	}
2199 	for (i = 0; i < sc->sc_ntxr; i++) {
2200 		txr = &sc->sc_txr[i];
2201 		txr->txr_caps_assist = caps_assist;
2202 		txr->txr_csum_assist = csum_assist;
2203 	}
2204 
2205 	if (sc->sc_caps & HVN_CAPS_UDPHASH) {
2206 		for (i = 0; i < sc->sc_ntxr; i++) {
2207 			txr = &sc->sc_txr[i];
2208 			txr->txr_flags |= HVN_TXR_FLAG_UDP_HASH;
2209 		}
2210 	}
2211 }
2212 
2213 static int
2214 hvn_txd_peek(struct hvn_tx_ring *txr)
2215 {
2216 
2217 	KASSERT(mutex_owned(&txr->txr_lock));
2218 
2219 	return txr->txr_avail;
2220 }
2221 
2222 static struct hvn_tx_desc *
2223 hvn_txd_get(struct hvn_tx_ring *txr)
2224 {
2225 	struct hvn_tx_desc *txd;
2226 
2227 	KASSERT(mutex_owned(&txr->txr_lock));
2228 
2229 	txd = TAILQ_FIRST(&txr->txr_list);
2230 	KASSERT(txd != NULL);
2231 	TAILQ_REMOVE(&txr->txr_list, txd, txd_entry);
2232 	txr->txr_avail--;
2233 
2234 	txd->txd_refs = 1;
2235 
2236 	return txd;
2237 }
2238 
2239 static void
2240 hvn_txd_put(struct hvn_tx_ring *txr, struct hvn_tx_desc *txd)
2241 {
2242 	struct hvn_softc *sc = txr->txr_softc;
2243 	struct hvn_tx_desc *tmp_txd;
2244 
2245 	KASSERT(mutex_owned(&txr->txr_lock));
2246 	KASSERTMSG(!ISSET(txd->txd_flags, HVN_TXD_FLAG_ONAGG),
2247 	    "put an onagg txd %#x", txd->txd_flags);
2248 
2249 	KASSERTMSG(txd->txd_refs > 0, "invalid txd refs %d", txd->txd_refs);
2250 	if (atomic_dec_uint_nv(&txd->txd_refs) != 0)
2251 		return;
2252 
2253 	if (!STAILQ_EMPTY(&txd->txd_agg_list)) {
2254 		while ((tmp_txd = STAILQ_FIRST(&txd->txd_agg_list)) != NULL) {
2255 			KASSERTMSG(STAILQ_EMPTY(&tmp_txd->txd_agg_list),
2256 			    "resursive aggregation on aggregated txdesc");
2257 			KASSERTMSG(
2258 			    ISSET(tmp_txd->txd_flags, HVN_TXD_FLAG_ONAGG),
2259 			    "not aggregated txdesc");
2260 			KASSERTMSG(
2261 			    tmp_txd->txd_chim_index == HVN_NVS_CHIM_IDX_INVALID,
2262 			    "aggregated txdesc consumes chimney sending "
2263 			    "buffer: idx %u", tmp_txd->txd_chim_index);
2264 			KASSERTMSG(tmp_txd->txd_chim_size == 0,
2265 			    "aggregated txdesc has non-zero chimney sending "
2266 			    "size: sz %u", tmp_txd->txd_chim_size);
2267 
2268 			STAILQ_REMOVE_HEAD(&txd->txd_agg_list, txd_agg_entry);
2269 			CLR(tmp_txd->txd_flags, HVN_TXD_FLAG_ONAGG);
2270 			hvn_txd_put(txr, tmp_txd);
2271 		}
2272 	}
2273 
2274 	if (txd->txd_chim_index != HVN_NVS_CHIM_IDX_INVALID) {
2275 		KASSERTMSG(!ISSET(txd->txd_flags, HVN_TXD_FLAG_DMAMAP),
2276 		    "chim txd uses dmamap");
2277 		hvn_chim_free(sc, txd->txd_chim_index);
2278 		txd->txd_chim_index = HVN_NVS_CHIM_IDX_INVALID;
2279 		txd->txd_chim_size = 0;
2280 	} else if (ISSET(txd->txd_flags, HVN_TXD_FLAG_DMAMAP)) {
2281 		bus_dmamap_sync(sc->sc_dmat, txd->txd_dmap,
2282 		    0, txd->txd_dmap->dm_mapsize,
2283 		    BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE);
2284 		bus_dmamap_unload(sc->sc_dmat, txd->txd_dmap);
2285 		CLR(txd->txd_flags, HVN_TXD_FLAG_DMAMAP);
2286 	}
2287 
2288 	if (txd->txd_buf != NULL) {
2289 		m_freem(txd->txd_buf);
2290 		txd->txd_buf = NULL;
2291 	}
2292 
2293 	TAILQ_INSERT_TAIL(&txr->txr_list, txd, txd_entry);
2294 	txr->txr_avail++;
2295 	txr->txr_oactive = 0;
2296 }
2297 
2298 static void
2299 hvn_txd_gc(struct hvn_tx_ring *txr, struct hvn_tx_desc *txd)
2300 {
2301 
2302 	KASSERTMSG(txd->txd_refs == 0 || txd->txd_refs == 1,
2303 	    "invalid txd refs %d", txd->txd_refs);
2304 
2305 	/* Aggregated txds will be freed by their aggregating txd. */
2306 	if (txd->txd_refs > 0 && !ISSET(txd->txd_flags, HVN_TXD_FLAG_ONAGG))
2307 		hvn_txd_put(txr, txd);
2308 }
2309 
2310 static void
2311 hvn_txd_hold(struct hvn_tx_desc *txd)
2312 {
2313 
2314 	/* 0->1 transition will never work */
2315 	KASSERTMSG(txd->txd_refs > 0, "invalid txd refs %d", txd->txd_refs);
2316 
2317 	atomic_inc_uint(&txd->txd_refs);
2318 }
2319 
2320 static void
2321 hvn_txd_agg(struct hvn_tx_desc *agg_txd, struct hvn_tx_desc *txd)
2322 {
2323 
2324 	KASSERTMSG(!ISSET(agg_txd->txd_flags, HVN_TXD_FLAG_ONAGG),
2325 	    "recursive aggregation on aggregating txdesc");
2326 	KASSERTMSG(!ISSET(txd->txd_flags, HVN_TXD_FLAG_ONAGG),
2327 	    "already aggregated");
2328 	KASSERTMSG(STAILQ_EMPTY(&txd->txd_agg_list),
2329 	    "recursive aggregation on to-be-aggregated txdesc");
2330 
2331 	SET(txd->txd_flags, HVN_TXD_FLAG_ONAGG);
2332 	STAILQ_INSERT_TAIL(&agg_txd->txd_agg_list, txd, txd_agg_entry);
2333 }
2334 
2335 static int
2336 hvn_tx_ring_pending(struct hvn_tx_ring *txr)
2337 {
2338 	int pending = 0;
2339 
2340 	mutex_enter(&txr->txr_lock);
2341 	if (hvn_txd_peek(txr) != HVN_TX_DESC)
2342 		pending = 1;
2343 	mutex_exit(&txr->txr_lock);
2344 
2345 	return pending;
2346 }
2347 
2348 static void
2349 hvn_tx_ring_qflush(struct hvn_softc *sc, struct hvn_tx_ring *txr)
2350 {
2351 	struct mbuf *m;
2352 
2353 	while ((m = pcq_get(txr->txr_interq)) != NULL)
2354 		m_freem(m);
2355 }
2356 
2357 static int
2358 hvn_get_lladdr(struct hvn_softc *sc, uint8_t *enaddr)
2359 {
2360 	size_t addrlen = ETHER_ADDR_LEN;
2361 	int rv;
2362 
2363 	rv = hvn_rndis_query(sc, OID_802_3_PERMANENT_ADDRESS, enaddr, &addrlen);
2364 	if (rv == 0 && addrlen != ETHER_ADDR_LEN)
2365 		rv = -1;
2366 	return rv;
2367 }
2368 
2369 static void
2370 hvn_update_link_status(struct hvn_softc *sc)
2371 {
2372 	struct ifnet *ifp = SC2IFP(sc);
2373 	uint32_t state, old_link_state;
2374 	size_t len = sizeof(state);
2375 	int rv;
2376 
2377 	rv = hvn_rndis_query(sc, OID_GEN_MEDIA_CONNECT_STATUS, &state, &len);
2378 	if (rv != 0 || len != sizeof(state))
2379 		return;
2380 
2381 	old_link_state = sc->sc_link_state;
2382 	sc->sc_link_state = (state == NDIS_MEDIA_STATE_CONNECTED) ?
2383 	    LINK_STATE_UP : LINK_STATE_DOWN;
2384 	if (old_link_state != sc->sc_link_state) {
2385 		if_link_state_change(ifp, sc->sc_link_state);
2386 	}
2387 }
2388 
2389 static int
2390 hvn_get_mtu(struct hvn_softc *sc, uint32_t *mtu)
2391 {
2392 	size_t mtusz = sizeof(*mtu);
2393 	int rv;
2394 
2395 	rv = hvn_rndis_query(sc, OID_GEN_MAXIMUM_FRAME_SIZE, mtu, &mtusz);
2396 	if (rv == 0 && mtusz != sizeof(*mtu))
2397 		rv = -1;
2398 	return rv;
2399 }
2400 
2401 static int
2402 hvn_channel_attach(struct hvn_softc *sc, struct vmbus_channel *chan)
2403 {
2404 	struct hvn_rx_ring *rxr;
2405 	struct hvn_tx_ring *txr;
2406 	int idx;
2407 
2408 	idx = chan->ch_subidx;
2409 	if (idx < 0 || idx >= sc->sc_nrxr_inuse) {
2410 		DPRINTF("%s: invalid sub-channel %u\n",
2411 		    device_xname(sc->sc_dev), idx);
2412 		return -1;
2413 	}
2414 
2415 	rxr = &sc->sc_rxr[idx];
2416 	rxr->rxr_chan = chan;
2417 
2418 	if (idx < sc->sc_ntxr_inuse) {
2419 		txr = &sc->sc_txr[idx];
2420 		txr->txr_chan = chan;
2421 	}
2422 
2423 	/* Bind this channel to a proper CPU. */
2424 	vmbus_channel_cpu_set(chan, HVN_RING_IDX2CPU(sc, idx));
2425 
2426 	chan->ch_flags &= ~CHF_BATCHED;
2427 
2428 	/* Associate our interrupt handler with the channel */
2429 	if (vmbus_channel_open(chan,
2430 	    HVN_RING_BUFSIZE - sizeof(struct vmbus_bufring), NULL, 0,
2431 	    hvn_nvs_intr, rxr)) {
2432 		DPRINTF("%s: failed to open channel\n",
2433 		    device_xname(sc->sc_dev));
2434 		return -1;
2435 	}
2436 
2437 	return 0;
2438 }
2439 
2440 static void
2441 hvn_channel_detach(struct hvn_softc *sc, struct vmbus_channel *chan)
2442 {
2443 
2444 	vmbus_channel_close_direct(chan);
2445 }
2446 
2447 static void
2448 hvn_channel_detach_all(struct hvn_softc *sc)
2449 {
2450 	struct vmbus_channel **subchans;
2451 	int i, subchan_cnt = sc->sc_nrxr_inuse - 1;
2452 
2453 	if (subchan_cnt > 0) {
2454 		/* Detach the sub-channels. */
2455 		subchans = vmbus_subchannel_get(sc->sc_prichan, subchan_cnt);
2456 		for (i = 0; i < subchan_cnt; i++)
2457 			hvn_channel_detach(sc, subchans[i]);
2458 		vmbus_subchannel_rel(subchans, subchan_cnt);
2459 	}
2460 
2461 	/*
2462 	 * Detach the primary channel, _after_ all sub-channels
2463 	 * are detached.
2464 	 */
2465 	hvn_channel_detach(sc, sc->sc_prichan);
2466 
2467 	/* Wait for sub-channels to be destroyed, if any. */
2468 	vmbus_subchannel_drain(sc->sc_prichan);
2469 }
2470 
2471 static int
2472 hvn_subchannel_attach(struct hvn_softc *sc)
2473 {
2474 	struct vmbus_channel **subchans;
2475 	int subchan_cnt = sc->sc_nrxr_inuse - 1;
2476 	int i, error = 0;
2477 
2478 	KASSERTMSG(subchan_cnt > 0, "no sub-channels");
2479 
2480 	/* Attach the sub-channels. */
2481 	subchans = vmbus_subchannel_get(sc->sc_prichan, subchan_cnt);
2482 	for (i = 0; i < subchan_cnt; ++i) {
2483 		int error1;
2484 
2485 		error1 = hvn_channel_attach(sc, subchans[i]);
2486 		if (error1) {
2487 			error = error1;
2488 			/* Move on; all channels will be detached later. */
2489 		}
2490 	}
2491 	vmbus_subchannel_rel(subchans, subchan_cnt);
2492 
2493 	if (error) {
2494 		aprint_error_dev(sc->sc_dev,
2495 		    "sub-channels attach failed: %d\n", error);
2496 		return error;
2497 	}
2498 
2499 	aprint_debug_dev(sc->sc_dev, "%d sub-channels attached\n",
2500 	    subchan_cnt);
2501 	return 0;
2502 }
2503 
2504 static int
2505 hvn_synth_alloc_subchannels(struct hvn_softc *sc, int *nsubch)
2506 {
2507 	struct vmbus_channel **subchans;
2508 	int error, nchan, rxr_cnt;
2509 
2510 	nchan = *nsubch + 1;
2511 	if (nchan < 2) {
2512 		/* Multiple RX/TX rings are not requested. */
2513 		*nsubch = 0;
2514 		return 0;
2515 	}
2516 
2517 	/*
2518 	 * Query RSS capabilities, e.g. # of RX rings, and # of indirect
2519 	 * table entries.
2520 	 */
2521 	if (hvn_get_rsscaps(sc, &rxr_cnt)) {
2522 		/* No RSS. */
2523 		*nsubch = 0;
2524 		return 0;
2525 	}
2526 
2527 	aprint_debug_dev(sc->sc_dev, "RX rings offered %u, requested %d\n",
2528 	    rxr_cnt, nchan);
2529 
2530 	if (nchan > rxr_cnt)
2531 		nchan = rxr_cnt;
2532 	if (nchan == 1) {
2533 		aprint_debug_dev(sc->sc_dev,
2534 		    "only 1 channel is supported, no vRSS\n");
2535 		*nsubch = 0;
2536 		return 0;
2537 	}
2538 
2539 	*nsubch = nchan - 1;
2540 	error = hvn_nvs_alloc_subchannels(sc, nsubch);
2541 	if (error || *nsubch == 0) {
2542 		/* Failed to allocate sub-channels. */
2543 		*nsubch = 0;
2544 		return 0;
2545 	}
2546 
2547 	/*
2548 	 * Wait for all sub-channels to become ready before moving on.
2549 	 */
2550 	subchans = vmbus_subchannel_get(sc->sc_prichan, *nsubch);
2551 	vmbus_subchannel_rel(subchans, *nsubch);
2552 	return 0;
2553 }
2554 
2555 static int
2556 hvn_synth_attachable(const struct hvn_softc *sc)
2557 {
2558 #if 0
2559 	const struct hvn_rx_ring *rxr;
2560 	int i;
2561 
2562 	for (i = 0; i < sc->sc_nrxr; i++) {
2563 		rxr = &sc->sc_rxr[i];
2564 		if (rxr->rxr_flags)
2565 			return 0;
2566 	}
2567 #endif
2568 	return 1;
2569 }
2570 
2571 /*
2572  * Make sure that the RX filter is zero after the successful
2573  * RNDIS initialization.
2574  *
2575  * NOTE:
2576  * Under certain conditions on certain versions of Hyper-V,
2577  * the RNDIS rxfilter is _not_ zero on the hypervisor side
2578  * after the successful RNDIS initialization, which breaks
2579  * the assumption of any following code (well, it breaks the
2580  * RNDIS API contract actually).  Clear the RNDIS rxfilter
2581  * explicitly, drain packets sneaking through, and drain the
2582  * interrupt taskqueues scheduled due to the stealth packets.
2583  */
2584 static void
2585 hvn_init_fixat(struct hvn_softc *sc, int nchan)
2586 {
2587 
2588 	hvn_disable_rx(sc);
2589 	hvn_drain_rxtx(sc, nchan);
2590 }
2591 
2592 static void
2593 hvn_set_txagg(struct hvn_softc *sc)
2594 {
2595 	struct hvn_tx_ring *txr;
2596 	uint32_t size, pkts;
2597 	int i;
2598 
2599 	/*
2600 	 * Setup aggregation size.
2601 	 */
2602 	if (sc->sc_agg_size < 0)
2603 		size = UINT32_MAX;
2604 	else
2605 		size = sc->sc_agg_size;
2606 
2607 	if (size > sc->sc_rndis_agg_size)
2608 		size = sc->sc_rndis_agg_size;
2609 
2610 	/* NOTE: We only aggregate packets using chimney sending buffers. */
2611 	if (size > (uint32_t)sc->sc_chim_szmax)
2612 		size = sc->sc_chim_szmax;
2613 
2614 	if (size <= 2 * HVN_PKTSIZE_MIN(sc->sc_rndis_agg_align)) {
2615 		/* Disable */
2616 		size = 0;
2617 		pkts = 0;
2618 		goto done;
2619 	}
2620 
2621 	/* NOTE: Type of the per TX ring setting is 'int'. */
2622 	if (size > INT_MAX)
2623 		size = INT_MAX;
2624 
2625 	/*
2626 	 * Setup aggregation packet count.
2627 	 */
2628 	if (sc->sc_agg_pkts < 0)
2629 		pkts = UINT32_MAX;
2630 	else
2631 		pkts = sc->sc_agg_pkts;
2632 
2633 	if (pkts > sc->sc_rndis_agg_pkts)
2634 		pkts = sc->sc_rndis_agg_pkts;
2635 
2636 	if (pkts <= 1) {
2637 		/* Disable */
2638 		size = 0;
2639 		pkts = 0;
2640 		goto done;
2641 	}
2642 
2643 	/* NOTE: Type of the per TX ring setting is 'short'. */
2644 	if (pkts > SHRT_MAX)
2645 		pkts = SHRT_MAX;
2646 
2647 done:
2648 	/* NOTE: Type of the per TX ring setting is 'short'. */
2649 	if (sc->sc_rndis_agg_align > SHRT_MAX) {
2650 		/* Disable */
2651 		size = 0;
2652 		pkts = 0;
2653 	}
2654 
2655 	aprint_verbose_dev(sc->sc_dev,
2656 	    "TX aggregate size %u, pkts %u, align %u\n",
2657 	    size, pkts, sc->sc_rndis_agg_align);
2658 
2659 	for (i = 0; i < sc->sc_ntxr_inuse; ++i) {
2660 		txr = &sc->sc_txr[i];
2661 
2662 		mutex_enter(&txr->txr_lock);
2663 		txr->txr_agg_szmax = size;
2664 		txr->txr_agg_pktmax = pkts;
2665 		txr->txr_agg_align = sc->sc_rndis_agg_align;
2666 		mutex_exit(&txr->txr_lock);
2667 	}
2668 }
2669 
2670 static int
2671 hvn_synth_attach(struct hvn_softc *sc, int mtu)
2672 {
2673 	uint8_t rss_key[RSS_KEYSIZE];
2674 	uint32_t old_caps;
2675 	int nchan = 1, nsubch;
2676 	int i, error;
2677 
2678 	if (!hvn_synth_attachable(sc))
2679 		return ENXIO;
2680 
2681 	/* Save capabilities for later verification. */
2682 	old_caps = sc->sc_caps;
2683 	sc->sc_caps = 0;
2684 
2685 	/* Clear RSS stuffs. */
2686 	sc->sc_rss_ind_size = 0;
2687 	sc->sc_rss_hash = 0;
2688 	sc->sc_rss_hcap = 0;
2689 
2690 	/*
2691 	 * Attach the primary channel _before_ attaching NVS and RNDIS.
2692 	 */
2693 	error = hvn_channel_attach(sc, sc->sc_prichan);
2694 	if (error) {
2695 		aprint_error_dev(sc->sc_dev,
2696 		    "failed to attach primary channel\n");
2697 		goto failed;
2698 	}
2699 
2700 	/*
2701 	 * Attach NVS.
2702 	 */
2703 	error = hvn_nvs_attach(sc, mtu);
2704 	if (error) {
2705 		aprint_error_dev(sc->sc_dev, "failed to init NVSP\n");
2706 		goto detach_channel;
2707 	}
2708 
2709 	/*
2710 	 * Attach RNDIS _after_ NVS is attached.
2711 	 */
2712 	error = hvn_rndis_attach(sc, mtu);
2713 	if (error) {
2714 		aprint_error_dev(sc->sc_dev, "failed to init RNDIS\n");
2715 		goto detach_nvs;
2716 	}
2717 
2718 	error = hvn_set_capabilities(sc, mtu);
2719 	if (error) {
2720 		aprint_error_dev(sc->sc_dev, "failed to setup offloading\n");
2721 		goto detach_rndis;
2722 	}
2723 
2724 	if ((sc->sc_flags & HVN_SCF_ATTACHED) && old_caps != sc->sc_caps) {
2725 		device_printf(sc->sc_dev, "caps mismatch "
2726 		    "old 0x%08x, new 0x%08x\n", old_caps, sc->sc_caps);
2727 		error = ENXIO;
2728 		goto detach_rndis;
2729 	}
2730 
2731 	/*
2732 	 * Allocate sub-channels for multi-TX/RX rings.
2733 	 *
2734 	 * NOTE:
2735 	 * The # of RX rings that can be used is equivalent to the # of
2736 	 * channels to be requested.
2737 	 */
2738 	nsubch = sc->sc_nrxr - 1;
2739 	error = hvn_synth_alloc_subchannels(sc, &nsubch);
2740 	if (error) {
2741 		aprint_error_dev(sc->sc_dev,
2742 		    "failed to allocate sub channels\n");
2743 		goto detach_synth;
2744 	}
2745 
2746 	/*
2747 	 * Set the # of TX/RX rings that could be used according to
2748 	 * the # of channels that NVS offered.
2749 	 */
2750 	nchan = nsubch + 1;
2751 	hvn_set_ring_inuse(sc, nchan);
2752 
2753 	if (nchan > 1) {
2754 		/*
2755 		 * Attach the sub-channels.
2756 		 *
2757 		 * NOTE: hvn_set_ring_inuse() _must_ have been called.
2758 		 */
2759 		error = hvn_subchannel_attach(sc);
2760 		if (error) {
2761 			aprint_error_dev(sc->sc_dev,
2762 			    "failed to attach sub channels\n");
2763 			goto detach_synth;
2764 		}
2765 
2766 		/*
2767 		 * Configure RSS key and indirect table _after_ all sub-channels
2768 		 * are attached.
2769 		 */
2770 		if (!(sc->sc_flags & HVN_SCF_HAS_RSSKEY)) {
2771 			/* Set the default RSS key. */
2772 			CTASSERT(sizeof(sc->sc_rss.rss_key) == sizeof(rss_key));
2773 			rss_getkey(rss_key);
2774 			memcpy(&sc->sc_rss.rss_key, rss_key,
2775 			    sizeof(sc->sc_rss.rss_key));
2776 			sc->sc_flags |= HVN_SCF_HAS_RSSKEY;
2777 		}
2778 
2779 		if (!(sc->sc_flags & HVN_SCF_HAS_RSSIND)) {
2780 			/* Setup RSS indirect table in round-robin fashion. */
2781 			for (i = 0; i < NDIS_HASH_INDCNT; i++) {
2782 				sc->sc_rss.rss_ind[i] = i % nchan;
2783 			}
2784 			sc->sc_flags |= HVN_SCF_HAS_RSSIND;
2785 		} else {
2786 			/*
2787 			 * # of usable channels may be changed, so we have to
2788 			 * make sure that all entries in RSS indirect table
2789 			 * are valid.
2790 			 *
2791 			 * NOTE: hvn_set_ring_inuse() _must_ have been called.
2792 			 */
2793 			hvn_fixup_rss_ind(sc);
2794 		}
2795 
2796 		sc->sc_rss_hash = sc->sc_rss_hcap;
2797 		error = hvn_set_rss(sc, NDIS_RSS_FLAG_NONE);
2798 		if (error) {
2799 			aprint_error_dev(sc->sc_dev, "failed to setup RSS\n");
2800 			goto detach_synth;
2801 		}
2802 	}
2803 
2804 	/*
2805 	 * Fixup transmission aggregation setup.
2806 	 */
2807 	hvn_set_txagg(sc);
2808 	hvn_init_fixat(sc, nchan);
2809 	return 0;
2810 
2811 detach_synth:
2812 	hvn_init_fixat(sc, nchan);
2813 	hvn_synth_detach(sc);
2814 	return error;
2815 
2816 detach_rndis:
2817 	hvn_init_fixat(sc, nchan);
2818 	hvn_rndis_detach(sc);
2819 detach_nvs:
2820 	hvn_nvs_detach(sc);
2821 detach_channel:
2822 	hvn_channel_detach(sc, sc->sc_prichan);
2823 failed:
2824 	/* Restore old capabilities. */
2825 	sc->sc_caps = old_caps;
2826 	return error;
2827 }
2828 
2829 static void
2830 hvn_synth_detach(struct hvn_softc *sc)
2831 {
2832 
2833 	/* Detach the RNDIS first. */
2834 	hvn_rndis_detach(sc);
2835 
2836 	/* Detach NVS. */
2837 	hvn_nvs_detach(sc);
2838 
2839 	/* Detach all of the channels. */
2840 	hvn_channel_detach_all(sc);
2841 
2842 	if (sc->sc_prichan->ch_sc->sc_proto >= VMBUS_VERSION_WIN10 &&
2843 	    sc->sc_rx_hndl) {
2844 		/*
2845 		 * Host is post-Win2016, disconnect RXBUF from primary channel
2846 		 * here.
2847 		 */
2848 		vmbus_handle_free(sc->sc_prichan, sc->sc_rx_hndl);
2849 		sc->sc_rx_hndl = 0;
2850 	}
2851 
2852 	if (sc->sc_prichan->ch_sc->sc_proto >= VMBUS_VERSION_WIN10 &&
2853 	    sc->sc_chim_hndl) {
2854 		/*
2855 		 * Host is post-Win2016, disconnect chimney sending buffer
2856 		 * from primary channel here.
2857 		 */
2858 		vmbus_handle_free(sc->sc_prichan, sc->sc_chim_hndl);
2859 		sc->sc_chim_hndl = 0;
2860 	}
2861 }
2862 
2863 static void
2864 hvn_set_ring_inuse(struct hvn_softc *sc, int ring_cnt)
2865 {
2866 
2867 	if (sc->sc_ntxr > ring_cnt)
2868 		sc->sc_ntxr_inuse = ring_cnt;
2869 	else
2870 		sc->sc_ntxr_inuse = sc->sc_ntxr;
2871 	sc->sc_nrxr_inuse = ring_cnt;
2872 }
2873 
2874 static void
2875 hvn_channel_drain(struct hvn_softc *sc, struct vmbus_channel *chan)
2876 {
2877 	struct hvn_rx_ring *rxr;
2878 	int i, s;
2879 
2880 	for (rxr = NULL, i = 0; i < sc->sc_nrxr_inuse; i++) {
2881 		rxr = &sc->sc_rxr[i];
2882 		if (rxr->rxr_chan == chan)
2883 			break;
2884 	}
2885 	KASSERT(i < sc->sc_nrxr_inuse);
2886 
2887 	/*
2888 	 * NOTE:
2889 	 * The TX bufring will not be drained by the hypervisor,
2890 	 * if the primary channel is revoked.
2891 	 */
2892 	while (!vmbus_channel_rx_empty(chan) ||
2893 	    (!vmbus_channel_is_revoked(sc->sc_prichan) &&
2894 	     !vmbus_channel_tx_empty(chan))) {
2895 		DELAY(20);
2896 		s = splnet();
2897 		hvn_nvs_intr1(rxr, sc->sc_tx_process_limit,
2898 		    sc->sc_rx_process_limit);
2899 		splx(s);
2900 	}
2901 
2902 	mutex_enter(&rxr->rxr_onwork_lock);
2903 	while (rxr->rxr_onlist || rxr->rxr_onproc)
2904 		cv_wait(&rxr->rxr_onwork_cv, &rxr->rxr_onwork_lock);
2905 	mutex_exit(&rxr->rxr_onwork_lock);
2906 }
2907 
2908 static void
2909 hvn_disable_rx(struct hvn_softc *sc)
2910 {
2911 
2912 	/*
2913 	 * Disable RX by clearing RX filter forcefully.
2914 	 */
2915 	(void)hvn_rndis_close(sc);	/* ignore error */
2916 
2917 	/*
2918 	 * Give RNDIS enough time to flush all pending data packets.
2919 	 */
2920 	DELAY(200);
2921 }
2922 
2923 static void
2924 hvn_drain_rxtx(struct hvn_softc *sc, int nchan)
2925 {
2926 	struct vmbus_channel **subchans = NULL;
2927 	int i, nsubch;
2928 
2929 	/*
2930 	 * Drain RX/TX bufrings and interrupts.
2931 	 */
2932 	nsubch = nchan - 1;
2933 	if (nsubch > 0)
2934 		subchans = vmbus_subchannel_get(sc->sc_prichan, nsubch);
2935 
2936 	if (subchans != NULL) {
2937 		for (i = 0; i < nsubch; ++i)
2938 			hvn_channel_drain(sc, subchans[i]);
2939 	}
2940 	hvn_channel_drain(sc, sc->sc_prichan);
2941 
2942 	if (subchans != NULL)
2943 		vmbus_subchannel_rel(subchans, nsubch);
2944 }
2945 
2946 static void
2947 hvn_suspend_data(struct hvn_softc *sc)
2948 {
2949 	struct hvn_tx_ring *txr;
2950 	int i, s;
2951 
2952 	/*
2953 	 * Suspend TX.
2954 	 */
2955 	for (i = 0; i < sc->sc_ntxr_inuse; i++) {
2956 		txr = &sc->sc_txr[i];
2957 
2958 		mutex_enter(&txr->txr_lock);
2959 		txr->txr_suspended = 1;
2960 		mutex_exit(&txr->txr_lock);
2961 		/* No one is able send more packets now. */
2962 
2963 		/*
2964 		 * Wait for all pending sends to finish.
2965 		 *
2966 		 * NOTE:
2967 		 * We will _not_ receive all pending send-done, if the
2968 		 * primary channel is revoked.
2969 		 */
2970 		while (hvn_tx_ring_pending(txr) &&
2971 		    !vmbus_channel_is_revoked(sc->sc_prichan)) {
2972 			DELAY(20);
2973 			s = splnet();
2974 			hvn_nvs_intr1(txr->txr_rxr, sc->sc_tx_process_limit,
2975 			    sc->sc_rx_process_limit);
2976 			splx(s);
2977 		}
2978 	}
2979 
2980 	/*
2981 	 * Disable RX.
2982 	 */
2983 	hvn_disable_rx(sc);
2984 
2985 	/*
2986 	 * Drain RX/TX.
2987 	 */
2988 	hvn_drain_rxtx(sc, sc->sc_nrxr_inuse);
2989 }
2990 
2991 static void
2992 hvn_suspend_mgmt(struct hvn_softc *sc)
2993 {
2994 
2995 	sc->sc_link_suspend = true;
2996 	callout_halt(&sc->sc_link_tmout, NULL);
2997 
2998 	/* Drain link state task */
2999 	mutex_enter(&sc->sc_link_lock);
3000 	for (;;) {
3001 		if (!sc->sc_link_onproc)
3002 			break;
3003 		mutex_exit(&sc->sc_link_lock);
3004 		DELAY(20);
3005 		mutex_enter(&sc->sc_link_lock);
3006 	}
3007 	mutex_exit(&sc->sc_link_lock);
3008 }
3009 
3010 static void
3011 hvn_suspend(struct hvn_softc *sc)
3012 {
3013 	struct ifnet *ifp = SC2IFP(sc);
3014 
3015 	if (ifp->if_flags & IFF_RUNNING)
3016 		hvn_suspend_data(sc);
3017 	hvn_suspend_mgmt(sc);
3018 }
3019 
3020 static void
3021 hvn_resume_tx(struct hvn_softc *sc, int ring_cnt)
3022 {
3023 	struct hvn_tx_ring *txr;
3024 	int i;
3025 
3026 	for (i = 0; i < ring_cnt; i++) {
3027 		txr = &sc->sc_txr[i];
3028 		mutex_enter(&txr->txr_lock);
3029 		txr->txr_suspended = 0;
3030 		mutex_exit(&txr->txr_lock);
3031 	}
3032 }
3033 
3034 static void
3035 hvn_resume_data(struct hvn_softc *sc)
3036 {
3037 	struct ifnet *ifp = SC2IFP(sc);
3038 	struct hvn_tx_ring *txr;
3039 	int i;
3040 
3041 	/*
3042 	 * Re-enable RX.
3043 	 */
3044 	hvn_rndis_open(sc);
3045 
3046 	/*
3047 	 * Make sure to clear suspend status on "all" TX rings,
3048 	 * since sc_ntxr_inuse can be changed after hvn_suspend_data().
3049 	 */
3050 	hvn_resume_tx(sc, sc->sc_ntxr);
3051 
3052 	/*
3053 	 * Flush unused mbuf, since sc_ntxr_inuse may be reduced.
3054 	 */
3055 	for (i = sc->sc_ntxr_inuse; i < sc->sc_ntxr; i++)
3056 		hvn_tx_ring_qflush(sc, &sc->sc_txr[i]);
3057 
3058 	/*
3059 	 * Kick start TX.
3060 	 */
3061 	for (i = 0; i < sc->sc_ntxr_inuse; i++) {
3062 		txr = &sc->sc_txr[i];
3063 		mutex_enter(&txr->txr_lock);
3064 		txr->txr_oactive = 0;
3065 
3066 		/* ALTQ */
3067 		if (txr->txr_id == 0)
3068 			if_schedule_deferred_start(ifp);
3069 		softint_schedule(txr->txr_si);
3070 		mutex_exit(&txr->txr_lock);
3071 	}
3072 }
3073 
3074 static void
3075 hvn_resume_mgmt(struct hvn_softc *sc)
3076 {
3077 
3078 	sc->sc_link_suspend = false;
3079 	hvn_link_event(sc, HVN_LINK_EV_RESUME_NETWORK);
3080 }
3081 
3082 static void
3083 hvn_resume(struct hvn_softc *sc)
3084 {
3085 	struct ifnet *ifp = SC2IFP(sc);
3086 
3087 	if (ifp->if_flags & IFF_RUNNING)
3088 		hvn_resume_data(sc);
3089 	hvn_resume_mgmt(sc);
3090 }
3091 
3092 static int
3093 hvn_nvs_init(struct hvn_softc *sc)
3094 {
3095 
3096 	mutex_init(&sc->sc_nvsrsp_lock, MUTEX_DEFAULT, IPL_NET);
3097 	cv_init(&sc->sc_nvsrsp_cv, "nvsrspcv");
3098 
3099 	return 0;
3100 }
3101 
3102 static void
3103 hvn_nvs_destroy(struct hvn_softc *sc)
3104 {
3105 
3106 	mutex_destroy(&sc->sc_nvsrsp_lock);
3107 	cv_destroy(&sc->sc_nvsrsp_cv);
3108 }
3109 
3110 static int
3111 hvn_nvs_doinit(struct hvn_softc *sc, uint32_t proto)
3112 {
3113 	struct hvn_nvs_init cmd;
3114 	struct hvn_nvs_init_resp *rsp;
3115 	uint64_t tid;
3116 	int error;
3117 
3118 	memset(&cmd, 0, sizeof(cmd));
3119 	cmd.nvs_type = HVN_NVS_TYPE_INIT;
3120 	cmd.nvs_ver_min = cmd.nvs_ver_max = proto;
3121 
3122 	tid = atomic_inc_uint_nv(&sc->sc_nvstid);
3123 	mutex_enter(&sc->sc_nvsrsp_lock);
3124 	error = hvn_nvs_cmd(sc, &cmd, sizeof(cmd), tid, 0);
3125 	if (error == 0) {
3126 		rsp = (struct hvn_nvs_init_resp *)&sc->sc_nvsrsp;
3127 		if (rsp->nvs_status != HVN_NVS_STATUS_OK)
3128 			error = EINVAL;
3129 	}
3130 	mutex_exit(&sc->sc_nvsrsp_lock);
3131 
3132 	return error;
3133 }
3134 
3135 static int
3136 hvn_nvs_conf_ndis(struct hvn_softc *sc, int mtu)
3137 {
3138 	struct hvn_nvs_ndis_conf cmd;
3139 	uint64_t tid;
3140 	int error;
3141 
3142 	memset(&cmd, 0, sizeof(cmd));
3143 	cmd.nvs_type = HVN_NVS_TYPE_NDIS_CONF;
3144 	cmd.nvs_mtu = mtu + ETHER_HDR_LEN;
3145 	cmd.nvs_caps = HVN_NVS_NDIS_CONF_VLAN;
3146 
3147 	tid = atomic_inc_uint_nv(&sc->sc_nvstid);
3148 	mutex_enter(&sc->sc_nvsrsp_lock);
3149 	/* NOTE: No response. */
3150 	error = hvn_nvs_cmd(sc, &cmd, sizeof(cmd), tid, 0);
3151 	mutex_exit(&sc->sc_nvsrsp_lock);
3152 
3153 	if (error == 0)
3154 		sc->sc_caps |= HVN_CAPS_MTU | HVN_CAPS_VLAN;
3155 	return error;
3156 }
3157 
3158 static int
3159 hvn_nvs_init_ndis(struct hvn_softc *sc)
3160 {
3161 	struct hvn_nvs_ndis_init cmd;
3162 	uint64_t tid;
3163 	int error;
3164 
3165 	memset(&cmd, 0, sizeof(cmd));
3166 	cmd.nvs_type = HVN_NVS_TYPE_NDIS_INIT;
3167 	cmd.nvs_ndis_major = (sc->sc_ndisver & 0xffff0000) >> 16;
3168 	cmd.nvs_ndis_minor = sc->sc_ndisver & 0x0000ffff;
3169 
3170 	tid = atomic_inc_uint_nv(&sc->sc_nvstid);
3171 	mutex_enter(&sc->sc_nvsrsp_lock);
3172 	/* NOTE: No response. */
3173 	error = hvn_nvs_cmd(sc, &cmd, sizeof(cmd), tid, 0);
3174 	mutex_exit(&sc->sc_nvsrsp_lock);
3175 
3176 	return error;
3177 }
3178 
3179 static int
3180 hvn_nvs_attach(struct hvn_softc *sc, int mtu)
3181 {
3182 	static const uint32_t protos[] = {
3183 		HVN_NVS_PROTO_VERSION_5,
3184 		HVN_NVS_PROTO_VERSION_4,
3185 		HVN_NVS_PROTO_VERSION_2,
3186 		HVN_NVS_PROTO_VERSION_1
3187 	};
3188 	int i;
3189 
3190 	if (hyperv_ver_major >= 10)
3191 		sc->sc_caps |= HVN_CAPS_UDPHASH;
3192 
3193 	/*
3194 	 * Initialize NVS.
3195 	 */
3196 	if (sc->sc_flags & HVN_SCF_ATTACHED) {
3197 		/*
3198 		 * NVS version and NDIS version MUST NOT be changed.
3199 		 */
3200 		DPRINTF("%s: reinit NVS version %#x, NDIS version %u.%u\n",
3201 		    device_xname(sc->sc_dev), sc->sc_proto,
3202 		    (sc->sc_ndisver >> 16), sc->sc_ndisver & 0xffff);
3203 
3204 		if (hvn_nvs_doinit(sc, sc->sc_proto)) {
3205 			DPRINTF("%s: failed to reinit NVSP version %#x\n",
3206 			    device_xname(sc->sc_dev), sc->sc_proto);
3207 			return -1;
3208 		}
3209 	} else {
3210 		/*
3211 		 * Find the supported NVS version and set NDIS version
3212 		 * accordingly.
3213 		 */
3214 		for (i = 0; i < __arraycount(protos); i++) {
3215 			if (hvn_nvs_doinit(sc, protos[i]) == 0)
3216 				break;
3217 		}
3218 		if (i == __arraycount(protos)) {
3219 			DPRINTF("%s: failed to negotiate NVSP version\n",
3220 			    device_xname(sc->sc_dev));
3221 			return -1;
3222 		}
3223 
3224 		sc->sc_proto = protos[i];
3225 		if (sc->sc_proto <= HVN_NVS_PROTO_VERSION_4)
3226 			sc->sc_ndisver = NDIS_VERSION_6_1;
3227 		else
3228 			sc->sc_ndisver = NDIS_VERSION_6_30;
3229 
3230 		DPRINTF("%s: NVS version %#x, NDIS version %u.%u\n",
3231 		    device_xname(sc->sc_dev), sc->sc_proto,
3232 		    (sc->sc_ndisver >> 16), sc->sc_ndisver & 0xffff);
3233 	}
3234 
3235 	if (sc->sc_proto >= HVN_NVS_PROTO_VERSION_5)
3236 		sc->sc_caps |= HVN_CAPS_HASHVAL;
3237 
3238 	if (sc->sc_proto >= HVN_NVS_PROTO_VERSION_2) {
3239 		/*
3240 		 * Configure NDIS before initializing it.
3241 		 */
3242 		if (hvn_nvs_conf_ndis(sc, mtu))
3243 			return -1;
3244 	}
3245 
3246 	/*
3247 	 * Initialize NDIS.
3248 	 */
3249 	if (hvn_nvs_init_ndis(sc))
3250 		return -1;
3251 
3252 	/*
3253 	 * Connect RXBUF.
3254 	 */
3255 	if (hvn_nvs_connect_rxbuf(sc))
3256 		return -1;
3257 
3258 	/*
3259 	 * Connect chimney sending buffer.
3260 	 */
3261 	if (hvn_nvs_connect_chim(sc))
3262 		return -1;
3263 
3264 	return 0;
3265 }
3266 
3267 static int
3268 hvn_nvs_connect_rxbuf(struct hvn_softc *sc)
3269 {
3270 	struct hvn_nvs_rxbuf_conn cmd;
3271 	struct hvn_nvs_rxbuf_conn_resp *rsp;
3272 	uint64_t tid;
3273 
3274 	if (vmbus_handle_alloc(sc->sc_prichan, &sc->sc_rx_dma, sc->sc_rx_size,
3275 	    &sc->sc_rx_hndl)) {
3276 		DPRINTF("%s: failed to obtain a PA handle\n",
3277 		    device_xname(sc->sc_dev));
3278 		return -1;
3279 	}
3280 
3281 	memset(&cmd, 0, sizeof(cmd));
3282 	cmd.nvs_type = HVN_NVS_TYPE_RXBUF_CONN;
3283 	cmd.nvs_gpadl = sc->sc_rx_hndl;
3284 	cmd.nvs_sig = HVN_NVS_RXBUF_SIG;
3285 
3286 	tid = atomic_inc_uint_nv(&sc->sc_nvstid);
3287 	mutex_enter(&sc->sc_nvsrsp_lock);
3288 	if (hvn_nvs_cmd(sc, &cmd, sizeof(cmd), tid, 0))
3289 		goto errout;
3290 
3291 	rsp = (struct hvn_nvs_rxbuf_conn_resp *)&sc->sc_nvsrsp;
3292 	if (rsp->nvs_status != HVN_NVS_STATUS_OK) {
3293 		DPRINTF("%s: failed to set up the Rx ring\n",
3294 		    device_xname(sc->sc_dev));
3295 		goto errout;
3296 	}
3297 
3298 	SET(sc->sc_flags, HVN_SCF_RXBUF_CONNECTED);
3299 
3300 	if (rsp->nvs_nsect > 1) {
3301 		DPRINTF("%s: invalid number of Rx ring sections: %u\n",
3302 		    device_xname(sc->sc_dev), rsp->nvs_nsect);
3303 		goto errout;
3304 	}
3305 	mutex_exit(&sc->sc_nvsrsp_lock);
3306 
3307 	return 0;
3308 
3309  errout:
3310 	mutex_exit(&sc->sc_nvsrsp_lock);
3311 	hvn_nvs_disconnect_rxbuf(sc);
3312 	return -1;
3313 }
3314 
3315 static int
3316 hvn_nvs_disconnect_rxbuf(struct hvn_softc *sc)
3317 {
3318 	struct hvn_nvs_rxbuf_disconn cmd;
3319 	uint64_t tid;
3320 	int s, error;
3321 
3322 	if (ISSET(sc->sc_flags, HVN_SCF_RXBUF_CONNECTED)) {
3323 		memset(&cmd, 0, sizeof(cmd));
3324 		cmd.nvs_type = HVN_NVS_TYPE_RXBUF_DISCONN;
3325 		cmd.nvs_sig = HVN_NVS_RXBUF_SIG;
3326 
3327 		tid = atomic_inc_uint_nv(&sc->sc_nvstid);
3328 		mutex_enter(&sc->sc_nvsrsp_lock);
3329 		error = hvn_nvs_cmd(sc, &cmd, sizeof(cmd), tid,
3330 		    HVN_NVS_CMD_NORESP);
3331 		if (error) {
3332 			device_printf(sc->sc_dev,
3333 			    "failed to send rxbuf disconn: %d", error);
3334 		}
3335 		CLR(sc->sc_flags, HVN_SCF_RXBUF_CONNECTED);
3336 		mutex_exit(&sc->sc_nvsrsp_lock);
3337 
3338 		/*
3339 		 * Wait for the hypervisor to receive this NVS request.
3340 		 *
3341 		 * NOTE:
3342 		 * The TX bufring will not be drained by the hypervisor,
3343 		 * if the primary channel is revoked.
3344 		 */
3345 		while (!vmbus_channel_tx_empty(sc->sc_prichan) &&
3346 		    !vmbus_channel_is_revoked(sc->sc_prichan)) {
3347 			DELAY(20);
3348 			s = splnet();
3349 			hvn_nvs_intr1(&sc->sc_rxr[0], sc->sc_tx_process_limit,
3350 			    sc->sc_rx_process_limit);
3351 			splx(s);
3352 		}
3353 		/*
3354 		 * Linger long enough for NVS to disconnect RXBUF.
3355 		 */
3356 		DELAY(200);
3357 	}
3358 
3359 	if (sc->sc_prichan->ch_sc->sc_proto < VMBUS_VERSION_WIN10 &&
3360 	    sc->sc_rx_hndl) {
3361 		/*
3362 		 * Disconnect RXBUF from primary channel.
3363 		 */
3364 		vmbus_handle_free(sc->sc_prichan, sc->sc_rx_hndl);
3365 		sc->sc_rx_hndl = 0;
3366 	}
3367 
3368 	return 0;
3369 }
3370 
3371 static int
3372 hvn_nvs_connect_chim(struct hvn_softc *sc)
3373 {
3374 	struct hvn_nvs_chim_conn cmd;
3375 	const struct hvn_nvs_chim_conn_resp *rsp;
3376 	uint64_t tid;
3377 
3378 	mutex_init(&sc->sc_chim_bmap_lock, MUTEX_DEFAULT, IPL_NET);
3379 
3380 	/*
3381 	 * Connect chimney sending buffer GPADL to the primary channel.
3382 	 *
3383 	 * NOTE:
3384 	 * Only primary channel has chimney sending buffer connected to it.
3385 	 * Sub-channels just share this chimney sending buffer.
3386 	 */
3387 	if (vmbus_handle_alloc(sc->sc_prichan, &sc->sc_chim_dma, HVN_CHIM_SIZE,
3388 	    &sc->sc_chim_hndl)) {
3389 		DPRINTF("%s: failed to obtain a PA handle for chimney\n",
3390 		    device_xname(sc->sc_dev));
3391 		return -1;
3392 	}
3393 
3394 	memset(&cmd, 0, sizeof(cmd));
3395 	cmd.nvs_type = HVN_NVS_TYPE_CHIM_CONN;
3396 	cmd.nvs_gpadl = sc->sc_chim_hndl;
3397 	cmd.nvs_sig = HVN_NVS_CHIM_SIG;
3398 
3399 	tid = atomic_inc_uint_nv(&sc->sc_nvstid);
3400 	mutex_enter(&sc->sc_nvsrsp_lock);
3401 	if (hvn_nvs_cmd(sc, &cmd, sizeof(cmd), tid, 0))
3402 		goto errout;
3403 
3404 	rsp = (struct hvn_nvs_chim_conn_resp *)&sc->sc_nvsrsp;
3405 	if (rsp->nvs_status != HVN_NVS_STATUS_OK) {
3406 		DPRINTF("%s: failed to set up chimney sending buffer\n",
3407 		    device_xname(sc->sc_dev));
3408 		goto errout;
3409 	}
3410 
3411 	if (rsp->nvs_sectsz == 0 ||
3412 	    (rsp->nvs_sectsz % sizeof(uint32_t)) != 0) {
3413 		/*
3414 		 * Can't use chimney sending buffer; done!
3415 		 */
3416 		if (rsp->nvs_sectsz == 0) {
3417 			device_printf(sc->sc_dev,
3418 			    "zero chimney sending buffer section size\n");
3419 		} else {
3420 			device_printf(sc->sc_dev,
3421 			    "misaligned chimney sending buffers,"
3422 			    " section size: %d", rsp->nvs_sectsz);
3423 		}
3424 		sc->sc_chim_szmax = 0;
3425 		sc->sc_chim_cnt = 0;
3426 	} else {
3427 		sc->sc_chim_szmax = rsp->nvs_sectsz;
3428 		sc->sc_chim_cnt = HVN_CHIM_SIZE / sc->sc_chim_szmax;
3429 	}
3430 
3431 	if (sc->sc_chim_szmax > 0) {
3432 		if ((HVN_CHIM_SIZE % sc->sc_chim_szmax) != 0) {
3433 			device_printf(sc->sc_dev,
3434 			    "chimney sending sections are not properly "
3435 			    "aligned\n");
3436 		}
3437 		if ((sc->sc_chim_cnt % LONG_BIT) != 0) {
3438 			device_printf(sc->sc_dev,
3439 			    "discard %d chimney sending sections\n",
3440 			    sc->sc_chim_cnt % LONG_BIT);
3441 		}
3442 
3443 		sc->sc_chim_bmap_cnt = sc->sc_chim_cnt / LONG_BIT;
3444 		sc->sc_chim_bmap = kmem_zalloc(sc->sc_chim_bmap_cnt *
3445 		    sizeof(u_long), KM_SLEEP);
3446 	}
3447 
3448 	/* Done! */
3449 	SET(sc->sc_flags, HVN_SCF_CHIM_CONNECTED);
3450 
3451 	aprint_verbose_dev(sc->sc_dev, "chimney sending buffer %d/%d\n",
3452 	    sc->sc_chim_szmax, sc->sc_chim_cnt);
3453 
3454 	mutex_exit(&sc->sc_nvsrsp_lock);
3455 
3456 	return 0;
3457 
3458 errout:
3459 	mutex_exit(&sc->sc_nvsrsp_lock);
3460 	hvn_nvs_disconnect_chim(sc);
3461 	return -1;
3462 }
3463 
3464 static int
3465 hvn_nvs_disconnect_chim(struct hvn_softc *sc)
3466 {
3467 	struct hvn_nvs_chim_disconn cmd;
3468 	uint64_t tid;
3469 	int s, error;
3470 
3471 	if (ISSET(sc->sc_flags, HVN_SCF_CHIM_CONNECTED)) {
3472 		memset(&cmd, 0, sizeof(cmd));
3473 		cmd.nvs_type = HVN_NVS_TYPE_CHIM_DISCONN;
3474 		cmd.nvs_sig = HVN_NVS_CHIM_SIG;
3475 
3476 		tid = atomic_inc_uint_nv(&sc->sc_nvstid);
3477 		mutex_enter(&sc->sc_nvsrsp_lock);
3478 		error = hvn_nvs_cmd(sc, &cmd, sizeof(cmd), tid,
3479 		    HVN_NVS_CMD_NORESP);
3480 		if (error) {
3481 			device_printf(sc->sc_dev,
3482 			    "failed to send chim disconn: %d", error);
3483 		}
3484 		CLR(sc->sc_flags, HVN_SCF_CHIM_CONNECTED);
3485 		mutex_exit(&sc->sc_nvsrsp_lock);
3486 
3487 		/*
3488 		 * Wait for the hypervisor to receive this NVS request.
3489 		 *
3490 		 * NOTE:
3491 		 * The TX bufring will not be drained by the hypervisor,
3492 		 * if the primary channel is revoked.
3493 		 */
3494 		while (!vmbus_channel_tx_empty(sc->sc_prichan) &&
3495 		    !vmbus_channel_is_revoked(sc->sc_prichan)) {
3496 			DELAY(20);
3497 			s = splnet();
3498 			hvn_nvs_intr1(&sc->sc_rxr[0], sc->sc_tx_process_limit,
3499 			    sc->sc_rx_process_limit);
3500 			splx(s);
3501 		}
3502 		/*
3503 		 * Linger long enough for NVS to disconnect chimney
3504 		 * sending buffer.
3505 		 */
3506 		DELAY(200);
3507 	}
3508 
3509 	if (sc->sc_prichan->ch_sc->sc_proto < VMBUS_VERSION_WIN10 &&
3510 	    sc->sc_chim_hndl) {
3511 		/*
3512 		 * Disconnect chimney sending buffer from primary channel.
3513 		 */
3514 		vmbus_handle_free(sc->sc_prichan, sc->sc_chim_hndl);
3515 		sc->sc_chim_hndl = 0;
3516 	}
3517 
3518 	if (sc->sc_chim_bmap != NULL) {
3519 		kmem_free(sc->sc_chim_bmap, sc->sc_chim_cnt / LONG_BIT);
3520 		sc->sc_chim_bmap = NULL;
3521 		sc->sc_chim_bmap_cnt = 0;
3522 	}
3523 
3524 	mutex_destroy(&sc->sc_chim_bmap_lock);
3525 
3526 	return 0;
3527 }
3528 
3529 #define HVN_HANDLE_RING_DOTX	__BIT(0)
3530 
3531 static int
3532 hvn_handle_ring(struct hvn_rx_ring *rxr, int txlimit, int rxlimit)
3533 {
3534 	struct hvn_softc *sc = rxr->rxr_softc;
3535 	struct vmbus_chanpkt_hdr *cph;
3536 	const struct hvn_nvs_hdr *nvs;
3537 	uint64_t rid;
3538 	uint32_t rlen;
3539 	int n, tx = 0, rx = 0;
3540 	int result = 0;
3541 	int rv;
3542 
3543 	mutex_enter(&rxr->rxr_lock);
3544 	for (;;) {
3545 		rv = vmbus_channel_recv(rxr->rxr_chan, rxr->rxr_nvsbuf,
3546 		    HVN_NVS_BUFSIZE, &rlen, &rid, 1);
3547 		if (rv != 0 || rlen == 0) {
3548 			if (rv != EAGAIN)
3549 				device_printf(sc->sc_dev,
3550 				    "failed to receive an NVSP packet\n");
3551 			break;
3552 		}
3553 		cph = (struct vmbus_chanpkt_hdr *)rxr->rxr_nvsbuf;
3554 		nvs = (const struct hvn_nvs_hdr *)VMBUS_CHANPKT_CONST_DATA(cph);
3555 
3556 		if (cph->cph_type == VMBUS_CHANPKT_TYPE_COMP) {
3557 			switch (nvs->nvs_type) {
3558 			case HVN_NVS_TYPE_INIT_RESP:
3559 			case HVN_NVS_TYPE_RXBUF_CONNRESP:
3560 			case HVN_NVS_TYPE_CHIM_CONNRESP:
3561 			case HVN_NVS_TYPE_SUBCH_RESP:
3562 				mutex_enter(&sc->sc_nvsrsp_lock);
3563 				/* copy the response back */
3564 				memcpy(&sc->sc_nvsrsp, nvs, HVN_NVS_MSGSIZE);
3565 				sc->sc_nvsdone = 1;
3566 				cv_signal(&sc->sc_nvsrsp_cv);
3567 				mutex_exit(&sc->sc_nvsrsp_lock);
3568 				break;
3569 			case HVN_NVS_TYPE_RNDIS_ACK:
3570 				if (rxr->rxr_txr == NULL)
3571 					break;
3572 
3573 				result |= HVN_HANDLE_RING_DOTX;
3574 				mutex_enter(&rxr->rxr_txr->txr_lock);
3575 				hvn_txeof(rxr->rxr_txr, cph->cph_tid);
3576 				mutex_exit(&rxr->rxr_txr->txr_lock);
3577 				if (txlimit > 0 && ++tx >= txlimit)
3578 					goto out;
3579 				break;
3580 			default:
3581 				device_printf(sc->sc_dev,
3582 				    "unhandled NVSP packet type %u "
3583 				    "on completion\n", nvs->nvs_type);
3584 				break;
3585 			}
3586 		} else if (cph->cph_type == VMBUS_CHANPKT_TYPE_RXBUF) {
3587 			switch (nvs->nvs_type) {
3588 			case HVN_NVS_TYPE_RNDIS:
3589 				n = hvn_rndis_input(rxr, cph->cph_tid, cph);
3590 				if (rxlimit > 0) {
3591 					if (n < 0)
3592 						goto out;
3593 					rx += n;
3594 					if (rx >= rxlimit)
3595 						goto out;
3596 				}
3597 				break;
3598 			default:
3599 				device_printf(sc->sc_dev,
3600 				    "unhandled NVSP packet type %u "
3601 				    "on receive\n", nvs->nvs_type);
3602 				break;
3603 			}
3604 		} else if (cph->cph_type == VMBUS_CHANPKT_TYPE_INBAND) {
3605 			switch (nvs->nvs_type) {
3606 			case HVN_NVS_TYPE_TXTBL_NOTE:
3607 				/* Useless; ignore */
3608 				break;
3609 			default:
3610 				device_printf(sc->sc_dev,
3611 				    "got notify, nvs type %u\n", nvs->nvs_type);
3612 				break;
3613 			}
3614 		} else
3615 			device_printf(sc->sc_dev,
3616 			    "unknown NVSP packet type %u\n", cph->cph_type);
3617 	}
3618 out:
3619 	mutex_exit(&rxr->rxr_lock);
3620 
3621 	return result;
3622 }
3623 
3624 static void
3625 hvn_nvs_intr1(struct hvn_rx_ring *rxr, int txlimit, int rxlimit)
3626 {
3627 	struct hvn_softc *sc = rxr->rxr_softc;
3628 	struct ifnet *ifp = SC2IFP(sc);
3629 	struct hvn_tx_ring *txr = rxr->rxr_txr;
3630 	int result;
3631 
3632 	rxr->rxr_workqueue = sc->sc_txrx_workqueue;
3633 
3634 	result = hvn_handle_ring(rxr, txlimit, rxlimit);
3635 
3636 	if ((result & HVN_HANDLE_RING_DOTX) && txr != NULL) {
3637 		mutex_enter(&txr->txr_lock);
3638 		/* ALTQ */
3639 		if (txr->txr_id == 0) {
3640 			if_schedule_deferred_start(ifp);
3641 		}
3642 		softint_schedule(txr->txr_si);
3643 		mutex_exit(&txr->txr_lock);
3644 	}
3645 }
3646 
3647 static void
3648 hvn_schedule_handle_ring(struct hvn_softc *sc, struct hvn_rx_ring *rxr,
3649     bool intr)
3650 {
3651 
3652 	KASSERT(mutex_owned(&rxr->rxr_onwork_lock));
3653 
3654 	if (rxr->rxr_workqueue) {
3655 		if (!rxr->rxr_onlist) {
3656 			rxr->rxr_onlist = true;
3657 			if (intr)
3658 				rxr->rxr_evdeferreq.ev_count++;
3659 			else
3660 				rxr->rxr_evredeferreq.ev_count++;
3661 			workqueue_enqueue(sc->sc_wq, &rxr->rxr_wk, NULL);
3662 		}
3663 	} else {
3664 		rxr->rxr_onlist = true;
3665 		if (intr)
3666 			rxr->rxr_evdeferreq.ev_count++;
3667 		else
3668 			rxr->rxr_evredeferreq.ev_count++;
3669 		softint_schedule(rxr->rxr_si);
3670 	}
3671 }
3672 
3673 static void
3674 hvn_handle_ring_common(struct hvn_rx_ring *rxr)
3675 {
3676 	struct hvn_softc *sc = rxr->rxr_softc;
3677 	int txlimit = sc->sc_tx_process_limit;
3678 	int rxlimit = sc->sc_rx_process_limit;
3679 
3680 	rxr->rxr_evdefer.ev_count++;
3681 
3682 	mutex_enter(&rxr->rxr_onwork_lock);
3683 	rxr->rxr_onproc = true;
3684 	rxr->rxr_onlist = false;
3685 	mutex_exit(&rxr->rxr_onwork_lock);
3686 
3687 	hvn_nvs_intr1(rxr, txlimit, rxlimit);
3688 
3689 	mutex_enter(&rxr->rxr_onwork_lock);
3690 	if (vmbus_channel_unpause(rxr->rxr_chan)) {
3691 		vmbus_channel_pause(rxr->rxr_chan);
3692 		hvn_schedule_handle_ring(sc, rxr, false);
3693 	}
3694 	rxr->rxr_onproc = false;
3695 	cv_broadcast(&rxr->rxr_onwork_cv);
3696 	mutex_exit(&rxr->rxr_onwork_lock);
3697 }
3698 
3699 static void
3700 hvn_handle_ring_work(struct work *wk, void *arg)
3701 {
3702 	struct hvn_rx_ring *rxr = container_of(wk, struct hvn_rx_ring, rxr_wk);
3703 
3704 	hvn_handle_ring_common(rxr);
3705 }
3706 
3707 static void
3708 hvn_nvs_softintr(void *arg)
3709 {
3710 	struct hvn_rx_ring *rxr = arg;
3711 
3712 	hvn_handle_ring_common(rxr);
3713 }
3714 
3715 static void
3716 hvn_nvs_intr(void *arg)
3717 {
3718 	struct hvn_rx_ring *rxr = arg;
3719 	struct hvn_softc *sc = rxr->rxr_softc;
3720 	int txlimit = cold ? 0 : sc->sc_tx_intr_process_limit;
3721 	int rxlimit = cold ? 0 : sc->sc_rx_intr_process_limit;
3722 
3723 	rxr->rxr_evintr.ev_count++;
3724 
3725 	KASSERT(!rxr->rxr_onproc);
3726 	KASSERT(!rxr->rxr_onlist);
3727 
3728 	vmbus_channel_pause(rxr->rxr_chan);
3729 
3730 	hvn_nvs_intr1(rxr, txlimit, rxlimit);
3731 
3732 	if (vmbus_channel_unpause(rxr->rxr_chan) && !cold) {
3733 		vmbus_channel_pause(rxr->rxr_chan);
3734 		mutex_enter(&rxr->rxr_onwork_lock);
3735 		hvn_schedule_handle_ring(sc, rxr, true);
3736 		mutex_exit(&rxr->rxr_onwork_lock);
3737 	}
3738 }
3739 
3740 static int
3741 hvn_nvs_cmd(struct hvn_softc *sc, void *cmd, size_t cmdsize, uint64_t tid,
3742     u_int flags)
3743 {
3744 	struct hvn_rx_ring *rxr = &sc->sc_rxr[0];	/* primary channel */
3745 	struct hvn_nvs_hdr *hdr = cmd;
3746 	int tries = 10;
3747 	int rv, s;
3748 
3749 	KASSERT(mutex_owned(&sc->sc_nvsrsp_lock));
3750 
3751 	sc->sc_nvsdone = 0;
3752 
3753 	do {
3754 		rv = vmbus_channel_send(rxr->rxr_chan, cmd, cmdsize,
3755 		    tid, VMBUS_CHANPKT_TYPE_INBAND,
3756 		    ISSET(flags, HVN_NVS_CMD_NORESP) ? 0 :
3757 		      VMBUS_CHANPKT_FLAG_RC);
3758 		if (rv == EAGAIN) {
3759 			DELAY(1000);
3760 		} else if (rv) {
3761 			DPRINTF("%s: NVSP operation %u send error %d\n",
3762 			    device_xname(sc->sc_dev), hdr->nvs_type, rv);
3763 			return rv;
3764 		}
3765 	} while (rv != 0 && --tries > 0);
3766 
3767 	if (tries == 0 && rv != 0) {
3768 		device_printf(sc->sc_dev,
3769 		    "NVSP operation %u send error %d\n", hdr->nvs_type, rv);
3770 		return rv;
3771 	}
3772 
3773 	if (ISSET(flags, HVN_NVS_CMD_NORESP))
3774 		return 0;
3775 
3776 	while (!sc->sc_nvsdone && !ISSET(sc->sc_flags, HVN_SCF_REVOKED)) {
3777 		mutex_exit(&sc->sc_nvsrsp_lock);
3778 		DELAY(1000);
3779 		s = splnet();
3780 		hvn_nvs_intr1(rxr, 0, 0);
3781 		splx(s);
3782 		mutex_enter(&sc->sc_nvsrsp_lock);
3783 	}
3784 
3785 	return 0;
3786 }
3787 
3788 static int
3789 hvn_nvs_ack(struct hvn_rx_ring *rxr, uint64_t tid)
3790 {
3791 	struct hvn_softc *sc __unused = rxr->rxr_softc;
3792 	struct hvn_nvs_rndis_ack cmd;
3793 	int tries = 5;
3794 	int rv;
3795 
3796 	cmd.nvs_type = HVN_NVS_TYPE_RNDIS_ACK;
3797 	cmd.nvs_status = HVN_NVS_STATUS_OK;
3798 	do {
3799 		rv = vmbus_channel_send(rxr->rxr_chan, &cmd, sizeof(cmd),
3800 		    tid, VMBUS_CHANPKT_TYPE_COMP, 0);
3801 		if (rv == EAGAIN)
3802 			DELAY(10);
3803 		else if (rv) {
3804 			DPRINTF("%s: NVSP acknowledgement error %d\n",
3805 			    device_xname(sc->sc_dev), rv);
3806 			return rv;
3807 		}
3808 	} while (rv != 0 && --tries > 0);
3809 	return rv;
3810 }
3811 
3812 static void
3813 hvn_nvs_detach(struct hvn_softc *sc)
3814 {
3815 
3816 	hvn_nvs_disconnect_rxbuf(sc);
3817 	hvn_nvs_disconnect_chim(sc);
3818 }
3819 
3820 static int
3821 hvn_nvs_alloc_subchannels(struct hvn_softc *sc, int *nsubchp)
3822 {
3823 	struct hvn_nvs_subch_req cmd;
3824 	struct hvn_nvs_subch_resp *rsp;
3825 	uint64_t tid;
3826 	int nsubch, nsubch_req;
3827 
3828 	nsubch_req = *nsubchp;
3829 	KASSERTMSG(nsubch_req > 0, "invalid # of sub-channels %d", nsubch_req);
3830 
3831 	memset(&cmd, 0, sizeof(cmd));
3832 	cmd.nvs_type = HVN_NVS_TYPE_SUBCH_REQ;
3833 	cmd.nvs_op = HVN_NVS_SUBCH_OP_ALLOC;
3834 	cmd.nvs_nsubch = nsubch_req;
3835 
3836 	tid = atomic_inc_uint_nv(&sc->sc_nvstid);
3837 	mutex_enter(&sc->sc_nvsrsp_lock);
3838 	if (hvn_nvs_cmd(sc, &cmd, sizeof(cmd), tid, 0)) {
3839 		mutex_exit(&sc->sc_nvsrsp_lock);
3840 		return EIO;
3841 	}
3842 
3843 	rsp = (struct hvn_nvs_subch_resp *)&sc->sc_nvsrsp;
3844 	if (rsp->nvs_status != HVN_NVS_STATUS_OK) {
3845 		mutex_exit(&sc->sc_nvsrsp_lock);
3846 		DPRINTF("%s: failed to alloc sub-channels\n",
3847 		    device_xname(sc->sc_dev));
3848 		return EIO;
3849 	}
3850 
3851 	nsubch = rsp->nvs_nsubch;
3852 	if (nsubch > nsubch_req) {
3853 		aprint_debug_dev(sc->sc_dev,
3854 		    "%u subchans are allocated, requested %d\n",
3855 		    nsubch, nsubch_req);
3856 		nsubch = nsubch_req;
3857 	}
3858 	mutex_exit(&sc->sc_nvsrsp_lock);
3859 
3860 	*nsubchp = nsubch;
3861 
3862 	return 0;
3863 }
3864 
3865 static inline struct rndis_cmd *
3866 hvn_alloc_cmd(struct hvn_softc *sc)
3867 {
3868 	struct rndis_cmd *rc;
3869 
3870 	mutex_enter(&sc->sc_cntl_fqlck);
3871 	while ((rc = TAILQ_FIRST(&sc->sc_cntl_fq)) == NULL)
3872 		cv_wait(&sc->sc_cntl_fqcv, &sc->sc_cntl_fqlck);
3873 	TAILQ_REMOVE(&sc->sc_cntl_fq, rc, rc_entry);
3874 	mutex_exit(&sc->sc_cntl_fqlck);
3875 	return rc;
3876 }
3877 
3878 static inline void
3879 hvn_submit_cmd(struct hvn_softc *sc, struct rndis_cmd *rc)
3880 {
3881 
3882 	mutex_enter(&sc->sc_cntl_sqlck);
3883 	TAILQ_INSERT_TAIL(&sc->sc_cntl_sq, rc, rc_entry);
3884 	mutex_exit(&sc->sc_cntl_sqlck);
3885 }
3886 
3887 static inline struct rndis_cmd *
3888 hvn_complete_cmd(struct hvn_softc *sc, uint32_t id)
3889 {
3890 	struct rndis_cmd *rc;
3891 
3892 	mutex_enter(&sc->sc_cntl_sqlck);
3893 	TAILQ_FOREACH(rc, &sc->sc_cntl_sq, rc_entry) {
3894 		if (rc->rc_id == id) {
3895 			TAILQ_REMOVE(&sc->sc_cntl_sq, rc, rc_entry);
3896 			break;
3897 		}
3898 	}
3899 	mutex_exit(&sc->sc_cntl_sqlck);
3900 	if (rc != NULL) {
3901 		mutex_enter(&sc->sc_cntl_cqlck);
3902 		TAILQ_INSERT_TAIL(&sc->sc_cntl_cq, rc, rc_entry);
3903 		mutex_exit(&sc->sc_cntl_cqlck);
3904 	}
3905 	return rc;
3906 }
3907 
3908 static inline void
3909 hvn_release_cmd(struct hvn_softc *sc, struct rndis_cmd *rc)
3910 {
3911 
3912 	mutex_enter(&sc->sc_cntl_cqlck);
3913 	TAILQ_REMOVE(&sc->sc_cntl_cq, rc, rc_entry);
3914 	mutex_exit(&sc->sc_cntl_cqlck);
3915 }
3916 
3917 static inline int
3918 hvn_rollback_cmd(struct hvn_softc *sc, struct rndis_cmd *rc)
3919 {
3920 	struct rndis_cmd *rn;
3921 
3922 	mutex_enter(&sc->sc_cntl_sqlck);
3923 	TAILQ_FOREACH(rn, &sc->sc_cntl_sq, rc_entry) {
3924 		if (rn == rc) {
3925 			TAILQ_REMOVE(&sc->sc_cntl_sq, rc, rc_entry);
3926 			mutex_exit(&sc->sc_cntl_sqlck);
3927 			return 0;
3928 		}
3929 	}
3930 	mutex_exit(&sc->sc_cntl_sqlck);
3931 	return -1;
3932 }
3933 
3934 static inline void
3935 hvn_free_cmd(struct hvn_softc *sc, struct rndis_cmd *rc)
3936 {
3937 
3938 	memset(rc->rc_req, 0, sizeof(struct rndis_packet_msg));
3939 	memset(&rc->rc_cmp, 0, sizeof(rc->rc_cmp));
3940 	memset(&rc->rc_msg, 0, sizeof(rc->rc_msg));
3941 	mutex_enter(&sc->sc_cntl_fqlck);
3942 	TAILQ_INSERT_TAIL(&sc->sc_cntl_fq, rc, rc_entry);
3943 	cv_signal(&sc->sc_cntl_fqcv);
3944 	mutex_exit(&sc->sc_cntl_fqlck);
3945 }
3946 
3947 static int
3948 hvn_rndis_init(struct hvn_softc *sc)
3949 {
3950 	struct rndis_cmd *rc;
3951 	int i;
3952 
3953 	/* RNDIS control message queues */
3954 	TAILQ_INIT(&sc->sc_cntl_sq);
3955 	TAILQ_INIT(&sc->sc_cntl_cq);
3956 	TAILQ_INIT(&sc->sc_cntl_fq);
3957 	mutex_init(&sc->sc_cntl_sqlck, MUTEX_DEFAULT, IPL_NET);
3958 	mutex_init(&sc->sc_cntl_cqlck, MUTEX_DEFAULT, IPL_NET);
3959 	mutex_init(&sc->sc_cntl_fqlck, MUTEX_DEFAULT, IPL_NET);
3960 	cv_init(&sc->sc_cntl_fqcv, "nvsalloc");
3961 
3962 	for (i = 0; i < HVN_RNDIS_CTLREQS; i++) {
3963 		rc = &sc->sc_cntl_msgs[i];
3964 		if (bus_dmamap_create(sc->sc_dmat, PAGE_SIZE, 1, PAGE_SIZE, 0,
3965 		    BUS_DMA_WAITOK, &rc->rc_dmap)) {
3966 			DPRINTF("%s: failed to create RNDIS command map\n",
3967 			    device_xname(sc->sc_dev));
3968 			goto errout;
3969 		}
3970 		if (bus_dmamem_alloc(sc->sc_dmat, PAGE_SIZE, PAGE_SIZE,
3971 		    0, &rc->rc_segs, 1, &rc->rc_nsegs, BUS_DMA_WAITOK)) {
3972 			DPRINTF("%s: failed to allocate RNDIS command\n",
3973 			    device_xname(sc->sc_dev));
3974 			bus_dmamap_destroy(sc->sc_dmat, rc->rc_dmap);
3975 			goto errout;
3976 		}
3977 		if (bus_dmamem_map(sc->sc_dmat, &rc->rc_segs, rc->rc_nsegs,
3978 		    PAGE_SIZE, (void **)&rc->rc_req, BUS_DMA_WAITOK)) {
3979 			DPRINTF("%s: failed to allocate RNDIS command\n",
3980 			    device_xname(sc->sc_dev));
3981 			bus_dmamem_free(sc->sc_dmat, &rc->rc_segs,
3982 			    rc->rc_nsegs);
3983 			bus_dmamap_destroy(sc->sc_dmat, rc->rc_dmap);
3984 			goto errout;
3985 		}
3986 		memset(rc->rc_req, 0, PAGE_SIZE);
3987 		if (bus_dmamap_load(sc->sc_dmat, rc->rc_dmap, rc->rc_req,
3988 		    PAGE_SIZE, NULL, BUS_DMA_WAITOK)) {
3989 			DPRINTF("%s: failed to load RNDIS command map\n",
3990 			    device_xname(sc->sc_dev));
3991 			bus_dmamem_unmap(sc->sc_dmat, rc->rc_req, PAGE_SIZE);
3992 			rc->rc_req = NULL;
3993 			bus_dmamem_free(sc->sc_dmat, &rc->rc_segs,
3994 			    rc->rc_nsegs);
3995 			bus_dmamap_destroy(sc->sc_dmat, rc->rc_dmap);
3996 			goto errout;
3997 		}
3998 		rc->rc_gpa = atop(rc->rc_dmap->dm_segs[0].ds_addr);
3999 		mutex_init(&rc->rc_lock, MUTEX_DEFAULT, IPL_NET);
4000 		cv_init(&rc->rc_cv, "rndiscmd");
4001 		TAILQ_INSERT_TAIL(&sc->sc_cntl_fq, rc, rc_entry);
4002 	}
4003 
4004 	/* Initialize RNDIS Data command */
4005 	memset(&sc->sc_data_msg, 0, sizeof(sc->sc_data_msg));
4006 	sc->sc_data_msg.nvs_type = HVN_NVS_TYPE_RNDIS;
4007 	sc->sc_data_msg.nvs_rndis_mtype = HVN_NVS_RNDIS_MTYPE_DATA;
4008 	sc->sc_data_msg.nvs_chim_idx = HVN_NVS_CHIM_IDX_INVALID;
4009 
4010 	return 0;
4011 
4012 errout:
4013 	hvn_rndis_destroy(sc);
4014 	return -1;
4015 }
4016 
4017 static void
4018 hvn_rndis_destroy(struct hvn_softc *sc)
4019 {
4020 	struct rndis_cmd *rc;
4021 	int i;
4022 
4023 	for (i = 0; i < HVN_RNDIS_CTLREQS; i++) {
4024 		rc = &sc->sc_cntl_msgs[i];
4025 		if (rc->rc_req == NULL)
4026 			continue;
4027 
4028 		TAILQ_REMOVE(&sc->sc_cntl_fq, rc, rc_entry);
4029 		bus_dmamap_unload(sc->sc_dmat, rc->rc_dmap);
4030 		bus_dmamem_unmap(sc->sc_dmat, rc->rc_req, PAGE_SIZE);
4031 		rc->rc_req = NULL;
4032 		bus_dmamem_free(sc->sc_dmat, &rc->rc_segs, rc->rc_nsegs);
4033 		bus_dmamap_destroy(sc->sc_dmat, rc->rc_dmap);
4034 		mutex_destroy(&rc->rc_lock);
4035 		cv_destroy(&rc->rc_cv);
4036 	}
4037 
4038 	mutex_destroy(&sc->sc_cntl_sqlck);
4039 	mutex_destroy(&sc->sc_cntl_cqlck);
4040 	mutex_destroy(&sc->sc_cntl_fqlck);
4041 	cv_destroy(&sc->sc_cntl_fqcv);
4042 }
4043 
4044 static int
4045 hvn_rndis_attach(struct hvn_softc *sc, int mtu)
4046 {
4047 	struct rndis_init_req *req;
4048 	struct rndis_init_comp *cmp;
4049 	struct rndis_cmd *rc;
4050 	int rv;
4051 
4052 	rc = hvn_alloc_cmd(sc);
4053 
4054 	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
4055 	    BUS_DMASYNC_PREREAD);
4056 
4057 	rc->rc_id = atomic_inc_uint_nv(&sc->sc_rndisrid);
4058 
4059 	req = rc->rc_req;
4060 	req->rm_type = REMOTE_NDIS_INITIALIZE_MSG;
4061 	req->rm_len = sizeof(*req);
4062 	req->rm_rid = rc->rc_id;
4063 	req->rm_ver_major = RNDIS_VERSION_MAJOR;
4064 	req->rm_ver_minor = RNDIS_VERSION_MINOR;
4065 	req->rm_max_xfersz = HVN_RNDIS_XFER_SIZE;
4066 
4067 	rc->rc_cmplen = sizeof(*cmp);
4068 
4069 	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
4070 	    BUS_DMASYNC_PREWRITE);
4071 
4072 	if ((rv = hvn_rndis_cmd(sc, rc, 0)) != 0) {
4073 		DPRINTF("%s: INITIALIZE_MSG failed, error %d\n",
4074 		    device_xname(sc->sc_dev), rv);
4075 		hvn_free_cmd(sc, rc);
4076 		return -1;
4077 	}
4078 	cmp = (struct rndis_init_comp *)&rc->rc_cmp;
4079 	if (cmp->rm_status != RNDIS_STATUS_SUCCESS) {
4080 		DPRINTF("%s: failed to init RNDIS, error %#x\n",
4081 		    device_xname(sc->sc_dev), cmp->rm_status);
4082 		hvn_free_cmd(sc, rc);
4083 		return -1;
4084 	}
4085 
4086 	sc->sc_rndis_agg_size = cmp->rm_pktmaxsz;
4087 	sc->sc_rndis_agg_pkts = cmp->rm_pktmaxcnt;
4088 	sc->sc_rndis_agg_align = __BIT(cmp->rm_align);
4089 
4090 	if (sc->sc_rndis_agg_align < sizeof(uint32_t)) {
4091 		/*
4092 		 * The RNDIS packet messsage encap assumes that the RNDIS
4093 		 * packet message is at least 4 bytes aligned.  Fix up the
4094 		 * alignment here, if the remote side sets the alignment
4095 		 * too low.
4096 		 */
4097 		aprint_verbose_dev(sc->sc_dev,
4098 		    "fixup RNDIS aggpkt align: %u -> %zu\n",
4099 		    sc->sc_rndis_agg_align, sizeof(uint32_t));
4100 		sc->sc_rndis_agg_align = sizeof(uint32_t);
4101 	}
4102 
4103 	aprint_verbose_dev(sc->sc_dev,
4104 	    "RNDIS ver %u.%u, aggpkt size %u, aggpkt cnt %u, aggpkt align %u\n",
4105 	    cmp->rm_ver_major, cmp->rm_ver_minor, sc->sc_rndis_agg_size,
4106 	    sc->sc_rndis_agg_pkts, sc->sc_rndis_agg_align);
4107 
4108 	hvn_free_cmd(sc, rc);
4109 
4110 	return 0;
4111 }
4112 
4113 static int
4114 hvn_get_rsscaps(struct hvn_softc *sc, int *nrxr)
4115 {
4116 	struct ndis_rss_caps in, caps;
4117 	size_t caps_len;
4118 	int error, rxr_cnt, indsz, hash_fnidx;
4119 	uint32_t hash_func = 0, hash_types = 0;
4120 
4121 	*nrxr = 0;
4122 
4123 	if (sc->sc_ndisver < NDIS_VERSION_6_20)
4124 		return EOPNOTSUPP;
4125 
4126 	memset(&in, 0, sizeof(in));
4127 	in.ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_CAPS;
4128 	in.ndis_hdr.ndis_rev = NDIS_RSS_CAPS_REV_2;
4129 	in.ndis_hdr.ndis_size = NDIS_RSS_CAPS_SIZE;
4130 
4131 	caps_len = NDIS_RSS_CAPS_SIZE;
4132 	error = hvn_rndis_query2(sc, OID_GEN_RECEIVE_SCALE_CAPABILITIES,
4133 	    &in, NDIS_RSS_CAPS_SIZE, &caps, &caps_len, NDIS_RSS_CAPS_SIZE_6_0);
4134 	if (error)
4135 		return error;
4136 
4137 	/*
4138 	 * Preliminary verification.
4139 	 */
4140 	if (caps.ndis_hdr.ndis_type != NDIS_OBJTYPE_RSS_CAPS) {
4141 		DPRINTF("%s: invalid NDIS objtype 0x%02x\n",
4142 		    device_xname(sc->sc_dev), caps.ndis_hdr.ndis_type);
4143 		return EINVAL;
4144 	}
4145 	if (caps.ndis_hdr.ndis_rev < NDIS_RSS_CAPS_REV_1) {
4146 		DPRINTF("%s: invalid NDIS objrev 0x%02x\n",
4147 		    device_xname(sc->sc_dev), caps.ndis_hdr.ndis_rev);
4148 		return EINVAL;
4149 	}
4150 	if (caps.ndis_hdr.ndis_size > caps_len) {
4151 		DPRINTF("%s: invalid NDIS objsize %u, data size %zu\n",
4152 		    device_xname(sc->sc_dev), caps.ndis_hdr.ndis_size,
4153 		    caps_len);
4154 		return EINVAL;
4155 	} else if (caps.ndis_hdr.ndis_size < NDIS_RSS_CAPS_SIZE_6_0) {
4156 		DPRINTF("%s: invalid NDIS objsize %u\n",
4157 		    device_xname(sc->sc_dev), caps.ndis_hdr.ndis_size);
4158 		return EINVAL;
4159 	}
4160 
4161 	/*
4162 	 * Save information for later RSS configuration.
4163 	 */
4164 	if (caps.ndis_nrxr == 0) {
4165 		DPRINTF("%s: 0 RX rings!?\n", device_xname(sc->sc_dev));
4166 		return EINVAL;
4167 	}
4168 	rxr_cnt = caps.ndis_nrxr;
4169 	aprint_debug_dev(sc->sc_dev, "%u Rx rings\n", rxr_cnt);
4170 
4171 	if (caps.ndis_hdr.ndis_size == NDIS_RSS_CAPS_SIZE &&
4172 	    caps.ndis_hdr.ndis_rev >= NDIS_RSS_CAPS_REV_2) {
4173 		if (caps.ndis_nind > NDIS_HASH_INDCNT) {
4174 			DPRINTF("%s: too many RSS indirect table entries %u\n",
4175 			    device_xname(sc->sc_dev), caps.ndis_nind);
4176 			return EOPNOTSUPP;
4177 		}
4178 		if (!powerof2(caps.ndis_nind)) {
4179 			DPRINTF("%s: RSS indirect table size is not power-of-2:"
4180 			    " %u\n", device_xname(sc->sc_dev), caps.ndis_nind);
4181 			return EOPNOTSUPP;
4182 		}
4183 
4184 		indsz = caps.ndis_nind;
4185 	} else {
4186 		indsz = NDIS_HASH_INDCNT;
4187 	}
4188 	if (rxr_cnt > indsz) {
4189 		aprint_debug_dev(sc->sc_dev,
4190 		    "# of RX rings (%u) > RSS indirect table size %u\n",
4191 		    rxr_cnt, indsz);
4192 		rxr_cnt = indsz;
4193 	}
4194 
4195 	/*
4196 	 * NOTE:
4197 	 * Toeplitz is at the lowest bit, and it is prefered; so ffs(),
4198 	 * instead of fls(), is used here.
4199 	 */
4200 	hash_fnidx = ffs(caps.ndis_caps & NDIS_RSS_CAP_HASHFUNC_MASK);
4201 	if (hash_fnidx == 0) {
4202 		DPRINTF("%s: no hash functions, caps 0x%08x\n",
4203 		    device_xname(sc->sc_dev), caps.ndis_caps);
4204 		return EOPNOTSUPP;
4205 	}
4206 	hash_func = 1 << (hash_fnidx - 1);	/* ffs is 1-based */
4207 
4208 	if (caps.ndis_caps & NDIS_RSS_CAP_IPV4)
4209 		hash_types |= NDIS_HASH_IPV4 | NDIS_HASH_TCP_IPV4;
4210 	if (caps.ndis_caps & NDIS_RSS_CAP_IPV6)
4211 		hash_types |= NDIS_HASH_IPV6 | NDIS_HASH_TCP_IPV6;
4212 	if (caps.ndis_caps & NDIS_RSS_CAP_IPV6_EX)
4213 		hash_types |= NDIS_HASH_IPV6_EX | NDIS_HASH_TCP_IPV6_EX;
4214 	if (hash_types == 0) {
4215 		DPRINTF("%s: no hash types, caps 0x%08x\n",
4216 		    device_xname(sc->sc_dev), caps.ndis_caps);
4217 		return EOPNOTSUPP;
4218 	}
4219 	aprint_debug_dev(sc->sc_dev, "RSS caps %#x\n", caps.ndis_caps);
4220 
4221 	sc->sc_rss_ind_size = indsz;
4222 	sc->sc_rss_hcap = hash_func | hash_types;
4223 	if (sc->sc_caps & HVN_CAPS_UDPHASH) {
4224 		/* UDP 4-tuple hash is unconditionally enabled. */
4225 		sc->sc_rss_hcap |= NDIS_HASH_UDP_IPV4_X;
4226 	}
4227 	*nrxr = rxr_cnt;
4228 
4229 	return 0;
4230 }
4231 
4232 static int
4233 hvn_set_rss(struct hvn_softc *sc, uint16_t flags)
4234 {
4235 	struct ndis_rssprm_toeplitz *rss = &sc->sc_rss;
4236 	struct ndis_rss_params *params = &rss->rss_params;
4237 	int len;
4238 
4239 	/*
4240 	 * Only NDIS 6.20+ is supported:
4241 	 * We only support 4bytes element in indirect table, which has been
4242 	 * adopted since NDIS 6.20.
4243 	 */
4244 	if (sc->sc_ndisver < NDIS_VERSION_6_20)
4245 		return 0;
4246 
4247 	/* XXX only one can be specified through, popcnt? */
4248 	KASSERTMSG((sc->sc_rss_hash & NDIS_HASH_FUNCTION_MASK),
4249 	    "no hash func %08x", sc->sc_rss_hash);
4250 	KASSERTMSG((sc->sc_rss_hash & NDIS_HASH_STD),
4251 	    "no standard hash types %08x", sc->sc_rss_hash);
4252 	KASSERTMSG(sc->sc_rss_ind_size > 0, "no indirect table size");
4253 
4254 	aprint_debug_dev(sc->sc_dev, "RSS indirect table size %d, hash %#x\n",
4255 	    sc->sc_rss_ind_size, sc->sc_rss_hash);
4256 
4257 	len = NDIS_RSSPRM_TOEPLITZ_SIZE(sc->sc_rss_ind_size);
4258 
4259 	memset(params, 0, sizeof(*params));
4260 	params->ndis_hdr.ndis_type = NDIS_OBJTYPE_RSS_PARAMS;
4261 	params->ndis_hdr.ndis_rev = NDIS_RSS_PARAMS_REV_2;
4262 	params->ndis_hdr.ndis_size = len;
4263 	params->ndis_flags = flags;
4264 	params->ndis_hash =
4265 	    sc->sc_rss_hash & (NDIS_HASH_FUNCTION_MASK | NDIS_HASH_STD);
4266 	params->ndis_indsize = sizeof(rss->rss_ind[0]) * sc->sc_rss_ind_size;
4267 	params->ndis_indoffset =
4268 	    offsetof(struct ndis_rssprm_toeplitz, rss_ind[0]);
4269 	params->ndis_keysize = sizeof(rss->rss_key);
4270 	params->ndis_keyoffset =
4271 	    offsetof(struct ndis_rssprm_toeplitz, rss_key[0]);
4272 
4273 	return hvn_rndis_set(sc, OID_GEN_RECEIVE_SCALE_PARAMETERS, rss, len);
4274 }
4275 
4276 static void
4277 hvn_fixup_rss_ind(struct hvn_softc *sc)
4278 {
4279 	struct ndis_rssprm_toeplitz *rss = &sc->sc_rss;
4280 	int i, nchan;
4281 
4282 	nchan = sc->sc_nrxr_inuse;
4283 	KASSERTMSG(nchan > 1, "invalid # of channels %d", nchan);
4284 
4285 	/*
4286 	 * Check indirect table to make sure that all channels in it
4287 	 * can be used.
4288 	 */
4289 	for (i = 0; i < NDIS_HASH_INDCNT; i++) {
4290 		if (rss->rss_ind[i] >= nchan) {
4291 			DPRINTF("%s: RSS indirect table %d fixup: %u -> %d\n",
4292 			    device_xname(sc->sc_dev), i, rss->rss_ind[i],
4293 			    nchan - 1);
4294 			rss->rss_ind[i] = nchan - 1;
4295 		}
4296 	}
4297 }
4298 
4299 static int
4300 hvn_get_hwcaps(struct hvn_softc *sc, struct ndis_offload *caps)
4301 {
4302 	struct ndis_offload in;
4303 	size_t caps_len, len;
4304 	int error;
4305 
4306 	memset(&in, 0, sizeof(in));
4307 	in.ndis_hdr.ndis_type = NDIS_OBJTYPE_OFFLOAD;
4308 	if (sc->sc_ndisver >= NDIS_VERSION_6_30) {
4309 		in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_3;
4310 		len = in.ndis_hdr.ndis_size = NDIS_OFFLOAD_SIZE;
4311 	} else if (sc->sc_ndisver >= NDIS_VERSION_6_1) {
4312 		in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_2;
4313 		len = in.ndis_hdr.ndis_size = NDIS_OFFLOAD_SIZE_6_1;
4314 	} else {
4315 		in.ndis_hdr.ndis_rev = NDIS_OFFLOAD_REV_1;
4316 		len = in.ndis_hdr.ndis_size = NDIS_OFFLOAD_SIZE_6_0;
4317 	}
4318 
4319 	caps_len = NDIS_OFFLOAD_SIZE;
4320 	error = hvn_rndis_query2(sc, OID_TCP_OFFLOAD_HARDWARE_CAPABILITIES,
4321 	    &in, len, caps, &caps_len, NDIS_OFFLOAD_SIZE_6_0);
4322 	if (error)
4323 		return error;
4324 
4325 	/*
4326 	 * Preliminary verification.
4327 	 */
4328 	if (caps->ndis_hdr.ndis_type != NDIS_OBJTYPE_OFFLOAD) {
4329 		DPRINTF("%s: invalid NDIS objtype 0x%02x\n",
4330 		    device_xname(sc->sc_dev), caps->ndis_hdr.ndis_type);
4331 		return EINVAL;
4332 	}
4333 	if (caps->ndis_hdr.ndis_rev < NDIS_OFFLOAD_REV_1) {
4334 		DPRINTF("%s: invalid NDIS objrev 0x%02x\n",
4335 		    device_xname(sc->sc_dev), caps->ndis_hdr.ndis_rev);
4336 		return EINVAL;
4337 	}
4338 	if (caps->ndis_hdr.ndis_size > caps_len) {
4339 		DPRINTF("%s: invalid NDIS objsize %u, data size %zu\n",
4340 		    device_xname(sc->sc_dev), caps->ndis_hdr.ndis_size,
4341 		    caps_len);
4342 		return EINVAL;
4343 	} else if (caps->ndis_hdr.ndis_size < NDIS_OFFLOAD_SIZE_6_0) {
4344 		DPRINTF("%s: invalid NDIS objsize %u\n",
4345 		    device_xname(sc->sc_dev), caps->ndis_hdr.ndis_size);
4346 		return EINVAL;
4347 	}
4348 
4349 	/*
4350 	 * NOTE:
4351 	 * caps->ndis_hdr.ndis_size MUST be checked before accessing
4352 	 * NDIS 6.1+ specific fields.
4353 	 */
4354 	aprint_debug_dev(sc->sc_dev, "hwcaps rev %u\n",
4355 	    caps->ndis_hdr.ndis_rev);
4356 
4357 	aprint_debug_dev(sc->sc_dev, "hwcaps csum: "
4358 	    "ip4 tx 0x%x/0x%x rx 0x%x/0x%x, "
4359 	    "ip6 tx 0x%x/0x%x rx 0x%x/0x%x\n",
4360 	    caps->ndis_csum.ndis_ip4_txcsum, caps->ndis_csum.ndis_ip4_txenc,
4361 	    caps->ndis_csum.ndis_ip4_rxcsum, caps->ndis_csum.ndis_ip4_rxenc,
4362 	    caps->ndis_csum.ndis_ip6_txcsum, caps->ndis_csum.ndis_ip6_txenc,
4363 	    caps->ndis_csum.ndis_ip6_rxcsum, caps->ndis_csum.ndis_ip6_rxenc);
4364 	aprint_debug_dev(sc->sc_dev, "hwcaps lsov2: "
4365 	    "ip4 maxsz %u minsg %u encap 0x%x, "
4366 	    "ip6 maxsz %u minsg %u encap 0x%x opts 0x%x\n",
4367 	    caps->ndis_lsov2.ndis_ip4_maxsz, caps->ndis_lsov2.ndis_ip4_minsg,
4368 	    caps->ndis_lsov2.ndis_ip4_encap, caps->ndis_lsov2.ndis_ip6_maxsz,
4369 	    caps->ndis_lsov2.ndis_ip6_minsg, caps->ndis_lsov2.ndis_ip6_encap,
4370 	    caps->ndis_lsov2.ndis_ip6_opts);
4371 
4372 	return 0;
4373 }
4374 
4375 static int
4376 hvn_set_capabilities(struct hvn_softc *sc, int mtu)
4377 {
4378 	struct ndis_offload hwcaps;
4379 	struct ndis_offload_params params;
4380 	size_t len;
4381 	uint32_t caps = 0;
4382 	int error, tso_maxsz, tso_minsg;
4383 
4384 	error = hvn_get_hwcaps(sc, &hwcaps);
4385 	if (error) {
4386 		DPRINTF("%s: failed to query hwcaps\n",
4387 		    device_xname(sc->sc_dev));
4388 		return error;
4389 	}
4390 
4391 	/* NOTE: 0 means "no change" */
4392 	memset(&params, 0, sizeof(params));
4393 
4394 	params.ndis_hdr.ndis_type = NDIS_OBJTYPE_DEFAULT;
4395 	if (sc->sc_ndisver < NDIS_VERSION_6_30) {
4396 		params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_2;
4397 		len = params.ndis_hdr.ndis_size = NDIS_OFFLOAD_PARAMS_SIZE_6_1;
4398 	} else {
4399 		params.ndis_hdr.ndis_rev = NDIS_OFFLOAD_PARAMS_REV_3;
4400 		len = params.ndis_hdr.ndis_size = NDIS_OFFLOAD_PARAMS_SIZE;
4401 	}
4402 
4403 	/*
4404 	 * TSO4/TSO6 setup.
4405 	 */
4406 	tso_maxsz = IP_MAXPACKET;
4407 	tso_minsg = 2;
4408 	if (hwcaps.ndis_lsov2.ndis_ip4_encap & NDIS_OFFLOAD_ENCAP_8023) {
4409 		caps |= HVN_CAPS_TSO4;
4410 		params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_ON;
4411 
4412 		if (hwcaps.ndis_lsov2.ndis_ip4_maxsz < tso_maxsz)
4413 			tso_maxsz = hwcaps.ndis_lsov2.ndis_ip4_maxsz;
4414 		if (hwcaps.ndis_lsov2.ndis_ip4_minsg > tso_minsg)
4415 			tso_minsg = hwcaps.ndis_lsov2.ndis_ip4_minsg;
4416 	}
4417 	if ((hwcaps.ndis_lsov2.ndis_ip6_encap & NDIS_OFFLOAD_ENCAP_8023) &&
4418 	    (hwcaps.ndis_lsov2.ndis_ip6_opts & HVN_NDIS_LSOV2_CAP_IP6) ==
4419 	    HVN_NDIS_LSOV2_CAP_IP6) {
4420 		caps |= HVN_CAPS_TSO6;
4421 		params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_ON;
4422 
4423 		if (hwcaps.ndis_lsov2.ndis_ip6_maxsz < tso_maxsz)
4424 			tso_maxsz = hwcaps.ndis_lsov2.ndis_ip6_maxsz;
4425 		if (hwcaps.ndis_lsov2.ndis_ip6_minsg > tso_minsg)
4426 			tso_minsg = hwcaps.ndis_lsov2.ndis_ip6_minsg;
4427 	}
4428 	sc->sc_tso_szmax = 0;
4429 	sc->sc_tso_sgmin = 0;
4430 	if (caps & (HVN_CAPS_TSO4 | HVN_CAPS_TSO6)) {
4431 		KASSERTMSG(tso_maxsz <= IP_MAXPACKET,
4432 		    "invalid NDIS TSO maxsz %d", tso_maxsz);
4433 		KASSERTMSG(tso_minsg >= 2,
4434 		    "invalid NDIS TSO minsg %d", tso_minsg);
4435 		if (tso_maxsz < tso_minsg * mtu) {
4436 			DPRINTF("%s: invalid NDIS TSO config: "
4437 			    "maxsz %d, minsg %d, mtu %d; "
4438 			    "disable TSO4 and TSO6\n", device_xname(sc->sc_dev),
4439 			    tso_maxsz, tso_minsg, mtu);
4440 			caps &= ~(HVN_CAPS_TSO4 | HVN_CAPS_TSO6);
4441 			params.ndis_lsov2_ip4 = NDIS_OFFLOAD_LSOV2_OFF;
4442 			params.ndis_lsov2_ip6 = NDIS_OFFLOAD_LSOV2_OFF;
4443 		} else {
4444 			sc->sc_tso_szmax = tso_maxsz;
4445 			sc->sc_tso_sgmin = tso_minsg;
4446 			aprint_debug_dev(sc->sc_dev,
4447 			    "NDIS TSO szmax %d sgmin %d\n",
4448 			    sc->sc_tso_szmax, sc->sc_tso_sgmin);
4449 		}
4450 	}
4451 
4452 	/* IPv4 checksum */
4453 	if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HVN_NDIS_TXCSUM_CAP_IP4) ==
4454 	    HVN_NDIS_TXCSUM_CAP_IP4) {
4455 		caps |= HVN_CAPS_IPCS;
4456 		params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TX;
4457 	}
4458 	if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_IP4) {
4459 		if (params.ndis_ip4csum == NDIS_OFFLOAD_PARAM_TX)
4460 			params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_TXRX;
4461 		else
4462 			params.ndis_ip4csum = NDIS_OFFLOAD_PARAM_RX;
4463 	}
4464 
4465 	/* TCP4 checksum */
4466 	if ((hwcaps.ndis_csum.ndis_ip4_txcsum & HVN_NDIS_TXCSUM_CAP_TCP4) ==
4467 	    HVN_NDIS_TXCSUM_CAP_TCP4) {
4468 		caps |= HVN_CAPS_TCP4CS;
4469 		params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TX;
4470 	}
4471 	if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_TCP4) {
4472 		if (params.ndis_tcp4csum == NDIS_OFFLOAD_PARAM_TX)
4473 			params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_TXRX;
4474 		else
4475 			params.ndis_tcp4csum = NDIS_OFFLOAD_PARAM_RX;
4476 	}
4477 
4478 	/* UDP4 checksum */
4479 	if (hwcaps.ndis_csum.ndis_ip4_txcsum & NDIS_TXCSUM_CAP_UDP4) {
4480 		caps |= HVN_CAPS_UDP4CS;
4481 		params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TX;
4482 	}
4483 	if (hwcaps.ndis_csum.ndis_ip4_rxcsum & NDIS_RXCSUM_CAP_UDP4) {
4484 		if (params.ndis_udp4csum == NDIS_OFFLOAD_PARAM_TX)
4485 			params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_TXRX;
4486 		else
4487 			params.ndis_udp4csum = NDIS_OFFLOAD_PARAM_RX;
4488 	}
4489 
4490 	/* TCP6 checksum */
4491 	if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HVN_NDIS_TXCSUM_CAP_TCP6) ==
4492 	    HVN_NDIS_TXCSUM_CAP_TCP6) {
4493 		caps |= HVN_CAPS_TCP6CS;
4494 		params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TX;
4495 	}
4496 	if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_TCP6) {
4497 		if (params.ndis_tcp6csum == NDIS_OFFLOAD_PARAM_TX)
4498 			params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_TXRX;
4499 		else
4500 			params.ndis_tcp6csum = NDIS_OFFLOAD_PARAM_RX;
4501 	}
4502 
4503 	/* UDP6 checksum */
4504 	if ((hwcaps.ndis_csum.ndis_ip6_txcsum & HVN_NDIS_TXCSUM_CAP_UDP6) ==
4505 	    HVN_NDIS_TXCSUM_CAP_UDP6) {
4506 		caps |= HVN_CAPS_UDP6CS;
4507 		params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TX;
4508 	}
4509 	if (hwcaps.ndis_csum.ndis_ip6_rxcsum & NDIS_RXCSUM_CAP_UDP6) {
4510 		if (params.ndis_udp6csum == NDIS_OFFLOAD_PARAM_TX)
4511 			params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_TXRX;
4512 		else
4513 			params.ndis_udp6csum = NDIS_OFFLOAD_PARAM_RX;
4514 	}
4515 
4516 	aprint_debug_dev(sc->sc_dev, "offload csum: "
4517 	    "ip4 %u, tcp4 %u, udp4 %u, tcp6 %u, udp6 %u\n",
4518 	    params.ndis_ip4csum, params.ndis_tcp4csum, params.ndis_udp4csum,
4519 	    params.ndis_tcp6csum, params.ndis_udp6csum);
4520 	aprint_debug_dev(sc->sc_dev, "offload lsov2: ip4 %u, ip6 %u\n",
4521 	    params.ndis_lsov2_ip4, params.ndis_lsov2_ip6);
4522 
4523 	error = hvn_rndis_set(sc, OID_TCP_OFFLOAD_PARAMETERS, &params, len);
4524 	if (error) {
4525 		DPRINTF("%s: offload config failed: %d\n",
4526 		    device_xname(sc->sc_dev), error);
4527 		return error;
4528 	}
4529 
4530 	aprint_debug_dev(sc->sc_dev, "offload config done\n");
4531 	sc->sc_caps |= caps;
4532 
4533 	return 0;
4534 }
4535 
4536 static int
4537 hvn_rndis_cmd(struct hvn_softc *sc, struct rndis_cmd *rc, u_int flags)
4538 {
4539 	struct hvn_rx_ring *rxr = &sc->sc_rxr[0];	/* primary channel */
4540 	struct hvn_nvs_rndis *msg = &rc->rc_msg;
4541 	struct rndis_msghdr *hdr = rc->rc_req;
4542 	struct vmbus_gpa sgl[1];
4543 	int tries = 10;
4544 	int rv, s;
4545 
4546 	msg->nvs_type = HVN_NVS_TYPE_RNDIS;
4547 	msg->nvs_rndis_mtype = HVN_NVS_RNDIS_MTYPE_CTRL;
4548 	msg->nvs_chim_idx = HVN_NVS_CHIM_IDX_INVALID;
4549 
4550 	sgl[0].gpa_page = rc->rc_gpa;
4551 	sgl[0].gpa_len = hdr->rm_len;
4552 	sgl[0].gpa_ofs = 0;
4553 
4554 	rc->rc_done = 0;
4555 
4556 	mutex_enter(&rc->rc_lock);
4557 
4558 	hvn_submit_cmd(sc, rc);
4559 
4560 	do {
4561 		rv = vmbus_channel_send_sgl(rxr->rxr_chan, sgl, 1, &rc->rc_msg,
4562 		    sizeof(*msg), rc->rc_id);
4563 		if (rv == EAGAIN) {
4564 			DELAY(1000);
4565 		} else if (rv) {
4566 			mutex_exit(&rc->rc_lock);
4567 			DPRINTF("%s: RNDIS operation %u send error %d\n",
4568 			    device_xname(sc->sc_dev), hdr->rm_type, rv);
4569 			hvn_rollback_cmd(sc, rc);
4570 			return rv;
4571 		}
4572 	} while (rv != 0 && --tries > 0);
4573 
4574 	if (tries == 0 && rv != 0) {
4575 		mutex_exit(&rc->rc_lock);
4576 		device_printf(sc->sc_dev,
4577 		    "RNDIS operation %u send error %d\n", hdr->rm_type, rv);
4578 		hvn_rollback_cmd(sc, rc);
4579 		return rv;
4580 	}
4581 	if (vmbus_channel_is_revoked(rxr->rxr_chan) ||
4582 	    ISSET(flags, HVN_RNDIS_CMD_NORESP)) {
4583 		/* No response */
4584 		mutex_exit(&rc->rc_lock);
4585 		if (hvn_rollback_cmd(sc, rc))
4586 			hvn_release_cmd(sc, rc);
4587 		return 0;
4588 	}
4589 
4590 	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
4591 	    BUS_DMASYNC_POSTWRITE);
4592 
4593 	while (!rc->rc_done && !ISSET(sc->sc_flags, HVN_SCF_REVOKED)) {
4594 		mutex_exit(&rc->rc_lock);
4595 		DELAY(1000);
4596 		s = splnet();
4597 		hvn_nvs_intr1(rxr, 0, 0);
4598 		splx(s);
4599 		mutex_enter(&rc->rc_lock);
4600 	}
4601 	mutex_exit(&rc->rc_lock);
4602 
4603 	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
4604 	    BUS_DMASYNC_POSTREAD);
4605 
4606 	if (!rc->rc_done) {
4607 		rv = EINTR;
4608 		if (hvn_rollback_cmd(sc, rc)) {
4609 			hvn_release_cmd(sc, rc);
4610 			rv = 0;
4611 		}
4612 		return rv;
4613 	}
4614 
4615 	hvn_release_cmd(sc, rc);
4616 	return 0;
4617 }
4618 
4619 static int
4620 hvn_rndis_input(struct hvn_rx_ring *rxr, uint64_t tid, void *arg)
4621 {
4622 	struct hvn_softc *sc = rxr->rxr_softc;
4623 	struct vmbus_chanpkt_prplist *cp = arg;
4624 	uint32_t off, len, type;
4625 	int i, rv, rx = 0;
4626 	bool qfull = false;
4627 
4628 	if (sc->sc_rx_ring == NULL) {
4629 		DPRINTF("%s: invalid rx ring\n", device_xname(sc->sc_dev));
4630 		return 0;
4631 	}
4632 
4633 	for (i = 0; i < cp->cp_range_cnt; i++) {
4634 		off = cp->cp_range[i].gpa_ofs;
4635 		len = cp->cp_range[i].gpa_len;
4636 
4637 		KASSERT(off + len <= sc->sc_rx_size);
4638 		KASSERT(len >= RNDIS_HEADER_OFFSET + 4);
4639 
4640 		memcpy(&type, sc->sc_rx_ring + off, sizeof(type));
4641 		switch (type) {
4642 		/* data message */
4643 		case REMOTE_NDIS_PACKET_MSG:
4644 			rv = hvn_rxeof(rxr, sc->sc_rx_ring + off, len);
4645 			if (rv == 1)
4646 				rx++;
4647 			else if (rv == -1)	/* The receive queue is full. */
4648 				qfull = true;
4649 			break;
4650 		/* completion messages */
4651 		case REMOTE_NDIS_INITIALIZE_CMPLT:
4652 		case REMOTE_NDIS_QUERY_CMPLT:
4653 		case REMOTE_NDIS_SET_CMPLT:
4654 		case REMOTE_NDIS_RESET_CMPLT:
4655 		case REMOTE_NDIS_KEEPALIVE_CMPLT:
4656 			hvn_rndis_complete(sc, sc->sc_rx_ring + off, len);
4657 			break;
4658 		/* notification message */
4659 		case REMOTE_NDIS_INDICATE_STATUS_MSG:
4660 			hvn_rndis_status(sc, sc->sc_rx_ring + off, len);
4661 			break;
4662 		default:
4663 			device_printf(sc->sc_dev,
4664 			    "unhandled RNDIS message type %u\n", type);
4665 			break;
4666 		}
4667 	}
4668 
4669 	hvn_nvs_ack(rxr, tid);
4670 
4671 	if (qfull)
4672 		return -1;
4673 	return rx;
4674 }
4675 
4676 static inline struct mbuf *
4677 hvn_devget(struct hvn_softc *sc, void *buf, uint32_t len)
4678 {
4679 	struct ifnet *ifp = SC2IFP(sc);
4680 	struct mbuf *m;
4681 	size_t size = len + ETHER_ALIGN + ETHER_VLAN_ENCAP_LEN;
4682 
4683 	MGETHDR(m, M_NOWAIT, MT_DATA);
4684 	if (m == NULL)
4685 		return NULL;
4686 
4687 	if (size > MHLEN) {
4688 		if (size <= MCLBYTES)
4689 			MCLGET(m, M_NOWAIT);
4690 		else
4691 			MEXTMALLOC(m, size, M_NOWAIT);
4692 		if ((m->m_flags & M_EXT) == 0) {
4693 			m_freem(m);
4694 			return NULL;
4695 		}
4696 	}
4697 
4698 	m->m_len = m->m_pkthdr.len = size;
4699 	m_adj(m, ETHER_ALIGN + ETHER_VLAN_ENCAP_LEN);
4700 	m_copyback(m, 0, len, buf);
4701 	m_set_rcvif(m, ifp);
4702 	return m;
4703 }
4704 
4705 #define HVN_RXINFO_CSUM		__BIT(NDIS_PKTINFO_TYPE_CSUM)
4706 #define HVN_RXINFO_VLAN		__BIT(NDIS_PKTINFO_TYPE_VLAN)
4707 #define HVN_RXINFO_HASHVAL	__BIT(HVN_NDIS_PKTINFO_TYPE_HASHVAL)
4708 #define HVN_RXINFO_HASHINFO	__BIT(HVN_NDIS_PKTINFO_TYPE_HASHINF)
4709 #define HVN_RXINFO_ALL		(HVN_RXINFO_CSUM | \
4710 				 HVN_RXINFO_VLAN | \
4711 				 HVN_RXINFO_HASHVAL | \
4712 				 HVN_RXINFO_HASHINFO)
4713 
4714 static int
4715 hvn_rxeof(struct hvn_rx_ring *rxr, uint8_t *buf, uint32_t len)
4716 {
4717 	struct hvn_softc *sc = rxr->rxr_softc;
4718 	struct ifnet *ifp = SC2IFP(sc);
4719 	struct rndis_packet_msg *pkt;
4720 	struct rndis_pktinfo *pi;
4721 	struct mbuf *m;
4722 	uint32_t mask, csum, vlan, hashval, hashinfo;
4723 
4724 	if (!(ifp->if_flags & IFF_RUNNING))
4725 		return 0;
4726 
4727 	if (len < sizeof(*pkt)) {
4728 		device_printf(sc->sc_dev, "data packet too short: %u\n",
4729 		    len);
4730 		return 0;
4731 	}
4732 
4733 	pkt = (struct rndis_packet_msg *)buf;
4734 	if (pkt->rm_dataoffset + pkt->rm_datalen > len) {
4735 		device_printf(sc->sc_dev,
4736 		    "data packet out of bounds: %u@%u\n", pkt->rm_dataoffset,
4737 		    pkt->rm_datalen);
4738 		return 0;
4739 	}
4740 
4741 	if ((m = hvn_devget(sc, buf + RNDIS_HEADER_OFFSET + pkt->rm_dataoffset,
4742 	    pkt->rm_datalen)) == NULL) {
4743 		if_statinc(ifp, if_ierrors);
4744 		return 0;
4745 	}
4746 
4747 	if (pkt->rm_pktinfooffset + pkt->rm_pktinfolen > len) {
4748 		device_printf(sc->sc_dev,
4749 		    "pktinfo is out of bounds: %u@%u vs %u\n",
4750 		    pkt->rm_pktinfolen, pkt->rm_pktinfooffset, len);
4751 		goto done;
4752 	}
4753 
4754 	mask = csum = hashval = hashinfo = 0;
4755 	vlan = 0xffffffff;
4756 	pi = (struct rndis_pktinfo *)(buf + RNDIS_HEADER_OFFSET +
4757 	    pkt->rm_pktinfooffset);
4758 	while (pkt->rm_pktinfolen > 0) {
4759 		if (pi->rm_size > pkt->rm_pktinfolen) {
4760 			device_printf(sc->sc_dev,
4761 			    "invalid pktinfo size: %u/%u\n", pi->rm_size,
4762 			    pkt->rm_pktinfolen);
4763 			break;
4764 		}
4765 
4766 		switch (pi->rm_type) {
4767 		case NDIS_PKTINFO_TYPE_CSUM:
4768 			memcpy(&csum, pi->rm_data, sizeof(csum));
4769 			SET(mask, HVN_RXINFO_CSUM);
4770 			break;
4771 		case NDIS_PKTINFO_TYPE_VLAN:
4772 			memcpy(&vlan, pi->rm_data, sizeof(vlan));
4773 			SET(mask, HVN_RXINFO_VLAN);
4774 			break;
4775 		case HVN_NDIS_PKTINFO_TYPE_HASHVAL:
4776 			memcpy(&hashval, pi->rm_data, sizeof(hashval));
4777 			SET(mask, HVN_RXINFO_HASHVAL);
4778 			break;
4779 		case HVN_NDIS_PKTINFO_TYPE_HASHINF:
4780 			memcpy(&hashinfo, pi->rm_data, sizeof(hashinfo));
4781 			SET(mask, HVN_RXINFO_HASHINFO);
4782 			break;
4783 		default:
4784 			DPRINTF("%s: unhandled pktinfo type %u\n",
4785 			    device_xname(sc->sc_dev), pi->rm_type);
4786 			goto next;
4787 		}
4788 
4789 		if (mask == HVN_RXINFO_ALL) {
4790 			/* All found; done */
4791 			break;
4792 		}
4793  next:
4794 		pkt->rm_pktinfolen -= pi->rm_size;
4795 		pi = (struct rndis_pktinfo *)((char *)pi + pi->rm_size);
4796 	}
4797 
4798 	/*
4799 	 * Final fixup.
4800 	 * - If there is no hash value, invalidate the hash info.
4801 	 */
4802 	if (!ISSET(mask, HVN_RXINFO_HASHVAL))
4803 		hashinfo = 0;
4804 
4805 	if (csum != 0) {
4806 		if (ISSET(csum, NDIS_RXCSUM_INFO_IPCS_OK) &&
4807 			ISSET(ifp->if_csum_flags_rx, M_CSUM_IPv4)) {
4808 			SET(m->m_pkthdr.csum_flags, M_CSUM_IPv4);
4809 			rxr->rxr_evcsum_ip.ev_count++;
4810 		}
4811 		if (ISSET(csum, NDIS_RXCSUM_INFO_TCPCS_OK) &&
4812 			ISSET(ifp->if_csum_flags_rx, M_CSUM_TCPv4)) {
4813 			SET(m->m_pkthdr.csum_flags, M_CSUM_TCPv4);
4814 			rxr->rxr_evcsum_tcp.ev_count++;
4815 		}
4816 		if (ISSET(csum, NDIS_RXCSUM_INFO_UDPCS_OK) &&
4817 			ISSET(ifp->if_csum_flags_rx, M_CSUM_UDPv4)) {
4818 			SET(m->m_pkthdr.csum_flags, M_CSUM_UDPv4);
4819 			rxr->rxr_evcsum_udp.ev_count++;
4820 		}
4821 	}
4822 
4823 	if (vlan != 0xffffffff) {
4824 		uint16_t t = NDIS_VLAN_INFO_ID(vlan);
4825 		t |= NDIS_VLAN_INFO_PRI(vlan) << EVL_PRIO_BITS;
4826 		t |= NDIS_VLAN_INFO_CFI(vlan) << EVL_CFI_BITS;
4827 
4828 		if (ISSET(sc->sc_ec.ec_capenable, ETHERCAP_VLAN_HWTAGGING)) {
4829 			vlan_set_tag(m, t);
4830 			rxr->rxr_evvlanhwtagging.ev_count++;
4831 		} else {
4832 			struct ether_header eh;
4833 			struct ether_vlan_header *evl;
4834 
4835 			KDASSERT(m->m_pkthdr.len >= sizeof(eh));
4836 			m_copydata(m, 0, sizeof(eh), &eh);
4837 			M_PREPEND(m, ETHER_VLAN_ENCAP_LEN, M_NOWAIT);
4838 			KDASSERT(m != NULL);
4839 
4840 			evl = mtod(m, struct ether_vlan_header *);
4841 			memcpy(evl->evl_dhost, eh.ether_dhost,
4842 			    ETHER_ADDR_LEN * 2);
4843 			evl->evl_encap_proto = htons(ETHERTYPE_VLAN);
4844 			evl->evl_tag = htons(t);
4845 			evl->evl_proto = eh.ether_type;
4846 		}
4847 	}
4848 
4849 	/* XXX RSS hash is not supported. */
4850 
4851  done:
4852 	rxr->rxr_evpkts.ev_count++;
4853 	if_percpuq_enqueue(sc->sc_ipq, m);
4854 	/* XXX Unable to detect that the receive queue is full. */
4855 	return 1;
4856 }
4857 
4858 static void
4859 hvn_rndis_complete(struct hvn_softc *sc, uint8_t *buf, uint32_t len)
4860 {
4861 	struct rndis_cmd *rc;
4862 	uint32_t id;
4863 
4864 	memcpy(&id, buf + RNDIS_HEADER_OFFSET, sizeof(id));
4865 	if ((rc = hvn_complete_cmd(sc, id)) != NULL) {
4866 		mutex_enter(&rc->rc_lock);
4867 		if (len < rc->rc_cmplen)
4868 			device_printf(sc->sc_dev,
4869 			    "RNDIS response %u too short: %u\n", id, len);
4870 		else
4871 			memcpy(&rc->rc_cmp, buf, rc->rc_cmplen);
4872 		if (len > rc->rc_cmplen &&
4873 		    len - rc->rc_cmplen > HVN_RNDIS_BUFSIZE)
4874 			device_printf(sc->sc_dev,
4875 			    "RNDIS response %u too large: %u\n", id, len);
4876 		else if (len > rc->rc_cmplen)
4877 			memcpy(&rc->rc_cmpbuf, buf + rc->rc_cmplen,
4878 			    len - rc->rc_cmplen);
4879 		rc->rc_done = 1;
4880 		cv_signal(&rc->rc_cv);
4881 		mutex_exit(&rc->rc_lock);
4882 	} else {
4883 		DPRINTF("%s: failed to complete RNDIS request id %u\n",
4884 		    device_xname(sc->sc_dev), id);
4885 	}
4886 }
4887 
4888 static int
4889 hvn_rndis_output_sgl(struct hvn_tx_ring *txr, struct hvn_tx_desc *txd)
4890 {
4891 	struct hvn_softc *sc = txr->txr_softc;
4892 	uint64_t rid = (uint64_t)txd->txd_id << 32;
4893 	int rv;
4894 
4895 	rv = vmbus_channel_send_sgl(txr->txr_chan, txd->txd_sgl, txd->txd_nsge,
4896 	    &sc->sc_data_msg, sizeof(sc->sc_data_msg), rid);
4897 	if (rv) {
4898 		DPRINTF("%s: RNDIS data send error %d\n",
4899 		    device_xname(sc->sc_dev), rv);
4900 		return rv;
4901 	}
4902 	return 0;
4903 }
4904 
4905 static int
4906 hvn_rndis_output_chim(struct hvn_tx_ring *txr, struct hvn_tx_desc *txd)
4907 {
4908 	struct hvn_nvs_rndis rndis;
4909 	uint64_t rid = (uint64_t)txd->txd_id << 32;
4910 	int rv;
4911 
4912 	memset(&rndis, 0, sizeof(rndis));
4913 	rndis.nvs_type = HVN_NVS_TYPE_RNDIS;
4914 	rndis.nvs_rndis_mtype = HVN_NVS_RNDIS_MTYPE_DATA;
4915 	rndis.nvs_chim_idx = txd->txd_chim_index;
4916 	rndis.nvs_chim_sz = txd->txd_chim_size;
4917 
4918 	rv = vmbus_channel_send(txr->txr_chan, &rndis, sizeof(rndis),
4919 	    rid, VMBUS_CHANPKT_TYPE_INBAND, VMBUS_CHANPKT_FLAG_RC);
4920 	if (rv) {
4921 		DPRINTF("%s: RNDIS chimney data send error %d: idx %u, sz %u\n",
4922 		    device_xname(sc->sc_dev), rv, rndis.nvs_chim_idx,
4923 		    rndis.nvs_chim_sz);
4924 		return rv;
4925 	}
4926 	return 0;
4927 }
4928 
4929 static void
4930 hvn_rndis_status(struct hvn_softc *sc, uint8_t *buf, uint32_t len)
4931 {
4932 	uint32_t status;
4933 
4934 	memcpy(&status, buf + RNDIS_HEADER_OFFSET, sizeof(status));
4935 	switch (status) {
4936 	case RNDIS_STATUS_MEDIA_CONNECT:
4937 	case RNDIS_STATUS_MEDIA_DISCONNECT:
4938 		hvn_link_event(sc, HVN_LINK_EV_STATE_CHANGE);
4939 		break;
4940 	case RNDIS_STATUS_NETWORK_CHANGE:
4941 		hvn_link_event(sc, HVN_LINK_EV_NETWORK_CHANGE);
4942 		break;
4943 	/* Ignore these */
4944 	case RNDIS_STATUS_OFFLOAD_CURRENT_CONFIG:
4945 	case RNDIS_STATUS_LINK_SPEED_CHANGE:
4946 		return;
4947 	default:
4948 		DPRINTF("%s: unhandled status %#x\n", device_xname(sc->sc_dev),
4949 		    status);
4950 		return;
4951 	}
4952 }
4953 
4954 static int
4955 hvn_rndis_query(struct hvn_softc *sc, uint32_t oid, void *res, size_t *length)
4956 {
4957 
4958 	return hvn_rndis_query2(sc, oid, NULL, 0, res, length, 0);
4959 }
4960 
4961 static int
4962 hvn_rndis_query2(struct hvn_softc *sc, uint32_t oid, const void *idata,
4963     size_t idlen, void *odata, size_t *odlen, size_t min_odlen)
4964 {
4965 	struct rndis_cmd *rc;
4966 	struct rndis_query_req *req;
4967 	struct rndis_query_comp *cmp;
4968 	size_t olength = *odlen;
4969 	int rv;
4970 
4971 	rc = hvn_alloc_cmd(sc);
4972 
4973 	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
4974 	    BUS_DMASYNC_PREREAD);
4975 
4976 	rc->rc_id = atomic_inc_uint_nv(&sc->sc_rndisrid);
4977 
4978 	req = rc->rc_req;
4979 	req->rm_type = REMOTE_NDIS_QUERY_MSG;
4980 	req->rm_len = sizeof(*req) + idlen;
4981 	req->rm_rid = rc->rc_id;
4982 	req->rm_oid = oid;
4983 	req->rm_infobufoffset = sizeof(*req) - RNDIS_HEADER_OFFSET;
4984 	if (idlen > 0) {
4985 		KASSERT(sizeof(*req) + idlen <= PAGE_SIZE);
4986 		req->rm_infobuflen = idlen;
4987 		memcpy(req + 1, idata, idlen);
4988 	}
4989 
4990 	rc->rc_cmplen = sizeof(*cmp);
4991 
4992 	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
4993 	    BUS_DMASYNC_PREWRITE);
4994 
4995 	if ((rv = hvn_rndis_cmd(sc, rc, 0)) != 0) {
4996 		DPRINTF("%s: QUERY_MSG failed, error %d\n",
4997 		    device_xname(sc->sc_dev), rv);
4998 		hvn_free_cmd(sc, rc);
4999 		return rv;
5000 	}
5001 
5002 	cmp = (struct rndis_query_comp *)&rc->rc_cmp;
5003 	switch (cmp->rm_status) {
5004 	case RNDIS_STATUS_SUCCESS:
5005 		if (cmp->rm_infobuflen > olength ||
5006 		    (min_odlen > 0 && cmp->rm_infobuflen < min_odlen)) {
5007 			rv = EINVAL;
5008 			break;
5009 		}
5010 		memcpy(odata, rc->rc_cmpbuf, cmp->rm_infobuflen);
5011 		*odlen = cmp->rm_infobuflen;
5012 		break;
5013 	default:
5014 		*odlen = 0;
5015 		rv = EIO;
5016 		break;
5017 	}
5018 
5019 	hvn_free_cmd(sc, rc);
5020 	return rv;
5021 }
5022 
5023 static int
5024 hvn_rndis_set(struct hvn_softc *sc, uint32_t oid, void *data, size_t length)
5025 {
5026 	struct rndis_cmd *rc;
5027 	struct rndis_set_req *req;
5028 	struct rndis_set_comp *cmp;
5029 	int rv;
5030 
5031 	rc = hvn_alloc_cmd(sc);
5032 
5033 	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
5034 	    BUS_DMASYNC_PREREAD);
5035 
5036 	rc->rc_id = atomic_inc_uint_nv(&sc->sc_rndisrid);
5037 
5038 	req = rc->rc_req;
5039 	req->rm_type = REMOTE_NDIS_SET_MSG;
5040 	req->rm_len = sizeof(*req) + length;
5041 	req->rm_rid = rc->rc_id;
5042 	req->rm_oid = oid;
5043 	req->rm_infobufoffset = sizeof(*req) - RNDIS_HEADER_OFFSET;
5044 
5045 	rc->rc_cmplen = sizeof(*cmp);
5046 
5047 	if (length > 0) {
5048 		KASSERT(sizeof(*req) + length < PAGE_SIZE);
5049 		req->rm_infobuflen = length;
5050 		memcpy(req + 1, data, length);
5051 	}
5052 
5053 	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
5054 	    BUS_DMASYNC_PREWRITE);
5055 
5056 	if ((rv = hvn_rndis_cmd(sc, rc, 0)) != 0) {
5057 		DPRINTF("%s: SET_MSG failed, error %d\n",
5058 		    device_xname(sc->sc_dev), rv);
5059 		hvn_free_cmd(sc, rc);
5060 		return rv;
5061 	}
5062 
5063 	cmp = (struct rndis_set_comp *)&rc->rc_cmp;
5064 	if (cmp->rm_status != RNDIS_STATUS_SUCCESS)
5065 		rv = EIO;
5066 
5067 	hvn_free_cmd(sc, rc);
5068 	return rv;
5069 }
5070 
5071 static int
5072 hvn_rndis_open(struct hvn_softc *sc)
5073 {
5074 	struct ifnet *ifp = SC2IFP(sc);
5075 	uint32_t filter;
5076 	int rv;
5077 
5078 	if (ifp->if_flags & IFF_PROMISC) {
5079 		filter = RNDIS_PACKET_TYPE_PROMISCUOUS;
5080 	} else {
5081 		filter = RNDIS_PACKET_TYPE_DIRECTED;
5082 		if (ifp->if_flags & IFF_BROADCAST)
5083 			filter |= RNDIS_PACKET_TYPE_BROADCAST;
5084 		if (ifp->if_flags & IFF_ALLMULTI)
5085 			filter |= RNDIS_PACKET_TYPE_ALL_MULTICAST;
5086 		else {
5087 			struct ethercom *ec = &sc->sc_ec;
5088 			struct ether_multi *enm;
5089 			struct ether_multistep step;
5090 
5091 			ETHER_LOCK(ec);
5092 			ETHER_FIRST_MULTI(step, ec, enm);
5093 			/* TODO: support multicast list */
5094 			if (enm != NULL)
5095 				filter |= RNDIS_PACKET_TYPE_ALL_MULTICAST;
5096 			ETHER_UNLOCK(ec);
5097 		}
5098 	}
5099 
5100 	rv = hvn_rndis_set(sc, OID_GEN_CURRENT_PACKET_FILTER,
5101 	    &filter, sizeof(filter));
5102 	if (rv) {
5103 		DPRINTF("%s: failed to set RNDIS filter to %#x\n",
5104 		    device_xname(sc->sc_dev), filter);
5105 	}
5106 	return rv;
5107 }
5108 
5109 static int
5110 hvn_rndis_close(struct hvn_softc *sc)
5111 {
5112 	uint32_t filter = 0;
5113 	int rv;
5114 
5115 	rv = hvn_rndis_set(sc, OID_GEN_CURRENT_PACKET_FILTER,
5116 	    &filter, sizeof(filter));
5117 	if (rv) {
5118 		DPRINTF("%s: failed to clear RNDIS filter\n",
5119 		    device_xname(sc->sc_dev));
5120 	}
5121 	return rv;
5122 }
5123 
5124 static void
5125 hvn_rndis_detach(struct hvn_softc *sc)
5126 {
5127 	struct rndis_cmd *rc;
5128 	struct rndis_halt_req *req;
5129 	int rv;
5130 
5131 	rc = hvn_alloc_cmd(sc);
5132 
5133 	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
5134 	    BUS_DMASYNC_PREREAD);
5135 
5136 	rc->rc_id = atomic_inc_uint_nv(&sc->sc_rndisrid);
5137 
5138 	req = rc->rc_req;
5139 	req->rm_type = REMOTE_NDIS_HALT_MSG;
5140 	req->rm_len = sizeof(*req);
5141 	req->rm_rid = rc->rc_id;
5142 
5143 	bus_dmamap_sync(sc->sc_dmat, rc->rc_dmap, 0, PAGE_SIZE,
5144 	    BUS_DMASYNC_PREWRITE);
5145 
5146 	/* No RNDIS completion; rely on NVS message send completion */
5147 	if ((rv = hvn_rndis_cmd(sc, rc, HVN_RNDIS_CMD_NORESP)) != 0) {
5148 		DPRINTF("%s: HALT_MSG failed, error %d\n",
5149 		    device_xname(sc->sc_dev), rv);
5150 	}
5151 	hvn_free_cmd(sc, rc);
5152 }
5153 
5154 static void
5155 hvn_init_sysctls(struct hvn_softc *sc)
5156 {
5157 	struct sysctllog **log;
5158 	const struct sysctlnode *rnode, *cnode, *rxnode, *txnode;
5159 	const char *dvname;
5160 	int error;
5161 
5162 	log = &sc->sc_sysctllog;
5163 	dvname = device_xname(sc->sc_dev);
5164 
5165 	error = sysctl_createv(log, 0, NULL, &rnode,
5166 	    0, CTLTYPE_NODE, dvname,
5167 	    SYSCTL_DESCR("hvn information and settings"),
5168 	    NULL, 0, NULL, 0, CTL_HW, CTL_CREATE, CTL_EOL);
5169 	if (error)
5170 		goto err;
5171 
5172 	error = sysctl_createv(log, 0, &rnode, &cnode,
5173 	    CTLFLAG_READWRITE, CTLTYPE_BOOL, "txrx_workqueue",
5174 	    SYSCTL_DESCR("Use workqueue for packet processing"),
5175 	    NULL, 0, &sc->sc_txrx_workqueue, 0, CTL_CREATE, CTL_EOL);
5176 	if (error)
5177 		goto out;
5178 
5179 	error = sysctl_createv(log, 0, &rnode, &rxnode,
5180 	    0, CTLTYPE_NODE, "rx",
5181 	    SYSCTL_DESCR("hvn information and settings for Rx"),
5182 	    NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL);
5183 	if (error)
5184 		goto out;
5185 
5186 	error = sysctl_createv(log, 0, &rxnode, NULL,
5187 	    CTLFLAG_READWRITE, CTLTYPE_INT, "intr_process_limit",
5188 	    SYSCTL_DESCR("max number of Rx packets"
5189 	      " to process for interrupt processing"),
5190 	    NULL, 0, &sc->sc_rx_intr_process_limit, 0, CTL_CREATE, CTL_EOL);
5191 	if (error)
5192 		goto out;
5193 
5194 	error = sysctl_createv(log, 0, &rxnode, NULL,
5195 	    CTLFLAG_READWRITE, CTLTYPE_INT, "process_limit",
5196 	    SYSCTL_DESCR("max number of Rx packets"
5197 	      " to process for deferred processing"),
5198 	    NULL, 0, &sc->sc_rx_process_limit, 0, CTL_CREATE, CTL_EOL);
5199 	if (error)
5200 		goto out;
5201 
5202 	error = sysctl_createv(log, 0, &rnode, &txnode,
5203 	    0, CTLTYPE_NODE, "tx",
5204 	    SYSCTL_DESCR("hvn information and settings for Tx"),
5205 	    NULL, 0, NULL, 0, CTL_CREATE, CTL_EOL);
5206 	if (error)
5207 		goto out;
5208 
5209 	error = sysctl_createv(log, 0, &txnode, NULL,
5210 	    CTLFLAG_READWRITE, CTLTYPE_INT, "intr_process_limit",
5211 	    SYSCTL_DESCR("max number of Tx packets"
5212 	      " to process for interrupt processing"),
5213 	    NULL, 0, &sc->sc_tx_intr_process_limit, 0, CTL_CREATE, CTL_EOL);
5214 	if (error)
5215 		goto out;
5216 
5217 	error = sysctl_createv(log, 0, &txnode, NULL,
5218 	    CTLFLAG_READWRITE, CTLTYPE_INT, "process_limit",
5219 	    SYSCTL_DESCR("max number of Tx packets"
5220 	      " to process for deferred processing"),
5221 	    NULL, 0, &sc->sc_tx_process_limit, 0, CTL_CREATE, CTL_EOL);
5222 	if (error)
5223 		goto out;
5224 
5225 	return;
5226 
5227 out:
5228 	sysctl_teardown(log);
5229 	sc->sc_sysctllog = NULL;
5230 err:
5231 	aprint_error_dev(sc->sc_dev, "sysctl_createv failed (err = %d)\n",
5232 	    error);
5233 }
5234 
5235 SYSCTL_SETUP(sysctl_hw_hvn_setup, "sysctl hw.hvn setup")
5236 {
5237 	const struct sysctlnode *rnode;
5238 	const struct sysctlnode *cnode;
5239 	int error;
5240 
5241 	error = sysctl_createv(clog, 0, NULL, &rnode,
5242 	    CTLFLAG_PERMANENT, CTLTYPE_NODE, "hvn",
5243 	    SYSCTL_DESCR("hvn global controls"),
5244 	    NULL, 0, NULL, 0, CTL_HW, CTL_CREATE, CTL_EOL);
5245 	if (error)
5246 		goto fail;
5247 
5248 	error = sysctl_createv(clog, 0, &rnode, &cnode,
5249 	    CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT,
5250 	    "udp_csum_fixup_mtu",
5251 	    SYSCTL_DESCR("UDP checksum offloding fixup MTU"),
5252 	    NULL, 0, &hvn_udpcs_fixup_mtu, sizeof(hvn_udpcs_fixup_mtu),
5253 	    CTL_CREATE, CTL_EOL);
5254 	if (error)
5255 		goto fail;
5256 
5257 	error = sysctl_createv(clog, 0, &rnode, &cnode,
5258 	    CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT,
5259 	    "chimney_size",
5260 	    SYSCTL_DESCR("Chimney send packet size limit"),
5261 	    NULL, 0, &hvn_tx_chimney_size, sizeof(hvn_tx_chimney_size),
5262 	    CTL_CREATE, CTL_EOL);
5263 	if (error)
5264 		goto fail;
5265 
5266 	error = sysctl_createv(clog, 0, &rnode, &cnode,
5267 	    CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT,
5268 	    "channel_count",
5269 	    SYSCTL_DESCR("# of channels to use"),
5270 	    NULL, 0, &hvn_channel_cnt, sizeof(hvn_channel_cnt),
5271 	    CTL_CREATE, CTL_EOL);
5272 	if (error)
5273 		goto fail;
5274 
5275 	error = sysctl_createv(clog, 0, &rnode, &cnode,
5276 	    CTLFLAG_PERMANENT|CTLFLAG_READWRITE, CTLTYPE_INT,
5277 	    "tx_ring_count",
5278 	    SYSCTL_DESCR("# of transmit rings to use"),
5279 	    NULL, 0, &hvn_tx_ring_cnt, sizeof(hvn_tx_ring_cnt),
5280 	    CTL_CREATE, CTL_EOL);
5281 	if (error)
5282 		goto fail;
5283 
5284 	return;
5285 
5286 fail:
5287 	aprint_error("%s: sysctl_createv failed (err = %d)\n", __func__, error);
5288 }
5289