xref: /dpdk/drivers/net/netvsc/hn_ethdev.c (revision e90020535c03cf9e60448ba623cac3301f111dae)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016-2018 Microsoft Corporation
3  * Copyright(c) 2013-2016 Brocade Communications Systems, Inc.
4  * All rights reserved.
5  */
6 
7 #include <stdint.h>
8 #include <string.h>
9 #include <stdio.h>
10 #include <errno.h>
11 #include <unistd.h>
12 #include <dirent.h>
13 #include <net/if.h>
14 #include <net/if_arp.h>
15 #include <netinet/in.h>
16 #include <sys/ioctl.h>
17 
18 #include <rte_ethdev.h>
19 #include <rte_memcpy.h>
20 #include <rte_string_fns.h>
21 #include <rte_memzone.h>
22 #include <rte_devargs.h>
23 #include <rte_malloc.h>
24 #include <rte_kvargs.h>
25 #include <rte_atomic.h>
26 #include <rte_branch_prediction.h>
27 #include <rte_ether.h>
28 #include <ethdev_driver.h>
29 #include <rte_cycles.h>
30 #include <rte_errno.h>
31 #include <rte_memory.h>
32 #include <rte_eal.h>
33 #include <dev_driver.h>
34 #include <bus_driver.h>
35 #include <bus_vmbus_driver.h>
36 #include <rte_alarm.h>
37 
38 #include "hn_logs.h"
39 #include "hn_var.h"
40 #include "hn_rndis.h"
41 #include "hn_nvs.h"
42 #include "ndis.h"
43 
44 #define HN_TX_OFFLOAD_CAPS (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | \
45 			    RTE_ETH_TX_OFFLOAD_TCP_CKSUM  | \
46 			    RTE_ETH_TX_OFFLOAD_UDP_CKSUM  | \
47 			    RTE_ETH_TX_OFFLOAD_TCP_TSO    | \
48 			    RTE_ETH_TX_OFFLOAD_MULTI_SEGS | \
49 			    RTE_ETH_TX_OFFLOAD_VLAN_INSERT)
50 
51 #define HN_RX_OFFLOAD_CAPS (RTE_ETH_RX_OFFLOAD_CHECKSUM | \
52 			    RTE_ETH_RX_OFFLOAD_VLAN_STRIP | \
53 			    RTE_ETH_RX_OFFLOAD_RSS_HASH)
54 
55 #define NETVSC_ARG_LATENCY "latency"
56 #define NETVSC_ARG_RXBREAK "rx_copybreak"
57 #define NETVSC_ARG_TXBREAK "tx_copybreak"
58 #define NETVSC_ARG_RX_EXTMBUF_ENABLE "rx_extmbuf_enable"
59 
60 /* The max number of retry when hot adding a VF device */
61 #define NETVSC_MAX_HOTADD_RETRY 10
62 
63 struct hn_xstats_name_off {
64 	char name[RTE_ETH_XSTATS_NAME_SIZE];
65 	unsigned int offset;
66 };
67 
68 static const struct hn_xstats_name_off hn_stat_strings[] = {
69 	{ "good_packets",           offsetof(struct hn_stats, packets) },
70 	{ "good_bytes",             offsetof(struct hn_stats, bytes) },
71 	{ "errors",                 offsetof(struct hn_stats, errors) },
72 	{ "ring full",              offsetof(struct hn_stats, ring_full) },
73 	{ "channel full",           offsetof(struct hn_stats, channel_full) },
74 	{ "multicast_packets",      offsetof(struct hn_stats, multicast) },
75 	{ "broadcast_packets",      offsetof(struct hn_stats, broadcast) },
76 	{ "undersize_packets",      offsetof(struct hn_stats, size_bins[0]) },
77 	{ "size_64_packets",        offsetof(struct hn_stats, size_bins[1]) },
78 	{ "size_65_127_packets",    offsetof(struct hn_stats, size_bins[2]) },
79 	{ "size_128_255_packets",   offsetof(struct hn_stats, size_bins[3]) },
80 	{ "size_256_511_packets",   offsetof(struct hn_stats, size_bins[4]) },
81 	{ "size_512_1023_packets",  offsetof(struct hn_stats, size_bins[5]) },
82 	{ "size_1024_1518_packets", offsetof(struct hn_stats, size_bins[6]) },
83 	{ "size_1519_max_packets",  offsetof(struct hn_stats, size_bins[7]) },
84 };
85 
86 /* The default RSS key.
87  * This value is the same as MLX5 so that flows will be
88  * received on same path for both VF and synthetic NIC.
89  */
90 static const uint8_t rss_default_key[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
91 	0x2c, 0xc6, 0x81, 0xd1,	0x5b, 0xdb, 0xf4, 0xf7,
92 	0xfc, 0xa2, 0x83, 0x19,	0xdb, 0x1a, 0x3e, 0x94,
93 	0x6b, 0x9e, 0x38, 0xd9,	0x2c, 0x9c, 0x03, 0xd1,
94 	0xad, 0x99, 0x44, 0xa7,	0xd9, 0x56, 0x3d, 0x59,
95 	0x06, 0x3c, 0x25, 0xf3,	0xfc, 0x1f, 0xdc, 0x2a,
96 };
97 
98 static struct rte_eth_dev *
99 eth_dev_vmbus_allocate(struct rte_vmbus_device *dev, size_t private_data_size)
100 {
101 	struct rte_eth_dev *eth_dev;
102 	const char *name;
103 
104 	if (!dev)
105 		return NULL;
106 
107 	name = dev->device.name;
108 
109 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
110 		eth_dev = rte_eth_dev_allocate(name);
111 		if (!eth_dev) {
112 			PMD_DRV_LOG(NOTICE, "can not allocate rte ethdev");
113 			return NULL;
114 		}
115 
116 		if (private_data_size) {
117 			eth_dev->data->dev_private =
118 				rte_zmalloc_socket(name, private_data_size,
119 						     RTE_CACHE_LINE_SIZE, dev->device.numa_node);
120 			if (!eth_dev->data->dev_private) {
121 				PMD_DRV_LOG(NOTICE, "can not allocate driver data");
122 				rte_eth_dev_release_port(eth_dev);
123 				return NULL;
124 			}
125 		}
126 	} else {
127 		eth_dev = rte_eth_dev_attach_secondary(name);
128 		if (!eth_dev) {
129 			PMD_DRV_LOG(NOTICE, "can not attach secondary");
130 			return NULL;
131 		}
132 	}
133 
134 	eth_dev->device = &dev->device;
135 
136 	/* interrupt is simulated */
137 	rte_intr_type_set(dev->intr_handle, RTE_INTR_HANDLE_EXT);
138 	eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_LSC;
139 	eth_dev->intr_handle = dev->intr_handle;
140 
141 	return eth_dev;
142 }
143 
144 static void
145 eth_dev_vmbus_release(struct rte_eth_dev *eth_dev)
146 {
147 	/* free ether device */
148 	rte_eth_dev_release_port(eth_dev);
149 
150 	eth_dev->device = NULL;
151 	eth_dev->intr_handle = NULL;
152 }
153 
154 static int hn_set_parameter(const char *key, const char *value, void *opaque)
155 {
156 	struct hn_data *hv = opaque;
157 	char *endp = NULL;
158 	unsigned long v;
159 
160 	v = strtoul(value, &endp, 0);
161 	if (*value == '\0' || *endp != '\0') {
162 		PMD_DRV_LOG(ERR, "invalid parameter %s=%s", key, value);
163 		return -EINVAL;
164 	}
165 
166 	if (!strcmp(key, NETVSC_ARG_LATENCY)) {
167 		/* usec to nsec */
168 		hv->latency = v * 1000;
169 		PMD_DRV_LOG(DEBUG, "set latency %u usec", hv->latency);
170 	} else if (!strcmp(key, NETVSC_ARG_RXBREAK)) {
171 		hv->rx_copybreak = v;
172 		PMD_DRV_LOG(DEBUG, "rx copy break set to %u",
173 			    hv->rx_copybreak);
174 	} else if (!strcmp(key, NETVSC_ARG_TXBREAK)) {
175 		hv->tx_copybreak = v;
176 		PMD_DRV_LOG(DEBUG, "tx copy break set to %u",
177 			    hv->tx_copybreak);
178 	} else if (!strcmp(key, NETVSC_ARG_RX_EXTMBUF_ENABLE)) {
179 		hv->rx_extmbuf_enable = v;
180 		PMD_DRV_LOG(DEBUG, "rx extmbuf enable set to %u",
181 			    hv->rx_extmbuf_enable);
182 	}
183 
184 	return 0;
185 }
186 
187 /* Parse device arguments */
188 static int hn_parse_args(const struct rte_eth_dev *dev)
189 {
190 	struct hn_data *hv = dev->data->dev_private;
191 	struct rte_devargs *devargs = dev->device->devargs;
192 	static const char * const valid_keys[] = {
193 		NETVSC_ARG_LATENCY,
194 		NETVSC_ARG_RXBREAK,
195 		NETVSC_ARG_TXBREAK,
196 		NETVSC_ARG_RX_EXTMBUF_ENABLE,
197 		NULL
198 	};
199 	struct rte_kvargs *kvlist;
200 	int ret;
201 
202 	if (!devargs)
203 		return 0;
204 
205 	PMD_INIT_LOG(DEBUG, "device args %s %s",
206 		     devargs->name, devargs->args);
207 
208 	kvlist = rte_kvargs_parse(devargs->args, valid_keys);
209 	if (!kvlist) {
210 		PMD_DRV_LOG(ERR, "invalid parameters");
211 		return -EINVAL;
212 	}
213 
214 	ret = rte_kvargs_process(kvlist, NULL, hn_set_parameter, hv);
215 	rte_kvargs_free(kvlist);
216 
217 	return ret;
218 }
219 
220 /* Update link status.
221  * Note: the DPDK definition of "wait_to_complete"
222  *   means block this call until link is up.
223  *   which is not worth supporting.
224  */
225 int
226 hn_dev_link_update(struct rte_eth_dev *dev,
227 		   int wait_to_complete __rte_unused)
228 {
229 	struct hn_data *hv = dev->data->dev_private;
230 	struct rte_eth_link link, old;
231 	int error;
232 
233 	old = dev->data->dev_link;
234 
235 	error = hn_rndis_get_linkstatus(hv);
236 	if (error)
237 		return error;
238 
239 	hn_rndis_get_linkspeed(hv);
240 
241 	link = (struct rte_eth_link) {
242 		.link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
243 		.link_autoneg = RTE_ETH_LINK_SPEED_FIXED,
244 		.link_speed = hv->link_speed / 10000,
245 	};
246 
247 	if (hv->link_status == NDIS_MEDIA_STATE_CONNECTED)
248 		link.link_status = RTE_ETH_LINK_UP;
249 	else
250 		link.link_status = RTE_ETH_LINK_DOWN;
251 
252 	if (old.link_status == link.link_status)
253 		return 0;
254 
255 	PMD_INIT_LOG(DEBUG, "Port %d is %s", dev->data->port_id,
256 		     (link.link_status == RTE_ETH_LINK_UP) ? "up" : "down");
257 
258 	return rte_eth_linkstatus_set(dev, &link);
259 }
260 
261 static int hn_dev_info_get(struct rte_eth_dev *dev,
262 			   struct rte_eth_dev_info *dev_info)
263 {
264 	struct hn_data *hv = dev->data->dev_private;
265 	int rc;
266 
267 	dev_info->speed_capa = RTE_ETH_LINK_SPEED_10G;
268 	dev_info->min_rx_bufsize = HN_MIN_RX_BUF_SIZE;
269 	dev_info->max_rx_pktlen  = HN_MAX_XFER_LEN;
270 	dev_info->max_mac_addrs  = 1;
271 
272 	dev_info->hash_key_size = NDIS_HASH_KEYSIZE_TOEPLITZ;
273 	dev_info->flow_type_rss_offloads = hv->rss_offloads;
274 	dev_info->reta_size = RTE_ETH_RSS_RETA_SIZE_128;
275 
276 	dev_info->max_rx_queues = hv->max_queues;
277 	dev_info->max_tx_queues = hv->max_queues;
278 
279 	dev_info->tx_desc_lim.nb_min = 1;
280 	dev_info->tx_desc_lim.nb_max = 4096;
281 
282 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
283 		return 0;
284 
285 	/* fills in rx and tx offload capability */
286 	rc = hn_rndis_get_offload(hv, dev_info);
287 	if (rc != 0)
288 		return rc;
289 
290 	/* merges the offload and queues of vf */
291 	return hn_vf_info_get(hv, dev_info);
292 }
293 
294 static int hn_rss_reta_update(struct rte_eth_dev *dev,
295 			      struct rte_eth_rss_reta_entry64 *reta_conf,
296 			      uint16_t reta_size)
297 {
298 	struct hn_data *hv = dev->data->dev_private;
299 	unsigned int i;
300 	int err;
301 
302 	PMD_INIT_FUNC_TRACE();
303 
304 	if (reta_size != NDIS_HASH_INDCNT) {
305 		PMD_DRV_LOG(ERR, "Hash lookup table size does not match NDIS");
306 		return -EINVAL;
307 	}
308 
309 	for (i = 0; i < NDIS_HASH_INDCNT; i++) {
310 		uint16_t idx = i / RTE_ETH_RETA_GROUP_SIZE;
311 		uint16_t shift = i % RTE_ETH_RETA_GROUP_SIZE;
312 		uint64_t mask = (uint64_t)1 << shift;
313 
314 		if (reta_conf[idx].mask & mask)
315 			hv->rss_ind[i] = reta_conf[idx].reta[shift];
316 
317 		/*
318 		 * Ensure we don't allow config that directs traffic to an Rx
319 		 * queue that we aren't going to poll
320 		 */
321 		if (hv->rss_ind[i] >=  dev->data->nb_rx_queues) {
322 			PMD_DRV_LOG(ERR, "RSS distributing traffic to invalid Rx queue");
323 			return -EINVAL;
324 		}
325 	}
326 
327 	err = hn_rndis_conf_rss(hv, NDIS_RSS_FLAG_DISABLE);
328 	if (err) {
329 		PMD_DRV_LOG(NOTICE,
330 			"rss disable failed");
331 		return err;
332 	}
333 
334 	err = hn_rndis_conf_rss(hv, 0);
335 	if (err) {
336 		PMD_DRV_LOG(NOTICE,
337 			    "reta reconfig failed");
338 		return err;
339 	}
340 
341 	return hn_vf_reta_hash_update(dev, reta_conf, reta_size);
342 }
343 
344 static int hn_rss_reta_query(struct rte_eth_dev *dev,
345 			     struct rte_eth_rss_reta_entry64 *reta_conf,
346 			     uint16_t reta_size)
347 {
348 	struct hn_data *hv = dev->data->dev_private;
349 	unsigned int i;
350 
351 	PMD_INIT_FUNC_TRACE();
352 
353 	if (reta_size != NDIS_HASH_INDCNT) {
354 		PMD_DRV_LOG(ERR, "Hash lookup table size does not match NDIS");
355 		return -EINVAL;
356 	}
357 
358 	for (i = 0; i < NDIS_HASH_INDCNT; i++) {
359 		uint16_t idx = i / RTE_ETH_RETA_GROUP_SIZE;
360 		uint16_t shift = i % RTE_ETH_RETA_GROUP_SIZE;
361 		uint64_t mask = (uint64_t)1 << shift;
362 
363 		if (reta_conf[idx].mask & mask)
364 			reta_conf[idx].reta[shift] = hv->rss_ind[i];
365 	}
366 	return 0;
367 }
368 
369 static void hn_rss_hash_init(struct hn_data *hv,
370 			     const struct rte_eth_rss_conf *rss_conf)
371 {
372 	/* Convert from DPDK RSS hash flags to NDIS hash flags */
373 	hv->rss_hash = NDIS_HASH_FUNCTION_TOEPLITZ;
374 
375 	if (rss_conf->rss_hf & RTE_ETH_RSS_IPV4)
376 		hv->rss_hash |= NDIS_HASH_IPV4;
377 	if (rss_conf->rss_hf & RTE_ETH_RSS_NONFRAG_IPV4_TCP)
378 		hv->rss_hash |= NDIS_HASH_TCP_IPV4;
379 	if (rss_conf->rss_hf & RTE_ETH_RSS_IPV6)
380 		hv->rss_hash |=  NDIS_HASH_IPV6;
381 	if (rss_conf->rss_hf & RTE_ETH_RSS_IPV6_EX)
382 		hv->rss_hash |=  NDIS_HASH_IPV6_EX;
383 	if (rss_conf->rss_hf & RTE_ETH_RSS_NONFRAG_IPV6_TCP)
384 		hv->rss_hash |= NDIS_HASH_TCP_IPV6;
385 	if (rss_conf->rss_hf & RTE_ETH_RSS_IPV6_TCP_EX)
386 		hv->rss_hash |= NDIS_HASH_TCP_IPV6_EX;
387 
388 	memcpy(hv->rss_key, rss_conf->rss_key ? : rss_default_key,
389 	       NDIS_HASH_KEYSIZE_TOEPLITZ);
390 }
391 
392 static int hn_rss_hash_update(struct rte_eth_dev *dev,
393 			      struct rte_eth_rss_conf *rss_conf)
394 {
395 	struct hn_data *hv = dev->data->dev_private;
396 	int err;
397 
398 	PMD_INIT_FUNC_TRACE();
399 
400 	err = hn_rndis_conf_rss(hv, NDIS_RSS_FLAG_DISABLE);
401 	if (err) {
402 		PMD_DRV_LOG(NOTICE,
403 			    "rss disable failed");
404 		return err;
405 	}
406 
407 	hn_rss_hash_init(hv, rss_conf);
408 
409 	if (rss_conf->rss_hf != 0) {
410 		err = hn_rndis_conf_rss(hv, 0);
411 		if (err) {
412 			PMD_DRV_LOG(NOTICE,
413 				    "rss reconfig failed (RSS disabled)");
414 			return err;
415 		}
416 	}
417 
418 	return hn_vf_rss_hash_update(dev, rss_conf);
419 }
420 
421 static int hn_rss_hash_conf_get(struct rte_eth_dev *dev,
422 				struct rte_eth_rss_conf *rss_conf)
423 {
424 	struct hn_data *hv = dev->data->dev_private;
425 
426 	PMD_INIT_FUNC_TRACE();
427 
428 	if (hv->ndis_ver < NDIS_VERSION_6_20) {
429 		PMD_DRV_LOG(DEBUG, "RSS not supported on this host");
430 		return -EOPNOTSUPP;
431 	}
432 
433 	rss_conf->rss_key_len = NDIS_HASH_KEYSIZE_TOEPLITZ;
434 	if (rss_conf->rss_key)
435 		memcpy(rss_conf->rss_key, hv->rss_key,
436 		       NDIS_HASH_KEYSIZE_TOEPLITZ);
437 
438 	rss_conf->rss_hf = 0;
439 	if (hv->rss_hash & NDIS_HASH_IPV4)
440 		rss_conf->rss_hf |= RTE_ETH_RSS_IPV4;
441 
442 	if (hv->rss_hash & NDIS_HASH_TCP_IPV4)
443 		rss_conf->rss_hf |= RTE_ETH_RSS_NONFRAG_IPV4_TCP;
444 
445 	if (hv->rss_hash & NDIS_HASH_IPV6)
446 		rss_conf->rss_hf |= RTE_ETH_RSS_IPV6;
447 
448 	if (hv->rss_hash & NDIS_HASH_IPV6_EX)
449 		rss_conf->rss_hf |= RTE_ETH_RSS_IPV6_EX;
450 
451 	if (hv->rss_hash & NDIS_HASH_TCP_IPV6)
452 		rss_conf->rss_hf |= RTE_ETH_RSS_NONFRAG_IPV6_TCP;
453 
454 	if (hv->rss_hash & NDIS_HASH_TCP_IPV6_EX)
455 		rss_conf->rss_hf |= RTE_ETH_RSS_IPV6_TCP_EX;
456 
457 	return 0;
458 }
459 
460 static int
461 hn_dev_promiscuous_enable(struct rte_eth_dev *dev)
462 {
463 	struct hn_data *hv = dev->data->dev_private;
464 
465 	hn_rndis_set_rxfilter(hv, NDIS_PACKET_TYPE_PROMISCUOUS);
466 	return hn_vf_promiscuous_enable(dev);
467 }
468 
469 static int
470 hn_dev_promiscuous_disable(struct rte_eth_dev *dev)
471 {
472 	struct hn_data *hv = dev->data->dev_private;
473 	uint32_t filter;
474 
475 	filter = NDIS_PACKET_TYPE_DIRECTED | NDIS_PACKET_TYPE_BROADCAST;
476 	if (dev->data->all_multicast)
477 		filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
478 	hn_rndis_set_rxfilter(hv, filter);
479 	return hn_vf_promiscuous_disable(dev);
480 }
481 
482 static int
483 hn_dev_allmulticast_enable(struct rte_eth_dev *dev)
484 {
485 	struct hn_data *hv = dev->data->dev_private;
486 
487 	hn_rndis_set_rxfilter(hv, NDIS_PACKET_TYPE_DIRECTED |
488 			      NDIS_PACKET_TYPE_ALL_MULTICAST |
489 			NDIS_PACKET_TYPE_BROADCAST);
490 	return hn_vf_allmulticast_enable(dev);
491 }
492 
493 static int
494 hn_dev_allmulticast_disable(struct rte_eth_dev *dev)
495 {
496 	struct hn_data *hv = dev->data->dev_private;
497 
498 	hn_rndis_set_rxfilter(hv, NDIS_PACKET_TYPE_DIRECTED |
499 			     NDIS_PACKET_TYPE_BROADCAST);
500 	return hn_vf_allmulticast_disable(dev);
501 }
502 
503 static int
504 hn_dev_mc_addr_list(struct rte_eth_dev *dev,
505 		     struct rte_ether_addr *mc_addr_set,
506 		     uint32_t nb_mc_addr)
507 {
508 	/* No filtering on the synthetic path, but can do it on VF */
509 	return hn_vf_mc_addr_list(dev, mc_addr_set, nb_mc_addr);
510 }
511 
512 /* Setup shared rx/tx queue data */
513 static int hn_subchan_configure(struct hn_data *hv,
514 				uint32_t subchan)
515 {
516 	struct vmbus_channel *primary = hn_primary_chan(hv);
517 	int err;
518 	unsigned int retry = 0;
519 
520 	PMD_DRV_LOG(DEBUG,
521 		    "open %u subchannels", subchan);
522 
523 	/* Send create sub channels command */
524 	err = hn_nvs_alloc_subchans(hv, &subchan);
525 	if (err)
526 		return  err;
527 
528 	while (subchan > 0) {
529 		struct vmbus_channel *new_sc;
530 		uint16_t chn_index;
531 
532 		err = rte_vmbus_subchan_open(primary, &new_sc);
533 		if (err == -ENOENT && ++retry < 1000) {
534 			/* This can happen if not ready yet */
535 			rte_delay_ms(10);
536 			continue;
537 		}
538 
539 		if (err) {
540 			PMD_DRV_LOG(ERR,
541 				    "open subchannel failed: %d", err);
542 			return err;
543 		}
544 
545 		rte_vmbus_set_latency(hv->vmbus, new_sc, hv->latency);
546 
547 		retry = 0;
548 		chn_index = rte_vmbus_sub_channel_index(new_sc);
549 		if (chn_index == 0 || chn_index > hv->max_queues) {
550 			PMD_DRV_LOG(ERR,
551 				    "Invalid subchannel offermsg channel %u",
552 				    chn_index);
553 			return -EIO;
554 		}
555 
556 		PMD_DRV_LOG(DEBUG, "new sub channel %u", chn_index);
557 		hv->channels[chn_index] = new_sc;
558 		--subchan;
559 	}
560 
561 	return err;
562 }
563 
564 static void netvsc_hotplug_retry(void *args)
565 {
566 	int ret;
567 	struct hv_hotadd_context *hot_ctx = args;
568 	struct hn_data *hv = hot_ctx->hv;
569 	struct rte_eth_dev *dev = &rte_eth_devices[hv->port_id];
570 	struct rte_devargs *d = &hot_ctx->da;
571 	char buf[256];
572 
573 	DIR *di;
574 	struct dirent *dir;
575 	struct ifreq req;
576 	struct rte_ether_addr eth_addr;
577 	int s;
578 
579 	PMD_DRV_LOG(DEBUG, "%s: retry count %d",
580 		    __func__, hot_ctx->eal_hot_plug_retry);
581 
582 	if (hot_ctx->eal_hot_plug_retry++ > NETVSC_MAX_HOTADD_RETRY) {
583 		PMD_DRV_LOG(NOTICE, "Failed to parse PCI device retry=%d",
584 			    hot_ctx->eal_hot_plug_retry);
585 		goto free_hotadd_ctx;
586 	}
587 
588 	snprintf(buf, sizeof(buf), "/sys/bus/pci/devices/%s/net", d->name);
589 	di = opendir(buf);
590 	if (!di) {
591 		PMD_DRV_LOG(DEBUG, "%s: can't open directory %s, "
592 			    "retrying in 1 second", __func__, buf);
593 		goto retry;
594 	}
595 
596 	while ((dir = readdir(di))) {
597 		/* Skip . and .. directories */
598 		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
599 			continue;
600 
601 		/* trying to get mac address if this is a network device*/
602 		s = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
603 		if (s == -1) {
604 			PMD_DRV_LOG(ERR, "Failed to create socket errno %d",
605 				    errno);
606 			break;
607 		}
608 		strlcpy(req.ifr_name, dir->d_name, sizeof(req.ifr_name));
609 		ret = ioctl(s, SIOCGIFHWADDR, &req);
610 		close(s);
611 		if (ret == -1) {
612 			PMD_DRV_LOG(ERR,
613 				    "Failed to send SIOCGIFHWADDR for device %s",
614 				    dir->d_name);
615 			break;
616 		}
617 		if (req.ifr_hwaddr.sa_family != ARPHRD_ETHER) {
618 			closedir(di);
619 			goto free_hotadd_ctx;
620 		}
621 		memcpy(eth_addr.addr_bytes, req.ifr_hwaddr.sa_data,
622 		       RTE_DIM(eth_addr.addr_bytes));
623 
624 		if (rte_is_same_ether_addr(&eth_addr, dev->data->mac_addrs)) {
625 			PMD_DRV_LOG(NOTICE,
626 				    "Found matching MAC address, adding device %s network name %s",
627 				    d->name, dir->d_name);
628 
629 			/* If this device has been hot removed from this
630 			 * parent device, restore its args.
631 			 */
632 			ret = rte_eal_hotplug_add(d->bus->name, d->name,
633 						  hv->vf_devargs ?
634 						  hv->vf_devargs : "");
635 			if (ret) {
636 				PMD_DRV_LOG(ERR,
637 					    "Failed to add PCI device %s",
638 					    d->name);
639 				break;
640 			}
641 		}
642 		/* When the code reaches here, we either have already added
643 		 * the device, or its MAC address did not match.
644 		 */
645 		closedir(di);
646 		goto free_hotadd_ctx;
647 	}
648 	closedir(di);
649 retry:
650 	/* The device is still being initialized, retry after 1 second */
651 	rte_eal_alarm_set(1000000, netvsc_hotplug_retry, hot_ctx);
652 	return;
653 
654 free_hotadd_ctx:
655 	rte_spinlock_lock(&hv->hotadd_lock);
656 	LIST_REMOVE(hot_ctx, list);
657 	rte_spinlock_unlock(&hv->hotadd_lock);
658 
659 	rte_free(hot_ctx);
660 }
661 
662 static void
663 netvsc_hotadd_callback(const char *device_name, enum rte_dev_event_type type,
664 		       void *arg)
665 {
666 	struct hn_data *hv = arg;
667 	struct hv_hotadd_context *hot_ctx;
668 	struct rte_devargs *d;
669 	int ret;
670 
671 	PMD_DRV_LOG(INFO, "Device notification type=%d device_name=%s",
672 		    type, device_name);
673 
674 	switch (type) {
675 	case RTE_DEV_EVENT_ADD:
676 		/* if we already has a VF, don't check on hot add */
677 		if (hv->vf_ctx.vf_state > vf_removed)
678 			break;
679 
680 		hot_ctx = rte_zmalloc("NETVSC-HOTADD", sizeof(*hot_ctx),
681 				      rte_mem_page_size());
682 
683 		if (!hot_ctx) {
684 			PMD_DRV_LOG(ERR, "Failed to allocate hotadd context");
685 			return;
686 		}
687 
688 		hot_ctx->hv = hv;
689 		d = &hot_ctx->da;
690 
691 		ret = rte_devargs_parse(d, device_name);
692 		if (ret) {
693 			PMD_DRV_LOG(ERR,
694 				    "devargs parsing failed ret=%d", ret);
695 			goto free_ctx;
696 		}
697 
698 		if (!strcmp(d->bus->name, "pci")) {
699 			/* Start the process of figuring out if this
700 			 * PCI device is a VF device
701 			 */
702 			rte_spinlock_lock(&hv->hotadd_lock);
703 			LIST_INSERT_HEAD(&hv->hotadd_list, hot_ctx, list);
704 			rte_spinlock_unlock(&hv->hotadd_lock);
705 			rte_eal_alarm_set(1000000, netvsc_hotplug_retry, hot_ctx);
706 			return;
707 		}
708 
709 		/* We will switch to VF on RDNIS configure message
710 		 * sent from VSP
711 		 */
712 free_ctx:
713 		rte_free(hot_ctx);
714 		break;
715 
716 	default:
717 		break;
718 	}
719 }
720 
721 static int hn_dev_configure(struct rte_eth_dev *dev)
722 {
723 	struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
724 	struct rte_eth_rss_conf *rss_conf = &dev_conf->rx_adv_conf.rss_conf;
725 	const struct rte_eth_rxmode *rxmode = &dev_conf->rxmode;
726 	const struct rte_eth_txmode *txmode = &dev_conf->txmode;
727 	struct hn_data *hv = dev->data->dev_private;
728 	uint64_t unsupported;
729 	int i, err, subchan;
730 
731 	PMD_INIT_FUNC_TRACE();
732 
733 	if (dev_conf->rxmode.mq_mode & RTE_ETH_MQ_RX_RSS_FLAG)
734 		dev_conf->rxmode.offloads |= RTE_ETH_RX_OFFLOAD_RSS_HASH;
735 
736 	unsupported = txmode->offloads & ~HN_TX_OFFLOAD_CAPS;
737 	if (unsupported) {
738 		PMD_DRV_LOG(NOTICE,
739 			    "unsupported TX offload: %#" PRIx64,
740 			    unsupported);
741 		return -EINVAL;
742 	}
743 
744 	unsupported = rxmode->offloads & ~HN_RX_OFFLOAD_CAPS;
745 	if (unsupported) {
746 		PMD_DRV_LOG(NOTICE,
747 			    "unsupported RX offload: %#" PRIx64,
748 			    rxmode->offloads);
749 		return -EINVAL;
750 	}
751 
752 	hv->vlan_strip = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP);
753 
754 	err = hn_rndis_conf_offload(hv, txmode->offloads,
755 				    rxmode->offloads);
756 	if (err) {
757 		PMD_DRV_LOG(NOTICE,
758 			    "offload configure failed");
759 		return err;
760 	}
761 
762 	hv->num_queues = RTE_MAX(dev->data->nb_rx_queues,
763 				 dev->data->nb_tx_queues);
764 
765 	for (i = 0; i < NDIS_HASH_INDCNT; i++)
766 		hv->rss_ind[i] = i % dev->data->nb_rx_queues;
767 
768 	hn_rss_hash_init(hv, rss_conf);
769 
770 	subchan = hv->num_queues - 1;
771 	if (subchan > 0) {
772 		err = hn_subchan_configure(hv, subchan);
773 		if (err) {
774 			PMD_DRV_LOG(NOTICE,
775 				    "subchannel configuration failed");
776 			return err;
777 		}
778 
779 		err = hn_rndis_conf_rss(hv, NDIS_RSS_FLAG_DISABLE);
780 		if (err) {
781 			PMD_DRV_LOG(NOTICE,
782 				"rss disable failed");
783 			return err;
784 		}
785 
786 		if (rss_conf->rss_hf != 0) {
787 			err = hn_rndis_conf_rss(hv, 0);
788 			if (err) {
789 				PMD_DRV_LOG(NOTICE,
790 					    "initial RSS config failed");
791 				return err;
792 			}
793 		}
794 	}
795 
796 	return hn_vf_configure_locked(dev, dev_conf);
797 }
798 
799 static int hn_dev_stats_get(struct rte_eth_dev *dev,
800 			    struct rte_eth_stats *stats)
801 {
802 	unsigned int i;
803 
804 	hn_vf_stats_get(dev, stats);
805 
806 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
807 		const struct hn_tx_queue *txq = dev->data->tx_queues[i];
808 
809 		if (!txq)
810 			continue;
811 
812 		stats->opackets += txq->stats.packets;
813 		stats->obytes += txq->stats.bytes;
814 		stats->oerrors += txq->stats.errors;
815 
816 		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
817 			stats->q_opackets[i] = txq->stats.packets;
818 			stats->q_obytes[i] = txq->stats.bytes;
819 		}
820 	}
821 
822 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
823 		const struct hn_rx_queue *rxq = dev->data->rx_queues[i];
824 
825 		if (!rxq)
826 			continue;
827 
828 		stats->ipackets += rxq->stats.packets;
829 		stats->ibytes += rxq->stats.bytes;
830 		stats->ierrors += rxq->stats.errors;
831 		stats->imissed += rxq->stats.ring_full;
832 
833 		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
834 			stats->q_ipackets[i] = rxq->stats.packets;
835 			stats->q_ibytes[i] = rxq->stats.bytes;
836 		}
837 	}
838 
839 	stats->rx_nombuf = dev->data->rx_mbuf_alloc_failed;
840 	return 0;
841 }
842 
843 static int
844 hn_dev_stats_reset(struct rte_eth_dev *dev)
845 {
846 	unsigned int i;
847 
848 	PMD_INIT_FUNC_TRACE();
849 
850 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
851 		struct hn_tx_queue *txq = dev->data->tx_queues[i];
852 
853 		if (!txq)
854 			continue;
855 		memset(&txq->stats, 0, sizeof(struct hn_stats));
856 	}
857 
858 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
859 		struct hn_rx_queue *rxq = dev->data->rx_queues[i];
860 
861 		if (!rxq)
862 			continue;
863 
864 		memset(&rxq->stats, 0, sizeof(struct hn_stats));
865 	}
866 
867 	return 0;
868 }
869 
870 static int
871 hn_dev_xstats_reset(struct rte_eth_dev *dev)
872 {
873 	int ret;
874 
875 	ret = hn_dev_stats_reset(dev);
876 	if (ret != 0)
877 		return 0;
878 
879 	return hn_vf_xstats_reset(dev);
880 }
881 
882 static int
883 hn_dev_xstats_count(struct rte_eth_dev *dev)
884 {
885 	int ret, count;
886 
887 	count = dev->data->nb_tx_queues * RTE_DIM(hn_stat_strings);
888 	count += dev->data->nb_rx_queues * RTE_DIM(hn_stat_strings);
889 
890 	ret = hn_vf_xstats_get_names(dev, NULL, 0);
891 	if (ret < 0)
892 		return ret;
893 
894 	return count + ret;
895 }
896 
897 static int
898 hn_dev_xstats_get_names(struct rte_eth_dev *dev,
899 			struct rte_eth_xstat_name *xstats_names,
900 			unsigned int limit)
901 {
902 	unsigned int i, t, count = 0;
903 	int ret;
904 
905 	if (!xstats_names)
906 		return hn_dev_xstats_count(dev);
907 
908 	/* Note: limit checked in rte_eth_xstats_names() */
909 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
910 		const struct hn_tx_queue *txq = dev->data->tx_queues[i];
911 
912 		if (!txq)
913 			continue;
914 
915 		if (count >= limit)
916 			break;
917 
918 		for (t = 0; t < RTE_DIM(hn_stat_strings); t++)
919 			snprintf(xstats_names[count++].name,
920 				 RTE_ETH_XSTATS_NAME_SIZE,
921 				 "tx_q%u_%s", i, hn_stat_strings[t].name);
922 	}
923 
924 	for (i = 0; i < dev->data->nb_rx_queues; i++)  {
925 		const struct hn_rx_queue *rxq = dev->data->rx_queues[i];
926 
927 		if (!rxq)
928 			continue;
929 
930 		if (count >= limit)
931 			break;
932 
933 		for (t = 0; t < RTE_DIM(hn_stat_strings); t++)
934 			snprintf(xstats_names[count++].name,
935 				 RTE_ETH_XSTATS_NAME_SIZE,
936 				 "rx_q%u_%s", i,
937 				 hn_stat_strings[t].name);
938 	}
939 
940 	ret = hn_vf_xstats_get_names(dev, xstats_names + count,
941 				     limit - count);
942 	if (ret < 0)
943 		return ret;
944 
945 	return count + ret;
946 }
947 
948 static int
949 hn_dev_xstats_get(struct rte_eth_dev *dev,
950 		  struct rte_eth_xstat *xstats,
951 		  unsigned int n)
952 {
953 	unsigned int i, t, count = 0;
954 	const unsigned int nstats = hn_dev_xstats_count(dev);
955 	const char *stats;
956 	int ret;
957 
958 	PMD_INIT_FUNC_TRACE();
959 
960 	if (n < nstats)
961 		return nstats;
962 
963 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
964 		const struct hn_tx_queue *txq = dev->data->tx_queues[i];
965 
966 		if (!txq)
967 			continue;
968 
969 		stats = (const char *)&txq->stats;
970 		for (t = 0; t < RTE_DIM(hn_stat_strings); t++, count++) {
971 			xstats[count].id = count;
972 			xstats[count].value = *(const uint64_t *)
973 				(stats + hn_stat_strings[t].offset);
974 		}
975 	}
976 
977 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
978 		const struct hn_rx_queue *rxq = dev->data->rx_queues[i];
979 
980 		if (!rxq)
981 			continue;
982 
983 		stats = (const char *)&rxq->stats;
984 		for (t = 0; t < RTE_DIM(hn_stat_strings); t++, count++) {
985 			xstats[count].id = count;
986 			xstats[count].value = *(const uint64_t *)
987 				(stats + hn_stat_strings[t].offset);
988 		}
989 	}
990 
991 	ret = hn_vf_xstats_get(dev, xstats, count, n);
992 	if (ret < 0)
993 		return ret;
994 
995 	return count + ret;
996 }
997 
998 static int
999 hn_dev_start(struct rte_eth_dev *dev)
1000 {
1001 	struct hn_data *hv = dev->data->dev_private;
1002 	int i, error;
1003 
1004 	PMD_INIT_FUNC_TRACE();
1005 
1006 	/* Register to monitor hot plug events */
1007 	error = rte_dev_event_callback_register(NULL, netvsc_hotadd_callback,
1008 						hv);
1009 	if (error) {
1010 		PMD_DRV_LOG(ERR, "failed to register device event callback");
1011 		return error;
1012 	}
1013 
1014 	error = hn_rndis_set_rxfilter(hv,
1015 				      NDIS_PACKET_TYPE_BROADCAST |
1016 				      NDIS_PACKET_TYPE_ALL_MULTICAST |
1017 				      NDIS_PACKET_TYPE_DIRECTED);
1018 	if (error)
1019 		return error;
1020 
1021 	error = hn_vf_start(dev);
1022 	if (error)
1023 		hn_rndis_set_rxfilter(hv, 0);
1024 
1025 	/* Initialize Link state */
1026 	if (error == 0)
1027 		hn_dev_link_update(dev, 0);
1028 
1029 	for (i = 0; i < hv->num_queues; i++) {
1030 		dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
1031 		dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
1032 	}
1033 
1034 	return error;
1035 }
1036 
1037 static int
1038 hn_dev_stop(struct rte_eth_dev *dev)
1039 {
1040 	struct hn_data *hv = dev->data->dev_private;
1041 	int i, ret;
1042 
1043 	PMD_INIT_FUNC_TRACE();
1044 	dev->data->dev_started = 0;
1045 
1046 	rte_dev_event_callback_unregister(NULL, netvsc_hotadd_callback, hv);
1047 	hn_rndis_set_rxfilter(hv, 0);
1048 	ret = hn_vf_stop(dev);
1049 
1050 	for (i = 0; i < hv->num_queues; i++) {
1051 		dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
1052 		dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
1053 	}
1054 
1055 	return ret;
1056 }
1057 
1058 static int
1059 hn_dev_close(struct rte_eth_dev *dev)
1060 {
1061 	int ret;
1062 	struct hn_data *hv = dev->data->dev_private;
1063 	struct hv_hotadd_context *hot_ctx;
1064 
1065 	PMD_INIT_FUNC_TRACE();
1066 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1067 		return 0;
1068 
1069 	rte_spinlock_lock(&hv->hotadd_lock);
1070 	while (!LIST_EMPTY(&hv->hotadd_list)) {
1071 		hot_ctx = LIST_FIRST(&hv->hotadd_list);
1072 		rte_eal_alarm_cancel(netvsc_hotplug_retry, hot_ctx);
1073 		LIST_REMOVE(hot_ctx, list);
1074 		rte_free(hot_ctx);
1075 	}
1076 	rte_spinlock_unlock(&hv->hotadd_lock);
1077 
1078 	ret = hn_vf_close(dev);
1079 	hn_dev_free_queues(dev);
1080 
1081 	return ret;
1082 }
1083 
1084 /*
1085  * Setup connection between PMD and kernel.
1086  */
1087 static int
1088 hn_attach(struct hn_data *hv, unsigned int mtu)
1089 {
1090 	int error;
1091 
1092 	/* Attach NVS */
1093 	error = hn_nvs_attach(hv, mtu);
1094 	if (error)
1095 		goto failed_nvs;
1096 
1097 	/* Attach RNDIS */
1098 	error = hn_rndis_attach(hv);
1099 	if (error)
1100 		goto failed_rndis;
1101 
1102 	/*
1103 	 * NOTE:
1104 	 * Under certain conditions on certain versions of Hyper-V,
1105 	 * the RNDIS rxfilter is _not_ zero on the hypervisor side
1106 	 * after the successful RNDIS initialization.
1107 	 */
1108 	hn_rndis_set_rxfilter(hv, NDIS_PACKET_TYPE_NONE);
1109 	return 0;
1110 failed_rndis:
1111 	hn_nvs_detach(hv);
1112 failed_nvs:
1113 	return error;
1114 }
1115 
1116 static void
1117 hn_detach(struct hn_data *hv)
1118 {
1119 	hn_nvs_detach(hv);
1120 	hn_rndis_detach(hv);
1121 }
1122 
1123 /*
1124  * Connects EXISTING rx/tx queues to NEW vmbus channel(s), and
1125  * re-initializes NDIS and RNDIS, including re-sending initial
1126  * NDIS/RNDIS configuration. To be used after the underlying vmbus
1127  * has been un- and re-mapped, e.g. as must happen when the device
1128  * MTU is changed.
1129  */
1130 static int
1131 hn_reinit(struct rte_eth_dev *dev, uint16_t mtu)
1132 {
1133 	struct hn_data *hv = dev->data->dev_private;
1134 	struct hn_rx_queue **rxqs = (struct hn_rx_queue **)dev->data->rx_queues;
1135 	struct hn_tx_queue **txqs = (struct hn_tx_queue **)dev->data->tx_queues;
1136 	int i, ret = 0;
1137 
1138 	/* Point primary queues at new primary channel */
1139 	if (rxqs[0]) {
1140 		rxqs[0]->chan = hv->channels[0];
1141 		txqs[0]->chan = hv->channels[0];
1142 	}
1143 
1144 	ret = hn_attach(hv, mtu);
1145 	if (ret)
1146 		return ret;
1147 
1148 	/* Create vmbus subchannels, additional RNDIS configuration */
1149 	ret = hn_dev_configure(dev);
1150 	if (ret)
1151 		return ret;
1152 
1153 	/* Point any additional queues at new subchannels */
1154 	if (rxqs[0]) {
1155 		for (i = 1; i < dev->data->nb_rx_queues; i++)
1156 			rxqs[i]->chan = hv->channels[i];
1157 		for (i = 1; i < dev->data->nb_tx_queues; i++)
1158 			txqs[i]->chan = hv->channels[i];
1159 	}
1160 
1161 	return ret;
1162 }
1163 
1164 static int
1165 hn_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
1166 {
1167 	struct hn_data *hv = dev->data->dev_private;
1168 	unsigned int orig_mtu = dev->data->mtu;
1169 	uint32_t rndis_mtu;
1170 	int ret = 0;
1171 	int i;
1172 
1173 	if (dev->data->dev_started) {
1174 		PMD_DRV_LOG(ERR, "Device must be stopped before changing MTU");
1175 		return -EBUSY;
1176 	}
1177 
1178 	/* Change MTU of underlying VF dev first, if it exists */
1179 	ret = hn_vf_mtu_set(dev, mtu);
1180 	if (ret)
1181 		return ret;
1182 
1183 	/* Release channel resources */
1184 	hn_detach(hv);
1185 
1186 	/* Close any secondary vmbus channels */
1187 	for (i = 1; i < hv->num_queues; i++)
1188 		rte_vmbus_chan_close(hv->channels[i]);
1189 
1190 	/* Close primary vmbus channel */
1191 	rte_free(hv->channels[0]);
1192 
1193 	/* Unmap and re-map vmbus device */
1194 	rte_vmbus_unmap_device(hv->vmbus);
1195 	ret = rte_vmbus_map_device(hv->vmbus);
1196 	if (ret) {
1197 		/* This is a catastrophic error - the device is unusable */
1198 		PMD_DRV_LOG(ERR, "Could not re-map vmbus device!");
1199 		return ret;
1200 	}
1201 
1202 	/* Update pointers to re-mapped UIO resources */
1203 	hv->rxbuf_res = hv->vmbus->resource[HV_RECV_BUF_MAP];
1204 	hv->chim_res  = hv->vmbus->resource[HV_SEND_BUF_MAP];
1205 
1206 	/* Re-open the primary vmbus channel */
1207 	ret = rte_vmbus_chan_open(hv->vmbus, &hv->channels[0]);
1208 	if (ret) {
1209 		/* This is a catastrophic error - the device is unusable */
1210 		PMD_DRV_LOG(ERR, "Could not re-open vmbus channel!");
1211 		return ret;
1212 	}
1213 
1214 	rte_vmbus_set_latency(hv->vmbus, hv->channels[0], hv->latency);
1215 
1216 	ret = hn_reinit(dev, mtu);
1217 	if (!ret)
1218 		goto out;
1219 
1220 	/* In case of error, attempt to restore original MTU */
1221 	ret = hn_reinit(dev, orig_mtu);
1222 	if (ret)
1223 		PMD_DRV_LOG(ERR, "Restoring original MTU failed for netvsc");
1224 
1225 	ret = hn_vf_mtu_set(dev, orig_mtu);
1226 	if (ret)
1227 		PMD_DRV_LOG(ERR, "Restoring original MTU failed for VF");
1228 
1229 out:
1230 	if (hn_rndis_get_mtu(hv, &rndis_mtu)) {
1231 		PMD_DRV_LOG(ERR, "Could not get MTU via RNDIS");
1232 	} else {
1233 		dev->data->mtu = (uint16_t)rndis_mtu;
1234 		PMD_DRV_LOG(DEBUG, "RNDIS MTU is %u", dev->data->mtu);
1235 	}
1236 
1237 	return ret;
1238 }
1239 
1240 static const struct eth_dev_ops hn_eth_dev_ops = {
1241 	.dev_configure		= hn_dev_configure,
1242 	.dev_start		= hn_dev_start,
1243 	.dev_stop		= hn_dev_stop,
1244 	.dev_close		= hn_dev_close,
1245 	.dev_infos_get		= hn_dev_info_get,
1246 	.txq_info_get		= hn_dev_tx_queue_info,
1247 	.rxq_info_get		= hn_dev_rx_queue_info,
1248 	.dev_supported_ptypes_get = hn_vf_supported_ptypes,
1249 	.promiscuous_enable     = hn_dev_promiscuous_enable,
1250 	.promiscuous_disable    = hn_dev_promiscuous_disable,
1251 	.allmulticast_enable    = hn_dev_allmulticast_enable,
1252 	.allmulticast_disable   = hn_dev_allmulticast_disable,
1253 	.set_mc_addr_list	= hn_dev_mc_addr_list,
1254 	.mtu_set                = hn_dev_mtu_set,
1255 	.reta_update		= hn_rss_reta_update,
1256 	.reta_query             = hn_rss_reta_query,
1257 	.rss_hash_update	= hn_rss_hash_update,
1258 	.rss_hash_conf_get      = hn_rss_hash_conf_get,
1259 	.tx_queue_setup		= hn_dev_tx_queue_setup,
1260 	.tx_queue_release	= hn_dev_tx_queue_release,
1261 	.tx_done_cleanup        = hn_dev_tx_done_cleanup,
1262 	.rx_queue_setup		= hn_dev_rx_queue_setup,
1263 	.rx_queue_release	= hn_dev_rx_queue_release,
1264 	.link_update		= hn_dev_link_update,
1265 	.stats_get		= hn_dev_stats_get,
1266 	.stats_reset            = hn_dev_stats_reset,
1267 	.xstats_get		= hn_dev_xstats_get,
1268 	.xstats_get_names	= hn_dev_xstats_get_names,
1269 	.xstats_reset		= hn_dev_xstats_reset,
1270 };
1271 
1272 static int
1273 eth_hn_dev_init(struct rte_eth_dev *eth_dev)
1274 {
1275 	struct hn_data *hv = eth_dev->data->dev_private;
1276 	struct rte_device *device = eth_dev->device;
1277 	struct rte_vmbus_device *vmbus;
1278 	uint32_t mtu;
1279 	unsigned int rxr_cnt;
1280 	int err, max_chan;
1281 
1282 	PMD_INIT_FUNC_TRACE();
1283 
1284 	rte_spinlock_init(&hv->hotadd_lock);
1285 	LIST_INIT(&hv->hotadd_list);
1286 
1287 	vmbus = container_of(device, struct rte_vmbus_device, device);
1288 	eth_dev->dev_ops = &hn_eth_dev_ops;
1289 	eth_dev->rx_queue_count = hn_dev_rx_queue_count;
1290 	eth_dev->rx_descriptor_status = hn_dev_rx_queue_status;
1291 	eth_dev->tx_descriptor_status = hn_dev_tx_descriptor_status;
1292 	eth_dev->tx_pkt_burst = &hn_xmit_pkts;
1293 	eth_dev->rx_pkt_burst = &hn_recv_pkts;
1294 
1295 	/*
1296 	 * for secondary processes, we don't initialize any further as primary
1297 	 * has already done this work.
1298 	 */
1299 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1300 		return 0;
1301 
1302 	eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1303 
1304 	/* Since Hyper-V only supports one MAC address */
1305 	eth_dev->data->mac_addrs = rte_calloc("hv_mac", HN_MAX_MAC_ADDRS,
1306 					      sizeof(struct rte_ether_addr), 0);
1307 	if (eth_dev->data->mac_addrs == NULL) {
1308 		PMD_INIT_LOG(ERR,
1309 			     "Failed to allocate memory store MAC addresses");
1310 		return -ENOMEM;
1311 	}
1312 
1313 	hv->vmbus = vmbus;
1314 	hv->rxbuf_res = vmbus->resource[HV_RECV_BUF_MAP];
1315 	hv->chim_res  = vmbus->resource[HV_SEND_BUF_MAP];
1316 	hv->port_id = eth_dev->data->port_id;
1317 	hv->latency = HN_CHAN_LATENCY_NS;
1318 	hv->rx_copybreak = HN_RXCOPY_THRESHOLD;
1319 	hv->tx_copybreak = HN_TXCOPY_THRESHOLD;
1320 	hv->rx_extmbuf_enable = HN_RX_EXTMBUF_ENABLE;
1321 	hv->max_queues = 1;
1322 
1323 	rte_rwlock_init(&hv->vf_lock);
1324 	hv->vf_ctx.vf_vsc_switched = false;
1325 	hv->vf_ctx.vf_vsp_reported = false;
1326 	hv->vf_ctx.vf_attached = false;
1327 	hv->vf_ctx.vf_state = vf_unknown;
1328 
1329 	err = hn_parse_args(eth_dev);
1330 	if (err)
1331 		return err;
1332 
1333 	strlcpy(hv->owner.name, eth_dev->device->name,
1334 		RTE_ETH_MAX_OWNER_NAME_LEN);
1335 	err = rte_eth_dev_owner_new(&hv->owner.id);
1336 	if (err) {
1337 		PMD_INIT_LOG(ERR, "Can not get owner id");
1338 		return err;
1339 	}
1340 
1341 	/* Initialize primary channel input for control operations */
1342 	err = rte_vmbus_chan_open(vmbus, &hv->channels[0]);
1343 	if (err)
1344 		return err;
1345 
1346 	rte_vmbus_set_latency(hv->vmbus, hv->channels[0], hv->latency);
1347 
1348 	hv->primary = hn_rx_queue_alloc(hv, 0,
1349 					eth_dev->device->numa_node);
1350 
1351 	if (!hv->primary)
1352 		return -ENOMEM;
1353 
1354 	err = hn_attach(hv, RTE_ETHER_MTU);
1355 	if  (err)
1356 		goto failed;
1357 
1358 	err = hn_chim_init(eth_dev);
1359 	if (err)
1360 		goto failed;
1361 
1362 	err = hn_rndis_get_mtu(hv, &mtu);
1363 	if (err)
1364 		goto failed;
1365 	eth_dev->data->mtu = (uint16_t)mtu;
1366 	PMD_INIT_LOG(DEBUG, "RNDIS MTU is %u", eth_dev->data->mtu);
1367 
1368 	err = hn_rndis_get_eaddr(hv, eth_dev->data->mac_addrs->addr_bytes);
1369 	if (err)
1370 		goto failed;
1371 
1372 	/* Multi queue requires later versions of windows server */
1373 	if (hv->nvs_ver < NVS_VERSION_5)
1374 		return 0;
1375 
1376 	max_chan = rte_vmbus_max_channels(vmbus);
1377 	PMD_INIT_LOG(DEBUG, "VMBus max channels %d", max_chan);
1378 	if (max_chan <= 0)
1379 		goto failed;
1380 
1381 	if (hn_rndis_query_rsscaps(hv, &rxr_cnt) != 0)
1382 		rxr_cnt = 1;
1383 
1384 	hv->max_queues = RTE_MIN(rxr_cnt, (unsigned int)max_chan);
1385 
1386 	/* If VF was reported but not added, do it now */
1387 	if (hv->vf_ctx.vf_vsp_reported && !hv->vf_ctx.vf_vsc_switched) {
1388 		PMD_INIT_LOG(DEBUG, "Adding VF device");
1389 
1390 		err = hn_vf_add(eth_dev, hv);
1391 	}
1392 
1393 	return 0;
1394 
1395 failed:
1396 	PMD_INIT_LOG(NOTICE, "device init failed");
1397 
1398 	hn_chim_uninit(eth_dev);
1399 	hn_detach(hv);
1400 	return err;
1401 }
1402 
1403 static int
1404 eth_hn_dev_uninit(struct rte_eth_dev *eth_dev)
1405 {
1406 	struct hn_data *hv = eth_dev->data->dev_private;
1407 	int ret, ret_stop;
1408 
1409 	PMD_INIT_FUNC_TRACE();
1410 
1411 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1412 		return 0;
1413 
1414 	ret_stop = hn_dev_stop(eth_dev);
1415 	hn_dev_close(eth_dev);
1416 
1417 	free(hv->vf_devargs);
1418 	hv->vf_devargs = NULL;
1419 
1420 	hn_detach(hv);
1421 	hn_chim_uninit(eth_dev);
1422 	rte_vmbus_chan_close(hv->channels[0]);
1423 	rte_free(hv->primary);
1424 	ret = rte_eth_dev_owner_delete(hv->owner.id);
1425 	if (ret != 0)
1426 		return ret;
1427 
1428 	return ret_stop;
1429 }
1430 
1431 static int eth_hn_probe(struct rte_vmbus_driver *drv __rte_unused,
1432 			struct rte_vmbus_device *dev)
1433 {
1434 	struct rte_eth_dev *eth_dev;
1435 	int ret;
1436 
1437 	PMD_INIT_FUNC_TRACE();
1438 
1439 	ret = rte_dev_event_monitor_start();
1440 	if (ret) {
1441 		PMD_DRV_LOG(ERR, "Failed to start device event monitoring");
1442 		return ret;
1443 	}
1444 
1445 	eth_dev = eth_dev_vmbus_allocate(dev, sizeof(struct hn_data));
1446 	if (!eth_dev)
1447 		return -ENOMEM;
1448 
1449 	ret = eth_hn_dev_init(eth_dev);
1450 	if (ret) {
1451 		eth_dev_vmbus_release(eth_dev);
1452 		rte_dev_event_monitor_stop();
1453 	} else {
1454 		rte_eth_dev_probing_finish(eth_dev);
1455 	}
1456 
1457 	return ret;
1458 }
1459 
1460 static int eth_hn_remove(struct rte_vmbus_device *dev)
1461 {
1462 	struct rte_eth_dev *eth_dev;
1463 	int ret;
1464 
1465 	PMD_INIT_FUNC_TRACE();
1466 
1467 	eth_dev = rte_eth_dev_allocated(dev->device.name);
1468 	if (!eth_dev)
1469 		return 0; /* port already released */
1470 
1471 	ret = eth_hn_dev_uninit(eth_dev);
1472 	if (ret)
1473 		return ret;
1474 
1475 	eth_dev_vmbus_release(eth_dev);
1476 	rte_dev_event_monitor_stop();
1477 	return 0;
1478 }
1479 
1480 /* Network device GUID */
1481 static const rte_uuid_t hn_net_ids[] = {
1482 	/*  f8615163-df3e-46c5-913f-f2d2f965ed0e */
1483 	RTE_UUID_INIT(0xf8615163, 0xdf3e, 0x46c5, 0x913f, 0xf2d2f965ed0eULL),
1484 	{ 0 }
1485 };
1486 
1487 static struct rte_vmbus_driver rte_netvsc_pmd = {
1488 	.id_table = hn_net_ids,
1489 	.probe = eth_hn_probe,
1490 	.remove = eth_hn_remove,
1491 };
1492 
1493 RTE_PMD_REGISTER_VMBUS(net_netvsc, rte_netvsc_pmd);
1494 RTE_PMD_REGISTER_KMOD_DEP(net_netvsc, "* uio_hv_generic");
1495 RTE_LOG_REGISTER_SUFFIX(hn_logtype_init, init, NOTICE);
1496 RTE_LOG_REGISTER_SUFFIX(hn_logtype_driver, driver, NOTICE);
1497 RTE_PMD_REGISTER_PARAM_STRING(net_netvsc,
1498 			      NETVSC_ARG_LATENCY "=<uint32> "
1499 			      NETVSC_ARG_RXBREAK "=<uint32> "
1500 			      NETVSC_ARG_TXBREAK "=<uint32> "
1501 			      NETVSC_ARG_RX_EXTMBUF_ENABLE "=<0|1>");
1502