xref: /dpdk/drivers/net/netvsc/hn_ethdev.c (revision 7917b0d38e92e8b9ec5a870415b791420e10f11a)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2016-2018 Microsoft Corporation
3  * Copyright(c) 2013-2016 Brocade Communications Systems, Inc.
4  * All rights reserved.
5  */
6 
7 #include <stdint.h>
8 #include <string.h>
9 #include <stdio.h>
10 #include <errno.h>
11 #include <unistd.h>
12 #include <dirent.h>
13 #include <net/if.h>
14 #include <net/if_arp.h>
15 #include <netinet/in.h>
16 #include <sys/ioctl.h>
17 
18 #include <rte_ethdev.h>
19 #include <rte_memcpy.h>
20 #include <rte_string_fns.h>
21 #include <rte_memzone.h>
22 #include <rte_devargs.h>
23 #include <rte_malloc.h>
24 #include <rte_kvargs.h>
25 #include <rte_atomic.h>
26 #include <rte_branch_prediction.h>
27 #include <rte_ether.h>
28 #include <ethdev_driver.h>
29 #include <rte_cycles.h>
30 #include <rte_errno.h>
31 #include <rte_memory.h>
32 #include <rte_eal.h>
33 #include <dev_driver.h>
34 #include <bus_driver.h>
35 #include <bus_vmbus_driver.h>
36 #include <rte_alarm.h>
37 
38 #include "hn_logs.h"
39 #include "hn_var.h"
40 #include "hn_rndis.h"
41 #include "hn_nvs.h"
42 #include "ndis.h"
43 
44 #define HN_TX_OFFLOAD_CAPS (RTE_ETH_TX_OFFLOAD_IPV4_CKSUM | \
45 			    RTE_ETH_TX_OFFLOAD_TCP_CKSUM  | \
46 			    RTE_ETH_TX_OFFLOAD_UDP_CKSUM  | \
47 			    RTE_ETH_TX_OFFLOAD_TCP_TSO    | \
48 			    RTE_ETH_TX_OFFLOAD_MULTI_SEGS | \
49 			    RTE_ETH_TX_OFFLOAD_VLAN_INSERT)
50 
51 #define HN_RX_OFFLOAD_CAPS (RTE_ETH_RX_OFFLOAD_CHECKSUM | \
52 			    RTE_ETH_RX_OFFLOAD_VLAN_STRIP | \
53 			    RTE_ETH_RX_OFFLOAD_RSS_HASH)
54 
55 #define NETVSC_ARG_LATENCY "latency"
56 #define NETVSC_ARG_RXBREAK "rx_copybreak"
57 #define NETVSC_ARG_TXBREAK "tx_copybreak"
58 #define NETVSC_ARG_RX_EXTMBUF_ENABLE "rx_extmbuf_enable"
59 
60 /* The max number of retry when hot adding a VF device */
61 #define NETVSC_MAX_HOTADD_RETRY 10
62 
63 struct hn_xstats_name_off {
64 	char name[RTE_ETH_XSTATS_NAME_SIZE];
65 	unsigned int offset;
66 };
67 
68 static const struct hn_xstats_name_off hn_stat_strings[] = {
69 	{ "good_packets",           offsetof(struct hn_stats, packets) },
70 	{ "good_bytes",             offsetof(struct hn_stats, bytes) },
71 	{ "errors",                 offsetof(struct hn_stats, errors) },
72 	{ "ring full",              offsetof(struct hn_stats, ring_full) },
73 	{ "channel full",           offsetof(struct hn_stats, channel_full) },
74 	{ "multicast_packets",      offsetof(struct hn_stats, multicast) },
75 	{ "broadcast_packets",      offsetof(struct hn_stats, broadcast) },
76 	{ "undersize_packets",      offsetof(struct hn_stats, size_bins[0]) },
77 	{ "size_64_packets",        offsetof(struct hn_stats, size_bins[1]) },
78 	{ "size_65_127_packets",    offsetof(struct hn_stats, size_bins[2]) },
79 	{ "size_128_255_packets",   offsetof(struct hn_stats, size_bins[3]) },
80 	{ "size_256_511_packets",   offsetof(struct hn_stats, size_bins[4]) },
81 	{ "size_512_1023_packets",  offsetof(struct hn_stats, size_bins[5]) },
82 	{ "size_1024_1518_packets", offsetof(struct hn_stats, size_bins[6]) },
83 	{ "size_1519_max_packets",  offsetof(struct hn_stats, size_bins[7]) },
84 };
85 
86 /* The default RSS key.
87  * This value is the same as MLX5 so that flows will be
88  * received on same path for both VF and synthetic NIC.
89  */
90 static const uint8_t rss_default_key[NDIS_HASH_KEYSIZE_TOEPLITZ] = {
91 	0x2c, 0xc6, 0x81, 0xd1,	0x5b, 0xdb, 0xf4, 0xf7,
92 	0xfc, 0xa2, 0x83, 0x19,	0xdb, 0x1a, 0x3e, 0x94,
93 	0x6b, 0x9e, 0x38, 0xd9,	0x2c, 0x9c, 0x03, 0xd1,
94 	0xad, 0x99, 0x44, 0xa7,	0xd9, 0x56, 0x3d, 0x59,
95 	0x06, 0x3c, 0x25, 0xf3,	0xfc, 0x1f, 0xdc, 0x2a,
96 };
97 
98 static struct rte_eth_dev *
99 eth_dev_vmbus_allocate(struct rte_vmbus_device *dev, size_t private_data_size)
100 {
101 	struct rte_eth_dev *eth_dev;
102 	const char *name;
103 
104 	if (!dev)
105 		return NULL;
106 
107 	name = dev->device.name;
108 
109 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
110 		eth_dev = rte_eth_dev_allocate(name);
111 		if (!eth_dev) {
112 			PMD_DRV_LOG(NOTICE, "can not allocate rte ethdev");
113 			return NULL;
114 		}
115 
116 		if (private_data_size) {
117 			eth_dev->data->dev_private =
118 				rte_zmalloc_socket(name, private_data_size,
119 						     RTE_CACHE_LINE_SIZE, dev->device.numa_node);
120 			if (!eth_dev->data->dev_private) {
121 				PMD_DRV_LOG(NOTICE, "can not allocate driver data");
122 				rte_eth_dev_release_port(eth_dev);
123 				return NULL;
124 			}
125 		}
126 	} else {
127 		eth_dev = rte_eth_dev_attach_secondary(name);
128 		if (!eth_dev) {
129 			PMD_DRV_LOG(NOTICE, "can not attach secondary");
130 			return NULL;
131 		}
132 	}
133 
134 	eth_dev->device = &dev->device;
135 
136 	/* interrupt is simulated */
137 	rte_intr_type_set(dev->intr_handle, RTE_INTR_HANDLE_EXT);
138 	eth_dev->data->dev_flags |= RTE_ETH_DEV_INTR_LSC;
139 	eth_dev->intr_handle = dev->intr_handle;
140 
141 	return eth_dev;
142 }
143 
144 static void
145 eth_dev_vmbus_release(struct rte_eth_dev *eth_dev)
146 {
147 	/* free ether device */
148 	rte_eth_dev_release_port(eth_dev);
149 
150 	eth_dev->device = NULL;
151 	eth_dev->intr_handle = NULL;
152 }
153 
154 static int hn_set_parameter(const char *key, const char *value, void *opaque)
155 {
156 	struct hn_data *hv = opaque;
157 	char *endp = NULL;
158 	unsigned long v;
159 
160 	v = strtoul(value, &endp, 0);
161 	if (*value == '\0' || *endp != '\0') {
162 		PMD_DRV_LOG(ERR, "invalid parameter %s=%s", key, value);
163 		return -EINVAL;
164 	}
165 
166 	if (!strcmp(key, NETVSC_ARG_LATENCY)) {
167 		/* usec to nsec */
168 		hv->latency = v * 1000;
169 		PMD_DRV_LOG(DEBUG, "set latency %u usec", hv->latency);
170 	} else if (!strcmp(key, NETVSC_ARG_RXBREAK)) {
171 		hv->rx_copybreak = v;
172 		PMD_DRV_LOG(DEBUG, "rx copy break set to %u",
173 			    hv->rx_copybreak);
174 	} else if (!strcmp(key, NETVSC_ARG_TXBREAK)) {
175 		hv->tx_copybreak = v;
176 		PMD_DRV_LOG(DEBUG, "tx copy break set to %u",
177 			    hv->tx_copybreak);
178 	} else if (!strcmp(key, NETVSC_ARG_RX_EXTMBUF_ENABLE)) {
179 		hv->rx_extmbuf_enable = v;
180 		PMD_DRV_LOG(DEBUG, "rx extmbuf enable set to %u",
181 			    hv->rx_extmbuf_enable);
182 	}
183 
184 	return 0;
185 }
186 
187 /* Parse device arguments */
188 static int hn_parse_args(const struct rte_eth_dev *dev)
189 {
190 	struct hn_data *hv = dev->data->dev_private;
191 	struct rte_devargs *devargs = dev->device->devargs;
192 	static const char * const valid_keys[] = {
193 		NETVSC_ARG_LATENCY,
194 		NETVSC_ARG_RXBREAK,
195 		NETVSC_ARG_TXBREAK,
196 		NETVSC_ARG_RX_EXTMBUF_ENABLE,
197 		NULL
198 	};
199 	struct rte_kvargs *kvlist;
200 	int ret;
201 
202 	if (!devargs)
203 		return 0;
204 
205 	PMD_INIT_LOG(DEBUG, "device args %s %s",
206 		     devargs->name, devargs->args);
207 
208 	kvlist = rte_kvargs_parse(devargs->args, valid_keys);
209 	if (!kvlist) {
210 		PMD_DRV_LOG(ERR, "invalid parameters");
211 		return -EINVAL;
212 	}
213 
214 	ret = rte_kvargs_process(kvlist, NULL, hn_set_parameter, hv);
215 	rte_kvargs_free(kvlist);
216 
217 	return ret;
218 }
219 
220 /* Update link status.
221  * Note: the DPDK definition of "wait_to_complete"
222  *   means block this call until link is up.
223  *   which is not worth supporting.
224  */
225 int
226 hn_dev_link_update(struct rte_eth_dev *dev,
227 		   int wait_to_complete __rte_unused)
228 {
229 	struct hn_data *hv = dev->data->dev_private;
230 	struct rte_eth_link link, old;
231 	int error;
232 
233 	old = dev->data->dev_link;
234 
235 	error = hn_rndis_get_linkstatus(hv);
236 	if (error)
237 		return error;
238 
239 	hn_rndis_get_linkspeed(hv);
240 
241 	link = (struct rte_eth_link) {
242 		.link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
243 		.link_autoneg = RTE_ETH_LINK_SPEED_FIXED,
244 		.link_speed = hv->link_speed / 10000,
245 	};
246 
247 	if (hv->link_status == NDIS_MEDIA_STATE_CONNECTED)
248 		link.link_status = RTE_ETH_LINK_UP;
249 	else
250 		link.link_status = RTE_ETH_LINK_DOWN;
251 
252 	if (old.link_status == link.link_status)
253 		return 0;
254 
255 	PMD_INIT_LOG(DEBUG, "Port %d is %s", dev->data->port_id,
256 		     (link.link_status == RTE_ETH_LINK_UP) ? "up" : "down");
257 
258 	return rte_eth_linkstatus_set(dev, &link);
259 }
260 
261 static int hn_dev_info_get(struct rte_eth_dev *dev,
262 			   struct rte_eth_dev_info *dev_info)
263 {
264 	struct hn_data *hv = dev->data->dev_private;
265 	int rc;
266 
267 	dev_info->speed_capa = RTE_ETH_LINK_SPEED_10G;
268 	dev_info->min_rx_bufsize = HN_MIN_RX_BUF_SIZE;
269 	dev_info->max_rx_pktlen  = HN_MAX_XFER_LEN;
270 	dev_info->max_mac_addrs  = 1;
271 
272 	dev_info->hash_key_size = NDIS_HASH_KEYSIZE_TOEPLITZ;
273 	dev_info->flow_type_rss_offloads = hv->rss_offloads;
274 	dev_info->reta_size = RTE_ETH_RSS_RETA_SIZE_128;
275 
276 	dev_info->max_rx_queues = hv->max_queues;
277 	dev_info->max_tx_queues = hv->max_queues;
278 
279 	dev_info->tx_desc_lim.nb_min = 1;
280 	dev_info->tx_desc_lim.nb_max = 4096;
281 
282 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
283 		return 0;
284 
285 	/* fills in rx and tx offload capability */
286 	rc = hn_rndis_get_offload(hv, dev_info);
287 	if (rc != 0)
288 		return rc;
289 
290 	/* merges the offload and queues of vf */
291 	return hn_vf_info_get(hv, dev_info);
292 }
293 
294 static int hn_rss_reta_update(struct rte_eth_dev *dev,
295 			      struct rte_eth_rss_reta_entry64 *reta_conf,
296 			      uint16_t reta_size)
297 {
298 	struct hn_data *hv = dev->data->dev_private;
299 	unsigned int i;
300 	int err;
301 
302 	PMD_INIT_FUNC_TRACE();
303 
304 	if (reta_size != NDIS_HASH_INDCNT) {
305 		PMD_DRV_LOG(ERR, "Hash lookup table size does not match NDIS");
306 		return -EINVAL;
307 	}
308 
309 	for (i = 0; i < NDIS_HASH_INDCNT; i++) {
310 		uint16_t idx = i / RTE_ETH_RETA_GROUP_SIZE;
311 		uint16_t shift = i % RTE_ETH_RETA_GROUP_SIZE;
312 		uint64_t mask = (uint64_t)1 << shift;
313 
314 		if (reta_conf[idx].mask & mask)
315 			hv->rss_ind[i] = reta_conf[idx].reta[shift];
316 	}
317 
318 	err = hn_rndis_conf_rss(hv, NDIS_RSS_FLAG_DISABLE);
319 	if (err) {
320 		PMD_DRV_LOG(NOTICE,
321 			"rss disable failed");
322 		return err;
323 	}
324 
325 	err = hn_rndis_conf_rss(hv, 0);
326 	if (err) {
327 		PMD_DRV_LOG(NOTICE,
328 			    "reta reconfig failed");
329 		return err;
330 	}
331 
332 	return hn_vf_reta_hash_update(dev, reta_conf, reta_size);
333 }
334 
335 static int hn_rss_reta_query(struct rte_eth_dev *dev,
336 			     struct rte_eth_rss_reta_entry64 *reta_conf,
337 			     uint16_t reta_size)
338 {
339 	struct hn_data *hv = dev->data->dev_private;
340 	unsigned int i;
341 
342 	PMD_INIT_FUNC_TRACE();
343 
344 	if (reta_size != NDIS_HASH_INDCNT) {
345 		PMD_DRV_LOG(ERR, "Hash lookup table size does not match NDIS");
346 		return -EINVAL;
347 	}
348 
349 	for (i = 0; i < NDIS_HASH_INDCNT; i++) {
350 		uint16_t idx = i / RTE_ETH_RETA_GROUP_SIZE;
351 		uint16_t shift = i % RTE_ETH_RETA_GROUP_SIZE;
352 		uint64_t mask = (uint64_t)1 << shift;
353 
354 		if (reta_conf[idx].mask & mask)
355 			reta_conf[idx].reta[shift] = hv->rss_ind[i];
356 	}
357 	return 0;
358 }
359 
360 static void hn_rss_hash_init(struct hn_data *hv,
361 			     const struct rte_eth_rss_conf *rss_conf)
362 {
363 	/* Convert from DPDK RSS hash flags to NDIS hash flags */
364 	hv->rss_hash = NDIS_HASH_FUNCTION_TOEPLITZ;
365 
366 	if (rss_conf->rss_hf & RTE_ETH_RSS_IPV4)
367 		hv->rss_hash |= NDIS_HASH_IPV4;
368 	if (rss_conf->rss_hf & RTE_ETH_RSS_NONFRAG_IPV4_TCP)
369 		hv->rss_hash |= NDIS_HASH_TCP_IPV4;
370 	if (rss_conf->rss_hf & RTE_ETH_RSS_IPV6)
371 		hv->rss_hash |=  NDIS_HASH_IPV6;
372 	if (rss_conf->rss_hf & RTE_ETH_RSS_IPV6_EX)
373 		hv->rss_hash |=  NDIS_HASH_IPV6_EX;
374 	if (rss_conf->rss_hf & RTE_ETH_RSS_NONFRAG_IPV6_TCP)
375 		hv->rss_hash |= NDIS_HASH_TCP_IPV6;
376 	if (rss_conf->rss_hf & RTE_ETH_RSS_IPV6_TCP_EX)
377 		hv->rss_hash |= NDIS_HASH_TCP_IPV6_EX;
378 
379 	memcpy(hv->rss_key, rss_conf->rss_key ? : rss_default_key,
380 	       NDIS_HASH_KEYSIZE_TOEPLITZ);
381 }
382 
383 static int hn_rss_hash_update(struct rte_eth_dev *dev,
384 			      struct rte_eth_rss_conf *rss_conf)
385 {
386 	struct hn_data *hv = dev->data->dev_private;
387 	int err;
388 
389 	PMD_INIT_FUNC_TRACE();
390 
391 	err = hn_rndis_conf_rss(hv, NDIS_RSS_FLAG_DISABLE);
392 	if (err) {
393 		PMD_DRV_LOG(NOTICE,
394 			    "rss disable failed");
395 		return err;
396 	}
397 
398 	hn_rss_hash_init(hv, rss_conf);
399 
400 	if (rss_conf->rss_hf != 0) {
401 		err = hn_rndis_conf_rss(hv, 0);
402 		if (err) {
403 			PMD_DRV_LOG(NOTICE,
404 				    "rss reconfig failed (RSS disabled)");
405 			return err;
406 		}
407 	}
408 
409 	return hn_vf_rss_hash_update(dev, rss_conf);
410 }
411 
412 static int hn_rss_hash_conf_get(struct rte_eth_dev *dev,
413 				struct rte_eth_rss_conf *rss_conf)
414 {
415 	struct hn_data *hv = dev->data->dev_private;
416 
417 	PMD_INIT_FUNC_TRACE();
418 
419 	if (hv->ndis_ver < NDIS_VERSION_6_20) {
420 		PMD_DRV_LOG(DEBUG, "RSS not supported on this host");
421 		return -EOPNOTSUPP;
422 	}
423 
424 	rss_conf->rss_key_len = NDIS_HASH_KEYSIZE_TOEPLITZ;
425 	if (rss_conf->rss_key)
426 		memcpy(rss_conf->rss_key, hv->rss_key,
427 		       NDIS_HASH_KEYSIZE_TOEPLITZ);
428 
429 	rss_conf->rss_hf = 0;
430 	if (hv->rss_hash & NDIS_HASH_IPV4)
431 		rss_conf->rss_hf |= RTE_ETH_RSS_IPV4;
432 
433 	if (hv->rss_hash & NDIS_HASH_TCP_IPV4)
434 		rss_conf->rss_hf |= RTE_ETH_RSS_NONFRAG_IPV4_TCP;
435 
436 	if (hv->rss_hash & NDIS_HASH_IPV6)
437 		rss_conf->rss_hf |= RTE_ETH_RSS_IPV6;
438 
439 	if (hv->rss_hash & NDIS_HASH_IPV6_EX)
440 		rss_conf->rss_hf |= RTE_ETH_RSS_IPV6_EX;
441 
442 	if (hv->rss_hash & NDIS_HASH_TCP_IPV6)
443 		rss_conf->rss_hf |= RTE_ETH_RSS_NONFRAG_IPV6_TCP;
444 
445 	if (hv->rss_hash & NDIS_HASH_TCP_IPV6_EX)
446 		rss_conf->rss_hf |= RTE_ETH_RSS_IPV6_TCP_EX;
447 
448 	return 0;
449 }
450 
451 static int
452 hn_dev_promiscuous_enable(struct rte_eth_dev *dev)
453 {
454 	struct hn_data *hv = dev->data->dev_private;
455 
456 	hn_rndis_set_rxfilter(hv, NDIS_PACKET_TYPE_PROMISCUOUS);
457 	return hn_vf_promiscuous_enable(dev);
458 }
459 
460 static int
461 hn_dev_promiscuous_disable(struct rte_eth_dev *dev)
462 {
463 	struct hn_data *hv = dev->data->dev_private;
464 	uint32_t filter;
465 
466 	filter = NDIS_PACKET_TYPE_DIRECTED | NDIS_PACKET_TYPE_BROADCAST;
467 	if (dev->data->all_multicast)
468 		filter |= NDIS_PACKET_TYPE_ALL_MULTICAST;
469 	hn_rndis_set_rxfilter(hv, filter);
470 	return hn_vf_promiscuous_disable(dev);
471 }
472 
473 static int
474 hn_dev_allmulticast_enable(struct rte_eth_dev *dev)
475 {
476 	struct hn_data *hv = dev->data->dev_private;
477 
478 	hn_rndis_set_rxfilter(hv, NDIS_PACKET_TYPE_DIRECTED |
479 			      NDIS_PACKET_TYPE_ALL_MULTICAST |
480 			NDIS_PACKET_TYPE_BROADCAST);
481 	return hn_vf_allmulticast_enable(dev);
482 }
483 
484 static int
485 hn_dev_allmulticast_disable(struct rte_eth_dev *dev)
486 {
487 	struct hn_data *hv = dev->data->dev_private;
488 
489 	hn_rndis_set_rxfilter(hv, NDIS_PACKET_TYPE_DIRECTED |
490 			     NDIS_PACKET_TYPE_BROADCAST);
491 	return hn_vf_allmulticast_disable(dev);
492 }
493 
494 static int
495 hn_dev_mc_addr_list(struct rte_eth_dev *dev,
496 		     struct rte_ether_addr *mc_addr_set,
497 		     uint32_t nb_mc_addr)
498 {
499 	/* No filtering on the synthetic path, but can do it on VF */
500 	return hn_vf_mc_addr_list(dev, mc_addr_set, nb_mc_addr);
501 }
502 
503 /* Setup shared rx/tx queue data */
504 static int hn_subchan_configure(struct hn_data *hv,
505 				uint32_t subchan)
506 {
507 	struct vmbus_channel *primary = hn_primary_chan(hv);
508 	int err;
509 	unsigned int retry = 0;
510 
511 	PMD_DRV_LOG(DEBUG,
512 		    "open %u subchannels", subchan);
513 
514 	/* Send create sub channels command */
515 	err = hn_nvs_alloc_subchans(hv, &subchan);
516 	if (err)
517 		return  err;
518 
519 	while (subchan > 0) {
520 		struct vmbus_channel *new_sc;
521 		uint16_t chn_index;
522 
523 		err = rte_vmbus_subchan_open(primary, &new_sc);
524 		if (err == -ENOENT && ++retry < 1000) {
525 			/* This can happen if not ready yet */
526 			rte_delay_ms(10);
527 			continue;
528 		}
529 
530 		if (err) {
531 			PMD_DRV_LOG(ERR,
532 				    "open subchannel failed: %d", err);
533 			return err;
534 		}
535 
536 		rte_vmbus_set_latency(hv->vmbus, new_sc, hv->latency);
537 
538 		retry = 0;
539 		chn_index = rte_vmbus_sub_channel_index(new_sc);
540 		if (chn_index == 0 || chn_index > hv->max_queues) {
541 			PMD_DRV_LOG(ERR,
542 				    "Invalid subchannel offermsg channel %u",
543 				    chn_index);
544 			return -EIO;
545 		}
546 
547 		PMD_DRV_LOG(DEBUG, "new sub channel %u", chn_index);
548 		hv->channels[chn_index] = new_sc;
549 		--subchan;
550 	}
551 
552 	return err;
553 }
554 
555 static void netvsc_hotplug_retry(void *args)
556 {
557 	int ret;
558 	struct hv_hotadd_context *hot_ctx = args;
559 	struct hn_data *hv = hot_ctx->hv;
560 	struct rte_eth_dev *dev = &rte_eth_devices[hv->port_id];
561 	struct rte_devargs *d = &hot_ctx->da;
562 	char buf[256];
563 
564 	DIR *di;
565 	struct dirent *dir;
566 	struct ifreq req;
567 	struct rte_ether_addr eth_addr;
568 	int s;
569 
570 	PMD_DRV_LOG(DEBUG, "%s: retry count %d",
571 		    __func__, hot_ctx->eal_hot_plug_retry);
572 
573 	if (hot_ctx->eal_hot_plug_retry++ > NETVSC_MAX_HOTADD_RETRY) {
574 		PMD_DRV_LOG(NOTICE, "Failed to parse PCI device retry=%d",
575 			    hot_ctx->eal_hot_plug_retry);
576 		goto free_hotadd_ctx;
577 	}
578 
579 	snprintf(buf, sizeof(buf), "/sys/bus/pci/devices/%s/net", d->name);
580 	di = opendir(buf);
581 	if (!di) {
582 		PMD_DRV_LOG(DEBUG, "%s: can't open directory %s, "
583 			    "retrying in 1 second", __func__, buf);
584 		goto retry;
585 	}
586 
587 	while ((dir = readdir(di))) {
588 		/* Skip . and .. directories */
589 		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
590 			continue;
591 
592 		/* trying to get mac address if this is a network device*/
593 		s = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
594 		if (s == -1) {
595 			PMD_DRV_LOG(ERR, "Failed to create socket errno %d",
596 				    errno);
597 			break;
598 		}
599 		strlcpy(req.ifr_name, dir->d_name, sizeof(req.ifr_name));
600 		ret = ioctl(s, SIOCGIFHWADDR, &req);
601 		close(s);
602 		if (ret == -1) {
603 			PMD_DRV_LOG(ERR,
604 				    "Failed to send SIOCGIFHWADDR for device %s",
605 				    dir->d_name);
606 			break;
607 		}
608 		if (req.ifr_hwaddr.sa_family != ARPHRD_ETHER) {
609 			closedir(di);
610 			goto free_hotadd_ctx;
611 		}
612 		memcpy(eth_addr.addr_bytes, req.ifr_hwaddr.sa_data,
613 		       RTE_DIM(eth_addr.addr_bytes));
614 
615 		if (rte_is_same_ether_addr(&eth_addr, dev->data->mac_addrs)) {
616 			PMD_DRV_LOG(NOTICE,
617 				    "Found matching MAC address, adding device %s network name %s",
618 				    d->name, dir->d_name);
619 
620 			/* If this device has been hot removed from this
621 			 * parent device, restore its args.
622 			 */
623 			ret = rte_eal_hotplug_add(d->bus->name, d->name,
624 						  hv->vf_devargs ?
625 						  hv->vf_devargs : "");
626 			if (ret) {
627 				PMD_DRV_LOG(ERR,
628 					    "Failed to add PCI device %s",
629 					    d->name);
630 				break;
631 			}
632 		}
633 		/* When the code reaches here, we either have already added
634 		 * the device, or its MAC address did not match.
635 		 */
636 		closedir(di);
637 		goto free_hotadd_ctx;
638 	}
639 	closedir(di);
640 retry:
641 	/* The device is still being initialized, retry after 1 second */
642 	rte_eal_alarm_set(1000000, netvsc_hotplug_retry, hot_ctx);
643 	return;
644 
645 free_hotadd_ctx:
646 	rte_spinlock_lock(&hv->hotadd_lock);
647 	LIST_REMOVE(hot_ctx, list);
648 	rte_spinlock_unlock(&hv->hotadd_lock);
649 
650 	rte_free(hot_ctx);
651 }
652 
653 static void
654 netvsc_hotadd_callback(const char *device_name, enum rte_dev_event_type type,
655 		       void *arg)
656 {
657 	struct hn_data *hv = arg;
658 	struct hv_hotadd_context *hot_ctx;
659 	struct rte_devargs *d;
660 	int ret;
661 
662 	PMD_DRV_LOG(INFO, "Device notification type=%d device_name=%s",
663 		    type, device_name);
664 
665 	switch (type) {
666 	case RTE_DEV_EVENT_ADD:
667 		/* if we already has a VF, don't check on hot add */
668 		if (hv->vf_ctx.vf_state > vf_removed)
669 			break;
670 
671 		hot_ctx = rte_zmalloc("NETVSC-HOTADD", sizeof(*hot_ctx),
672 				      rte_mem_page_size());
673 
674 		if (!hot_ctx) {
675 			PMD_DRV_LOG(ERR, "Failed to allocate hotadd context");
676 			return;
677 		}
678 
679 		hot_ctx->hv = hv;
680 		d = &hot_ctx->da;
681 
682 		ret = rte_devargs_parse(d, device_name);
683 		if (ret) {
684 			PMD_DRV_LOG(ERR,
685 				    "devargs parsing failed ret=%d", ret);
686 			goto free_ctx;
687 		}
688 
689 		if (!strcmp(d->bus->name, "pci")) {
690 			/* Start the process of figuring out if this
691 			 * PCI device is a VF device
692 			 */
693 			rte_spinlock_lock(&hv->hotadd_lock);
694 			LIST_INSERT_HEAD(&hv->hotadd_list, hot_ctx, list);
695 			rte_spinlock_unlock(&hv->hotadd_lock);
696 			rte_eal_alarm_set(1000000, netvsc_hotplug_retry, hot_ctx);
697 			return;
698 		}
699 
700 		/* We will switch to VF on RDNIS configure message
701 		 * sent from VSP
702 		 */
703 free_ctx:
704 		rte_free(hot_ctx);
705 		break;
706 
707 	default:
708 		break;
709 	}
710 }
711 
712 static int hn_dev_configure(struct rte_eth_dev *dev)
713 {
714 	struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
715 	struct rte_eth_rss_conf *rss_conf = &dev_conf->rx_adv_conf.rss_conf;
716 	const struct rte_eth_rxmode *rxmode = &dev_conf->rxmode;
717 	const struct rte_eth_txmode *txmode = &dev_conf->txmode;
718 	struct hn_data *hv = dev->data->dev_private;
719 	uint64_t unsupported;
720 	int i, err, subchan;
721 
722 	PMD_INIT_FUNC_TRACE();
723 
724 	if (dev_conf->rxmode.mq_mode & RTE_ETH_MQ_RX_RSS_FLAG)
725 		dev_conf->rxmode.offloads |= RTE_ETH_RX_OFFLOAD_RSS_HASH;
726 
727 	unsupported = txmode->offloads & ~HN_TX_OFFLOAD_CAPS;
728 	if (unsupported) {
729 		PMD_DRV_LOG(NOTICE,
730 			    "unsupported TX offload: %#" PRIx64,
731 			    unsupported);
732 		return -EINVAL;
733 	}
734 
735 	unsupported = rxmode->offloads & ~HN_RX_OFFLOAD_CAPS;
736 	if (unsupported) {
737 		PMD_DRV_LOG(NOTICE,
738 			    "unsupported RX offload: %#" PRIx64,
739 			    rxmode->offloads);
740 		return -EINVAL;
741 	}
742 
743 	hv->vlan_strip = !!(rxmode->offloads & RTE_ETH_RX_OFFLOAD_VLAN_STRIP);
744 
745 	err = hn_rndis_conf_offload(hv, txmode->offloads,
746 				    rxmode->offloads);
747 	if (err) {
748 		PMD_DRV_LOG(NOTICE,
749 			    "offload configure failed");
750 		return err;
751 	}
752 
753 	hv->num_queues = RTE_MAX(dev->data->nb_rx_queues,
754 				 dev->data->nb_tx_queues);
755 
756 	for (i = 0; i < NDIS_HASH_INDCNT; i++)
757 		hv->rss_ind[i] = i % dev->data->nb_rx_queues;
758 
759 	hn_rss_hash_init(hv, rss_conf);
760 
761 	subchan = hv->num_queues - 1;
762 	if (subchan > 0) {
763 		err = hn_subchan_configure(hv, subchan);
764 		if (err) {
765 			PMD_DRV_LOG(NOTICE,
766 				    "subchannel configuration failed");
767 			return err;
768 		}
769 
770 		err = hn_rndis_conf_rss(hv, NDIS_RSS_FLAG_DISABLE);
771 		if (err) {
772 			PMD_DRV_LOG(NOTICE,
773 				"rss disable failed");
774 			return err;
775 		}
776 
777 		if (rss_conf->rss_hf != 0) {
778 			err = hn_rndis_conf_rss(hv, 0);
779 			if (err) {
780 				PMD_DRV_LOG(NOTICE,
781 					    "initial RSS config failed");
782 				return err;
783 			}
784 		}
785 	}
786 
787 	return hn_vf_configure_locked(dev, dev_conf);
788 }
789 
790 static int hn_dev_stats_get(struct rte_eth_dev *dev,
791 			    struct rte_eth_stats *stats)
792 {
793 	unsigned int i;
794 
795 	hn_vf_stats_get(dev, stats);
796 
797 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
798 		const struct hn_tx_queue *txq = dev->data->tx_queues[i];
799 
800 		if (!txq)
801 			continue;
802 
803 		stats->opackets += txq->stats.packets;
804 		stats->obytes += txq->stats.bytes;
805 		stats->oerrors += txq->stats.errors;
806 
807 		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
808 			stats->q_opackets[i] = txq->stats.packets;
809 			stats->q_obytes[i] = txq->stats.bytes;
810 		}
811 	}
812 
813 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
814 		const struct hn_rx_queue *rxq = dev->data->rx_queues[i];
815 
816 		if (!rxq)
817 			continue;
818 
819 		stats->ipackets += rxq->stats.packets;
820 		stats->ibytes += rxq->stats.bytes;
821 		stats->ierrors += rxq->stats.errors;
822 		stats->imissed += rxq->stats.ring_full;
823 
824 		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
825 			stats->q_ipackets[i] = rxq->stats.packets;
826 			stats->q_ibytes[i] = rxq->stats.bytes;
827 		}
828 	}
829 
830 	stats->rx_nombuf = dev->data->rx_mbuf_alloc_failed;
831 	return 0;
832 }
833 
834 static int
835 hn_dev_stats_reset(struct rte_eth_dev *dev)
836 {
837 	unsigned int i;
838 
839 	PMD_INIT_FUNC_TRACE();
840 
841 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
842 		struct hn_tx_queue *txq = dev->data->tx_queues[i];
843 
844 		if (!txq)
845 			continue;
846 		memset(&txq->stats, 0, sizeof(struct hn_stats));
847 	}
848 
849 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
850 		struct hn_rx_queue *rxq = dev->data->rx_queues[i];
851 
852 		if (!rxq)
853 			continue;
854 
855 		memset(&rxq->stats, 0, sizeof(struct hn_stats));
856 	}
857 
858 	return 0;
859 }
860 
861 static int
862 hn_dev_xstats_reset(struct rte_eth_dev *dev)
863 {
864 	int ret;
865 
866 	ret = hn_dev_stats_reset(dev);
867 	if (ret != 0)
868 		return 0;
869 
870 	return hn_vf_xstats_reset(dev);
871 }
872 
873 static int
874 hn_dev_xstats_count(struct rte_eth_dev *dev)
875 {
876 	int ret, count;
877 
878 	count = dev->data->nb_tx_queues * RTE_DIM(hn_stat_strings);
879 	count += dev->data->nb_rx_queues * RTE_DIM(hn_stat_strings);
880 
881 	ret = hn_vf_xstats_get_names(dev, NULL, 0);
882 	if (ret < 0)
883 		return ret;
884 
885 	return count + ret;
886 }
887 
888 static int
889 hn_dev_xstats_get_names(struct rte_eth_dev *dev,
890 			struct rte_eth_xstat_name *xstats_names,
891 			unsigned int limit)
892 {
893 	unsigned int i, t, count = 0;
894 	int ret;
895 
896 	if (!xstats_names)
897 		return hn_dev_xstats_count(dev);
898 
899 	/* Note: limit checked in rte_eth_xstats_names() */
900 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
901 		const struct hn_tx_queue *txq = dev->data->tx_queues[i];
902 
903 		if (!txq)
904 			continue;
905 
906 		if (count >= limit)
907 			break;
908 
909 		for (t = 0; t < RTE_DIM(hn_stat_strings); t++)
910 			snprintf(xstats_names[count++].name,
911 				 RTE_ETH_XSTATS_NAME_SIZE,
912 				 "tx_q%u_%s", i, hn_stat_strings[t].name);
913 	}
914 
915 	for (i = 0; i < dev->data->nb_rx_queues; i++)  {
916 		const struct hn_rx_queue *rxq = dev->data->rx_queues[i];
917 
918 		if (!rxq)
919 			continue;
920 
921 		if (count >= limit)
922 			break;
923 
924 		for (t = 0; t < RTE_DIM(hn_stat_strings); t++)
925 			snprintf(xstats_names[count++].name,
926 				 RTE_ETH_XSTATS_NAME_SIZE,
927 				 "rx_q%u_%s", i,
928 				 hn_stat_strings[t].name);
929 	}
930 
931 	ret = hn_vf_xstats_get_names(dev, xstats_names + count,
932 				     limit - count);
933 	if (ret < 0)
934 		return ret;
935 
936 	return count + ret;
937 }
938 
939 static int
940 hn_dev_xstats_get(struct rte_eth_dev *dev,
941 		  struct rte_eth_xstat *xstats,
942 		  unsigned int n)
943 {
944 	unsigned int i, t, count = 0;
945 	const unsigned int nstats = hn_dev_xstats_count(dev);
946 	const char *stats;
947 	int ret;
948 
949 	PMD_INIT_FUNC_TRACE();
950 
951 	if (n < nstats)
952 		return nstats;
953 
954 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
955 		const struct hn_tx_queue *txq = dev->data->tx_queues[i];
956 
957 		if (!txq)
958 			continue;
959 
960 		stats = (const char *)&txq->stats;
961 		for (t = 0; t < RTE_DIM(hn_stat_strings); t++, count++) {
962 			xstats[count].id = count;
963 			xstats[count].value = *(const uint64_t *)
964 				(stats + hn_stat_strings[t].offset);
965 		}
966 	}
967 
968 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
969 		const struct hn_rx_queue *rxq = dev->data->rx_queues[i];
970 
971 		if (!rxq)
972 			continue;
973 
974 		stats = (const char *)&rxq->stats;
975 		for (t = 0; t < RTE_DIM(hn_stat_strings); t++, count++) {
976 			xstats[count].id = count;
977 			xstats[count].value = *(const uint64_t *)
978 				(stats + hn_stat_strings[t].offset);
979 		}
980 	}
981 
982 	ret = hn_vf_xstats_get(dev, xstats, count, n);
983 	if (ret < 0)
984 		return ret;
985 
986 	return count + ret;
987 }
988 
989 static int
990 hn_dev_start(struct rte_eth_dev *dev)
991 {
992 	struct hn_data *hv = dev->data->dev_private;
993 	int i, error;
994 
995 	PMD_INIT_FUNC_TRACE();
996 
997 	/* Register to monitor hot plug events */
998 	error = rte_dev_event_callback_register(NULL, netvsc_hotadd_callback,
999 						hv);
1000 	if (error) {
1001 		PMD_DRV_LOG(ERR, "failed to register device event callback");
1002 		return error;
1003 	}
1004 
1005 	error = hn_rndis_set_rxfilter(hv,
1006 				      NDIS_PACKET_TYPE_BROADCAST |
1007 				      NDIS_PACKET_TYPE_ALL_MULTICAST |
1008 				      NDIS_PACKET_TYPE_DIRECTED);
1009 	if (error)
1010 		return error;
1011 
1012 	error = hn_vf_start(dev);
1013 	if (error)
1014 		hn_rndis_set_rxfilter(hv, 0);
1015 
1016 	/* Initialize Link state */
1017 	if (error == 0)
1018 		hn_dev_link_update(dev, 0);
1019 
1020 	for (i = 0; i < hv->num_queues; i++) {
1021 		dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
1022 		dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STARTED;
1023 	}
1024 
1025 	return error;
1026 }
1027 
1028 static int
1029 hn_dev_stop(struct rte_eth_dev *dev)
1030 {
1031 	struct hn_data *hv = dev->data->dev_private;
1032 	int i, ret;
1033 
1034 	PMD_INIT_FUNC_TRACE();
1035 	dev->data->dev_started = 0;
1036 
1037 	rte_dev_event_callback_unregister(NULL, netvsc_hotadd_callback, hv);
1038 	hn_rndis_set_rxfilter(hv, 0);
1039 	ret = hn_vf_stop(dev);
1040 
1041 	for (i = 0; i < hv->num_queues; i++) {
1042 		dev->data->tx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
1043 		dev->data->rx_queue_state[i] = RTE_ETH_QUEUE_STATE_STOPPED;
1044 	}
1045 
1046 	return ret;
1047 }
1048 
1049 static int
1050 hn_dev_close(struct rte_eth_dev *dev)
1051 {
1052 	int ret;
1053 	struct hn_data *hv = dev->data->dev_private;
1054 	struct hv_hotadd_context *hot_ctx;
1055 
1056 	PMD_INIT_FUNC_TRACE();
1057 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1058 		return 0;
1059 
1060 	rte_spinlock_lock(&hv->hotadd_lock);
1061 	while (!LIST_EMPTY(&hv->hotadd_list)) {
1062 		hot_ctx = LIST_FIRST(&hv->hotadd_list);
1063 		rte_eal_alarm_cancel(netvsc_hotplug_retry, hot_ctx);
1064 		LIST_REMOVE(hot_ctx, list);
1065 		rte_free(hot_ctx);
1066 	}
1067 	rte_spinlock_unlock(&hv->hotadd_lock);
1068 
1069 	ret = hn_vf_close(dev);
1070 	hn_dev_free_queues(dev);
1071 
1072 	return ret;
1073 }
1074 
1075 /*
1076  * Setup connection between PMD and kernel.
1077  */
1078 static int
1079 hn_attach(struct hn_data *hv, unsigned int mtu)
1080 {
1081 	int error;
1082 
1083 	/* Attach NVS */
1084 	error = hn_nvs_attach(hv, mtu);
1085 	if (error)
1086 		goto failed_nvs;
1087 
1088 	/* Attach RNDIS */
1089 	error = hn_rndis_attach(hv);
1090 	if (error)
1091 		goto failed_rndis;
1092 
1093 	/*
1094 	 * NOTE:
1095 	 * Under certain conditions on certain versions of Hyper-V,
1096 	 * the RNDIS rxfilter is _not_ zero on the hypervisor side
1097 	 * after the successful RNDIS initialization.
1098 	 */
1099 	hn_rndis_set_rxfilter(hv, NDIS_PACKET_TYPE_NONE);
1100 	return 0;
1101 failed_rndis:
1102 	hn_nvs_detach(hv);
1103 failed_nvs:
1104 	return error;
1105 }
1106 
1107 static void
1108 hn_detach(struct hn_data *hv)
1109 {
1110 	hn_nvs_detach(hv);
1111 	hn_rndis_detach(hv);
1112 }
1113 
1114 /*
1115  * Connects EXISTING rx/tx queues to NEW vmbus channel(s), and
1116  * re-initializes NDIS and RNDIS, including re-sending initial
1117  * NDIS/RNDIS configuration. To be used after the underlying vmbus
1118  * has been un- and re-mapped, e.g. as must happen when the device
1119  * MTU is changed.
1120  */
1121 static int
1122 hn_reinit(struct rte_eth_dev *dev, uint16_t mtu)
1123 {
1124 	struct hn_data *hv = dev->data->dev_private;
1125 	struct hn_rx_queue **rxqs = (struct hn_rx_queue **)dev->data->rx_queues;
1126 	struct hn_tx_queue **txqs = (struct hn_tx_queue **)dev->data->tx_queues;
1127 	int i, ret = 0;
1128 
1129 	/* Point primary queues at new primary channel */
1130 	if (rxqs[0]) {
1131 		rxqs[0]->chan = hv->channels[0];
1132 		txqs[0]->chan = hv->channels[0];
1133 	}
1134 
1135 	ret = hn_attach(hv, mtu);
1136 	if (ret)
1137 		return ret;
1138 
1139 	/* Create vmbus subchannels, additional RNDIS configuration */
1140 	ret = hn_dev_configure(dev);
1141 	if (ret)
1142 		return ret;
1143 
1144 	/* Point any additional queues at new subchannels */
1145 	if (rxqs[0]) {
1146 		for (i = 1; i < dev->data->nb_rx_queues; i++)
1147 			rxqs[i]->chan = hv->channels[i];
1148 		for (i = 1; i < dev->data->nb_tx_queues; i++)
1149 			txqs[i]->chan = hv->channels[i];
1150 	}
1151 
1152 	return ret;
1153 }
1154 
1155 static int
1156 hn_dev_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
1157 {
1158 	struct hn_data *hv = dev->data->dev_private;
1159 	unsigned int orig_mtu = dev->data->mtu;
1160 	uint32_t rndis_mtu;
1161 	int ret = 0;
1162 	int i;
1163 
1164 	if (dev->data->dev_started) {
1165 		PMD_DRV_LOG(ERR, "Device must be stopped before changing MTU");
1166 		return -EBUSY;
1167 	}
1168 
1169 	/* Change MTU of underlying VF dev first, if it exists */
1170 	ret = hn_vf_mtu_set(dev, mtu);
1171 	if (ret)
1172 		return ret;
1173 
1174 	/* Release channel resources */
1175 	hn_detach(hv);
1176 
1177 	/* Close any secondary vmbus channels */
1178 	for (i = 1; i < hv->num_queues; i++)
1179 		rte_vmbus_chan_close(hv->channels[i]);
1180 
1181 	/* Close primary vmbus channel */
1182 	rte_free(hv->channels[0]);
1183 
1184 	/* Unmap and re-map vmbus device */
1185 	rte_vmbus_unmap_device(hv->vmbus);
1186 	ret = rte_vmbus_map_device(hv->vmbus);
1187 	if (ret) {
1188 		/* This is a catastrophic error - the device is unusable */
1189 		PMD_DRV_LOG(ERR, "Could not re-map vmbus device!");
1190 		return ret;
1191 	}
1192 
1193 	/* Update pointers to re-mapped UIO resources */
1194 	hv->rxbuf_res = hv->vmbus->resource[HV_RECV_BUF_MAP];
1195 	hv->chim_res  = hv->vmbus->resource[HV_SEND_BUF_MAP];
1196 
1197 	/* Re-open the primary vmbus channel */
1198 	ret = rte_vmbus_chan_open(hv->vmbus, &hv->channels[0]);
1199 	if (ret) {
1200 		/* This is a catastrophic error - the device is unusable */
1201 		PMD_DRV_LOG(ERR, "Could not re-open vmbus channel!");
1202 		return ret;
1203 	}
1204 
1205 	rte_vmbus_set_latency(hv->vmbus, hv->channels[0], hv->latency);
1206 
1207 	ret = hn_reinit(dev, mtu);
1208 	if (!ret)
1209 		goto out;
1210 
1211 	/* In case of error, attempt to restore original MTU */
1212 	ret = hn_reinit(dev, orig_mtu);
1213 	if (ret)
1214 		PMD_DRV_LOG(ERR, "Restoring original MTU failed for netvsc");
1215 
1216 	ret = hn_vf_mtu_set(dev, orig_mtu);
1217 	if (ret)
1218 		PMD_DRV_LOG(ERR, "Restoring original MTU failed for VF");
1219 
1220 out:
1221 	if (hn_rndis_get_mtu(hv, &rndis_mtu)) {
1222 		PMD_DRV_LOG(ERR, "Could not get MTU via RNDIS");
1223 	} else {
1224 		dev->data->mtu = (uint16_t)rndis_mtu;
1225 		PMD_DRV_LOG(DEBUG, "RNDIS MTU is %u", dev->data->mtu);
1226 	}
1227 
1228 	return ret;
1229 }
1230 
1231 static const struct eth_dev_ops hn_eth_dev_ops = {
1232 	.dev_configure		= hn_dev_configure,
1233 	.dev_start		= hn_dev_start,
1234 	.dev_stop		= hn_dev_stop,
1235 	.dev_close		= hn_dev_close,
1236 	.dev_infos_get		= hn_dev_info_get,
1237 	.txq_info_get		= hn_dev_tx_queue_info,
1238 	.rxq_info_get		= hn_dev_rx_queue_info,
1239 	.dev_supported_ptypes_get = hn_vf_supported_ptypes,
1240 	.promiscuous_enable     = hn_dev_promiscuous_enable,
1241 	.promiscuous_disable    = hn_dev_promiscuous_disable,
1242 	.allmulticast_enable    = hn_dev_allmulticast_enable,
1243 	.allmulticast_disable   = hn_dev_allmulticast_disable,
1244 	.set_mc_addr_list	= hn_dev_mc_addr_list,
1245 	.mtu_set                = hn_dev_mtu_set,
1246 	.reta_update		= hn_rss_reta_update,
1247 	.reta_query             = hn_rss_reta_query,
1248 	.rss_hash_update	= hn_rss_hash_update,
1249 	.rss_hash_conf_get      = hn_rss_hash_conf_get,
1250 	.tx_queue_setup		= hn_dev_tx_queue_setup,
1251 	.tx_queue_release	= hn_dev_tx_queue_release,
1252 	.tx_done_cleanup        = hn_dev_tx_done_cleanup,
1253 	.rx_queue_setup		= hn_dev_rx_queue_setup,
1254 	.rx_queue_release	= hn_dev_rx_queue_release,
1255 	.link_update		= hn_dev_link_update,
1256 	.stats_get		= hn_dev_stats_get,
1257 	.stats_reset            = hn_dev_stats_reset,
1258 	.xstats_get		= hn_dev_xstats_get,
1259 	.xstats_get_names	= hn_dev_xstats_get_names,
1260 	.xstats_reset		= hn_dev_xstats_reset,
1261 };
1262 
1263 static int
1264 eth_hn_dev_init(struct rte_eth_dev *eth_dev)
1265 {
1266 	struct hn_data *hv = eth_dev->data->dev_private;
1267 	struct rte_device *device = eth_dev->device;
1268 	struct rte_vmbus_device *vmbus;
1269 	uint32_t mtu;
1270 	unsigned int rxr_cnt;
1271 	int err, max_chan;
1272 
1273 	PMD_INIT_FUNC_TRACE();
1274 
1275 	rte_spinlock_init(&hv->hotadd_lock);
1276 	LIST_INIT(&hv->hotadd_list);
1277 
1278 	vmbus = container_of(device, struct rte_vmbus_device, device);
1279 	eth_dev->dev_ops = &hn_eth_dev_ops;
1280 	eth_dev->rx_queue_count = hn_dev_rx_queue_count;
1281 	eth_dev->rx_descriptor_status = hn_dev_rx_queue_status;
1282 	eth_dev->tx_descriptor_status = hn_dev_tx_descriptor_status;
1283 	eth_dev->tx_pkt_burst = &hn_xmit_pkts;
1284 	eth_dev->rx_pkt_burst = &hn_recv_pkts;
1285 
1286 	/*
1287 	 * for secondary processes, we don't initialize any further as primary
1288 	 * has already done this work.
1289 	 */
1290 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1291 		return 0;
1292 
1293 	eth_dev->data->dev_flags |= RTE_ETH_DEV_AUTOFILL_QUEUE_XSTATS;
1294 
1295 	/* Since Hyper-V only supports one MAC address */
1296 	eth_dev->data->mac_addrs = rte_calloc("hv_mac", HN_MAX_MAC_ADDRS,
1297 					      sizeof(struct rte_ether_addr), 0);
1298 	if (eth_dev->data->mac_addrs == NULL) {
1299 		PMD_INIT_LOG(ERR,
1300 			     "Failed to allocate memory store MAC addresses");
1301 		return -ENOMEM;
1302 	}
1303 
1304 	hv->vmbus = vmbus;
1305 	hv->rxbuf_res = vmbus->resource[HV_RECV_BUF_MAP];
1306 	hv->chim_res  = vmbus->resource[HV_SEND_BUF_MAP];
1307 	hv->port_id = eth_dev->data->port_id;
1308 	hv->latency = HN_CHAN_LATENCY_NS;
1309 	hv->rx_copybreak = HN_RXCOPY_THRESHOLD;
1310 	hv->tx_copybreak = HN_TXCOPY_THRESHOLD;
1311 	hv->rx_extmbuf_enable = HN_RX_EXTMBUF_ENABLE;
1312 	hv->max_queues = 1;
1313 
1314 	rte_rwlock_init(&hv->vf_lock);
1315 	hv->vf_ctx.vf_vsc_switched = false;
1316 	hv->vf_ctx.vf_vsp_reported = false;
1317 	hv->vf_ctx.vf_attached = false;
1318 	hv->vf_ctx.vf_state = vf_unknown;
1319 
1320 	err = hn_parse_args(eth_dev);
1321 	if (err)
1322 		return err;
1323 
1324 	strlcpy(hv->owner.name, eth_dev->device->name,
1325 		RTE_ETH_MAX_OWNER_NAME_LEN);
1326 	err = rte_eth_dev_owner_new(&hv->owner.id);
1327 	if (err) {
1328 		PMD_INIT_LOG(ERR, "Can not get owner id");
1329 		return err;
1330 	}
1331 
1332 	/* Initialize primary channel input for control operations */
1333 	err = rte_vmbus_chan_open(vmbus, &hv->channels[0]);
1334 	if (err)
1335 		return err;
1336 
1337 	rte_vmbus_set_latency(hv->vmbus, hv->channels[0], hv->latency);
1338 
1339 	hv->primary = hn_rx_queue_alloc(hv, 0,
1340 					eth_dev->device->numa_node);
1341 
1342 	if (!hv->primary)
1343 		return -ENOMEM;
1344 
1345 	err = hn_attach(hv, RTE_ETHER_MTU);
1346 	if  (err)
1347 		goto failed;
1348 
1349 	err = hn_chim_init(eth_dev);
1350 	if (err)
1351 		goto failed;
1352 
1353 	err = hn_rndis_get_mtu(hv, &mtu);
1354 	if (err)
1355 		goto failed;
1356 	eth_dev->data->mtu = (uint16_t)mtu;
1357 	PMD_INIT_LOG(DEBUG, "RNDIS MTU is %u", eth_dev->data->mtu);
1358 
1359 	err = hn_rndis_get_eaddr(hv, eth_dev->data->mac_addrs->addr_bytes);
1360 	if (err)
1361 		goto failed;
1362 
1363 	/* Multi queue requires later versions of windows server */
1364 	if (hv->nvs_ver < NVS_VERSION_5)
1365 		return 0;
1366 
1367 	max_chan = rte_vmbus_max_channels(vmbus);
1368 	PMD_INIT_LOG(DEBUG, "VMBus max channels %d", max_chan);
1369 	if (max_chan <= 0)
1370 		goto failed;
1371 
1372 	if (hn_rndis_query_rsscaps(hv, &rxr_cnt) != 0)
1373 		rxr_cnt = 1;
1374 
1375 	hv->max_queues = RTE_MIN(rxr_cnt, (unsigned int)max_chan);
1376 
1377 	/* If VF was reported but not added, do it now */
1378 	if (hv->vf_ctx.vf_vsp_reported && !hv->vf_ctx.vf_vsc_switched) {
1379 		PMD_INIT_LOG(DEBUG, "Adding VF device");
1380 
1381 		err = hn_vf_add(eth_dev, hv);
1382 	}
1383 
1384 	return 0;
1385 
1386 failed:
1387 	PMD_INIT_LOG(NOTICE, "device init failed");
1388 
1389 	hn_chim_uninit(eth_dev);
1390 	hn_detach(hv);
1391 	return err;
1392 }
1393 
1394 static int
1395 eth_hn_dev_uninit(struct rte_eth_dev *eth_dev)
1396 {
1397 	struct hn_data *hv = eth_dev->data->dev_private;
1398 	int ret, ret_stop;
1399 
1400 	PMD_INIT_FUNC_TRACE();
1401 
1402 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1403 		return 0;
1404 
1405 	ret_stop = hn_dev_stop(eth_dev);
1406 	hn_dev_close(eth_dev);
1407 
1408 	free(hv->vf_devargs);
1409 	hv->vf_devargs = NULL;
1410 
1411 	hn_detach(hv);
1412 	hn_chim_uninit(eth_dev);
1413 	rte_vmbus_chan_close(hv->channels[0]);
1414 	rte_free(hv->primary);
1415 	ret = rte_eth_dev_owner_delete(hv->owner.id);
1416 	if (ret != 0)
1417 		return ret;
1418 
1419 	return ret_stop;
1420 }
1421 
1422 static int eth_hn_probe(struct rte_vmbus_driver *drv __rte_unused,
1423 			struct rte_vmbus_device *dev)
1424 {
1425 	struct rte_eth_dev *eth_dev;
1426 	int ret;
1427 
1428 	PMD_INIT_FUNC_TRACE();
1429 
1430 	ret = rte_dev_event_monitor_start();
1431 	if (ret) {
1432 		PMD_DRV_LOG(ERR, "Failed to start device event monitoring");
1433 		return ret;
1434 	}
1435 
1436 	eth_dev = eth_dev_vmbus_allocate(dev, sizeof(struct hn_data));
1437 	if (!eth_dev)
1438 		return -ENOMEM;
1439 
1440 	ret = eth_hn_dev_init(eth_dev);
1441 	if (ret) {
1442 		eth_dev_vmbus_release(eth_dev);
1443 		rte_dev_event_monitor_stop();
1444 	} else {
1445 		rte_eth_dev_probing_finish(eth_dev);
1446 	}
1447 
1448 	return ret;
1449 }
1450 
1451 static int eth_hn_remove(struct rte_vmbus_device *dev)
1452 {
1453 	struct rte_eth_dev *eth_dev;
1454 	int ret;
1455 
1456 	PMD_INIT_FUNC_TRACE();
1457 
1458 	eth_dev = rte_eth_dev_allocated(dev->device.name);
1459 	if (!eth_dev)
1460 		return 0; /* port already released */
1461 
1462 	ret = eth_hn_dev_uninit(eth_dev);
1463 	if (ret)
1464 		return ret;
1465 
1466 	eth_dev_vmbus_release(eth_dev);
1467 	rte_dev_event_monitor_stop();
1468 	return 0;
1469 }
1470 
1471 /* Network device GUID */
1472 static const rte_uuid_t hn_net_ids[] = {
1473 	/*  f8615163-df3e-46c5-913f-f2d2f965ed0e */
1474 	RTE_UUID_INIT(0xf8615163, 0xdf3e, 0x46c5, 0x913f, 0xf2d2f965ed0eULL),
1475 	{ 0 }
1476 };
1477 
1478 static struct rte_vmbus_driver rte_netvsc_pmd = {
1479 	.id_table = hn_net_ids,
1480 	.probe = eth_hn_probe,
1481 	.remove = eth_hn_remove,
1482 };
1483 
1484 RTE_PMD_REGISTER_VMBUS(net_netvsc, rte_netvsc_pmd);
1485 RTE_PMD_REGISTER_KMOD_DEP(net_netvsc, "* uio_hv_generic");
1486 RTE_LOG_REGISTER_SUFFIX(hn_logtype_init, init, NOTICE);
1487 RTE_LOG_REGISTER_SUFFIX(hn_logtype_driver, driver, NOTICE);
1488 RTE_PMD_REGISTER_PARAM_STRING(net_netvsc,
1489 			      NETVSC_ARG_LATENCY "=<uint32> "
1490 			      NETVSC_ARG_RXBREAK "=<uint32> "
1491 			      NETVSC_ARG_TXBREAK "=<uint32> "
1492 			      NETVSC_ARG_RX_EXTMBUF_ENABLE "=<0|1>");
1493