xref: /dpdk/drivers/net/mana/mana.c (revision 3c4898ef762eeb2578b9ae3d7f6e3a0e5cbca8c8)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2022 Microsoft Corporation
3  */
4 
5 #include <unistd.h>
6 #include <dirent.h>
7 #include <fcntl.h>
8 #include <sys/mman.h>
9 #include <sys/ioctl.h>
10 #include <net/if.h>
11 
12 #include <ethdev_driver.h>
13 #include <ethdev_pci.h>
14 #include <rte_kvargs.h>
15 #include <rte_eal_paging.h>
16 #include <rte_pci.h>
17 
18 #include <infiniband/verbs.h>
19 #include <infiniband/manadv.h>
20 
21 #include <assert.h>
22 
23 #include "mana.h"
24 
25 /* Shared memory between primary/secondary processes, per driver */
26 /* Data to track primary/secondary usage */
27 struct mana_shared_data *mana_shared_data;
28 static struct mana_shared_data mana_local_data;
29 
30 /* The memory region for the above data */
31 static const struct rte_memzone *mana_shared_mz;
32 static const char *MZ_MANA_SHARED_DATA = "mana_shared_data";
33 
34 /* Spinlock for mana_shared_data */
35 static rte_spinlock_t mana_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
36 
37 /* Allocate a buffer on the stack and fill it with a printf format string. */
38 #define MANA_MKSTR(name, ...) \
39 	int mkstr_size_##name = snprintf(NULL, 0, "" __VA_ARGS__); \
40 	char name[mkstr_size_##name + 1]; \
41 	\
42 	memset(name, 0, mkstr_size_##name + 1); \
43 	snprintf(name, sizeof(name), "" __VA_ARGS__)
44 
45 int mana_logtype_driver;
46 int mana_logtype_init;
47 
48 /*
49  * Callback from rdma-core to allocate a buffer for a queue.
50  */
51 void *
52 mana_alloc_verbs_buf(size_t size, void *data)
53 {
54 	void *ret;
55 	size_t alignment = rte_mem_page_size();
56 	int socket = (int)(uintptr_t)data;
57 
58 	DRV_LOG(DEBUG, "size=%zu socket=%d", size, socket);
59 
60 	if (alignment == (size_t)-1) {
61 		DRV_LOG(ERR, "Failed to get mem page size");
62 		rte_errno = ENOMEM;
63 		return NULL;
64 	}
65 
66 	ret = rte_zmalloc_socket("mana_verb_buf", size, alignment, socket);
67 	if (!ret && size)
68 		rte_errno = ENOMEM;
69 	return ret;
70 }
71 
72 void
73 mana_free_verbs_buf(void *ptr, void *data __rte_unused)
74 {
75 	rte_free(ptr);
76 }
77 
78 static int
79 mana_dev_configure(struct rte_eth_dev *dev)
80 {
81 	struct mana_priv *priv = dev->data->dev_private;
82 	struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
83 
84 	if (dev_conf->rxmode.mq_mode & RTE_ETH_MQ_RX_RSS_FLAG)
85 		dev_conf->rxmode.offloads |= RTE_ETH_RX_OFFLOAD_RSS_HASH;
86 
87 	if (dev->data->nb_rx_queues != dev->data->nb_tx_queues) {
88 		DRV_LOG(ERR, "Only support equal number of rx/tx queues");
89 		return -EINVAL;
90 	}
91 
92 	if (!rte_is_power_of_2(dev->data->nb_rx_queues)) {
93 		DRV_LOG(ERR, "number of TX/RX queues must be power of 2");
94 		return -EINVAL;
95 	}
96 
97 	priv->num_queues = dev->data->nb_rx_queues;
98 
99 	manadv_set_context_attr(priv->ib_ctx, MANADV_CTX_ATTR_BUF_ALLOCATORS,
100 				(void *)((uintptr_t)&(struct manadv_ctx_allocators){
101 					.alloc = &mana_alloc_verbs_buf,
102 					.free = &mana_free_verbs_buf,
103 					.data = 0,
104 				}));
105 
106 	return 0;
107 }
108 
109 static void
110 rx_intr_vec_disable(struct mana_priv *priv)
111 {
112 	struct rte_intr_handle *intr_handle = priv->intr_handle;
113 
114 	rte_intr_free_epoll_fd(intr_handle);
115 	rte_intr_vec_list_free(intr_handle);
116 	rte_intr_nb_efd_set(intr_handle, 0);
117 }
118 
119 static int
120 rx_intr_vec_enable(struct mana_priv *priv)
121 {
122 	unsigned int i;
123 	unsigned int rxqs_n = priv->dev_data->nb_rx_queues;
124 	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
125 	struct rte_intr_handle *intr_handle = priv->intr_handle;
126 	int ret;
127 
128 	rx_intr_vec_disable(priv);
129 
130 	if (rte_intr_vec_list_alloc(intr_handle, NULL, n)) {
131 		DRV_LOG(ERR, "Failed to allocate memory for interrupt vector");
132 		return -ENOMEM;
133 	}
134 
135 	for (i = 0; i < n; i++) {
136 		struct mana_rxq *rxq = priv->dev_data->rx_queues[i];
137 
138 		ret = rte_intr_vec_list_index_set(intr_handle, i,
139 						  RTE_INTR_VEC_RXTX_OFFSET + i);
140 		if (ret) {
141 			DRV_LOG(ERR, "Failed to set intr vec %u", i);
142 			return ret;
143 		}
144 
145 		ret = rte_intr_efds_index_set(intr_handle, i, rxq->channel->fd);
146 		if (ret) {
147 			DRV_LOG(ERR, "Failed to set FD at intr %u", i);
148 			return ret;
149 		}
150 	}
151 
152 	return rte_intr_nb_efd_set(intr_handle, n);
153 }
154 
155 static void
156 rxq_intr_disable(struct mana_priv *priv)
157 {
158 	int err = rte_errno;
159 
160 	rx_intr_vec_disable(priv);
161 	rte_errno = err;
162 }
163 
164 static int
165 rxq_intr_enable(struct mana_priv *priv)
166 {
167 	const struct rte_eth_intr_conf *const intr_conf =
168 		&priv->dev_data->dev_conf.intr_conf;
169 
170 	if (!intr_conf->rxq)
171 		return 0;
172 
173 	return rx_intr_vec_enable(priv);
174 }
175 
176 static int
177 mana_dev_start(struct rte_eth_dev *dev)
178 {
179 	int ret;
180 	struct mana_priv *priv = dev->data->dev_private;
181 
182 	rte_spinlock_init(&priv->mr_btree_lock);
183 	ret = mana_mr_btree_init(&priv->mr_btree, MANA_MR_BTREE_CACHE_N,
184 				 dev->device->numa_node);
185 	if (ret) {
186 		DRV_LOG(ERR, "Failed to init device MR btree %d", ret);
187 		return ret;
188 	}
189 
190 	ret = mana_start_tx_queues(dev);
191 	if (ret) {
192 		DRV_LOG(ERR, "failed to start tx queues %d", ret);
193 		goto failed_tx;
194 	}
195 
196 	ret = mana_start_rx_queues(dev);
197 	if (ret) {
198 		DRV_LOG(ERR, "failed to start rx queues %d", ret);
199 		goto failed_rx;
200 	}
201 
202 	rte_wmb();
203 
204 	dev->tx_pkt_burst = mana_tx_burst;
205 	dev->rx_pkt_burst = mana_rx_burst;
206 
207 	DRV_LOG(INFO, "TX/RX queues have started");
208 
209 	/* Enable datapath for secondary processes */
210 	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_START_RXTX);
211 
212 	ret = rxq_intr_enable(priv);
213 	if (ret) {
214 		DRV_LOG(ERR, "Failed to enable RX interrupts");
215 		goto failed_intr;
216 	}
217 
218 	return 0;
219 
220 failed_intr:
221 	mana_stop_rx_queues(dev);
222 
223 failed_rx:
224 	mana_stop_tx_queues(dev);
225 
226 failed_tx:
227 	mana_mr_btree_free(&priv->mr_btree);
228 
229 	return ret;
230 }
231 
232 static int
233 mana_dev_stop(struct rte_eth_dev *dev)
234 {
235 	int ret;
236 	struct mana_priv *priv = dev->data->dev_private;
237 
238 	rxq_intr_disable(priv);
239 
240 	dev->tx_pkt_burst = mana_tx_burst_removed;
241 	dev->rx_pkt_burst = mana_rx_burst_removed;
242 
243 	/* Stop datapath on secondary processes */
244 	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_STOP_RXTX);
245 
246 	rte_wmb();
247 
248 	ret = mana_stop_tx_queues(dev);
249 	if (ret) {
250 		DRV_LOG(ERR, "failed to stop tx queues");
251 		return ret;
252 	}
253 
254 	ret = mana_stop_rx_queues(dev);
255 	if (ret) {
256 		DRV_LOG(ERR, "failed to stop tx queues");
257 		return ret;
258 	}
259 
260 	return 0;
261 }
262 
263 static int mana_intr_uninstall(struct mana_priv *priv);
264 
265 static int
266 mana_dev_close(struct rte_eth_dev *dev)
267 {
268 	struct mana_priv *priv = dev->data->dev_private;
269 	int ret;
270 
271 	mana_remove_all_mr(priv);
272 
273 	ret = mana_intr_uninstall(priv);
274 	if (ret)
275 		return ret;
276 
277 	ret = ibv_close_device(priv->ib_ctx);
278 	if (ret) {
279 		ret = errno;
280 		return ret;
281 	}
282 
283 	return 0;
284 }
285 
286 static int
287 mana_dev_info_get(struct rte_eth_dev *dev,
288 		  struct rte_eth_dev_info *dev_info)
289 {
290 	struct mana_priv *priv = dev->data->dev_private;
291 
292 	dev_info->min_mtu = RTE_ETHER_MIN_MTU;
293 	dev_info->max_mtu = MANA_MAX_MTU;
294 
295 	/* RX params */
296 	dev_info->min_rx_bufsize = MIN_RX_BUF_SIZE;
297 	dev_info->max_rx_pktlen = MANA_MAX_MTU + RTE_ETHER_HDR_LEN;
298 
299 	dev_info->max_rx_queues = priv->max_rx_queues;
300 	dev_info->max_tx_queues = priv->max_tx_queues;
301 
302 	dev_info->max_mac_addrs = MANA_MAX_MAC_ADDR;
303 	dev_info->max_hash_mac_addrs = 0;
304 
305 	dev_info->max_vfs = 1;
306 
307 	/* Offload params */
308 	dev_info->rx_offload_capa = MANA_DEV_RX_OFFLOAD_SUPPORT;
309 
310 	dev_info->tx_offload_capa = MANA_DEV_TX_OFFLOAD_SUPPORT;
311 
312 	/* RSS */
313 	dev_info->reta_size = INDIRECTION_TABLE_NUM_ELEMENTS;
314 	dev_info->hash_key_size = TOEPLITZ_HASH_KEY_SIZE_IN_BYTES;
315 	dev_info->flow_type_rss_offloads = MANA_ETH_RSS_SUPPORT;
316 
317 	/* Thresholds */
318 	dev_info->default_rxconf = (struct rte_eth_rxconf){
319 		.rx_thresh = {
320 			.pthresh = 8,
321 			.hthresh = 8,
322 			.wthresh = 0,
323 		},
324 		.rx_free_thresh = 32,
325 		/* If no descriptors available, pkts are dropped by default */
326 		.rx_drop_en = 1,
327 	};
328 
329 	dev_info->default_txconf = (struct rte_eth_txconf){
330 		.tx_thresh = {
331 			.pthresh = 32,
332 			.hthresh = 0,
333 			.wthresh = 0,
334 		},
335 		.tx_rs_thresh = 32,
336 		.tx_free_thresh = 32,
337 	};
338 
339 	/* Buffer limits */
340 	dev_info->rx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
341 	dev_info->rx_desc_lim.nb_max = priv->max_rx_desc;
342 	dev_info->rx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
343 	dev_info->rx_desc_lim.nb_seg_max = priv->max_recv_sge;
344 	dev_info->rx_desc_lim.nb_mtu_seg_max = priv->max_recv_sge;
345 
346 	dev_info->tx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
347 	dev_info->tx_desc_lim.nb_max = priv->max_tx_desc;
348 	dev_info->tx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
349 	dev_info->tx_desc_lim.nb_seg_max = priv->max_send_sge;
350 	dev_info->rx_desc_lim.nb_mtu_seg_max = priv->max_recv_sge;
351 
352 	/* Speed */
353 	dev_info->speed_capa = RTE_ETH_LINK_SPEED_100G;
354 
355 	/* RX params */
356 	dev_info->default_rxportconf.burst_size = 1;
357 	dev_info->default_rxportconf.ring_size = MAX_RECEIVE_BUFFERS_PER_QUEUE;
358 	dev_info->default_rxportconf.nb_queues = 1;
359 
360 	/* TX params */
361 	dev_info->default_txportconf.burst_size = 1;
362 	dev_info->default_txportconf.ring_size = MAX_SEND_BUFFERS_PER_QUEUE;
363 	dev_info->default_txportconf.nb_queues = 1;
364 
365 	return 0;
366 }
367 
368 static void
369 mana_dev_tx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
370 		       struct rte_eth_txq_info *qinfo)
371 {
372 	struct mana_txq *txq = dev->data->tx_queues[queue_id];
373 
374 	qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads;
375 	qinfo->nb_desc = txq->num_desc;
376 }
377 
378 static void
379 mana_dev_rx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
380 		       struct rte_eth_rxq_info *qinfo)
381 {
382 	struct mana_rxq *rxq = dev->data->rx_queues[queue_id];
383 
384 	qinfo->mp = rxq->mp;
385 	qinfo->nb_desc = rxq->num_desc;
386 	qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads;
387 }
388 
389 static const uint32_t *
390 mana_supported_ptypes(struct rte_eth_dev *dev __rte_unused)
391 {
392 	static const uint32_t ptypes[] = {
393 		RTE_PTYPE_L2_ETHER,
394 		RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
395 		RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
396 		RTE_PTYPE_L4_FRAG,
397 		RTE_PTYPE_L4_TCP,
398 		RTE_PTYPE_L4_UDP,
399 		RTE_PTYPE_UNKNOWN
400 	};
401 
402 	return ptypes;
403 }
404 
405 static int
406 mana_rss_hash_update(struct rte_eth_dev *dev,
407 		     struct rte_eth_rss_conf *rss_conf)
408 {
409 	struct mana_priv *priv = dev->data->dev_private;
410 
411 	/* Currently can only update RSS hash when device is stopped */
412 	if (dev->data->dev_started) {
413 		DRV_LOG(ERR, "Can't update RSS after device has started");
414 		return -ENODEV;
415 	}
416 
417 	if (rss_conf->rss_hf & ~MANA_ETH_RSS_SUPPORT) {
418 		DRV_LOG(ERR, "Port %u invalid RSS HF 0x%" PRIx64,
419 			dev->data->port_id, rss_conf->rss_hf);
420 		return -EINVAL;
421 	}
422 
423 	if (rss_conf->rss_key && rss_conf->rss_key_len) {
424 		if (rss_conf->rss_key_len != TOEPLITZ_HASH_KEY_SIZE_IN_BYTES) {
425 			DRV_LOG(ERR, "Port %u key len must be %u long",
426 				dev->data->port_id,
427 				TOEPLITZ_HASH_KEY_SIZE_IN_BYTES);
428 			return -EINVAL;
429 		}
430 
431 		priv->rss_conf.rss_key_len = rss_conf->rss_key_len;
432 		priv->rss_conf.rss_key =
433 			rte_zmalloc("mana_rss", rss_conf->rss_key_len,
434 				    RTE_CACHE_LINE_SIZE);
435 		if (!priv->rss_conf.rss_key)
436 			return -ENOMEM;
437 		memcpy(priv->rss_conf.rss_key, rss_conf->rss_key,
438 		       rss_conf->rss_key_len);
439 	}
440 	priv->rss_conf.rss_hf = rss_conf->rss_hf;
441 
442 	return 0;
443 }
444 
445 static int
446 mana_rss_hash_conf_get(struct rte_eth_dev *dev,
447 		       struct rte_eth_rss_conf *rss_conf)
448 {
449 	struct mana_priv *priv = dev->data->dev_private;
450 
451 	if (!rss_conf)
452 		return -EINVAL;
453 
454 	if (rss_conf->rss_key &&
455 	    rss_conf->rss_key_len >= priv->rss_conf.rss_key_len) {
456 		memcpy(rss_conf->rss_key, priv->rss_conf.rss_key,
457 		       priv->rss_conf.rss_key_len);
458 	}
459 
460 	rss_conf->rss_key_len = priv->rss_conf.rss_key_len;
461 	rss_conf->rss_hf = priv->rss_conf.rss_hf;
462 
463 	return 0;
464 }
465 
466 static int
467 mana_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
468 			uint16_t nb_desc, unsigned int socket_id,
469 			const struct rte_eth_txconf *tx_conf __rte_unused)
470 
471 {
472 	struct mana_priv *priv = dev->data->dev_private;
473 	struct mana_txq *txq;
474 	int ret;
475 
476 	txq = rte_zmalloc_socket("mana_txq", sizeof(*txq), 0, socket_id);
477 	if (!txq) {
478 		DRV_LOG(ERR, "failed to allocate txq");
479 		return -ENOMEM;
480 	}
481 
482 	txq->socket = socket_id;
483 
484 	txq->desc_ring = rte_malloc_socket("mana_tx_desc_ring",
485 					   sizeof(struct mana_txq_desc) *
486 						nb_desc,
487 					   RTE_CACHE_LINE_SIZE, socket_id);
488 	if (!txq->desc_ring) {
489 		DRV_LOG(ERR, "failed to allocate txq desc_ring");
490 		ret = -ENOMEM;
491 		goto fail;
492 	}
493 
494 	txq->gdma_comp_buf = rte_malloc_socket("mana_txq_comp",
495 			sizeof(*txq->gdma_comp_buf) * nb_desc,
496 			RTE_CACHE_LINE_SIZE, socket_id);
497 	if (!txq->gdma_comp_buf) {
498 		DRV_LOG(ERR, "failed to allocate txq comp");
499 		ret = -ENOMEM;
500 		goto fail;
501 	}
502 
503 	ret = mana_mr_btree_init(&txq->mr_btree,
504 				 MANA_MR_BTREE_PER_QUEUE_N, socket_id);
505 	if (ret) {
506 		DRV_LOG(ERR, "Failed to init TXQ MR btree");
507 		goto fail;
508 	}
509 
510 	DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u txq->desc_ring %p",
511 		queue_idx, nb_desc, socket_id, txq->desc_ring);
512 
513 	txq->desc_ring_head = 0;
514 	txq->desc_ring_tail = 0;
515 	txq->priv = priv;
516 	txq->num_desc = nb_desc;
517 	dev->data->tx_queues[queue_idx] = txq;
518 
519 	return 0;
520 
521 fail:
522 	rte_free(txq->gdma_comp_buf);
523 	rte_free(txq->desc_ring);
524 	rte_free(txq);
525 	return ret;
526 }
527 
528 static void
529 mana_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
530 {
531 	struct mana_txq *txq = dev->data->tx_queues[qid];
532 
533 	mana_mr_btree_free(&txq->mr_btree);
534 
535 	rte_free(txq->gdma_comp_buf);
536 	rte_free(txq->desc_ring);
537 	rte_free(txq);
538 }
539 
540 static int
541 mana_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
542 			uint16_t nb_desc, unsigned int socket_id,
543 			const struct rte_eth_rxconf *rx_conf __rte_unused,
544 			struct rte_mempool *mp)
545 {
546 	struct mana_priv *priv = dev->data->dev_private;
547 	struct mana_rxq *rxq;
548 	int ret;
549 
550 	rxq = rte_zmalloc_socket("mana_rxq", sizeof(*rxq), 0, socket_id);
551 	if (!rxq) {
552 		DRV_LOG(ERR, "failed to allocate rxq");
553 		return -ENOMEM;
554 	}
555 
556 	DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u",
557 		queue_idx, nb_desc, socket_id);
558 
559 	rxq->socket = socket_id;
560 
561 	rxq->desc_ring = rte_zmalloc_socket("mana_rx_mbuf_ring",
562 					    sizeof(struct mana_rxq_desc) *
563 						nb_desc,
564 					    RTE_CACHE_LINE_SIZE, socket_id);
565 
566 	if (!rxq->desc_ring) {
567 		DRV_LOG(ERR, "failed to allocate rxq desc_ring");
568 		ret = -ENOMEM;
569 		goto fail;
570 	}
571 
572 	rxq->desc_ring_head = 0;
573 	rxq->desc_ring_tail = 0;
574 
575 	rxq->gdma_comp_buf = rte_malloc_socket("mana_rxq_comp",
576 			sizeof(*rxq->gdma_comp_buf) * nb_desc,
577 			RTE_CACHE_LINE_SIZE, socket_id);
578 	if (!rxq->gdma_comp_buf) {
579 		DRV_LOG(ERR, "failed to allocate rxq comp");
580 		ret = -ENOMEM;
581 		goto fail;
582 	}
583 
584 	ret = mana_mr_btree_init(&rxq->mr_btree,
585 				 MANA_MR_BTREE_PER_QUEUE_N, socket_id);
586 	if (ret) {
587 		DRV_LOG(ERR, "Failed to init RXQ MR btree");
588 		goto fail;
589 	}
590 
591 	rxq->priv = priv;
592 	rxq->num_desc = nb_desc;
593 	rxq->mp = mp;
594 	dev->data->rx_queues[queue_idx] = rxq;
595 
596 	return 0;
597 
598 fail:
599 	rte_free(rxq->gdma_comp_buf);
600 	rte_free(rxq->desc_ring);
601 	rte_free(rxq);
602 	return ret;
603 }
604 
605 static void
606 mana_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
607 {
608 	struct mana_rxq *rxq = dev->data->rx_queues[qid];
609 
610 	mana_mr_btree_free(&rxq->mr_btree);
611 
612 	rte_free(rxq->gdma_comp_buf);
613 	rte_free(rxq->desc_ring);
614 	rte_free(rxq);
615 }
616 
617 static int
618 mana_dev_link_update(struct rte_eth_dev *dev,
619 		     int wait_to_complete __rte_unused)
620 {
621 	struct rte_eth_link link;
622 
623 	/* MANA has no concept of carrier state, always reporting UP */
624 	link = (struct rte_eth_link) {
625 		.link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
626 		.link_autoneg = RTE_ETH_LINK_SPEED_FIXED,
627 		.link_speed = RTE_ETH_SPEED_NUM_100G,
628 		.link_status = RTE_ETH_LINK_UP,
629 	};
630 
631 	return rte_eth_linkstatus_set(dev, &link);
632 }
633 
634 static int
635 mana_dev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
636 {
637 	unsigned int i;
638 
639 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
640 		struct mana_txq *txq = dev->data->tx_queues[i];
641 
642 		if (!txq)
643 			continue;
644 
645 		stats->opackets += txq->stats.packets;
646 		stats->obytes += txq->stats.bytes;
647 		stats->oerrors += txq->stats.errors;
648 
649 		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
650 			stats->q_opackets[i] = txq->stats.packets;
651 			stats->q_obytes[i] = txq->stats.bytes;
652 		}
653 	}
654 
655 	stats->rx_nombuf = 0;
656 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
657 		struct mana_rxq *rxq = dev->data->rx_queues[i];
658 
659 		if (!rxq)
660 			continue;
661 
662 		stats->ipackets += rxq->stats.packets;
663 		stats->ibytes += rxq->stats.bytes;
664 		stats->ierrors += rxq->stats.errors;
665 
666 		/* There is no good way to get stats->imissed, not setting it */
667 
668 		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
669 			stats->q_ipackets[i] = rxq->stats.packets;
670 			stats->q_ibytes[i] = rxq->stats.bytes;
671 		}
672 
673 		stats->rx_nombuf += rxq->stats.nombuf;
674 	}
675 
676 	return 0;
677 }
678 
679 static int
680 mana_dev_stats_reset(struct rte_eth_dev *dev __rte_unused)
681 {
682 	unsigned int i;
683 
684 	PMD_INIT_FUNC_TRACE();
685 
686 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
687 		struct mana_txq *txq = dev->data->tx_queues[i];
688 
689 		if (!txq)
690 			continue;
691 
692 		memset(&txq->stats, 0, sizeof(txq->stats));
693 	}
694 
695 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
696 		struct mana_rxq *rxq = dev->data->rx_queues[i];
697 
698 		if (!rxq)
699 			continue;
700 
701 		memset(&rxq->stats, 0, sizeof(rxq->stats));
702 	}
703 
704 	return 0;
705 }
706 
707 static int
708 mana_get_ifname(const struct mana_priv *priv, char (*ifname)[IF_NAMESIZE])
709 {
710 	int ret;
711 	DIR *dir;
712 	struct dirent *dent;
713 
714 	MANA_MKSTR(dirpath, "%s/device/net", priv->ib_ctx->device->ibdev_path);
715 
716 	dir = opendir(dirpath);
717 	if (dir == NULL)
718 		return -ENODEV;
719 
720 	while ((dent = readdir(dir)) != NULL) {
721 		char *name = dent->d_name;
722 		FILE *file;
723 		struct rte_ether_addr addr;
724 		char *mac = NULL;
725 
726 		if ((name[0] == '.') &&
727 		    ((name[1] == '\0') ||
728 		     ((name[1] == '.') && (name[2] == '\0'))))
729 			continue;
730 
731 		MANA_MKSTR(path, "%s/%s/address", dirpath, name);
732 
733 		file = fopen(path, "r");
734 		if (!file) {
735 			ret = -ENODEV;
736 			break;
737 		}
738 
739 		ret = fscanf(file, "%ms", &mac);
740 		fclose(file);
741 
742 		if (ret <= 0) {
743 			ret = -EINVAL;
744 			break;
745 		}
746 
747 		ret = rte_ether_unformat_addr(mac, &addr);
748 		free(mac);
749 		if (ret)
750 			break;
751 
752 		if (rte_is_same_ether_addr(&addr, priv->dev_data->mac_addrs)) {
753 			strlcpy(*ifname, name, sizeof(*ifname));
754 			ret = 0;
755 			break;
756 		}
757 	}
758 
759 	closedir(dir);
760 	return ret;
761 }
762 
763 static int
764 mana_ifreq(const struct mana_priv *priv, int req, struct ifreq *ifr)
765 {
766 	int sock, ret;
767 
768 	sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
769 	if (sock == -1)
770 		return -errno;
771 
772 	ret = mana_get_ifname(priv, &ifr->ifr_name);
773 	if (ret) {
774 		close(sock);
775 		return ret;
776 	}
777 
778 	if (ioctl(sock, req, ifr) == -1)
779 		ret = -errno;
780 
781 	close(sock);
782 
783 	return ret;
784 }
785 
786 static int
787 mana_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
788 {
789 	struct mana_priv *priv = dev->data->dev_private;
790 	struct ifreq request = { .ifr_mtu = mtu, };
791 
792 	return mana_ifreq(priv, SIOCSIFMTU, &request);
793 }
794 
795 static const struct eth_dev_ops mana_dev_ops = {
796 	.dev_configure		= mana_dev_configure,
797 	.dev_start		= mana_dev_start,
798 	.dev_stop		= mana_dev_stop,
799 	.dev_close		= mana_dev_close,
800 	.dev_infos_get		= mana_dev_info_get,
801 	.txq_info_get		= mana_dev_tx_queue_info,
802 	.rxq_info_get		= mana_dev_rx_queue_info,
803 	.dev_supported_ptypes_get = mana_supported_ptypes,
804 	.rss_hash_update	= mana_rss_hash_update,
805 	.rss_hash_conf_get	= mana_rss_hash_conf_get,
806 	.tx_queue_setup		= mana_dev_tx_queue_setup,
807 	.tx_queue_release	= mana_dev_tx_queue_release,
808 	.rx_queue_setup		= mana_dev_rx_queue_setup,
809 	.rx_queue_release	= mana_dev_rx_queue_release,
810 	.rx_queue_intr_enable	= mana_rx_intr_enable,
811 	.rx_queue_intr_disable	= mana_rx_intr_disable,
812 	.link_update		= mana_dev_link_update,
813 	.stats_get		= mana_dev_stats_get,
814 	.stats_reset		= mana_dev_stats_reset,
815 	.mtu_set		= mana_mtu_set,
816 };
817 
818 static const struct eth_dev_ops mana_dev_secondary_ops = {
819 	.stats_get = mana_dev_stats_get,
820 	.stats_reset = mana_dev_stats_reset,
821 	.dev_infos_get = mana_dev_info_get,
822 };
823 
824 uint16_t
825 mana_rx_burst_removed(void *dpdk_rxq __rte_unused,
826 		      struct rte_mbuf **pkts __rte_unused,
827 		      uint16_t pkts_n __rte_unused)
828 {
829 	rte_mb();
830 	return 0;
831 }
832 
833 uint16_t
834 mana_tx_burst_removed(void *dpdk_rxq __rte_unused,
835 		      struct rte_mbuf **pkts __rte_unused,
836 		      uint16_t pkts_n __rte_unused)
837 {
838 	rte_mb();
839 	return 0;
840 }
841 
842 #define ETH_MANA_MAC_ARG "mac"
843 static const char * const mana_init_args[] = {
844 	ETH_MANA_MAC_ARG,
845 	NULL,
846 };
847 
848 /* Support of parsing up to 8 mac address from EAL command line */
849 #define MAX_NUM_ADDRESS 8
850 struct mana_conf {
851 	struct rte_ether_addr mac_array[MAX_NUM_ADDRESS];
852 	unsigned int index;
853 };
854 
855 static int
856 mana_arg_parse_callback(const char *key, const char *val, void *private)
857 {
858 	struct mana_conf *conf = (struct mana_conf *)private;
859 	int ret;
860 
861 	DRV_LOG(INFO, "key=%s value=%s index=%d", key, val, conf->index);
862 
863 	if (conf->index >= MAX_NUM_ADDRESS) {
864 		DRV_LOG(ERR, "Exceeding max MAC address");
865 		return 1;
866 	}
867 
868 	ret = rte_ether_unformat_addr(val, &conf->mac_array[conf->index]);
869 	if (ret) {
870 		DRV_LOG(ERR, "Invalid MAC address %s", val);
871 		return ret;
872 	}
873 
874 	conf->index++;
875 
876 	return 0;
877 }
878 
879 static int
880 mana_parse_args(struct rte_devargs *devargs, struct mana_conf *conf)
881 {
882 	struct rte_kvargs *kvlist;
883 	unsigned int arg_count;
884 	int ret = 0;
885 
886 	kvlist = rte_kvargs_parse(devargs->drv_str, mana_init_args);
887 	if (!kvlist) {
888 		DRV_LOG(ERR, "failed to parse kvargs args=%s", devargs->drv_str);
889 		return -EINVAL;
890 	}
891 
892 	arg_count = rte_kvargs_count(kvlist, mana_init_args[0]);
893 	if (arg_count > MAX_NUM_ADDRESS) {
894 		ret = -EINVAL;
895 		goto free_kvlist;
896 	}
897 	ret = rte_kvargs_process(kvlist, mana_init_args[0],
898 				 mana_arg_parse_callback, conf);
899 	if (ret) {
900 		DRV_LOG(ERR, "error parsing args");
901 		goto free_kvlist;
902 	}
903 
904 free_kvlist:
905 	rte_kvargs_free(kvlist);
906 	return ret;
907 }
908 
909 static int
910 get_port_mac(struct ibv_device *device, unsigned int port,
911 	     struct rte_ether_addr *addr)
912 {
913 	FILE *file;
914 	int ret = 0;
915 	DIR *dir;
916 	struct dirent *dent;
917 	unsigned int dev_port;
918 
919 	MANA_MKSTR(path, "%s/device/net", device->ibdev_path);
920 
921 	dir = opendir(path);
922 	if (!dir)
923 		return -ENOENT;
924 
925 	while ((dent = readdir(dir))) {
926 		char *name = dent->d_name;
927 		char *mac = NULL;
928 
929 		MANA_MKSTR(port_path, "%s/%s/dev_port", path, name);
930 
931 		/* Ignore . and .. */
932 		if ((name[0] == '.') &&
933 		    ((name[1] == '\0') ||
934 		     ((name[1] == '.') && (name[2] == '\0'))))
935 			continue;
936 
937 		file = fopen(port_path, "r");
938 		if (!file)
939 			continue;
940 
941 		ret = fscanf(file, "%u", &dev_port);
942 		fclose(file);
943 
944 		if (ret != 1)
945 			continue;
946 
947 		/* Ethernet ports start at 0, IB port start at 1 */
948 		if (dev_port == port - 1) {
949 			MANA_MKSTR(address_path, "%s/%s/address", path, name);
950 
951 			file = fopen(address_path, "r");
952 			if (!file)
953 				continue;
954 
955 			ret = fscanf(file, "%ms", &mac);
956 			fclose(file);
957 
958 			if (ret < 0)
959 				break;
960 
961 			ret = rte_ether_unformat_addr(mac, addr);
962 			if (ret)
963 				DRV_LOG(ERR, "unrecognized mac addr %s", mac);
964 
965 			free(mac);
966 			break;
967 		}
968 	}
969 
970 	closedir(dir);
971 	return ret;
972 }
973 
974 static int
975 mana_ibv_device_to_pci_addr(const struct ibv_device *device,
976 			    struct rte_pci_addr *pci_addr)
977 {
978 	FILE *file;
979 	char *line = NULL;
980 	size_t len = 0;
981 
982 	MANA_MKSTR(path, "%s/device/uevent", device->ibdev_path);
983 
984 	file = fopen(path, "r");
985 	if (!file)
986 		return -errno;
987 
988 	while (getline(&line, &len, file) != -1) {
989 		/* Extract information. */
990 		if (sscanf(line,
991 			   "PCI_SLOT_NAME="
992 			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
993 			   &pci_addr->domain,
994 			   &pci_addr->bus,
995 			   &pci_addr->devid,
996 			   &pci_addr->function) == 4) {
997 			break;
998 		}
999 	}
1000 
1001 	free(line);
1002 	fclose(file);
1003 	return 0;
1004 }
1005 
1006 /*
1007  * Interrupt handler from IB layer to notify this device is being removed.
1008  */
1009 static void
1010 mana_intr_handler(void *arg)
1011 {
1012 	struct mana_priv *priv = arg;
1013 	struct ibv_context *ctx = priv->ib_ctx;
1014 	struct ibv_async_event event;
1015 
1016 	/* Read and ack all messages from IB device */
1017 	while (true) {
1018 		if (ibv_get_async_event(ctx, &event))
1019 			break;
1020 
1021 		if (event.event_type == IBV_EVENT_DEVICE_FATAL) {
1022 			struct rte_eth_dev *dev;
1023 
1024 			dev = &rte_eth_devices[priv->port_id];
1025 			if (dev->data->dev_conf.intr_conf.rmv)
1026 				rte_eth_dev_callback_process(dev,
1027 					RTE_ETH_EVENT_INTR_RMV, NULL);
1028 		}
1029 
1030 		ibv_ack_async_event(&event);
1031 	}
1032 }
1033 
1034 static int
1035 mana_intr_uninstall(struct mana_priv *priv)
1036 {
1037 	int ret;
1038 
1039 	ret = rte_intr_callback_unregister(priv->intr_handle,
1040 					   mana_intr_handler, priv);
1041 	if (ret <= 0) {
1042 		DRV_LOG(ERR, "Failed to unregister intr callback ret %d", ret);
1043 		return ret;
1044 	}
1045 
1046 	rte_intr_instance_free(priv->intr_handle);
1047 
1048 	return 0;
1049 }
1050 
1051 int
1052 mana_fd_set_non_blocking(int fd)
1053 {
1054 	int ret = fcntl(fd, F_GETFL);
1055 
1056 	if (ret != -1 && !fcntl(fd, F_SETFL, ret | O_NONBLOCK))
1057 		return 0;
1058 
1059 	rte_errno = errno;
1060 	return -rte_errno;
1061 }
1062 
1063 static int
1064 mana_intr_install(struct rte_eth_dev *eth_dev, struct mana_priv *priv)
1065 {
1066 	int ret;
1067 	struct ibv_context *ctx = priv->ib_ctx;
1068 
1069 	priv->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_SHARED);
1070 	if (!priv->intr_handle) {
1071 		DRV_LOG(ERR, "Failed to allocate intr_handle");
1072 		rte_errno = ENOMEM;
1073 		return -ENOMEM;
1074 	}
1075 
1076 	ret = rte_intr_fd_set(priv->intr_handle, -1);
1077 	if (ret)
1078 		goto free_intr;
1079 
1080 	ret = mana_fd_set_non_blocking(ctx->async_fd);
1081 	if (ret) {
1082 		DRV_LOG(ERR, "Failed to change async_fd to NONBLOCK");
1083 		goto free_intr;
1084 	}
1085 
1086 	ret = rte_intr_fd_set(priv->intr_handle, ctx->async_fd);
1087 	if (ret)
1088 		goto free_intr;
1089 
1090 	ret = rte_intr_type_set(priv->intr_handle, RTE_INTR_HANDLE_EXT);
1091 	if (ret)
1092 		goto free_intr;
1093 
1094 	ret = rte_intr_callback_register(priv->intr_handle,
1095 					 mana_intr_handler, priv);
1096 	if (ret) {
1097 		DRV_LOG(ERR, "Failed to register intr callback");
1098 		rte_intr_fd_set(priv->intr_handle, -1);
1099 		goto free_intr;
1100 	}
1101 
1102 	eth_dev->intr_handle = priv->intr_handle;
1103 	return 0;
1104 
1105 free_intr:
1106 	rte_intr_instance_free(priv->intr_handle);
1107 	priv->intr_handle = NULL;
1108 
1109 	return ret;
1110 }
1111 
1112 static int
1113 mana_proc_priv_init(struct rte_eth_dev *dev)
1114 {
1115 	struct mana_process_priv *priv;
1116 
1117 	priv = rte_zmalloc_socket("mana_proc_priv",
1118 				  sizeof(struct mana_process_priv),
1119 				  RTE_CACHE_LINE_SIZE,
1120 				  dev->device->numa_node);
1121 	if (!priv)
1122 		return -ENOMEM;
1123 
1124 	dev->process_private = priv;
1125 	return 0;
1126 }
1127 
1128 /*
1129  * Map the doorbell page for the secondary process through IB device handle.
1130  */
1131 static int
1132 mana_map_doorbell_secondary(struct rte_eth_dev *eth_dev, int fd)
1133 {
1134 	struct mana_process_priv *priv = eth_dev->process_private;
1135 
1136 	void *addr;
1137 
1138 	addr = mmap(NULL, rte_mem_page_size(), PROT_WRITE, MAP_SHARED, fd, 0);
1139 	if (addr == MAP_FAILED) {
1140 		DRV_LOG(ERR, "Failed to map secondary doorbell port %u",
1141 			eth_dev->data->port_id);
1142 		return -ENOMEM;
1143 	}
1144 
1145 	DRV_LOG(INFO, "Secondary doorbell mapped to %p", addr);
1146 
1147 	priv->db_page = addr;
1148 
1149 	return 0;
1150 }
1151 
1152 /* Initialize shared data for the driver (all devices) */
1153 static int
1154 mana_init_shared_data(void)
1155 {
1156 	int ret =  0;
1157 	const struct rte_memzone *secondary_mz;
1158 
1159 	rte_spinlock_lock(&mana_shared_data_lock);
1160 
1161 	/* Skip if shared data is already initialized */
1162 	if (mana_shared_data)
1163 		goto exit;
1164 
1165 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1166 		mana_shared_mz = rte_memzone_reserve(MZ_MANA_SHARED_DATA,
1167 						     sizeof(*mana_shared_data),
1168 						     SOCKET_ID_ANY, 0);
1169 		if (!mana_shared_mz) {
1170 			DRV_LOG(ERR, "Cannot allocate mana shared data");
1171 			ret = -rte_errno;
1172 			goto exit;
1173 		}
1174 
1175 		mana_shared_data = mana_shared_mz->addr;
1176 		memset(mana_shared_data, 0, sizeof(*mana_shared_data));
1177 		rte_spinlock_init(&mana_shared_data->lock);
1178 	} else {
1179 		secondary_mz = rte_memzone_lookup(MZ_MANA_SHARED_DATA);
1180 		if (!secondary_mz) {
1181 			DRV_LOG(ERR, "Cannot attach mana shared data");
1182 			ret = -rte_errno;
1183 			goto exit;
1184 		}
1185 
1186 		mana_shared_data = secondary_mz->addr;
1187 		memset(&mana_local_data, 0, sizeof(mana_local_data));
1188 	}
1189 
1190 exit:
1191 	rte_spinlock_unlock(&mana_shared_data_lock);
1192 
1193 	return ret;
1194 }
1195 
1196 /*
1197  * Init the data structures for use in primary and secondary processes.
1198  */
1199 static int
1200 mana_init_once(void)
1201 {
1202 	int ret;
1203 
1204 	ret = mana_init_shared_data();
1205 	if (ret)
1206 		return ret;
1207 
1208 	rte_spinlock_lock(&mana_shared_data->lock);
1209 
1210 	switch (rte_eal_process_type()) {
1211 	case RTE_PROC_PRIMARY:
1212 		if (mana_shared_data->init_done)
1213 			break;
1214 
1215 		ret = mana_mp_init_primary();
1216 		if (ret)
1217 			break;
1218 		DRV_LOG(ERR, "MP INIT PRIMARY");
1219 
1220 		mana_shared_data->init_done = 1;
1221 		break;
1222 
1223 	case RTE_PROC_SECONDARY:
1224 
1225 		if (mana_local_data.init_done)
1226 			break;
1227 
1228 		ret = mana_mp_init_secondary();
1229 		if (ret)
1230 			break;
1231 
1232 		DRV_LOG(ERR, "MP INIT SECONDARY");
1233 
1234 		mana_local_data.init_done = 1;
1235 		break;
1236 
1237 	default:
1238 		/* Impossible, internal error */
1239 		ret = -EPROTO;
1240 		break;
1241 	}
1242 
1243 	rte_spinlock_unlock(&mana_shared_data->lock);
1244 
1245 	return ret;
1246 }
1247 
1248 /*
1249  * Probe an IB port
1250  * Return value:
1251  * positive value: successfully probed port
1252  * 0: port not matching specified MAC address
1253  * negative value: error code
1254  */
1255 static int
1256 mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
1257 		uint8_t port, struct rte_pci_device *pci_dev, struct rte_ether_addr *addr)
1258 {
1259 	struct mana_priv *priv = NULL;
1260 	struct rte_eth_dev *eth_dev = NULL;
1261 	struct ibv_parent_domain_init_attr attr = {0};
1262 	char address[64];
1263 	char name[RTE_ETH_NAME_MAX_LEN];
1264 	int ret;
1265 	struct ibv_context *ctx = NULL;
1266 
1267 	rte_ether_format_addr(address, sizeof(address), addr);
1268 	DRV_LOG(INFO, "device located port %u address %s", port, address);
1269 
1270 	priv = rte_zmalloc_socket(NULL, sizeof(*priv), RTE_CACHE_LINE_SIZE,
1271 				  SOCKET_ID_ANY);
1272 	if (!priv)
1273 		return -ENOMEM;
1274 
1275 	snprintf(name, sizeof(name), "%s_port%d", pci_dev->device.name, port);
1276 
1277 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1278 		int fd;
1279 
1280 		eth_dev = rte_eth_dev_attach_secondary(name);
1281 		if (!eth_dev) {
1282 			DRV_LOG(ERR, "Can't attach to dev %s", name);
1283 			ret =  -ENOMEM;
1284 			goto failed;
1285 		}
1286 
1287 		eth_dev->device = &pci_dev->device;
1288 		eth_dev->dev_ops = &mana_dev_secondary_ops;
1289 		ret = mana_proc_priv_init(eth_dev);
1290 		if (ret)
1291 			goto failed;
1292 		priv->process_priv = eth_dev->process_private;
1293 
1294 		/* Get the IB FD from the primary process */
1295 		fd = mana_mp_req_verbs_cmd_fd(eth_dev);
1296 		if (fd < 0) {
1297 			DRV_LOG(ERR, "Failed to get FD %d", fd);
1298 			ret = -ENODEV;
1299 			goto failed;
1300 		}
1301 
1302 		ret = mana_map_doorbell_secondary(eth_dev, fd);
1303 		if (ret) {
1304 			DRV_LOG(ERR, "Failed secondary map %d", fd);
1305 			goto failed;
1306 		}
1307 
1308 		/* fd is no not used after mapping doorbell */
1309 		close(fd);
1310 
1311 		eth_dev->tx_pkt_burst = mana_tx_burst_removed;
1312 		eth_dev->rx_pkt_burst = mana_rx_burst_removed;
1313 
1314 		rte_spinlock_lock(&mana_shared_data->lock);
1315 		mana_shared_data->secondary_cnt++;
1316 		mana_local_data.secondary_cnt++;
1317 		rte_spinlock_unlock(&mana_shared_data->lock);
1318 
1319 		rte_eth_copy_pci_info(eth_dev, pci_dev);
1320 		rte_eth_dev_probing_finish(eth_dev);
1321 
1322 		return 0;
1323 	}
1324 
1325 	ctx = ibv_open_device(ibdev);
1326 	if (!ctx) {
1327 		DRV_LOG(ERR, "Failed to open IB device %s", ibdev->name);
1328 		ret = -ENODEV;
1329 		goto failed;
1330 	}
1331 
1332 	eth_dev = rte_eth_dev_allocate(name);
1333 	if (!eth_dev) {
1334 		ret = -ENOMEM;
1335 		goto failed;
1336 	}
1337 
1338 	eth_dev->data->mac_addrs =
1339 		rte_calloc("mana_mac", 1,
1340 			   sizeof(struct rte_ether_addr), 0);
1341 	if (!eth_dev->data->mac_addrs) {
1342 		ret = -ENOMEM;
1343 		goto failed;
1344 	}
1345 
1346 	rte_ether_addr_copy(addr, eth_dev->data->mac_addrs);
1347 
1348 	priv->ib_pd = ibv_alloc_pd(ctx);
1349 	if (!priv->ib_pd) {
1350 		DRV_LOG(ERR, "ibv_alloc_pd failed port %d", port);
1351 		ret = -ENOMEM;
1352 		goto failed;
1353 	}
1354 
1355 	/* Create a parent domain with the port number */
1356 	attr.pd = priv->ib_pd;
1357 	attr.comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
1358 	attr.pd_context = (void *)(uintptr_t)port;
1359 	priv->ib_parent_pd = ibv_alloc_parent_domain(ctx, &attr);
1360 	if (!priv->ib_parent_pd) {
1361 		DRV_LOG(ERR, "ibv_alloc_parent_domain failed port %d", port);
1362 		ret = -ENOMEM;
1363 		goto failed;
1364 	}
1365 
1366 	priv->ib_ctx = ctx;
1367 	priv->port_id = eth_dev->data->port_id;
1368 	priv->dev_port = port;
1369 	eth_dev->data->dev_private = priv;
1370 	priv->dev_data = eth_dev->data;
1371 
1372 	priv->max_rx_queues = dev_attr->orig_attr.max_qp;
1373 	priv->max_tx_queues = dev_attr->orig_attr.max_qp;
1374 
1375 	priv->max_rx_desc =
1376 		RTE_MIN(dev_attr->orig_attr.max_qp_wr,
1377 			dev_attr->orig_attr.max_cqe);
1378 	priv->max_tx_desc =
1379 		RTE_MIN(dev_attr->orig_attr.max_qp_wr,
1380 			dev_attr->orig_attr.max_cqe);
1381 
1382 	priv->max_send_sge = dev_attr->orig_attr.max_sge;
1383 	priv->max_recv_sge = dev_attr->orig_attr.max_sge;
1384 
1385 	priv->max_mr = dev_attr->orig_attr.max_mr;
1386 	priv->max_mr_size = dev_attr->orig_attr.max_mr_size;
1387 
1388 	DRV_LOG(INFO, "dev %s max queues %d desc %d sge %d",
1389 		name, priv->max_rx_queues, priv->max_rx_desc,
1390 		priv->max_send_sge);
1391 
1392 	rte_eth_copy_pci_info(eth_dev, pci_dev);
1393 
1394 	/* Create async interrupt handler */
1395 	ret = mana_intr_install(eth_dev, priv);
1396 	if (ret) {
1397 		DRV_LOG(ERR, "Failed to install intr handler");
1398 		goto failed;
1399 	}
1400 
1401 	rte_spinlock_lock(&mana_shared_data->lock);
1402 	mana_shared_data->primary_cnt++;
1403 	rte_spinlock_unlock(&mana_shared_data->lock);
1404 
1405 	eth_dev->device = &pci_dev->device;
1406 
1407 	DRV_LOG(INFO, "device %s at port %u", name, eth_dev->data->port_id);
1408 
1409 	eth_dev->rx_pkt_burst = mana_rx_burst_removed;
1410 	eth_dev->tx_pkt_burst = mana_tx_burst_removed;
1411 	eth_dev->dev_ops = &mana_dev_ops;
1412 
1413 	rte_eth_dev_probing_finish(eth_dev);
1414 
1415 	return 0;
1416 
1417 failed:
1418 	/* Free the resource for the port failed */
1419 	if (priv) {
1420 		if (priv->ib_parent_pd)
1421 			ibv_dealloc_pd(priv->ib_parent_pd);
1422 
1423 		if (priv->ib_pd)
1424 			ibv_dealloc_pd(priv->ib_pd);
1425 	}
1426 
1427 	if (eth_dev)
1428 		rte_eth_dev_release_port(eth_dev);
1429 
1430 	rte_free(priv);
1431 
1432 	if (ctx)
1433 		ibv_close_device(ctx);
1434 
1435 	return ret;
1436 }
1437 
1438 /*
1439  * Goes through the IB device list to look for the IB port matching the
1440  * mac_addr. If found, create a rte_eth_dev for it.
1441  * Return value: number of successfully probed devices
1442  */
1443 static int
1444 mana_pci_probe_mac(struct rte_pci_device *pci_dev,
1445 		   struct rte_ether_addr *mac_addr)
1446 {
1447 	struct ibv_device **ibv_list;
1448 	int ibv_idx;
1449 	struct ibv_context *ctx;
1450 	int num_devices;
1451 	int ret;
1452 	uint8_t port;
1453 	int count = 0;
1454 
1455 	ibv_list = ibv_get_device_list(&num_devices);
1456 	for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
1457 		struct ibv_device *ibdev = ibv_list[ibv_idx];
1458 		struct rte_pci_addr pci_addr;
1459 		struct ibv_device_attr_ex dev_attr;
1460 
1461 		DRV_LOG(INFO, "Probe device name %s dev_name %s ibdev_path %s",
1462 			ibdev->name, ibdev->dev_name, ibdev->ibdev_path);
1463 
1464 		if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
1465 			continue;
1466 
1467 		/* Ignore if this IB device is not this PCI device */
1468 		if (rte_pci_addr_cmp(&pci_dev->addr, &pci_addr) != 0)
1469 			continue;
1470 
1471 		ctx = ibv_open_device(ibdev);
1472 		if (!ctx) {
1473 			DRV_LOG(ERR, "Failed to open IB device %s",
1474 				ibdev->name);
1475 			continue;
1476 		}
1477 		ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
1478 		ibv_close_device(ctx);
1479 
1480 		if (ret) {
1481 			DRV_LOG(ERR, "Failed to query IB device %s",
1482 				ibdev->name);
1483 			continue;
1484 		}
1485 
1486 		for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
1487 		     port++) {
1488 			struct rte_ether_addr addr;
1489 			ret = get_port_mac(ibdev, port, &addr);
1490 			if (ret)
1491 				continue;
1492 
1493 			if (mac_addr && !rte_is_same_ether_addr(&addr, mac_addr))
1494 				continue;
1495 
1496 			ret = mana_probe_port(ibdev, &dev_attr, port, pci_dev, &addr);
1497 			if (ret) {
1498 				DRV_LOG(ERR, "Probe on IB port %u failed %d", port, ret);
1499 			} else {
1500 				count++;
1501 				DRV_LOG(INFO, "Successfully probed on IB port %u", port);
1502 			}
1503 		}
1504 	}
1505 
1506 	ibv_free_device_list(ibv_list);
1507 	return count;
1508 }
1509 
1510 /*
1511  * Main callback function from PCI bus to probe a device.
1512  */
1513 static int
1514 mana_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1515 	       struct rte_pci_device *pci_dev)
1516 {
1517 	struct rte_devargs *args = pci_dev->device.devargs;
1518 	struct mana_conf conf = {0};
1519 	unsigned int i;
1520 	int ret;
1521 	int count = 0;
1522 
1523 	if (args && args->drv_str) {
1524 		ret = mana_parse_args(args, &conf);
1525 		if (ret) {
1526 			DRV_LOG(ERR, "Failed to parse parameters args = %s",
1527 				args->drv_str);
1528 			return ret;
1529 		}
1530 	}
1531 
1532 	ret = mana_init_once();
1533 	if (ret) {
1534 		DRV_LOG(ERR, "Failed to init PMD global data %d", ret);
1535 		return ret;
1536 	}
1537 
1538 	/* If there are no driver parameters, probe on all ports */
1539 	if (conf.index) {
1540 		for (i = 0; i < conf.index; i++)
1541 			count += mana_pci_probe_mac(pci_dev,
1542 						    &conf.mac_array[i]);
1543 	} else {
1544 		count = mana_pci_probe_mac(pci_dev, NULL);
1545 	}
1546 
1547 	if (!count) {
1548 		rte_memzone_free(mana_shared_mz);
1549 		mana_shared_mz = NULL;
1550 		ret = -ENODEV;
1551 	}
1552 
1553 	return ret;
1554 }
1555 
1556 static int
1557 mana_dev_uninit(struct rte_eth_dev *dev)
1558 {
1559 	return mana_dev_close(dev);
1560 }
1561 
1562 /*
1563  * Callback from PCI to remove this device.
1564  */
1565 static int
1566 mana_pci_remove(struct rte_pci_device *pci_dev)
1567 {
1568 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1569 		rte_spinlock_lock(&mana_shared_data_lock);
1570 
1571 		rte_spinlock_lock(&mana_shared_data->lock);
1572 
1573 		RTE_VERIFY(mana_shared_data->primary_cnt > 0);
1574 		mana_shared_data->primary_cnt--;
1575 		if (!mana_shared_data->primary_cnt) {
1576 			DRV_LOG(DEBUG, "mp uninit primary");
1577 			mana_mp_uninit_primary();
1578 		}
1579 
1580 		rte_spinlock_unlock(&mana_shared_data->lock);
1581 
1582 		/* Also free the shared memory if this is the last */
1583 		if (!mana_shared_data->primary_cnt) {
1584 			DRV_LOG(DEBUG, "free shared memezone data");
1585 			rte_memzone_free(mana_shared_mz);
1586 			mana_shared_mz = NULL;
1587 		}
1588 
1589 		rte_spinlock_unlock(&mana_shared_data_lock);
1590 	} else {
1591 		rte_spinlock_lock(&mana_shared_data_lock);
1592 
1593 		rte_spinlock_lock(&mana_shared_data->lock);
1594 		RTE_VERIFY(mana_shared_data->secondary_cnt > 0);
1595 		mana_shared_data->secondary_cnt--;
1596 		rte_spinlock_unlock(&mana_shared_data->lock);
1597 
1598 		RTE_VERIFY(mana_local_data.secondary_cnt > 0);
1599 		mana_local_data.secondary_cnt--;
1600 		if (!mana_local_data.secondary_cnt) {
1601 			DRV_LOG(DEBUG, "mp uninit secondary");
1602 			mana_mp_uninit_secondary();
1603 		}
1604 
1605 		rte_spinlock_unlock(&mana_shared_data_lock);
1606 	}
1607 
1608 	return rte_eth_dev_pci_generic_remove(pci_dev, mana_dev_uninit);
1609 }
1610 
1611 static const struct rte_pci_id mana_pci_id_map[] = {
1612 	{
1613 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT,
1614 			       PCI_DEVICE_ID_MICROSOFT_MANA)
1615 	},
1616 	{
1617 		.vendor_id = 0
1618 	},
1619 };
1620 
1621 static struct rte_pci_driver mana_pci_driver = {
1622 	.id_table = mana_pci_id_map,
1623 	.probe = mana_pci_probe,
1624 	.remove = mana_pci_remove,
1625 	.drv_flags = RTE_PCI_DRV_INTR_RMV,
1626 };
1627 
1628 RTE_PMD_REGISTER_PCI(net_mana, mana_pci_driver);
1629 RTE_PMD_REGISTER_PCI_TABLE(net_mana, mana_pci_id_map);
1630 RTE_PMD_REGISTER_KMOD_DEP(net_mana, "* ib_uverbs & mana_ib");
1631 RTE_LOG_REGISTER_SUFFIX(mana_logtype_init, init, NOTICE);
1632 RTE_LOG_REGISTER_SUFFIX(mana_logtype_driver, driver, NOTICE);
1633 RTE_PMD_REGISTER_PARAM_STRING(net_mana, ETH_MANA_MAC_ARG "=<mac_addr>");
1634