xref: /dpdk/drivers/net/mana/mana.c (revision 62774b78a84e9fa5df56d04cffed69bef8c901f1)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2022 Microsoft Corporation
3  */
4 
5 #include <unistd.h>
6 #include <dirent.h>
7 #include <fcntl.h>
8 #include <sys/mman.h>
9 
10 #include <ethdev_driver.h>
11 #include <ethdev_pci.h>
12 #include <rte_kvargs.h>
13 #include <rte_eal_paging.h>
14 
15 #include <infiniband/verbs.h>
16 #include <infiniband/manadv.h>
17 
18 #include <assert.h>
19 
20 #include "mana.h"
21 
22 /* Shared memory between primary/secondary processes, per driver */
23 /* Data to track primary/secondary usage */
24 struct mana_shared_data *mana_shared_data;
25 static struct mana_shared_data mana_local_data;
26 
27 /* The memory region for the above data */
28 static const struct rte_memzone *mana_shared_mz;
29 static const char *MZ_MANA_SHARED_DATA = "mana_shared_data";
30 
31 /* Spinlock for mana_shared_data */
32 static rte_spinlock_t mana_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
33 
34 /* Allocate a buffer on the stack and fill it with a printf format string. */
35 #define MANA_MKSTR(name, ...) \
36 	int mkstr_size_##name = snprintf(NULL, 0, "" __VA_ARGS__); \
37 	char name[mkstr_size_##name + 1]; \
38 	\
39 	memset(name, 0, mkstr_size_##name + 1); \
40 	snprintf(name, sizeof(name), "" __VA_ARGS__)
41 
42 int mana_logtype_driver;
43 int mana_logtype_init;
44 
45 /*
46  * Callback from rdma-core to allocate a buffer for a queue.
47  */
48 void *
49 mana_alloc_verbs_buf(size_t size, void *data)
50 {
51 	void *ret;
52 	size_t alignment = rte_mem_page_size();
53 	int socket = (int)(uintptr_t)data;
54 
55 	DRV_LOG(DEBUG, "size=%zu socket=%d", size, socket);
56 
57 	if (alignment == (size_t)-1) {
58 		DRV_LOG(ERR, "Failed to get mem page size");
59 		rte_errno = ENOMEM;
60 		return NULL;
61 	}
62 
63 	ret = rte_zmalloc_socket("mana_verb_buf", size, alignment, socket);
64 	if (!ret && size)
65 		rte_errno = ENOMEM;
66 	return ret;
67 }
68 
69 void
70 mana_free_verbs_buf(void *ptr, void *data __rte_unused)
71 {
72 	rte_free(ptr);
73 }
74 
75 static int
76 mana_dev_configure(struct rte_eth_dev *dev)
77 {
78 	struct mana_priv *priv = dev->data->dev_private;
79 	struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
80 
81 	if (dev_conf->rxmode.mq_mode & RTE_ETH_MQ_RX_RSS_FLAG)
82 		dev_conf->rxmode.offloads |= RTE_ETH_RX_OFFLOAD_RSS_HASH;
83 
84 	if (dev->data->nb_rx_queues != dev->data->nb_tx_queues) {
85 		DRV_LOG(ERR, "Only support equal number of rx/tx queues");
86 		return -EINVAL;
87 	}
88 
89 	if (!rte_is_power_of_2(dev->data->nb_rx_queues)) {
90 		DRV_LOG(ERR, "number of TX/RX queues must be power of 2");
91 		return -EINVAL;
92 	}
93 
94 	priv->num_queues = dev->data->nb_rx_queues;
95 
96 	manadv_set_context_attr(priv->ib_ctx, MANADV_CTX_ATTR_BUF_ALLOCATORS,
97 				(void *)((uintptr_t)&(struct manadv_ctx_allocators){
98 					.alloc = &mana_alloc_verbs_buf,
99 					.free = &mana_free_verbs_buf,
100 					.data = 0,
101 				}));
102 
103 	return 0;
104 }
105 
106 static void
107 rx_intr_vec_disable(struct mana_priv *priv)
108 {
109 	struct rte_intr_handle *intr_handle = priv->intr_handle;
110 
111 	rte_intr_free_epoll_fd(intr_handle);
112 	rte_intr_vec_list_free(intr_handle);
113 	rte_intr_nb_efd_set(intr_handle, 0);
114 }
115 
116 static int
117 rx_intr_vec_enable(struct mana_priv *priv)
118 {
119 	unsigned int i;
120 	unsigned int rxqs_n = priv->dev_data->nb_rx_queues;
121 	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
122 	struct rte_intr_handle *intr_handle = priv->intr_handle;
123 	int ret;
124 
125 	rx_intr_vec_disable(priv);
126 
127 	if (rte_intr_vec_list_alloc(intr_handle, NULL, n)) {
128 		DRV_LOG(ERR, "Failed to allocate memory for interrupt vector");
129 		return -ENOMEM;
130 	}
131 
132 	for (i = 0; i < n; i++) {
133 		struct mana_rxq *rxq = priv->dev_data->rx_queues[i];
134 
135 		ret = rte_intr_vec_list_index_set(intr_handle, i,
136 						  RTE_INTR_VEC_RXTX_OFFSET + i);
137 		if (ret) {
138 			DRV_LOG(ERR, "Failed to set intr vec %u", i);
139 			return ret;
140 		}
141 
142 		ret = rte_intr_efds_index_set(intr_handle, i, rxq->channel->fd);
143 		if (ret) {
144 			DRV_LOG(ERR, "Failed to set FD at intr %u", i);
145 			return ret;
146 		}
147 	}
148 
149 	return rte_intr_nb_efd_set(intr_handle, n);
150 }
151 
152 static void
153 rxq_intr_disable(struct mana_priv *priv)
154 {
155 	int err = rte_errno;
156 
157 	rx_intr_vec_disable(priv);
158 	rte_errno = err;
159 }
160 
161 static int
162 rxq_intr_enable(struct mana_priv *priv)
163 {
164 	const struct rte_eth_intr_conf *const intr_conf =
165 		&priv->dev_data->dev_conf.intr_conf;
166 
167 	if (!intr_conf->rxq)
168 		return 0;
169 
170 	return rx_intr_vec_enable(priv);
171 }
172 
173 static int
174 mana_dev_start(struct rte_eth_dev *dev)
175 {
176 	int ret;
177 	struct mana_priv *priv = dev->data->dev_private;
178 
179 	rte_spinlock_init(&priv->mr_btree_lock);
180 	ret = mana_mr_btree_init(&priv->mr_btree, MANA_MR_BTREE_CACHE_N,
181 				 dev->device->numa_node);
182 	if (ret) {
183 		DRV_LOG(ERR, "Failed to init device MR btree %d", ret);
184 		return ret;
185 	}
186 
187 	ret = mana_start_tx_queues(dev);
188 	if (ret) {
189 		DRV_LOG(ERR, "failed to start tx queues %d", ret);
190 		goto failed_tx;
191 	}
192 
193 	ret = mana_start_rx_queues(dev);
194 	if (ret) {
195 		DRV_LOG(ERR, "failed to start rx queues %d", ret);
196 		goto failed_rx;
197 	}
198 
199 	rte_wmb();
200 
201 	dev->tx_pkt_burst = mana_tx_burst;
202 	dev->rx_pkt_burst = mana_rx_burst;
203 
204 	DRV_LOG(INFO, "TX/RX queues have started");
205 
206 	/* Enable datapath for secondary processes */
207 	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_START_RXTX);
208 
209 	ret = rxq_intr_enable(priv);
210 	if (ret) {
211 		DRV_LOG(ERR, "Failed to enable RX interrupts");
212 		goto failed_intr;
213 	}
214 
215 	return 0;
216 
217 failed_intr:
218 	mana_stop_rx_queues(dev);
219 
220 failed_rx:
221 	mana_stop_tx_queues(dev);
222 
223 failed_tx:
224 	mana_mr_btree_free(&priv->mr_btree);
225 
226 	return ret;
227 }
228 
229 static int
230 mana_dev_stop(struct rte_eth_dev *dev)
231 {
232 	int ret;
233 	struct mana_priv *priv = dev->data->dev_private;
234 
235 	rxq_intr_disable(priv);
236 
237 	dev->tx_pkt_burst = mana_tx_burst_removed;
238 	dev->rx_pkt_burst = mana_rx_burst_removed;
239 
240 	/* Stop datapath on secondary processes */
241 	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_STOP_RXTX);
242 
243 	rte_wmb();
244 
245 	ret = mana_stop_tx_queues(dev);
246 	if (ret) {
247 		DRV_LOG(ERR, "failed to stop tx queues");
248 		return ret;
249 	}
250 
251 	ret = mana_stop_rx_queues(dev);
252 	if (ret) {
253 		DRV_LOG(ERR, "failed to stop tx queues");
254 		return ret;
255 	}
256 
257 	return 0;
258 }
259 
260 static int mana_intr_uninstall(struct mana_priv *priv);
261 
262 static int
263 mana_dev_close(struct rte_eth_dev *dev)
264 {
265 	struct mana_priv *priv = dev->data->dev_private;
266 	int ret;
267 
268 	mana_remove_all_mr(priv);
269 
270 	ret = mana_intr_uninstall(priv);
271 	if (ret)
272 		return ret;
273 
274 	ret = ibv_close_device(priv->ib_ctx);
275 	if (ret) {
276 		ret = errno;
277 		return ret;
278 	}
279 
280 	return 0;
281 }
282 
283 static int
284 mana_dev_info_get(struct rte_eth_dev *dev,
285 		  struct rte_eth_dev_info *dev_info)
286 {
287 	struct mana_priv *priv = dev->data->dev_private;
288 
289 	dev_info->max_mtu = RTE_ETHER_MTU;
290 
291 	/* RX params */
292 	dev_info->min_rx_bufsize = MIN_RX_BUF_SIZE;
293 	dev_info->max_rx_pktlen = MAX_FRAME_SIZE;
294 
295 	dev_info->max_rx_queues = priv->max_rx_queues;
296 	dev_info->max_tx_queues = priv->max_tx_queues;
297 
298 	dev_info->max_mac_addrs = MANA_MAX_MAC_ADDR;
299 	dev_info->max_hash_mac_addrs = 0;
300 
301 	dev_info->max_vfs = 1;
302 
303 	/* Offload params */
304 	dev_info->rx_offload_capa = MANA_DEV_RX_OFFLOAD_SUPPORT;
305 
306 	dev_info->tx_offload_capa = MANA_DEV_TX_OFFLOAD_SUPPORT;
307 
308 	/* RSS */
309 	dev_info->reta_size = INDIRECTION_TABLE_NUM_ELEMENTS;
310 	dev_info->hash_key_size = TOEPLITZ_HASH_KEY_SIZE_IN_BYTES;
311 	dev_info->flow_type_rss_offloads = MANA_ETH_RSS_SUPPORT;
312 
313 	/* Thresholds */
314 	dev_info->default_rxconf = (struct rte_eth_rxconf){
315 		.rx_thresh = {
316 			.pthresh = 8,
317 			.hthresh = 8,
318 			.wthresh = 0,
319 		},
320 		.rx_free_thresh = 32,
321 		/* If no descriptors available, pkts are dropped by default */
322 		.rx_drop_en = 1,
323 	};
324 
325 	dev_info->default_txconf = (struct rte_eth_txconf){
326 		.tx_thresh = {
327 			.pthresh = 32,
328 			.hthresh = 0,
329 			.wthresh = 0,
330 		},
331 		.tx_rs_thresh = 32,
332 		.tx_free_thresh = 32,
333 	};
334 
335 	/* Buffer limits */
336 	dev_info->rx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
337 	dev_info->rx_desc_lim.nb_max = priv->max_rx_desc;
338 	dev_info->rx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
339 	dev_info->rx_desc_lim.nb_seg_max = priv->max_recv_sge;
340 	dev_info->rx_desc_lim.nb_mtu_seg_max = priv->max_recv_sge;
341 
342 	dev_info->tx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
343 	dev_info->tx_desc_lim.nb_max = priv->max_tx_desc;
344 	dev_info->tx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
345 	dev_info->tx_desc_lim.nb_seg_max = priv->max_send_sge;
346 	dev_info->rx_desc_lim.nb_mtu_seg_max = priv->max_recv_sge;
347 
348 	/* Speed */
349 	dev_info->speed_capa = RTE_ETH_LINK_SPEED_100G;
350 
351 	/* RX params */
352 	dev_info->default_rxportconf.burst_size = 1;
353 	dev_info->default_rxportconf.ring_size = MAX_RECEIVE_BUFFERS_PER_QUEUE;
354 	dev_info->default_rxportconf.nb_queues = 1;
355 
356 	/* TX params */
357 	dev_info->default_txportconf.burst_size = 1;
358 	dev_info->default_txportconf.ring_size = MAX_SEND_BUFFERS_PER_QUEUE;
359 	dev_info->default_txportconf.nb_queues = 1;
360 
361 	return 0;
362 }
363 
364 static void
365 mana_dev_tx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
366 		       struct rte_eth_txq_info *qinfo)
367 {
368 	struct mana_txq *txq = dev->data->tx_queues[queue_id];
369 
370 	qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads;
371 	qinfo->nb_desc = txq->num_desc;
372 }
373 
374 static void
375 mana_dev_rx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
376 		       struct rte_eth_rxq_info *qinfo)
377 {
378 	struct mana_rxq *rxq = dev->data->rx_queues[queue_id];
379 
380 	qinfo->mp = rxq->mp;
381 	qinfo->nb_desc = rxq->num_desc;
382 	qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads;
383 }
384 
385 static const uint32_t *
386 mana_supported_ptypes(struct rte_eth_dev *dev __rte_unused)
387 {
388 	static const uint32_t ptypes[] = {
389 		RTE_PTYPE_L2_ETHER,
390 		RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
391 		RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
392 		RTE_PTYPE_L4_FRAG,
393 		RTE_PTYPE_L4_TCP,
394 		RTE_PTYPE_L4_UDP,
395 		RTE_PTYPE_UNKNOWN
396 	};
397 
398 	return ptypes;
399 }
400 
401 static int
402 mana_rss_hash_update(struct rte_eth_dev *dev,
403 		     struct rte_eth_rss_conf *rss_conf)
404 {
405 	struct mana_priv *priv = dev->data->dev_private;
406 
407 	/* Currently can only update RSS hash when device is stopped */
408 	if (dev->data->dev_started) {
409 		DRV_LOG(ERR, "Can't update RSS after device has started");
410 		return -ENODEV;
411 	}
412 
413 	if (rss_conf->rss_hf & ~MANA_ETH_RSS_SUPPORT) {
414 		DRV_LOG(ERR, "Port %u invalid RSS HF 0x%" PRIx64,
415 			dev->data->port_id, rss_conf->rss_hf);
416 		return -EINVAL;
417 	}
418 
419 	if (rss_conf->rss_key && rss_conf->rss_key_len) {
420 		if (rss_conf->rss_key_len != TOEPLITZ_HASH_KEY_SIZE_IN_BYTES) {
421 			DRV_LOG(ERR, "Port %u key len must be %u long",
422 				dev->data->port_id,
423 				TOEPLITZ_HASH_KEY_SIZE_IN_BYTES);
424 			return -EINVAL;
425 		}
426 
427 		priv->rss_conf.rss_key_len = rss_conf->rss_key_len;
428 		priv->rss_conf.rss_key =
429 			rte_zmalloc("mana_rss", rss_conf->rss_key_len,
430 				    RTE_CACHE_LINE_SIZE);
431 		if (!priv->rss_conf.rss_key)
432 			return -ENOMEM;
433 		memcpy(priv->rss_conf.rss_key, rss_conf->rss_key,
434 		       rss_conf->rss_key_len);
435 	}
436 	priv->rss_conf.rss_hf = rss_conf->rss_hf;
437 
438 	return 0;
439 }
440 
441 static int
442 mana_rss_hash_conf_get(struct rte_eth_dev *dev,
443 		       struct rte_eth_rss_conf *rss_conf)
444 {
445 	struct mana_priv *priv = dev->data->dev_private;
446 
447 	if (!rss_conf)
448 		return -EINVAL;
449 
450 	if (rss_conf->rss_key &&
451 	    rss_conf->rss_key_len >= priv->rss_conf.rss_key_len) {
452 		memcpy(rss_conf->rss_key, priv->rss_conf.rss_key,
453 		       priv->rss_conf.rss_key_len);
454 	}
455 
456 	rss_conf->rss_key_len = priv->rss_conf.rss_key_len;
457 	rss_conf->rss_hf = priv->rss_conf.rss_hf;
458 
459 	return 0;
460 }
461 
462 static int
463 mana_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
464 			uint16_t nb_desc, unsigned int socket_id,
465 			const struct rte_eth_txconf *tx_conf __rte_unused)
466 
467 {
468 	struct mana_priv *priv = dev->data->dev_private;
469 	struct mana_txq *txq;
470 	int ret;
471 
472 	txq = rte_zmalloc_socket("mana_txq", sizeof(*txq), 0, socket_id);
473 	if (!txq) {
474 		DRV_LOG(ERR, "failed to allocate txq");
475 		return -ENOMEM;
476 	}
477 
478 	txq->socket = socket_id;
479 
480 	txq->desc_ring = rte_malloc_socket("mana_tx_desc_ring",
481 					   sizeof(struct mana_txq_desc) *
482 						nb_desc,
483 					   RTE_CACHE_LINE_SIZE, socket_id);
484 	if (!txq->desc_ring) {
485 		DRV_LOG(ERR, "failed to allocate txq desc_ring");
486 		ret = -ENOMEM;
487 		goto fail;
488 	}
489 
490 	txq->gdma_comp_buf = rte_malloc_socket("mana_txq_comp",
491 			sizeof(*txq->gdma_comp_buf) * nb_desc,
492 			RTE_CACHE_LINE_SIZE, socket_id);
493 	if (!txq->gdma_comp_buf) {
494 		DRV_LOG(ERR, "failed to allocate txq comp");
495 		ret = -ENOMEM;
496 		goto fail;
497 	}
498 
499 	ret = mana_mr_btree_init(&txq->mr_btree,
500 				 MANA_MR_BTREE_PER_QUEUE_N, socket_id);
501 	if (ret) {
502 		DRV_LOG(ERR, "Failed to init TXQ MR btree");
503 		goto fail;
504 	}
505 
506 	DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u txq->desc_ring %p",
507 		queue_idx, nb_desc, socket_id, txq->desc_ring);
508 
509 	txq->desc_ring_head = 0;
510 	txq->desc_ring_tail = 0;
511 	txq->priv = priv;
512 	txq->num_desc = nb_desc;
513 	dev->data->tx_queues[queue_idx] = txq;
514 
515 	return 0;
516 
517 fail:
518 	rte_free(txq->gdma_comp_buf);
519 	rte_free(txq->desc_ring);
520 	rte_free(txq);
521 	return ret;
522 }
523 
524 static void
525 mana_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
526 {
527 	struct mana_txq *txq = dev->data->tx_queues[qid];
528 
529 	mana_mr_btree_free(&txq->mr_btree);
530 
531 	rte_free(txq->gdma_comp_buf);
532 	rte_free(txq->desc_ring);
533 	rte_free(txq);
534 }
535 
536 static int
537 mana_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
538 			uint16_t nb_desc, unsigned int socket_id,
539 			const struct rte_eth_rxconf *rx_conf __rte_unused,
540 			struct rte_mempool *mp)
541 {
542 	struct mana_priv *priv = dev->data->dev_private;
543 	struct mana_rxq *rxq;
544 	int ret;
545 
546 	rxq = rte_zmalloc_socket("mana_rxq", sizeof(*rxq), 0, socket_id);
547 	if (!rxq) {
548 		DRV_LOG(ERR, "failed to allocate rxq");
549 		return -ENOMEM;
550 	}
551 
552 	DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u",
553 		queue_idx, nb_desc, socket_id);
554 
555 	rxq->socket = socket_id;
556 
557 	rxq->desc_ring = rte_zmalloc_socket("mana_rx_mbuf_ring",
558 					    sizeof(struct mana_rxq_desc) *
559 						nb_desc,
560 					    RTE_CACHE_LINE_SIZE, socket_id);
561 
562 	if (!rxq->desc_ring) {
563 		DRV_LOG(ERR, "failed to allocate rxq desc_ring");
564 		ret = -ENOMEM;
565 		goto fail;
566 	}
567 
568 	rxq->desc_ring_head = 0;
569 	rxq->desc_ring_tail = 0;
570 
571 	rxq->gdma_comp_buf = rte_malloc_socket("mana_rxq_comp",
572 			sizeof(*rxq->gdma_comp_buf) * nb_desc,
573 			RTE_CACHE_LINE_SIZE, socket_id);
574 	if (!rxq->gdma_comp_buf) {
575 		DRV_LOG(ERR, "failed to allocate rxq comp");
576 		ret = -ENOMEM;
577 		goto fail;
578 	}
579 
580 	ret = mana_mr_btree_init(&rxq->mr_btree,
581 				 MANA_MR_BTREE_PER_QUEUE_N, socket_id);
582 	if (ret) {
583 		DRV_LOG(ERR, "Failed to init RXQ MR btree");
584 		goto fail;
585 	}
586 
587 	rxq->priv = priv;
588 	rxq->num_desc = nb_desc;
589 	rxq->mp = mp;
590 	dev->data->rx_queues[queue_idx] = rxq;
591 
592 	return 0;
593 
594 fail:
595 	rte_free(rxq->gdma_comp_buf);
596 	rte_free(rxq->desc_ring);
597 	rte_free(rxq);
598 	return ret;
599 }
600 
601 static void
602 mana_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
603 {
604 	struct mana_rxq *rxq = dev->data->rx_queues[qid];
605 
606 	mana_mr_btree_free(&rxq->mr_btree);
607 
608 	rte_free(rxq->gdma_comp_buf);
609 	rte_free(rxq->desc_ring);
610 	rte_free(rxq);
611 }
612 
613 static int
614 mana_dev_link_update(struct rte_eth_dev *dev,
615 		     int wait_to_complete __rte_unused)
616 {
617 	struct rte_eth_link link;
618 
619 	/* MANA has no concept of carrier state, always reporting UP */
620 	link = (struct rte_eth_link) {
621 		.link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
622 		.link_autoneg = RTE_ETH_LINK_SPEED_FIXED,
623 		.link_speed = RTE_ETH_SPEED_NUM_100G,
624 		.link_status = RTE_ETH_LINK_UP,
625 	};
626 
627 	return rte_eth_linkstatus_set(dev, &link);
628 }
629 
630 static int
631 mana_dev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
632 {
633 	unsigned int i;
634 
635 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
636 		struct mana_txq *txq = dev->data->tx_queues[i];
637 
638 		if (!txq)
639 			continue;
640 
641 		stats->opackets += txq->stats.packets;
642 		stats->obytes += txq->stats.bytes;
643 		stats->oerrors += txq->stats.errors;
644 
645 		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
646 			stats->q_opackets[i] = txq->stats.packets;
647 			stats->q_obytes[i] = txq->stats.bytes;
648 		}
649 	}
650 
651 	stats->rx_nombuf = 0;
652 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
653 		struct mana_rxq *rxq = dev->data->rx_queues[i];
654 
655 		if (!rxq)
656 			continue;
657 
658 		stats->ipackets += rxq->stats.packets;
659 		stats->ibytes += rxq->stats.bytes;
660 		stats->ierrors += rxq->stats.errors;
661 
662 		/* There is no good way to get stats->imissed, not setting it */
663 
664 		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
665 			stats->q_ipackets[i] = rxq->stats.packets;
666 			stats->q_ibytes[i] = rxq->stats.bytes;
667 		}
668 
669 		stats->rx_nombuf += rxq->stats.nombuf;
670 	}
671 
672 	return 0;
673 }
674 
675 static int
676 mana_dev_stats_reset(struct rte_eth_dev *dev __rte_unused)
677 {
678 	unsigned int i;
679 
680 	PMD_INIT_FUNC_TRACE();
681 
682 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
683 		struct mana_txq *txq = dev->data->tx_queues[i];
684 
685 		if (!txq)
686 			continue;
687 
688 		memset(&txq->stats, 0, sizeof(txq->stats));
689 	}
690 
691 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
692 		struct mana_rxq *rxq = dev->data->rx_queues[i];
693 
694 		if (!rxq)
695 			continue;
696 
697 		memset(&rxq->stats, 0, sizeof(rxq->stats));
698 	}
699 
700 	return 0;
701 }
702 
703 static const struct eth_dev_ops mana_dev_ops = {
704 	.dev_configure		= mana_dev_configure,
705 	.dev_start		= mana_dev_start,
706 	.dev_stop		= mana_dev_stop,
707 	.dev_close		= mana_dev_close,
708 	.dev_infos_get		= mana_dev_info_get,
709 	.txq_info_get		= mana_dev_tx_queue_info,
710 	.rxq_info_get		= mana_dev_rx_queue_info,
711 	.dev_supported_ptypes_get = mana_supported_ptypes,
712 	.rss_hash_update	= mana_rss_hash_update,
713 	.rss_hash_conf_get	= mana_rss_hash_conf_get,
714 	.tx_queue_setup		= mana_dev_tx_queue_setup,
715 	.tx_queue_release	= mana_dev_tx_queue_release,
716 	.rx_queue_setup		= mana_dev_rx_queue_setup,
717 	.rx_queue_release	= mana_dev_rx_queue_release,
718 	.rx_queue_intr_enable	= mana_rx_intr_enable,
719 	.rx_queue_intr_disable	= mana_rx_intr_disable,
720 	.link_update		= mana_dev_link_update,
721 	.stats_get		= mana_dev_stats_get,
722 	.stats_reset		= mana_dev_stats_reset,
723 };
724 
725 static const struct eth_dev_ops mana_dev_secondary_ops = {
726 	.stats_get = mana_dev_stats_get,
727 	.stats_reset = mana_dev_stats_reset,
728 	.dev_infos_get = mana_dev_info_get,
729 };
730 
731 uint16_t
732 mana_rx_burst_removed(void *dpdk_rxq __rte_unused,
733 		      struct rte_mbuf **pkts __rte_unused,
734 		      uint16_t pkts_n __rte_unused)
735 {
736 	rte_mb();
737 	return 0;
738 }
739 
740 uint16_t
741 mana_tx_burst_removed(void *dpdk_rxq __rte_unused,
742 		      struct rte_mbuf **pkts __rte_unused,
743 		      uint16_t pkts_n __rte_unused)
744 {
745 	rte_mb();
746 	return 0;
747 }
748 
749 #define ETH_MANA_MAC_ARG "mac"
750 static const char * const mana_init_args[] = {
751 	ETH_MANA_MAC_ARG,
752 	NULL,
753 };
754 
755 /* Support of parsing up to 8 mac address from EAL command line */
756 #define MAX_NUM_ADDRESS 8
757 struct mana_conf {
758 	struct rte_ether_addr mac_array[MAX_NUM_ADDRESS];
759 	unsigned int index;
760 };
761 
762 static int
763 mana_arg_parse_callback(const char *key, const char *val, void *private)
764 {
765 	struct mana_conf *conf = (struct mana_conf *)private;
766 	int ret;
767 
768 	DRV_LOG(INFO, "key=%s value=%s index=%d", key, val, conf->index);
769 
770 	if (conf->index >= MAX_NUM_ADDRESS) {
771 		DRV_LOG(ERR, "Exceeding max MAC address");
772 		return 1;
773 	}
774 
775 	ret = rte_ether_unformat_addr(val, &conf->mac_array[conf->index]);
776 	if (ret) {
777 		DRV_LOG(ERR, "Invalid MAC address %s", val);
778 		return ret;
779 	}
780 
781 	conf->index++;
782 
783 	return 0;
784 }
785 
786 static int
787 mana_parse_args(struct rte_devargs *devargs, struct mana_conf *conf)
788 {
789 	struct rte_kvargs *kvlist;
790 	unsigned int arg_count;
791 	int ret = 0;
792 
793 	kvlist = rte_kvargs_parse(devargs->drv_str, mana_init_args);
794 	if (!kvlist) {
795 		DRV_LOG(ERR, "failed to parse kvargs args=%s", devargs->drv_str);
796 		return -EINVAL;
797 	}
798 
799 	arg_count = rte_kvargs_count(kvlist, mana_init_args[0]);
800 	if (arg_count > MAX_NUM_ADDRESS) {
801 		ret = -EINVAL;
802 		goto free_kvlist;
803 	}
804 	ret = rte_kvargs_process(kvlist, mana_init_args[0],
805 				 mana_arg_parse_callback, conf);
806 	if (ret) {
807 		DRV_LOG(ERR, "error parsing args");
808 		goto free_kvlist;
809 	}
810 
811 free_kvlist:
812 	rte_kvargs_free(kvlist);
813 	return ret;
814 }
815 
816 static int
817 get_port_mac(struct ibv_device *device, unsigned int port,
818 	     struct rte_ether_addr *addr)
819 {
820 	FILE *file;
821 	int ret = 0;
822 	DIR *dir;
823 	struct dirent *dent;
824 	unsigned int dev_port;
825 	char mac[20];
826 
827 	MANA_MKSTR(path, "%s/device/net", device->ibdev_path);
828 
829 	dir = opendir(path);
830 	if (!dir)
831 		return -ENOENT;
832 
833 	while ((dent = readdir(dir))) {
834 		char *name = dent->d_name;
835 
836 		MANA_MKSTR(port_path, "%s/%s/dev_port", path, name);
837 
838 		/* Ignore . and .. */
839 		if ((name[0] == '.') &&
840 		    ((name[1] == '\0') ||
841 		     ((name[1] == '.') && (name[2] == '\0'))))
842 			continue;
843 
844 		file = fopen(port_path, "r");
845 		if (!file)
846 			continue;
847 
848 		ret = fscanf(file, "%u", &dev_port);
849 		fclose(file);
850 
851 		if (ret != 1)
852 			continue;
853 
854 		/* Ethernet ports start at 0, IB port start at 1 */
855 		if (dev_port == port - 1) {
856 			MANA_MKSTR(address_path, "%s/%s/address", path, name);
857 
858 			file = fopen(address_path, "r");
859 			if (!file)
860 				continue;
861 
862 			ret = fscanf(file, "%s", mac);
863 			fclose(file);
864 
865 			if (ret < 0)
866 				break;
867 
868 			ret = rte_ether_unformat_addr(mac, addr);
869 			if (ret)
870 				DRV_LOG(ERR, "unrecognized mac addr %s", mac);
871 			break;
872 		}
873 	}
874 
875 	closedir(dir);
876 	return ret;
877 }
878 
879 static int
880 mana_ibv_device_to_pci_addr(const struct ibv_device *device,
881 			    struct rte_pci_addr *pci_addr)
882 {
883 	FILE *file;
884 	char *line = NULL;
885 	size_t len = 0;
886 
887 	MANA_MKSTR(path, "%s/device/uevent", device->ibdev_path);
888 
889 	file = fopen(path, "r");
890 	if (!file)
891 		return -errno;
892 
893 	while (getline(&line, &len, file) != -1) {
894 		/* Extract information. */
895 		if (sscanf(line,
896 			   "PCI_SLOT_NAME="
897 			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
898 			   &pci_addr->domain,
899 			   &pci_addr->bus,
900 			   &pci_addr->devid,
901 			   &pci_addr->function) == 4) {
902 			break;
903 		}
904 	}
905 
906 	free(line);
907 	fclose(file);
908 	return 0;
909 }
910 
911 /*
912  * Interrupt handler from IB layer to notify this device is being removed.
913  */
914 static void
915 mana_intr_handler(void *arg)
916 {
917 	struct mana_priv *priv = arg;
918 	struct ibv_context *ctx = priv->ib_ctx;
919 	struct ibv_async_event event;
920 
921 	/* Read and ack all messages from IB device */
922 	while (true) {
923 		if (ibv_get_async_event(ctx, &event))
924 			break;
925 
926 		if (event.event_type == IBV_EVENT_DEVICE_FATAL) {
927 			struct rte_eth_dev *dev;
928 
929 			dev = &rte_eth_devices[priv->port_id];
930 			if (dev->data->dev_conf.intr_conf.rmv)
931 				rte_eth_dev_callback_process(dev,
932 					RTE_ETH_EVENT_INTR_RMV, NULL);
933 		}
934 
935 		ibv_ack_async_event(&event);
936 	}
937 }
938 
939 static int
940 mana_intr_uninstall(struct mana_priv *priv)
941 {
942 	int ret;
943 
944 	ret = rte_intr_callback_unregister(priv->intr_handle,
945 					   mana_intr_handler, priv);
946 	if (ret <= 0) {
947 		DRV_LOG(ERR, "Failed to unregister intr callback ret %d", ret);
948 		return ret;
949 	}
950 
951 	rte_intr_instance_free(priv->intr_handle);
952 
953 	return 0;
954 }
955 
956 int
957 mana_fd_set_non_blocking(int fd)
958 {
959 	int ret = fcntl(fd, F_GETFL);
960 
961 	if (ret != -1 && !fcntl(fd, F_SETFL, ret | O_NONBLOCK))
962 		return 0;
963 
964 	rte_errno = errno;
965 	return -rte_errno;
966 }
967 
968 static int
969 mana_intr_install(struct rte_eth_dev *eth_dev, struct mana_priv *priv)
970 {
971 	int ret;
972 	struct ibv_context *ctx = priv->ib_ctx;
973 
974 	priv->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_SHARED);
975 	if (!priv->intr_handle) {
976 		DRV_LOG(ERR, "Failed to allocate intr_handle");
977 		rte_errno = ENOMEM;
978 		return -ENOMEM;
979 	}
980 
981 	ret = rte_intr_fd_set(priv->intr_handle, -1);
982 	if (ret)
983 		goto free_intr;
984 
985 	ret = mana_fd_set_non_blocking(ctx->async_fd);
986 	if (ret) {
987 		DRV_LOG(ERR, "Failed to change async_fd to NONBLOCK");
988 		goto free_intr;
989 	}
990 
991 	ret = rte_intr_fd_set(priv->intr_handle, ctx->async_fd);
992 	if (ret)
993 		goto free_intr;
994 
995 	ret = rte_intr_type_set(priv->intr_handle, RTE_INTR_HANDLE_EXT);
996 	if (ret)
997 		goto free_intr;
998 
999 	ret = rte_intr_callback_register(priv->intr_handle,
1000 					 mana_intr_handler, priv);
1001 	if (ret) {
1002 		DRV_LOG(ERR, "Failed to register intr callback");
1003 		rte_intr_fd_set(priv->intr_handle, -1);
1004 		goto free_intr;
1005 	}
1006 
1007 	eth_dev->intr_handle = priv->intr_handle;
1008 	return 0;
1009 
1010 free_intr:
1011 	rte_intr_instance_free(priv->intr_handle);
1012 	priv->intr_handle = NULL;
1013 
1014 	return ret;
1015 }
1016 
1017 static int
1018 mana_proc_priv_init(struct rte_eth_dev *dev)
1019 {
1020 	struct mana_process_priv *priv;
1021 
1022 	priv = rte_zmalloc_socket("mana_proc_priv",
1023 				  sizeof(struct mana_process_priv),
1024 				  RTE_CACHE_LINE_SIZE,
1025 				  dev->device->numa_node);
1026 	if (!priv)
1027 		return -ENOMEM;
1028 
1029 	dev->process_private = priv;
1030 	return 0;
1031 }
1032 
1033 /*
1034  * Map the doorbell page for the secondary process through IB device handle.
1035  */
1036 static int
1037 mana_map_doorbell_secondary(struct rte_eth_dev *eth_dev, int fd)
1038 {
1039 	struct mana_process_priv *priv = eth_dev->process_private;
1040 
1041 	void *addr;
1042 
1043 	addr = mmap(NULL, rte_mem_page_size(), PROT_WRITE, MAP_SHARED, fd, 0);
1044 	if (addr == MAP_FAILED) {
1045 		DRV_LOG(ERR, "Failed to map secondary doorbell port %u",
1046 			eth_dev->data->port_id);
1047 		return -ENOMEM;
1048 	}
1049 
1050 	DRV_LOG(INFO, "Secondary doorbell mapped to %p", addr);
1051 
1052 	priv->db_page = addr;
1053 
1054 	return 0;
1055 }
1056 
1057 /* Initialize shared data for the driver (all devices) */
1058 static int
1059 mana_init_shared_data(void)
1060 {
1061 	int ret =  0;
1062 	const struct rte_memzone *secondary_mz;
1063 
1064 	rte_spinlock_lock(&mana_shared_data_lock);
1065 
1066 	/* Skip if shared data is already initialized */
1067 	if (mana_shared_data)
1068 		goto exit;
1069 
1070 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1071 		mana_shared_mz = rte_memzone_reserve(MZ_MANA_SHARED_DATA,
1072 						     sizeof(*mana_shared_data),
1073 						     SOCKET_ID_ANY, 0);
1074 		if (!mana_shared_mz) {
1075 			DRV_LOG(ERR, "Cannot allocate mana shared data");
1076 			ret = -rte_errno;
1077 			goto exit;
1078 		}
1079 
1080 		mana_shared_data = mana_shared_mz->addr;
1081 		memset(mana_shared_data, 0, sizeof(*mana_shared_data));
1082 		rte_spinlock_init(&mana_shared_data->lock);
1083 	} else {
1084 		secondary_mz = rte_memzone_lookup(MZ_MANA_SHARED_DATA);
1085 		if (!secondary_mz) {
1086 			DRV_LOG(ERR, "Cannot attach mana shared data");
1087 			ret = -rte_errno;
1088 			goto exit;
1089 		}
1090 
1091 		mana_shared_data = secondary_mz->addr;
1092 		memset(&mana_local_data, 0, sizeof(mana_local_data));
1093 	}
1094 
1095 exit:
1096 	rte_spinlock_unlock(&mana_shared_data_lock);
1097 
1098 	return ret;
1099 }
1100 
1101 /*
1102  * Init the data structures for use in primary and secondary processes.
1103  */
1104 static int
1105 mana_init_once(void)
1106 {
1107 	int ret;
1108 
1109 	ret = mana_init_shared_data();
1110 	if (ret)
1111 		return ret;
1112 
1113 	rte_spinlock_lock(&mana_shared_data->lock);
1114 
1115 	switch (rte_eal_process_type()) {
1116 	case RTE_PROC_PRIMARY:
1117 		if (mana_shared_data->init_done)
1118 			break;
1119 
1120 		ret = mana_mp_init_primary();
1121 		if (ret)
1122 			break;
1123 		DRV_LOG(ERR, "MP INIT PRIMARY");
1124 
1125 		mana_shared_data->init_done = 1;
1126 		break;
1127 
1128 	case RTE_PROC_SECONDARY:
1129 
1130 		if (mana_local_data.init_done)
1131 			break;
1132 
1133 		ret = mana_mp_init_secondary();
1134 		if (ret)
1135 			break;
1136 
1137 		DRV_LOG(ERR, "MP INIT SECONDARY");
1138 
1139 		mana_local_data.init_done = 1;
1140 		break;
1141 
1142 	default:
1143 		/* Impossible, internal error */
1144 		ret = -EPROTO;
1145 		break;
1146 	}
1147 
1148 	rte_spinlock_unlock(&mana_shared_data->lock);
1149 
1150 	return ret;
1151 }
1152 
1153 /*
1154  * Probe an IB port
1155  * Return value:
1156  * positive value: successfully probed port
1157  * 0: port not matching specified MAC address
1158  * negative value: error code
1159  */
1160 static int
1161 mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
1162 		uint8_t port, struct rte_pci_device *pci_dev, struct rte_ether_addr *addr)
1163 {
1164 	struct mana_priv *priv = NULL;
1165 	struct rte_eth_dev *eth_dev = NULL;
1166 	struct ibv_parent_domain_init_attr attr = {0};
1167 	char address[64];
1168 	char name[RTE_ETH_NAME_MAX_LEN];
1169 	int ret;
1170 	struct ibv_context *ctx = NULL;
1171 
1172 	rte_ether_format_addr(address, sizeof(address), addr);
1173 	DRV_LOG(INFO, "device located port %u address %s", port, address);
1174 
1175 	priv = rte_zmalloc_socket(NULL, sizeof(*priv), RTE_CACHE_LINE_SIZE,
1176 				  SOCKET_ID_ANY);
1177 	if (!priv)
1178 		return -ENOMEM;
1179 
1180 	snprintf(name, sizeof(name), "%s_port%d", pci_dev->device.name, port);
1181 
1182 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1183 		int fd;
1184 
1185 		eth_dev = rte_eth_dev_attach_secondary(name);
1186 		if (!eth_dev) {
1187 			DRV_LOG(ERR, "Can't attach to dev %s", name);
1188 			ret =  -ENOMEM;
1189 			goto failed;
1190 		}
1191 
1192 		eth_dev->device = &pci_dev->device;
1193 		eth_dev->dev_ops = &mana_dev_secondary_ops;
1194 		ret = mana_proc_priv_init(eth_dev);
1195 		if (ret)
1196 			goto failed;
1197 		priv->process_priv = eth_dev->process_private;
1198 
1199 		/* Get the IB FD from the primary process */
1200 		fd = mana_mp_req_verbs_cmd_fd(eth_dev);
1201 		if (fd < 0) {
1202 			DRV_LOG(ERR, "Failed to get FD %d", fd);
1203 			ret = -ENODEV;
1204 			goto failed;
1205 		}
1206 
1207 		ret = mana_map_doorbell_secondary(eth_dev, fd);
1208 		if (ret) {
1209 			DRV_LOG(ERR, "Failed secondary map %d", fd);
1210 			goto failed;
1211 		}
1212 
1213 		/* fd is no not used after mapping doorbell */
1214 		close(fd);
1215 
1216 		eth_dev->tx_pkt_burst = mana_tx_burst_removed;
1217 		eth_dev->rx_pkt_burst = mana_rx_burst_removed;
1218 
1219 		rte_spinlock_lock(&mana_shared_data->lock);
1220 		mana_shared_data->secondary_cnt++;
1221 		mana_local_data.secondary_cnt++;
1222 		rte_spinlock_unlock(&mana_shared_data->lock);
1223 
1224 		rte_eth_copy_pci_info(eth_dev, pci_dev);
1225 		rte_eth_dev_probing_finish(eth_dev);
1226 
1227 		return 0;
1228 	}
1229 
1230 	ctx = ibv_open_device(ibdev);
1231 	if (!ctx) {
1232 		DRV_LOG(ERR, "Failed to open IB device %s", ibdev->name);
1233 		ret = -ENODEV;
1234 		goto failed;
1235 	}
1236 
1237 	eth_dev = rte_eth_dev_allocate(name);
1238 	if (!eth_dev) {
1239 		ret = -ENOMEM;
1240 		goto failed;
1241 	}
1242 
1243 	eth_dev->data->mac_addrs =
1244 		rte_calloc("mana_mac", 1,
1245 			   sizeof(struct rte_ether_addr), 0);
1246 	if (!eth_dev->data->mac_addrs) {
1247 		ret = -ENOMEM;
1248 		goto failed;
1249 	}
1250 
1251 	rte_ether_addr_copy(addr, eth_dev->data->mac_addrs);
1252 
1253 	priv->ib_pd = ibv_alloc_pd(ctx);
1254 	if (!priv->ib_pd) {
1255 		DRV_LOG(ERR, "ibv_alloc_pd failed port %d", port);
1256 		ret = -ENOMEM;
1257 		goto failed;
1258 	}
1259 
1260 	/* Create a parent domain with the port number */
1261 	attr.pd = priv->ib_pd;
1262 	attr.comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
1263 	attr.pd_context = (void *)(uint64_t)port;
1264 	priv->ib_parent_pd = ibv_alloc_parent_domain(ctx, &attr);
1265 	if (!priv->ib_parent_pd) {
1266 		DRV_LOG(ERR, "ibv_alloc_parent_domain failed port %d", port);
1267 		ret = -ENOMEM;
1268 		goto failed;
1269 	}
1270 
1271 	priv->ib_ctx = ctx;
1272 	priv->port_id = eth_dev->data->port_id;
1273 	priv->dev_port = port;
1274 	eth_dev->data->dev_private = priv;
1275 	priv->dev_data = eth_dev->data;
1276 
1277 	priv->max_rx_queues = dev_attr->orig_attr.max_qp;
1278 	priv->max_tx_queues = dev_attr->orig_attr.max_qp;
1279 
1280 	priv->max_rx_desc =
1281 		RTE_MIN(dev_attr->orig_attr.max_qp_wr,
1282 			dev_attr->orig_attr.max_cqe);
1283 	priv->max_tx_desc =
1284 		RTE_MIN(dev_attr->orig_attr.max_qp_wr,
1285 			dev_attr->orig_attr.max_cqe);
1286 
1287 	priv->max_send_sge = dev_attr->orig_attr.max_sge;
1288 	priv->max_recv_sge = dev_attr->orig_attr.max_sge;
1289 
1290 	priv->max_mr = dev_attr->orig_attr.max_mr;
1291 	priv->max_mr_size = dev_attr->orig_attr.max_mr_size;
1292 
1293 	DRV_LOG(INFO, "dev %s max queues %d desc %d sge %d",
1294 		name, priv->max_rx_queues, priv->max_rx_desc,
1295 		priv->max_send_sge);
1296 
1297 	rte_eth_copy_pci_info(eth_dev, pci_dev);
1298 
1299 	/* Create async interrupt handler */
1300 	ret = mana_intr_install(eth_dev, priv);
1301 	if (ret) {
1302 		DRV_LOG(ERR, "Failed to install intr handler");
1303 		goto failed;
1304 	}
1305 
1306 	rte_spinlock_lock(&mana_shared_data->lock);
1307 	mana_shared_data->primary_cnt++;
1308 	rte_spinlock_unlock(&mana_shared_data->lock);
1309 
1310 	eth_dev->device = &pci_dev->device;
1311 
1312 	DRV_LOG(INFO, "device %s at port %u", name, eth_dev->data->port_id);
1313 
1314 	eth_dev->rx_pkt_burst = mana_rx_burst_removed;
1315 	eth_dev->tx_pkt_burst = mana_tx_burst_removed;
1316 	eth_dev->dev_ops = &mana_dev_ops;
1317 
1318 	rte_eth_dev_probing_finish(eth_dev);
1319 
1320 	return 0;
1321 
1322 failed:
1323 	/* Free the resource for the port failed */
1324 	if (priv) {
1325 		if (priv->ib_parent_pd)
1326 			ibv_dealloc_pd(priv->ib_parent_pd);
1327 
1328 		if (priv->ib_pd)
1329 			ibv_dealloc_pd(priv->ib_pd);
1330 	}
1331 
1332 	if (eth_dev)
1333 		rte_eth_dev_release_port(eth_dev);
1334 
1335 	rte_free(priv);
1336 
1337 	if (ctx)
1338 		ibv_close_device(ctx);
1339 
1340 	return ret;
1341 }
1342 
1343 /*
1344  * Goes through the IB device list to look for the IB port matching the
1345  * mac_addr. If found, create a rte_eth_dev for it.
1346  * Return value: number of successfully probed devices
1347  */
1348 static int
1349 mana_pci_probe_mac(struct rte_pci_device *pci_dev,
1350 		   struct rte_ether_addr *mac_addr)
1351 {
1352 	struct ibv_device **ibv_list;
1353 	int ibv_idx;
1354 	struct ibv_context *ctx;
1355 	int num_devices;
1356 	int ret;
1357 	uint8_t port;
1358 	int count = 0;
1359 
1360 	ibv_list = ibv_get_device_list(&num_devices);
1361 	for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
1362 		struct ibv_device *ibdev = ibv_list[ibv_idx];
1363 		struct rte_pci_addr pci_addr;
1364 		struct ibv_device_attr_ex dev_attr;
1365 
1366 		DRV_LOG(INFO, "Probe device name %s dev_name %s ibdev_path %s",
1367 			ibdev->name, ibdev->dev_name, ibdev->ibdev_path);
1368 
1369 		if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
1370 			continue;
1371 
1372 		/* Ignore if this IB device is not this PCI device */
1373 		if (pci_dev->addr.domain != pci_addr.domain ||
1374 		    pci_dev->addr.bus != pci_addr.bus ||
1375 		    pci_dev->addr.devid != pci_addr.devid ||
1376 		    pci_dev->addr.function != pci_addr.function)
1377 			continue;
1378 
1379 		ctx = ibv_open_device(ibdev);
1380 		if (!ctx) {
1381 			DRV_LOG(ERR, "Failed to open IB device %s",
1382 				ibdev->name);
1383 			continue;
1384 		}
1385 		ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
1386 		ibv_close_device(ctx);
1387 
1388 		if (ret) {
1389 			DRV_LOG(ERR, "Failed to query IB device %s",
1390 				ibdev->name);
1391 			continue;
1392 		}
1393 
1394 		for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
1395 		     port++) {
1396 			struct rte_ether_addr addr;
1397 			ret = get_port_mac(ibdev, port, &addr);
1398 			if (ret)
1399 				continue;
1400 
1401 			if (mac_addr && !rte_is_same_ether_addr(&addr, mac_addr))
1402 				continue;
1403 
1404 			ret = mana_probe_port(ibdev, &dev_attr, port, pci_dev, &addr);
1405 			if (ret) {
1406 				DRV_LOG(ERR, "Probe on IB port %u failed %d", port, ret);
1407 			} else {
1408 				count++;
1409 				DRV_LOG(INFO, "Successfully probed on IB port %u", port);
1410 			}
1411 		}
1412 	}
1413 
1414 	ibv_free_device_list(ibv_list);
1415 	return count;
1416 }
1417 
1418 /*
1419  * Main callback function from PCI bus to probe a device.
1420  */
1421 static int
1422 mana_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1423 	       struct rte_pci_device *pci_dev)
1424 {
1425 	struct rte_devargs *args = pci_dev->device.devargs;
1426 	struct mana_conf conf = {0};
1427 	unsigned int i;
1428 	int ret;
1429 	int count = 0;
1430 
1431 	if (args && args->drv_str) {
1432 		ret = mana_parse_args(args, &conf);
1433 		if (ret) {
1434 			DRV_LOG(ERR, "Failed to parse parameters args = %s",
1435 				args->drv_str);
1436 			return ret;
1437 		}
1438 	}
1439 
1440 	ret = mana_init_once();
1441 	if (ret) {
1442 		DRV_LOG(ERR, "Failed to init PMD global data %d", ret);
1443 		return ret;
1444 	}
1445 
1446 	/* If there are no driver parameters, probe on all ports */
1447 	if (conf.index) {
1448 		for (i = 0; i < conf.index; i++)
1449 			count += mana_pci_probe_mac(pci_dev,
1450 						    &conf.mac_array[i]);
1451 	} else {
1452 		count = mana_pci_probe_mac(pci_dev, NULL);
1453 	}
1454 
1455 	if (!count) {
1456 		rte_memzone_free(mana_shared_mz);
1457 		mana_shared_mz = NULL;
1458 		ret = -ENODEV;
1459 	}
1460 
1461 	return ret;
1462 }
1463 
1464 static int
1465 mana_dev_uninit(struct rte_eth_dev *dev)
1466 {
1467 	return mana_dev_close(dev);
1468 }
1469 
1470 /*
1471  * Callback from PCI to remove this device.
1472  */
1473 static int
1474 mana_pci_remove(struct rte_pci_device *pci_dev)
1475 {
1476 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1477 		rte_spinlock_lock(&mana_shared_data_lock);
1478 
1479 		rte_spinlock_lock(&mana_shared_data->lock);
1480 
1481 		RTE_VERIFY(mana_shared_data->primary_cnt > 0);
1482 		mana_shared_data->primary_cnt--;
1483 		if (!mana_shared_data->primary_cnt) {
1484 			DRV_LOG(DEBUG, "mp uninit primary");
1485 			mana_mp_uninit_primary();
1486 		}
1487 
1488 		rte_spinlock_unlock(&mana_shared_data->lock);
1489 
1490 		/* Also free the shared memory if this is the last */
1491 		if (!mana_shared_data->primary_cnt) {
1492 			DRV_LOG(DEBUG, "free shared memezone data");
1493 			rte_memzone_free(mana_shared_mz);
1494 			mana_shared_mz = NULL;
1495 		}
1496 
1497 		rte_spinlock_unlock(&mana_shared_data_lock);
1498 	} else {
1499 		rte_spinlock_lock(&mana_shared_data_lock);
1500 
1501 		rte_spinlock_lock(&mana_shared_data->lock);
1502 		RTE_VERIFY(mana_shared_data->secondary_cnt > 0);
1503 		mana_shared_data->secondary_cnt--;
1504 		rte_spinlock_unlock(&mana_shared_data->lock);
1505 
1506 		RTE_VERIFY(mana_local_data.secondary_cnt > 0);
1507 		mana_local_data.secondary_cnt--;
1508 		if (!mana_local_data.secondary_cnt) {
1509 			DRV_LOG(DEBUG, "mp uninit secondary");
1510 			mana_mp_uninit_secondary();
1511 		}
1512 
1513 		rte_spinlock_unlock(&mana_shared_data_lock);
1514 	}
1515 
1516 	return rte_eth_dev_pci_generic_remove(pci_dev, mana_dev_uninit);
1517 }
1518 
1519 static const struct rte_pci_id mana_pci_id_map[] = {
1520 	{
1521 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT,
1522 			       PCI_DEVICE_ID_MICROSOFT_MANA)
1523 	},
1524 	{
1525 		.vendor_id = 0
1526 	},
1527 };
1528 
1529 static struct rte_pci_driver mana_pci_driver = {
1530 	.id_table = mana_pci_id_map,
1531 	.probe = mana_pci_probe,
1532 	.remove = mana_pci_remove,
1533 	.drv_flags = RTE_PCI_DRV_INTR_RMV,
1534 };
1535 
1536 RTE_PMD_REGISTER_PCI(net_mana, mana_pci_driver);
1537 RTE_PMD_REGISTER_PCI_TABLE(net_mana, mana_pci_id_map);
1538 RTE_PMD_REGISTER_KMOD_DEP(net_mana, "* ib_uverbs & mana_ib");
1539 RTE_LOG_REGISTER_SUFFIX(mana_logtype_init, init, NOTICE);
1540 RTE_LOG_REGISTER_SUFFIX(mana_logtype_driver, driver, NOTICE);
1541 RTE_PMD_REGISTER_PARAM_STRING(net_mana, ETH_MANA_MAC_ARG "=<mac_addr>");
1542