xref: /dpdk/drivers/net/mana/mana.c (revision 5d52418fa4b9a7f28eaedc1d88ec5cf330381c0e)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2022 Microsoft Corporation
3  */
4 
5 #include <unistd.h>
6 #include <dirent.h>
7 #include <fcntl.h>
8 #include <sys/mman.h>
9 #include <sys/ioctl.h>
10 #include <net/if.h>
11 
12 #include <ethdev_driver.h>
13 #include <ethdev_pci.h>
14 #include <rte_kvargs.h>
15 #include <rte_eal_paging.h>
16 
17 #include <infiniband/verbs.h>
18 #include <infiniband/manadv.h>
19 
20 #include <assert.h>
21 
22 #include "mana.h"
23 
24 /* Shared memory between primary/secondary processes, per driver */
25 /* Data to track primary/secondary usage */
26 struct mana_shared_data *mana_shared_data;
27 static struct mana_shared_data mana_local_data;
28 
29 /* The memory region for the above data */
30 static const struct rte_memzone *mana_shared_mz;
31 static const char *MZ_MANA_SHARED_DATA = "mana_shared_data";
32 
33 /* Spinlock for mana_shared_data */
34 static rte_spinlock_t mana_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
35 
36 /* Allocate a buffer on the stack and fill it with a printf format string. */
37 #define MANA_MKSTR(name, ...) \
38 	int mkstr_size_##name = snprintf(NULL, 0, "" __VA_ARGS__); \
39 	char name[mkstr_size_##name + 1]; \
40 	\
41 	memset(name, 0, mkstr_size_##name + 1); \
42 	snprintf(name, sizeof(name), "" __VA_ARGS__)
43 
44 int mana_logtype_driver;
45 int mana_logtype_init;
46 
47 /*
48  * Callback from rdma-core to allocate a buffer for a queue.
49  */
50 void *
51 mana_alloc_verbs_buf(size_t size, void *data)
52 {
53 	void *ret;
54 	size_t alignment = rte_mem_page_size();
55 	int socket = (int)(uintptr_t)data;
56 
57 	DRV_LOG(DEBUG, "size=%zu socket=%d", size, socket);
58 
59 	if (alignment == (size_t)-1) {
60 		DRV_LOG(ERR, "Failed to get mem page size");
61 		rte_errno = ENOMEM;
62 		return NULL;
63 	}
64 
65 	ret = rte_zmalloc_socket("mana_verb_buf", size, alignment, socket);
66 	if (!ret && size)
67 		rte_errno = ENOMEM;
68 	return ret;
69 }
70 
71 void
72 mana_free_verbs_buf(void *ptr, void *data __rte_unused)
73 {
74 	rte_free(ptr);
75 }
76 
77 static int
78 mana_dev_configure(struct rte_eth_dev *dev)
79 {
80 	struct mana_priv *priv = dev->data->dev_private;
81 	struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
82 
83 	if (dev_conf->rxmode.mq_mode & RTE_ETH_MQ_RX_RSS_FLAG)
84 		dev_conf->rxmode.offloads |= RTE_ETH_RX_OFFLOAD_RSS_HASH;
85 
86 	if (dev->data->nb_rx_queues != dev->data->nb_tx_queues) {
87 		DRV_LOG(ERR, "Only support equal number of rx/tx queues");
88 		return -EINVAL;
89 	}
90 
91 	if (!rte_is_power_of_2(dev->data->nb_rx_queues)) {
92 		DRV_LOG(ERR, "number of TX/RX queues must be power of 2");
93 		return -EINVAL;
94 	}
95 
96 	priv->num_queues = dev->data->nb_rx_queues;
97 
98 	manadv_set_context_attr(priv->ib_ctx, MANADV_CTX_ATTR_BUF_ALLOCATORS,
99 				(void *)((uintptr_t)&(struct manadv_ctx_allocators){
100 					.alloc = &mana_alloc_verbs_buf,
101 					.free = &mana_free_verbs_buf,
102 					.data = 0,
103 				}));
104 
105 	return 0;
106 }
107 
108 static void
109 rx_intr_vec_disable(struct mana_priv *priv)
110 {
111 	struct rte_intr_handle *intr_handle = priv->intr_handle;
112 
113 	rte_intr_free_epoll_fd(intr_handle);
114 	rte_intr_vec_list_free(intr_handle);
115 	rte_intr_nb_efd_set(intr_handle, 0);
116 }
117 
118 static int
119 rx_intr_vec_enable(struct mana_priv *priv)
120 {
121 	unsigned int i;
122 	unsigned int rxqs_n = priv->dev_data->nb_rx_queues;
123 	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
124 	struct rte_intr_handle *intr_handle = priv->intr_handle;
125 	int ret;
126 
127 	rx_intr_vec_disable(priv);
128 
129 	if (rte_intr_vec_list_alloc(intr_handle, NULL, n)) {
130 		DRV_LOG(ERR, "Failed to allocate memory for interrupt vector");
131 		return -ENOMEM;
132 	}
133 
134 	for (i = 0; i < n; i++) {
135 		struct mana_rxq *rxq = priv->dev_data->rx_queues[i];
136 
137 		ret = rte_intr_vec_list_index_set(intr_handle, i,
138 						  RTE_INTR_VEC_RXTX_OFFSET + i);
139 		if (ret) {
140 			DRV_LOG(ERR, "Failed to set intr vec %u", i);
141 			return ret;
142 		}
143 
144 		ret = rte_intr_efds_index_set(intr_handle, i, rxq->channel->fd);
145 		if (ret) {
146 			DRV_LOG(ERR, "Failed to set FD at intr %u", i);
147 			return ret;
148 		}
149 	}
150 
151 	return rte_intr_nb_efd_set(intr_handle, n);
152 }
153 
154 static void
155 rxq_intr_disable(struct mana_priv *priv)
156 {
157 	int err = rte_errno;
158 
159 	rx_intr_vec_disable(priv);
160 	rte_errno = err;
161 }
162 
163 static int
164 rxq_intr_enable(struct mana_priv *priv)
165 {
166 	const struct rte_eth_intr_conf *const intr_conf =
167 		&priv->dev_data->dev_conf.intr_conf;
168 
169 	if (!intr_conf->rxq)
170 		return 0;
171 
172 	return rx_intr_vec_enable(priv);
173 }
174 
175 static int
176 mana_dev_start(struct rte_eth_dev *dev)
177 {
178 	int ret;
179 	struct mana_priv *priv = dev->data->dev_private;
180 
181 	rte_spinlock_init(&priv->mr_btree_lock);
182 	ret = mana_mr_btree_init(&priv->mr_btree, MANA_MR_BTREE_CACHE_N,
183 				 dev->device->numa_node);
184 	if (ret) {
185 		DRV_LOG(ERR, "Failed to init device MR btree %d", ret);
186 		return ret;
187 	}
188 
189 	ret = mana_start_tx_queues(dev);
190 	if (ret) {
191 		DRV_LOG(ERR, "failed to start tx queues %d", ret);
192 		goto failed_tx;
193 	}
194 
195 	ret = mana_start_rx_queues(dev);
196 	if (ret) {
197 		DRV_LOG(ERR, "failed to start rx queues %d", ret);
198 		goto failed_rx;
199 	}
200 
201 	rte_wmb();
202 
203 	dev->tx_pkt_burst = mana_tx_burst;
204 	dev->rx_pkt_burst = mana_rx_burst;
205 
206 	DRV_LOG(INFO, "TX/RX queues have started");
207 
208 	/* Enable datapath for secondary processes */
209 	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_START_RXTX);
210 
211 	ret = rxq_intr_enable(priv);
212 	if (ret) {
213 		DRV_LOG(ERR, "Failed to enable RX interrupts");
214 		goto failed_intr;
215 	}
216 
217 	return 0;
218 
219 failed_intr:
220 	mana_stop_rx_queues(dev);
221 
222 failed_rx:
223 	mana_stop_tx_queues(dev);
224 
225 failed_tx:
226 	mana_mr_btree_free(&priv->mr_btree);
227 
228 	return ret;
229 }
230 
231 static int
232 mana_dev_stop(struct rte_eth_dev *dev)
233 {
234 	int ret;
235 	struct mana_priv *priv = dev->data->dev_private;
236 
237 	rxq_intr_disable(priv);
238 
239 	dev->tx_pkt_burst = mana_tx_burst_removed;
240 	dev->rx_pkt_burst = mana_rx_burst_removed;
241 
242 	/* Stop datapath on secondary processes */
243 	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_STOP_RXTX);
244 
245 	rte_wmb();
246 
247 	ret = mana_stop_tx_queues(dev);
248 	if (ret) {
249 		DRV_LOG(ERR, "failed to stop tx queues");
250 		return ret;
251 	}
252 
253 	ret = mana_stop_rx_queues(dev);
254 	if (ret) {
255 		DRV_LOG(ERR, "failed to stop tx queues");
256 		return ret;
257 	}
258 
259 	return 0;
260 }
261 
262 static int mana_intr_uninstall(struct mana_priv *priv);
263 
264 static int
265 mana_dev_close(struct rte_eth_dev *dev)
266 {
267 	struct mana_priv *priv = dev->data->dev_private;
268 	int ret;
269 
270 	mana_remove_all_mr(priv);
271 
272 	ret = mana_intr_uninstall(priv);
273 	if (ret)
274 		return ret;
275 
276 	ret = ibv_close_device(priv->ib_ctx);
277 	if (ret) {
278 		ret = errno;
279 		return ret;
280 	}
281 
282 	return 0;
283 }
284 
285 static int
286 mana_dev_info_get(struct rte_eth_dev *dev,
287 		  struct rte_eth_dev_info *dev_info)
288 {
289 	struct mana_priv *priv = dev->data->dev_private;
290 
291 	dev_info->min_mtu = RTE_ETHER_MIN_MTU;
292 	dev_info->max_mtu = MANA_MAX_MTU;
293 
294 	/* RX params */
295 	dev_info->min_rx_bufsize = MIN_RX_BUF_SIZE;
296 	dev_info->max_rx_pktlen = MANA_MAX_MTU + RTE_ETHER_HDR_LEN;
297 
298 	dev_info->max_rx_queues = priv->max_rx_queues;
299 	dev_info->max_tx_queues = priv->max_tx_queues;
300 
301 	dev_info->max_mac_addrs = MANA_MAX_MAC_ADDR;
302 	dev_info->max_hash_mac_addrs = 0;
303 
304 	dev_info->max_vfs = 1;
305 
306 	/* Offload params */
307 	dev_info->rx_offload_capa = MANA_DEV_RX_OFFLOAD_SUPPORT;
308 
309 	dev_info->tx_offload_capa = MANA_DEV_TX_OFFLOAD_SUPPORT;
310 
311 	/* RSS */
312 	dev_info->reta_size = INDIRECTION_TABLE_NUM_ELEMENTS;
313 	dev_info->hash_key_size = TOEPLITZ_HASH_KEY_SIZE_IN_BYTES;
314 	dev_info->flow_type_rss_offloads = MANA_ETH_RSS_SUPPORT;
315 
316 	/* Thresholds */
317 	dev_info->default_rxconf = (struct rte_eth_rxconf){
318 		.rx_thresh = {
319 			.pthresh = 8,
320 			.hthresh = 8,
321 			.wthresh = 0,
322 		},
323 		.rx_free_thresh = 32,
324 		/* If no descriptors available, pkts are dropped by default */
325 		.rx_drop_en = 1,
326 	};
327 
328 	dev_info->default_txconf = (struct rte_eth_txconf){
329 		.tx_thresh = {
330 			.pthresh = 32,
331 			.hthresh = 0,
332 			.wthresh = 0,
333 		},
334 		.tx_rs_thresh = 32,
335 		.tx_free_thresh = 32,
336 	};
337 
338 	/* Buffer limits */
339 	dev_info->rx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
340 	dev_info->rx_desc_lim.nb_max = priv->max_rx_desc;
341 	dev_info->rx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
342 	dev_info->rx_desc_lim.nb_seg_max = priv->max_recv_sge;
343 	dev_info->rx_desc_lim.nb_mtu_seg_max = priv->max_recv_sge;
344 
345 	dev_info->tx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
346 	dev_info->tx_desc_lim.nb_max = priv->max_tx_desc;
347 	dev_info->tx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
348 	dev_info->tx_desc_lim.nb_seg_max = priv->max_send_sge;
349 	dev_info->rx_desc_lim.nb_mtu_seg_max = priv->max_recv_sge;
350 
351 	/* Speed */
352 	dev_info->speed_capa = RTE_ETH_LINK_SPEED_100G;
353 
354 	/* RX params */
355 	dev_info->default_rxportconf.burst_size = 1;
356 	dev_info->default_rxportconf.ring_size = MAX_RECEIVE_BUFFERS_PER_QUEUE;
357 	dev_info->default_rxportconf.nb_queues = 1;
358 
359 	/* TX params */
360 	dev_info->default_txportconf.burst_size = 1;
361 	dev_info->default_txportconf.ring_size = MAX_SEND_BUFFERS_PER_QUEUE;
362 	dev_info->default_txportconf.nb_queues = 1;
363 
364 	return 0;
365 }
366 
367 static void
368 mana_dev_tx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
369 		       struct rte_eth_txq_info *qinfo)
370 {
371 	struct mana_txq *txq = dev->data->tx_queues[queue_id];
372 
373 	qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads;
374 	qinfo->nb_desc = txq->num_desc;
375 }
376 
377 static void
378 mana_dev_rx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
379 		       struct rte_eth_rxq_info *qinfo)
380 {
381 	struct mana_rxq *rxq = dev->data->rx_queues[queue_id];
382 
383 	qinfo->mp = rxq->mp;
384 	qinfo->nb_desc = rxq->num_desc;
385 	qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads;
386 }
387 
388 static const uint32_t *
389 mana_supported_ptypes(struct rte_eth_dev *dev __rte_unused)
390 {
391 	static const uint32_t ptypes[] = {
392 		RTE_PTYPE_L2_ETHER,
393 		RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
394 		RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
395 		RTE_PTYPE_L4_FRAG,
396 		RTE_PTYPE_L4_TCP,
397 		RTE_PTYPE_L4_UDP,
398 		RTE_PTYPE_UNKNOWN
399 	};
400 
401 	return ptypes;
402 }
403 
404 static int
405 mana_rss_hash_update(struct rte_eth_dev *dev,
406 		     struct rte_eth_rss_conf *rss_conf)
407 {
408 	struct mana_priv *priv = dev->data->dev_private;
409 
410 	/* Currently can only update RSS hash when device is stopped */
411 	if (dev->data->dev_started) {
412 		DRV_LOG(ERR, "Can't update RSS after device has started");
413 		return -ENODEV;
414 	}
415 
416 	if (rss_conf->rss_hf & ~MANA_ETH_RSS_SUPPORT) {
417 		DRV_LOG(ERR, "Port %u invalid RSS HF 0x%" PRIx64,
418 			dev->data->port_id, rss_conf->rss_hf);
419 		return -EINVAL;
420 	}
421 
422 	if (rss_conf->rss_key && rss_conf->rss_key_len) {
423 		if (rss_conf->rss_key_len != TOEPLITZ_HASH_KEY_SIZE_IN_BYTES) {
424 			DRV_LOG(ERR, "Port %u key len must be %u long",
425 				dev->data->port_id,
426 				TOEPLITZ_HASH_KEY_SIZE_IN_BYTES);
427 			return -EINVAL;
428 		}
429 
430 		priv->rss_conf.rss_key_len = rss_conf->rss_key_len;
431 		priv->rss_conf.rss_key =
432 			rte_zmalloc("mana_rss", rss_conf->rss_key_len,
433 				    RTE_CACHE_LINE_SIZE);
434 		if (!priv->rss_conf.rss_key)
435 			return -ENOMEM;
436 		memcpy(priv->rss_conf.rss_key, rss_conf->rss_key,
437 		       rss_conf->rss_key_len);
438 	}
439 	priv->rss_conf.rss_hf = rss_conf->rss_hf;
440 
441 	return 0;
442 }
443 
444 static int
445 mana_rss_hash_conf_get(struct rte_eth_dev *dev,
446 		       struct rte_eth_rss_conf *rss_conf)
447 {
448 	struct mana_priv *priv = dev->data->dev_private;
449 
450 	if (!rss_conf)
451 		return -EINVAL;
452 
453 	if (rss_conf->rss_key &&
454 	    rss_conf->rss_key_len >= priv->rss_conf.rss_key_len) {
455 		memcpy(rss_conf->rss_key, priv->rss_conf.rss_key,
456 		       priv->rss_conf.rss_key_len);
457 	}
458 
459 	rss_conf->rss_key_len = priv->rss_conf.rss_key_len;
460 	rss_conf->rss_hf = priv->rss_conf.rss_hf;
461 
462 	return 0;
463 }
464 
465 static int
466 mana_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
467 			uint16_t nb_desc, unsigned int socket_id,
468 			const struct rte_eth_txconf *tx_conf __rte_unused)
469 
470 {
471 	struct mana_priv *priv = dev->data->dev_private;
472 	struct mana_txq *txq;
473 	int ret;
474 
475 	txq = rte_zmalloc_socket("mana_txq", sizeof(*txq), 0, socket_id);
476 	if (!txq) {
477 		DRV_LOG(ERR, "failed to allocate txq");
478 		return -ENOMEM;
479 	}
480 
481 	txq->socket = socket_id;
482 
483 	txq->desc_ring = rte_malloc_socket("mana_tx_desc_ring",
484 					   sizeof(struct mana_txq_desc) *
485 						nb_desc,
486 					   RTE_CACHE_LINE_SIZE, socket_id);
487 	if (!txq->desc_ring) {
488 		DRV_LOG(ERR, "failed to allocate txq desc_ring");
489 		ret = -ENOMEM;
490 		goto fail;
491 	}
492 
493 	txq->gdma_comp_buf = rte_malloc_socket("mana_txq_comp",
494 			sizeof(*txq->gdma_comp_buf) * nb_desc,
495 			RTE_CACHE_LINE_SIZE, socket_id);
496 	if (!txq->gdma_comp_buf) {
497 		DRV_LOG(ERR, "failed to allocate txq comp");
498 		ret = -ENOMEM;
499 		goto fail;
500 	}
501 
502 	ret = mana_mr_btree_init(&txq->mr_btree,
503 				 MANA_MR_BTREE_PER_QUEUE_N, socket_id);
504 	if (ret) {
505 		DRV_LOG(ERR, "Failed to init TXQ MR btree");
506 		goto fail;
507 	}
508 
509 	DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u txq->desc_ring %p",
510 		queue_idx, nb_desc, socket_id, txq->desc_ring);
511 
512 	txq->desc_ring_head = 0;
513 	txq->desc_ring_tail = 0;
514 	txq->priv = priv;
515 	txq->num_desc = nb_desc;
516 	dev->data->tx_queues[queue_idx] = txq;
517 
518 	return 0;
519 
520 fail:
521 	rte_free(txq->gdma_comp_buf);
522 	rte_free(txq->desc_ring);
523 	rte_free(txq);
524 	return ret;
525 }
526 
527 static void
528 mana_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
529 {
530 	struct mana_txq *txq = dev->data->tx_queues[qid];
531 
532 	mana_mr_btree_free(&txq->mr_btree);
533 
534 	rte_free(txq->gdma_comp_buf);
535 	rte_free(txq->desc_ring);
536 	rte_free(txq);
537 }
538 
539 static int
540 mana_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
541 			uint16_t nb_desc, unsigned int socket_id,
542 			const struct rte_eth_rxconf *rx_conf __rte_unused,
543 			struct rte_mempool *mp)
544 {
545 	struct mana_priv *priv = dev->data->dev_private;
546 	struct mana_rxq *rxq;
547 	int ret;
548 
549 	rxq = rte_zmalloc_socket("mana_rxq", sizeof(*rxq), 0, socket_id);
550 	if (!rxq) {
551 		DRV_LOG(ERR, "failed to allocate rxq");
552 		return -ENOMEM;
553 	}
554 
555 	DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u",
556 		queue_idx, nb_desc, socket_id);
557 
558 	rxq->socket = socket_id;
559 
560 	rxq->desc_ring = rte_zmalloc_socket("mana_rx_mbuf_ring",
561 					    sizeof(struct mana_rxq_desc) *
562 						nb_desc,
563 					    RTE_CACHE_LINE_SIZE, socket_id);
564 
565 	if (!rxq->desc_ring) {
566 		DRV_LOG(ERR, "failed to allocate rxq desc_ring");
567 		ret = -ENOMEM;
568 		goto fail;
569 	}
570 
571 	rxq->desc_ring_head = 0;
572 	rxq->desc_ring_tail = 0;
573 
574 	rxq->gdma_comp_buf = rte_malloc_socket("mana_rxq_comp",
575 			sizeof(*rxq->gdma_comp_buf) * nb_desc,
576 			RTE_CACHE_LINE_SIZE, socket_id);
577 	if (!rxq->gdma_comp_buf) {
578 		DRV_LOG(ERR, "failed to allocate rxq comp");
579 		ret = -ENOMEM;
580 		goto fail;
581 	}
582 
583 	ret = mana_mr_btree_init(&rxq->mr_btree,
584 				 MANA_MR_BTREE_PER_QUEUE_N, socket_id);
585 	if (ret) {
586 		DRV_LOG(ERR, "Failed to init RXQ MR btree");
587 		goto fail;
588 	}
589 
590 	rxq->priv = priv;
591 	rxq->num_desc = nb_desc;
592 	rxq->mp = mp;
593 	dev->data->rx_queues[queue_idx] = rxq;
594 
595 	return 0;
596 
597 fail:
598 	rte_free(rxq->gdma_comp_buf);
599 	rte_free(rxq->desc_ring);
600 	rte_free(rxq);
601 	return ret;
602 }
603 
604 static void
605 mana_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
606 {
607 	struct mana_rxq *rxq = dev->data->rx_queues[qid];
608 
609 	mana_mr_btree_free(&rxq->mr_btree);
610 
611 	rte_free(rxq->gdma_comp_buf);
612 	rte_free(rxq->desc_ring);
613 	rte_free(rxq);
614 }
615 
616 static int
617 mana_dev_link_update(struct rte_eth_dev *dev,
618 		     int wait_to_complete __rte_unused)
619 {
620 	struct rte_eth_link link;
621 
622 	/* MANA has no concept of carrier state, always reporting UP */
623 	link = (struct rte_eth_link) {
624 		.link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
625 		.link_autoneg = RTE_ETH_LINK_SPEED_FIXED,
626 		.link_speed = RTE_ETH_SPEED_NUM_100G,
627 		.link_status = RTE_ETH_LINK_UP,
628 	};
629 
630 	return rte_eth_linkstatus_set(dev, &link);
631 }
632 
633 static int
634 mana_dev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
635 {
636 	unsigned int i;
637 
638 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
639 		struct mana_txq *txq = dev->data->tx_queues[i];
640 
641 		if (!txq)
642 			continue;
643 
644 		stats->opackets += txq->stats.packets;
645 		stats->obytes += txq->stats.bytes;
646 		stats->oerrors += txq->stats.errors;
647 
648 		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
649 			stats->q_opackets[i] = txq->stats.packets;
650 			stats->q_obytes[i] = txq->stats.bytes;
651 		}
652 	}
653 
654 	stats->rx_nombuf = 0;
655 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
656 		struct mana_rxq *rxq = dev->data->rx_queues[i];
657 
658 		if (!rxq)
659 			continue;
660 
661 		stats->ipackets += rxq->stats.packets;
662 		stats->ibytes += rxq->stats.bytes;
663 		stats->ierrors += rxq->stats.errors;
664 
665 		/* There is no good way to get stats->imissed, not setting it */
666 
667 		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
668 			stats->q_ipackets[i] = rxq->stats.packets;
669 			stats->q_ibytes[i] = rxq->stats.bytes;
670 		}
671 
672 		stats->rx_nombuf += rxq->stats.nombuf;
673 	}
674 
675 	return 0;
676 }
677 
678 static int
679 mana_dev_stats_reset(struct rte_eth_dev *dev __rte_unused)
680 {
681 	unsigned int i;
682 
683 	PMD_INIT_FUNC_TRACE();
684 
685 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
686 		struct mana_txq *txq = dev->data->tx_queues[i];
687 
688 		if (!txq)
689 			continue;
690 
691 		memset(&txq->stats, 0, sizeof(txq->stats));
692 	}
693 
694 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
695 		struct mana_rxq *rxq = dev->data->rx_queues[i];
696 
697 		if (!rxq)
698 			continue;
699 
700 		memset(&rxq->stats, 0, sizeof(rxq->stats));
701 	}
702 
703 	return 0;
704 }
705 
706 static int
707 mana_get_ifname(const struct mana_priv *priv, char (*ifname)[IF_NAMESIZE])
708 {
709 	int ret;
710 	DIR *dir;
711 	struct dirent *dent;
712 
713 	MANA_MKSTR(dirpath, "%s/device/net", priv->ib_ctx->device->ibdev_path);
714 
715 	dir = opendir(dirpath);
716 	if (dir == NULL)
717 		return -ENODEV;
718 
719 	while ((dent = readdir(dir)) != NULL) {
720 		char *name = dent->d_name;
721 		FILE *file;
722 		struct rte_ether_addr addr;
723 		char *mac = NULL;
724 
725 		if ((name[0] == '.') &&
726 		    ((name[1] == '\0') ||
727 		     ((name[1] == '.') && (name[2] == '\0'))))
728 			continue;
729 
730 		MANA_MKSTR(path, "%s/%s/address", dirpath, name);
731 
732 		file = fopen(path, "r");
733 		if (!file) {
734 			ret = -ENODEV;
735 			break;
736 		}
737 
738 		ret = fscanf(file, "%ms", &mac);
739 		fclose(file);
740 
741 		if (ret <= 0) {
742 			ret = -EINVAL;
743 			break;
744 		}
745 
746 		ret = rte_ether_unformat_addr(mac, &addr);
747 		free(mac);
748 		if (ret)
749 			break;
750 
751 		if (rte_is_same_ether_addr(&addr, priv->dev_data->mac_addrs)) {
752 			strlcpy(*ifname, name, sizeof(*ifname));
753 			ret = 0;
754 			break;
755 		}
756 	}
757 
758 	closedir(dir);
759 	return ret;
760 }
761 
762 static int
763 mana_ifreq(const struct mana_priv *priv, int req, struct ifreq *ifr)
764 {
765 	int sock, ret;
766 
767 	sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
768 	if (sock == -1)
769 		return -errno;
770 
771 	ret = mana_get_ifname(priv, &ifr->ifr_name);
772 	if (ret) {
773 		close(sock);
774 		return ret;
775 	}
776 
777 	if (ioctl(sock, req, ifr) == -1)
778 		ret = -errno;
779 
780 	close(sock);
781 
782 	return ret;
783 }
784 
785 static int
786 mana_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
787 {
788 	struct mana_priv *priv = dev->data->dev_private;
789 	struct ifreq request = { .ifr_mtu = mtu, };
790 
791 	return mana_ifreq(priv, SIOCSIFMTU, &request);
792 }
793 
794 static const struct eth_dev_ops mana_dev_ops = {
795 	.dev_configure		= mana_dev_configure,
796 	.dev_start		= mana_dev_start,
797 	.dev_stop		= mana_dev_stop,
798 	.dev_close		= mana_dev_close,
799 	.dev_infos_get		= mana_dev_info_get,
800 	.txq_info_get		= mana_dev_tx_queue_info,
801 	.rxq_info_get		= mana_dev_rx_queue_info,
802 	.dev_supported_ptypes_get = mana_supported_ptypes,
803 	.rss_hash_update	= mana_rss_hash_update,
804 	.rss_hash_conf_get	= mana_rss_hash_conf_get,
805 	.tx_queue_setup		= mana_dev_tx_queue_setup,
806 	.tx_queue_release	= mana_dev_tx_queue_release,
807 	.rx_queue_setup		= mana_dev_rx_queue_setup,
808 	.rx_queue_release	= mana_dev_rx_queue_release,
809 	.rx_queue_intr_enable	= mana_rx_intr_enable,
810 	.rx_queue_intr_disable	= mana_rx_intr_disable,
811 	.link_update		= mana_dev_link_update,
812 	.stats_get		= mana_dev_stats_get,
813 	.stats_reset		= mana_dev_stats_reset,
814 	.mtu_set		= mana_mtu_set,
815 };
816 
817 static const struct eth_dev_ops mana_dev_secondary_ops = {
818 	.stats_get = mana_dev_stats_get,
819 	.stats_reset = mana_dev_stats_reset,
820 	.dev_infos_get = mana_dev_info_get,
821 };
822 
823 uint16_t
824 mana_rx_burst_removed(void *dpdk_rxq __rte_unused,
825 		      struct rte_mbuf **pkts __rte_unused,
826 		      uint16_t pkts_n __rte_unused)
827 {
828 	rte_mb();
829 	return 0;
830 }
831 
832 uint16_t
833 mana_tx_burst_removed(void *dpdk_rxq __rte_unused,
834 		      struct rte_mbuf **pkts __rte_unused,
835 		      uint16_t pkts_n __rte_unused)
836 {
837 	rte_mb();
838 	return 0;
839 }
840 
841 #define ETH_MANA_MAC_ARG "mac"
842 static const char * const mana_init_args[] = {
843 	ETH_MANA_MAC_ARG,
844 	NULL,
845 };
846 
847 /* Support of parsing up to 8 mac address from EAL command line */
848 #define MAX_NUM_ADDRESS 8
849 struct mana_conf {
850 	struct rte_ether_addr mac_array[MAX_NUM_ADDRESS];
851 	unsigned int index;
852 };
853 
854 static int
855 mana_arg_parse_callback(const char *key, const char *val, void *private)
856 {
857 	struct mana_conf *conf = (struct mana_conf *)private;
858 	int ret;
859 
860 	DRV_LOG(INFO, "key=%s value=%s index=%d", key, val, conf->index);
861 
862 	if (conf->index >= MAX_NUM_ADDRESS) {
863 		DRV_LOG(ERR, "Exceeding max MAC address");
864 		return 1;
865 	}
866 
867 	ret = rte_ether_unformat_addr(val, &conf->mac_array[conf->index]);
868 	if (ret) {
869 		DRV_LOG(ERR, "Invalid MAC address %s", val);
870 		return ret;
871 	}
872 
873 	conf->index++;
874 
875 	return 0;
876 }
877 
878 static int
879 mana_parse_args(struct rte_devargs *devargs, struct mana_conf *conf)
880 {
881 	struct rte_kvargs *kvlist;
882 	unsigned int arg_count;
883 	int ret = 0;
884 
885 	kvlist = rte_kvargs_parse(devargs->drv_str, mana_init_args);
886 	if (!kvlist) {
887 		DRV_LOG(ERR, "failed to parse kvargs args=%s", devargs->drv_str);
888 		return -EINVAL;
889 	}
890 
891 	arg_count = rte_kvargs_count(kvlist, mana_init_args[0]);
892 	if (arg_count > MAX_NUM_ADDRESS) {
893 		ret = -EINVAL;
894 		goto free_kvlist;
895 	}
896 	ret = rte_kvargs_process(kvlist, mana_init_args[0],
897 				 mana_arg_parse_callback, conf);
898 	if (ret) {
899 		DRV_LOG(ERR, "error parsing args");
900 		goto free_kvlist;
901 	}
902 
903 free_kvlist:
904 	rte_kvargs_free(kvlist);
905 	return ret;
906 }
907 
908 static int
909 get_port_mac(struct ibv_device *device, unsigned int port,
910 	     struct rte_ether_addr *addr)
911 {
912 	FILE *file;
913 	int ret = 0;
914 	DIR *dir;
915 	struct dirent *dent;
916 	unsigned int dev_port;
917 
918 	MANA_MKSTR(path, "%s/device/net", device->ibdev_path);
919 
920 	dir = opendir(path);
921 	if (!dir)
922 		return -ENOENT;
923 
924 	while ((dent = readdir(dir))) {
925 		char *name = dent->d_name;
926 		char *mac = NULL;
927 
928 		MANA_MKSTR(port_path, "%s/%s/dev_port", path, name);
929 
930 		/* Ignore . and .. */
931 		if ((name[0] == '.') &&
932 		    ((name[1] == '\0') ||
933 		     ((name[1] == '.') && (name[2] == '\0'))))
934 			continue;
935 
936 		file = fopen(port_path, "r");
937 		if (!file)
938 			continue;
939 
940 		ret = fscanf(file, "%u", &dev_port);
941 		fclose(file);
942 
943 		if (ret != 1)
944 			continue;
945 
946 		/* Ethernet ports start at 0, IB port start at 1 */
947 		if (dev_port == port - 1) {
948 			MANA_MKSTR(address_path, "%s/%s/address", path, name);
949 
950 			file = fopen(address_path, "r");
951 			if (!file)
952 				continue;
953 
954 			ret = fscanf(file, "%ms", &mac);
955 			fclose(file);
956 
957 			if (ret < 0)
958 				break;
959 
960 			ret = rte_ether_unformat_addr(mac, addr);
961 			if (ret)
962 				DRV_LOG(ERR, "unrecognized mac addr %s", mac);
963 
964 			free(mac);
965 			break;
966 		}
967 	}
968 
969 	closedir(dir);
970 	return ret;
971 }
972 
973 static int
974 mana_ibv_device_to_pci_addr(const struct ibv_device *device,
975 			    struct rte_pci_addr *pci_addr)
976 {
977 	FILE *file;
978 	char *line = NULL;
979 	size_t len = 0;
980 
981 	MANA_MKSTR(path, "%s/device/uevent", device->ibdev_path);
982 
983 	file = fopen(path, "r");
984 	if (!file)
985 		return -errno;
986 
987 	while (getline(&line, &len, file) != -1) {
988 		/* Extract information. */
989 		if (sscanf(line,
990 			   "PCI_SLOT_NAME="
991 			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
992 			   &pci_addr->domain,
993 			   &pci_addr->bus,
994 			   &pci_addr->devid,
995 			   &pci_addr->function) == 4) {
996 			break;
997 		}
998 	}
999 
1000 	free(line);
1001 	fclose(file);
1002 	return 0;
1003 }
1004 
1005 /*
1006  * Interrupt handler from IB layer to notify this device is being removed.
1007  */
1008 static void
1009 mana_intr_handler(void *arg)
1010 {
1011 	struct mana_priv *priv = arg;
1012 	struct ibv_context *ctx = priv->ib_ctx;
1013 	struct ibv_async_event event;
1014 
1015 	/* Read and ack all messages from IB device */
1016 	while (true) {
1017 		if (ibv_get_async_event(ctx, &event))
1018 			break;
1019 
1020 		if (event.event_type == IBV_EVENT_DEVICE_FATAL) {
1021 			struct rte_eth_dev *dev;
1022 
1023 			dev = &rte_eth_devices[priv->port_id];
1024 			if (dev->data->dev_conf.intr_conf.rmv)
1025 				rte_eth_dev_callback_process(dev,
1026 					RTE_ETH_EVENT_INTR_RMV, NULL);
1027 		}
1028 
1029 		ibv_ack_async_event(&event);
1030 	}
1031 }
1032 
1033 static int
1034 mana_intr_uninstall(struct mana_priv *priv)
1035 {
1036 	int ret;
1037 
1038 	ret = rte_intr_callback_unregister(priv->intr_handle,
1039 					   mana_intr_handler, priv);
1040 	if (ret <= 0) {
1041 		DRV_LOG(ERR, "Failed to unregister intr callback ret %d", ret);
1042 		return ret;
1043 	}
1044 
1045 	rte_intr_instance_free(priv->intr_handle);
1046 
1047 	return 0;
1048 }
1049 
1050 int
1051 mana_fd_set_non_blocking(int fd)
1052 {
1053 	int ret = fcntl(fd, F_GETFL);
1054 
1055 	if (ret != -1 && !fcntl(fd, F_SETFL, ret | O_NONBLOCK))
1056 		return 0;
1057 
1058 	rte_errno = errno;
1059 	return -rte_errno;
1060 }
1061 
1062 static int
1063 mana_intr_install(struct rte_eth_dev *eth_dev, struct mana_priv *priv)
1064 {
1065 	int ret;
1066 	struct ibv_context *ctx = priv->ib_ctx;
1067 
1068 	priv->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_SHARED);
1069 	if (!priv->intr_handle) {
1070 		DRV_LOG(ERR, "Failed to allocate intr_handle");
1071 		rte_errno = ENOMEM;
1072 		return -ENOMEM;
1073 	}
1074 
1075 	ret = rte_intr_fd_set(priv->intr_handle, -1);
1076 	if (ret)
1077 		goto free_intr;
1078 
1079 	ret = mana_fd_set_non_blocking(ctx->async_fd);
1080 	if (ret) {
1081 		DRV_LOG(ERR, "Failed to change async_fd to NONBLOCK");
1082 		goto free_intr;
1083 	}
1084 
1085 	ret = rte_intr_fd_set(priv->intr_handle, ctx->async_fd);
1086 	if (ret)
1087 		goto free_intr;
1088 
1089 	ret = rte_intr_type_set(priv->intr_handle, RTE_INTR_HANDLE_EXT);
1090 	if (ret)
1091 		goto free_intr;
1092 
1093 	ret = rte_intr_callback_register(priv->intr_handle,
1094 					 mana_intr_handler, priv);
1095 	if (ret) {
1096 		DRV_LOG(ERR, "Failed to register intr callback");
1097 		rte_intr_fd_set(priv->intr_handle, -1);
1098 		goto free_intr;
1099 	}
1100 
1101 	eth_dev->intr_handle = priv->intr_handle;
1102 	return 0;
1103 
1104 free_intr:
1105 	rte_intr_instance_free(priv->intr_handle);
1106 	priv->intr_handle = NULL;
1107 
1108 	return ret;
1109 }
1110 
1111 static int
1112 mana_proc_priv_init(struct rte_eth_dev *dev)
1113 {
1114 	struct mana_process_priv *priv;
1115 
1116 	priv = rte_zmalloc_socket("mana_proc_priv",
1117 				  sizeof(struct mana_process_priv),
1118 				  RTE_CACHE_LINE_SIZE,
1119 				  dev->device->numa_node);
1120 	if (!priv)
1121 		return -ENOMEM;
1122 
1123 	dev->process_private = priv;
1124 	return 0;
1125 }
1126 
1127 /*
1128  * Map the doorbell page for the secondary process through IB device handle.
1129  */
1130 static int
1131 mana_map_doorbell_secondary(struct rte_eth_dev *eth_dev, int fd)
1132 {
1133 	struct mana_process_priv *priv = eth_dev->process_private;
1134 
1135 	void *addr;
1136 
1137 	addr = mmap(NULL, rte_mem_page_size(), PROT_WRITE, MAP_SHARED, fd, 0);
1138 	if (addr == MAP_FAILED) {
1139 		DRV_LOG(ERR, "Failed to map secondary doorbell port %u",
1140 			eth_dev->data->port_id);
1141 		return -ENOMEM;
1142 	}
1143 
1144 	DRV_LOG(INFO, "Secondary doorbell mapped to %p", addr);
1145 
1146 	priv->db_page = addr;
1147 
1148 	return 0;
1149 }
1150 
1151 /* Initialize shared data for the driver (all devices) */
1152 static int
1153 mana_init_shared_data(void)
1154 {
1155 	int ret =  0;
1156 	const struct rte_memzone *secondary_mz;
1157 
1158 	rte_spinlock_lock(&mana_shared_data_lock);
1159 
1160 	/* Skip if shared data is already initialized */
1161 	if (mana_shared_data)
1162 		goto exit;
1163 
1164 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1165 		mana_shared_mz = rte_memzone_reserve(MZ_MANA_SHARED_DATA,
1166 						     sizeof(*mana_shared_data),
1167 						     SOCKET_ID_ANY, 0);
1168 		if (!mana_shared_mz) {
1169 			DRV_LOG(ERR, "Cannot allocate mana shared data");
1170 			ret = -rte_errno;
1171 			goto exit;
1172 		}
1173 
1174 		mana_shared_data = mana_shared_mz->addr;
1175 		memset(mana_shared_data, 0, sizeof(*mana_shared_data));
1176 		rte_spinlock_init(&mana_shared_data->lock);
1177 	} else {
1178 		secondary_mz = rte_memzone_lookup(MZ_MANA_SHARED_DATA);
1179 		if (!secondary_mz) {
1180 			DRV_LOG(ERR, "Cannot attach mana shared data");
1181 			ret = -rte_errno;
1182 			goto exit;
1183 		}
1184 
1185 		mana_shared_data = secondary_mz->addr;
1186 		memset(&mana_local_data, 0, sizeof(mana_local_data));
1187 	}
1188 
1189 exit:
1190 	rte_spinlock_unlock(&mana_shared_data_lock);
1191 
1192 	return ret;
1193 }
1194 
1195 /*
1196  * Init the data structures for use in primary and secondary processes.
1197  */
1198 static int
1199 mana_init_once(void)
1200 {
1201 	int ret;
1202 
1203 	ret = mana_init_shared_data();
1204 	if (ret)
1205 		return ret;
1206 
1207 	rte_spinlock_lock(&mana_shared_data->lock);
1208 
1209 	switch (rte_eal_process_type()) {
1210 	case RTE_PROC_PRIMARY:
1211 		if (mana_shared_data->init_done)
1212 			break;
1213 
1214 		ret = mana_mp_init_primary();
1215 		if (ret)
1216 			break;
1217 		DRV_LOG(ERR, "MP INIT PRIMARY");
1218 
1219 		mana_shared_data->init_done = 1;
1220 		break;
1221 
1222 	case RTE_PROC_SECONDARY:
1223 
1224 		if (mana_local_data.init_done)
1225 			break;
1226 
1227 		ret = mana_mp_init_secondary();
1228 		if (ret)
1229 			break;
1230 
1231 		DRV_LOG(ERR, "MP INIT SECONDARY");
1232 
1233 		mana_local_data.init_done = 1;
1234 		break;
1235 
1236 	default:
1237 		/* Impossible, internal error */
1238 		ret = -EPROTO;
1239 		break;
1240 	}
1241 
1242 	rte_spinlock_unlock(&mana_shared_data->lock);
1243 
1244 	return ret;
1245 }
1246 
1247 /*
1248  * Probe an IB port
1249  * Return value:
1250  * positive value: successfully probed port
1251  * 0: port not matching specified MAC address
1252  * negative value: error code
1253  */
1254 static int
1255 mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
1256 		uint8_t port, struct rte_pci_device *pci_dev, struct rte_ether_addr *addr)
1257 {
1258 	struct mana_priv *priv = NULL;
1259 	struct rte_eth_dev *eth_dev = NULL;
1260 	struct ibv_parent_domain_init_attr attr = {0};
1261 	char address[64];
1262 	char name[RTE_ETH_NAME_MAX_LEN];
1263 	int ret;
1264 	struct ibv_context *ctx = NULL;
1265 
1266 	rte_ether_format_addr(address, sizeof(address), addr);
1267 	DRV_LOG(INFO, "device located port %u address %s", port, address);
1268 
1269 	priv = rte_zmalloc_socket(NULL, sizeof(*priv), RTE_CACHE_LINE_SIZE,
1270 				  SOCKET_ID_ANY);
1271 	if (!priv)
1272 		return -ENOMEM;
1273 
1274 	snprintf(name, sizeof(name), "%s_port%d", pci_dev->device.name, port);
1275 
1276 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1277 		int fd;
1278 
1279 		eth_dev = rte_eth_dev_attach_secondary(name);
1280 		if (!eth_dev) {
1281 			DRV_LOG(ERR, "Can't attach to dev %s", name);
1282 			ret =  -ENOMEM;
1283 			goto failed;
1284 		}
1285 
1286 		eth_dev->device = &pci_dev->device;
1287 		eth_dev->dev_ops = &mana_dev_secondary_ops;
1288 		ret = mana_proc_priv_init(eth_dev);
1289 		if (ret)
1290 			goto failed;
1291 		priv->process_priv = eth_dev->process_private;
1292 
1293 		/* Get the IB FD from the primary process */
1294 		fd = mana_mp_req_verbs_cmd_fd(eth_dev);
1295 		if (fd < 0) {
1296 			DRV_LOG(ERR, "Failed to get FD %d", fd);
1297 			ret = -ENODEV;
1298 			goto failed;
1299 		}
1300 
1301 		ret = mana_map_doorbell_secondary(eth_dev, fd);
1302 		if (ret) {
1303 			DRV_LOG(ERR, "Failed secondary map %d", fd);
1304 			goto failed;
1305 		}
1306 
1307 		/* fd is no not used after mapping doorbell */
1308 		close(fd);
1309 
1310 		eth_dev->tx_pkt_burst = mana_tx_burst_removed;
1311 		eth_dev->rx_pkt_burst = mana_rx_burst_removed;
1312 
1313 		rte_spinlock_lock(&mana_shared_data->lock);
1314 		mana_shared_data->secondary_cnt++;
1315 		mana_local_data.secondary_cnt++;
1316 		rte_spinlock_unlock(&mana_shared_data->lock);
1317 
1318 		rte_eth_copy_pci_info(eth_dev, pci_dev);
1319 		rte_eth_dev_probing_finish(eth_dev);
1320 
1321 		return 0;
1322 	}
1323 
1324 	ctx = ibv_open_device(ibdev);
1325 	if (!ctx) {
1326 		DRV_LOG(ERR, "Failed to open IB device %s", ibdev->name);
1327 		ret = -ENODEV;
1328 		goto failed;
1329 	}
1330 
1331 	eth_dev = rte_eth_dev_allocate(name);
1332 	if (!eth_dev) {
1333 		ret = -ENOMEM;
1334 		goto failed;
1335 	}
1336 
1337 	eth_dev->data->mac_addrs =
1338 		rte_calloc("mana_mac", 1,
1339 			   sizeof(struct rte_ether_addr), 0);
1340 	if (!eth_dev->data->mac_addrs) {
1341 		ret = -ENOMEM;
1342 		goto failed;
1343 	}
1344 
1345 	rte_ether_addr_copy(addr, eth_dev->data->mac_addrs);
1346 
1347 	priv->ib_pd = ibv_alloc_pd(ctx);
1348 	if (!priv->ib_pd) {
1349 		DRV_LOG(ERR, "ibv_alloc_pd failed port %d", port);
1350 		ret = -ENOMEM;
1351 		goto failed;
1352 	}
1353 
1354 	/* Create a parent domain with the port number */
1355 	attr.pd = priv->ib_pd;
1356 	attr.comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
1357 	attr.pd_context = (void *)(uintptr_t)port;
1358 	priv->ib_parent_pd = ibv_alloc_parent_domain(ctx, &attr);
1359 	if (!priv->ib_parent_pd) {
1360 		DRV_LOG(ERR, "ibv_alloc_parent_domain failed port %d", port);
1361 		ret = -ENOMEM;
1362 		goto failed;
1363 	}
1364 
1365 	priv->ib_ctx = ctx;
1366 	priv->port_id = eth_dev->data->port_id;
1367 	priv->dev_port = port;
1368 	eth_dev->data->dev_private = priv;
1369 	priv->dev_data = eth_dev->data;
1370 
1371 	priv->max_rx_queues = dev_attr->orig_attr.max_qp;
1372 	priv->max_tx_queues = dev_attr->orig_attr.max_qp;
1373 
1374 	priv->max_rx_desc =
1375 		RTE_MIN(dev_attr->orig_attr.max_qp_wr,
1376 			dev_attr->orig_attr.max_cqe);
1377 	priv->max_tx_desc =
1378 		RTE_MIN(dev_attr->orig_attr.max_qp_wr,
1379 			dev_attr->orig_attr.max_cqe);
1380 
1381 	priv->max_send_sge = dev_attr->orig_attr.max_sge;
1382 	priv->max_recv_sge = dev_attr->orig_attr.max_sge;
1383 
1384 	priv->max_mr = dev_attr->orig_attr.max_mr;
1385 	priv->max_mr_size = dev_attr->orig_attr.max_mr_size;
1386 
1387 	DRV_LOG(INFO, "dev %s max queues %d desc %d sge %d",
1388 		name, priv->max_rx_queues, priv->max_rx_desc,
1389 		priv->max_send_sge);
1390 
1391 	rte_eth_copy_pci_info(eth_dev, pci_dev);
1392 
1393 	/* Create async interrupt handler */
1394 	ret = mana_intr_install(eth_dev, priv);
1395 	if (ret) {
1396 		DRV_LOG(ERR, "Failed to install intr handler");
1397 		goto failed;
1398 	}
1399 
1400 	rte_spinlock_lock(&mana_shared_data->lock);
1401 	mana_shared_data->primary_cnt++;
1402 	rte_spinlock_unlock(&mana_shared_data->lock);
1403 
1404 	eth_dev->device = &pci_dev->device;
1405 
1406 	DRV_LOG(INFO, "device %s at port %u", name, eth_dev->data->port_id);
1407 
1408 	eth_dev->rx_pkt_burst = mana_rx_burst_removed;
1409 	eth_dev->tx_pkt_burst = mana_tx_burst_removed;
1410 	eth_dev->dev_ops = &mana_dev_ops;
1411 
1412 	rte_eth_dev_probing_finish(eth_dev);
1413 
1414 	return 0;
1415 
1416 failed:
1417 	/* Free the resource for the port failed */
1418 	if (priv) {
1419 		if (priv->ib_parent_pd)
1420 			ibv_dealloc_pd(priv->ib_parent_pd);
1421 
1422 		if (priv->ib_pd)
1423 			ibv_dealloc_pd(priv->ib_pd);
1424 	}
1425 
1426 	if (eth_dev)
1427 		rte_eth_dev_release_port(eth_dev);
1428 
1429 	rte_free(priv);
1430 
1431 	if (ctx)
1432 		ibv_close_device(ctx);
1433 
1434 	return ret;
1435 }
1436 
1437 /*
1438  * Goes through the IB device list to look for the IB port matching the
1439  * mac_addr. If found, create a rte_eth_dev for it.
1440  * Return value: number of successfully probed devices
1441  */
1442 static int
1443 mana_pci_probe_mac(struct rte_pci_device *pci_dev,
1444 		   struct rte_ether_addr *mac_addr)
1445 {
1446 	struct ibv_device **ibv_list;
1447 	int ibv_idx;
1448 	struct ibv_context *ctx;
1449 	int num_devices;
1450 	int ret;
1451 	uint8_t port;
1452 	int count = 0;
1453 
1454 	ibv_list = ibv_get_device_list(&num_devices);
1455 	for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
1456 		struct ibv_device *ibdev = ibv_list[ibv_idx];
1457 		struct rte_pci_addr pci_addr;
1458 		struct ibv_device_attr_ex dev_attr;
1459 
1460 		DRV_LOG(INFO, "Probe device name %s dev_name %s ibdev_path %s",
1461 			ibdev->name, ibdev->dev_name, ibdev->ibdev_path);
1462 
1463 		if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
1464 			continue;
1465 
1466 		/* Ignore if this IB device is not this PCI device */
1467 		if (pci_dev->addr.domain != pci_addr.domain ||
1468 		    pci_dev->addr.bus != pci_addr.bus ||
1469 		    pci_dev->addr.devid != pci_addr.devid ||
1470 		    pci_dev->addr.function != pci_addr.function)
1471 			continue;
1472 
1473 		ctx = ibv_open_device(ibdev);
1474 		if (!ctx) {
1475 			DRV_LOG(ERR, "Failed to open IB device %s",
1476 				ibdev->name);
1477 			continue;
1478 		}
1479 		ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
1480 		ibv_close_device(ctx);
1481 
1482 		if (ret) {
1483 			DRV_LOG(ERR, "Failed to query IB device %s",
1484 				ibdev->name);
1485 			continue;
1486 		}
1487 
1488 		for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
1489 		     port++) {
1490 			struct rte_ether_addr addr;
1491 			ret = get_port_mac(ibdev, port, &addr);
1492 			if (ret)
1493 				continue;
1494 
1495 			if (mac_addr && !rte_is_same_ether_addr(&addr, mac_addr))
1496 				continue;
1497 
1498 			ret = mana_probe_port(ibdev, &dev_attr, port, pci_dev, &addr);
1499 			if (ret) {
1500 				DRV_LOG(ERR, "Probe on IB port %u failed %d", port, ret);
1501 			} else {
1502 				count++;
1503 				DRV_LOG(INFO, "Successfully probed on IB port %u", port);
1504 			}
1505 		}
1506 	}
1507 
1508 	ibv_free_device_list(ibv_list);
1509 	return count;
1510 }
1511 
1512 /*
1513  * Main callback function from PCI bus to probe a device.
1514  */
1515 static int
1516 mana_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1517 	       struct rte_pci_device *pci_dev)
1518 {
1519 	struct rte_devargs *args = pci_dev->device.devargs;
1520 	struct mana_conf conf = {0};
1521 	unsigned int i;
1522 	int ret;
1523 	int count = 0;
1524 
1525 	if (args && args->drv_str) {
1526 		ret = mana_parse_args(args, &conf);
1527 		if (ret) {
1528 			DRV_LOG(ERR, "Failed to parse parameters args = %s",
1529 				args->drv_str);
1530 			return ret;
1531 		}
1532 	}
1533 
1534 	ret = mana_init_once();
1535 	if (ret) {
1536 		DRV_LOG(ERR, "Failed to init PMD global data %d", ret);
1537 		return ret;
1538 	}
1539 
1540 	/* If there are no driver parameters, probe on all ports */
1541 	if (conf.index) {
1542 		for (i = 0; i < conf.index; i++)
1543 			count += mana_pci_probe_mac(pci_dev,
1544 						    &conf.mac_array[i]);
1545 	} else {
1546 		count = mana_pci_probe_mac(pci_dev, NULL);
1547 	}
1548 
1549 	if (!count) {
1550 		rte_memzone_free(mana_shared_mz);
1551 		mana_shared_mz = NULL;
1552 		ret = -ENODEV;
1553 	}
1554 
1555 	return ret;
1556 }
1557 
1558 static int
1559 mana_dev_uninit(struct rte_eth_dev *dev)
1560 {
1561 	return mana_dev_close(dev);
1562 }
1563 
1564 /*
1565  * Callback from PCI to remove this device.
1566  */
1567 static int
1568 mana_pci_remove(struct rte_pci_device *pci_dev)
1569 {
1570 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1571 		rte_spinlock_lock(&mana_shared_data_lock);
1572 
1573 		rte_spinlock_lock(&mana_shared_data->lock);
1574 
1575 		RTE_VERIFY(mana_shared_data->primary_cnt > 0);
1576 		mana_shared_data->primary_cnt--;
1577 		if (!mana_shared_data->primary_cnt) {
1578 			DRV_LOG(DEBUG, "mp uninit primary");
1579 			mana_mp_uninit_primary();
1580 		}
1581 
1582 		rte_spinlock_unlock(&mana_shared_data->lock);
1583 
1584 		/* Also free the shared memory if this is the last */
1585 		if (!mana_shared_data->primary_cnt) {
1586 			DRV_LOG(DEBUG, "free shared memezone data");
1587 			rte_memzone_free(mana_shared_mz);
1588 			mana_shared_mz = NULL;
1589 		}
1590 
1591 		rte_spinlock_unlock(&mana_shared_data_lock);
1592 	} else {
1593 		rte_spinlock_lock(&mana_shared_data_lock);
1594 
1595 		rte_spinlock_lock(&mana_shared_data->lock);
1596 		RTE_VERIFY(mana_shared_data->secondary_cnt > 0);
1597 		mana_shared_data->secondary_cnt--;
1598 		rte_spinlock_unlock(&mana_shared_data->lock);
1599 
1600 		RTE_VERIFY(mana_local_data.secondary_cnt > 0);
1601 		mana_local_data.secondary_cnt--;
1602 		if (!mana_local_data.secondary_cnt) {
1603 			DRV_LOG(DEBUG, "mp uninit secondary");
1604 			mana_mp_uninit_secondary();
1605 		}
1606 
1607 		rte_spinlock_unlock(&mana_shared_data_lock);
1608 	}
1609 
1610 	return rte_eth_dev_pci_generic_remove(pci_dev, mana_dev_uninit);
1611 }
1612 
1613 static const struct rte_pci_id mana_pci_id_map[] = {
1614 	{
1615 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT,
1616 			       PCI_DEVICE_ID_MICROSOFT_MANA)
1617 	},
1618 	{
1619 		.vendor_id = 0
1620 	},
1621 };
1622 
1623 static struct rte_pci_driver mana_pci_driver = {
1624 	.id_table = mana_pci_id_map,
1625 	.probe = mana_pci_probe,
1626 	.remove = mana_pci_remove,
1627 	.drv_flags = RTE_PCI_DRV_INTR_RMV,
1628 };
1629 
1630 RTE_PMD_REGISTER_PCI(net_mana, mana_pci_driver);
1631 RTE_PMD_REGISTER_PCI_TABLE(net_mana, mana_pci_id_map);
1632 RTE_PMD_REGISTER_KMOD_DEP(net_mana, "* ib_uverbs & mana_ib");
1633 RTE_LOG_REGISTER_SUFFIX(mana_logtype_init, init, NOTICE);
1634 RTE_LOG_REGISTER_SUFFIX(mana_logtype_driver, driver, NOTICE);
1635 RTE_PMD_REGISTER_PARAM_STRING(net_mana, ETH_MANA_MAC_ARG "=<mac_addr>");
1636