xref: /dpdk/drivers/net/mana/mana.c (revision eb704df7e27df838ba7ec9bcd034bf0aaee405cd)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2022 Microsoft Corporation
3  */
4 
5 #include <unistd.h>
6 #include <dirent.h>
7 #include <fcntl.h>
8 #include <sys/mman.h>
9 #include <sys/ioctl.h>
10 #include <net/if.h>
11 
12 #include <ethdev_driver.h>
13 #include <ethdev_pci.h>
14 #include <rte_kvargs.h>
15 #include <rte_eal_paging.h>
16 #include <rte_pci.h>
17 
18 #include <infiniband/verbs.h>
19 #include <infiniband/manadv.h>
20 
21 #include <assert.h>
22 
23 #include "mana.h"
24 
25 /* Shared memory between primary/secondary processes, per driver */
26 /* Data to track primary/secondary usage */
27 struct mana_shared_data *mana_shared_data;
28 static struct mana_shared_data mana_local_data;
29 
30 /* The memory region for the above data */
31 static const struct rte_memzone *mana_shared_mz;
32 static const char *MZ_MANA_SHARED_DATA = "mana_shared_data";
33 
34 /* Spinlock for mana_shared_data */
35 static rte_spinlock_t mana_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
36 
37 /* Allocate a buffer on the stack and fill it with a printf format string. */
38 #define MANA_MKSTR(name, ...) \
39 	int mkstr_size_##name = snprintf(NULL, 0, "" __VA_ARGS__); \
40 	char name[mkstr_size_##name + 1]; \
41 	\
42 	memset(name, 0, mkstr_size_##name + 1); \
43 	snprintf(name, sizeof(name), "" __VA_ARGS__)
44 
45 int mana_logtype_driver;
46 int mana_logtype_init;
47 
48 /*
49  * Callback from rdma-core to allocate a buffer for a queue.
50  */
51 void *
52 mana_alloc_verbs_buf(size_t size, void *data)
53 {
54 	void *ret;
55 	size_t alignment = rte_mem_page_size();
56 	int socket = (int)(uintptr_t)data;
57 
58 	DRV_LOG(DEBUG, "size=%zu socket=%d", size, socket);
59 
60 	if (alignment == (size_t)-1) {
61 		DRV_LOG(ERR, "Failed to get mem page size");
62 		rte_errno = ENOMEM;
63 		return NULL;
64 	}
65 
66 	ret = rte_zmalloc_socket("mana_verb_buf", size, alignment, socket);
67 	if (!ret && size)
68 		rte_errno = ENOMEM;
69 	return ret;
70 }
71 
72 void
73 mana_free_verbs_buf(void *ptr, void *data __rte_unused)
74 {
75 	rte_free(ptr);
76 }
77 
78 static int
79 mana_dev_configure(struct rte_eth_dev *dev)
80 {
81 	struct mana_priv *priv = dev->data->dev_private;
82 	struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
83 
84 	if (dev_conf->rxmode.mq_mode & RTE_ETH_MQ_RX_RSS_FLAG)
85 		dev_conf->rxmode.offloads |= RTE_ETH_RX_OFFLOAD_RSS_HASH;
86 
87 	if (dev->data->nb_rx_queues != dev->data->nb_tx_queues) {
88 		DRV_LOG(ERR, "Only support equal number of rx/tx queues");
89 		return -EINVAL;
90 	}
91 
92 	if (!rte_is_power_of_2(dev->data->nb_rx_queues)) {
93 		DRV_LOG(ERR, "number of TX/RX queues must be power of 2");
94 		return -EINVAL;
95 	}
96 
97 	priv->num_queues = dev->data->nb_rx_queues;
98 
99 	manadv_set_context_attr(priv->ib_ctx, MANADV_CTX_ATTR_BUF_ALLOCATORS,
100 				(void *)((uintptr_t)&(struct manadv_ctx_allocators){
101 					.alloc = &mana_alloc_verbs_buf,
102 					.free = &mana_free_verbs_buf,
103 					.data = 0,
104 				}));
105 
106 	return 0;
107 }
108 
109 static void
110 rx_intr_vec_disable(struct mana_priv *priv)
111 {
112 	struct rte_intr_handle *intr_handle = priv->intr_handle;
113 
114 	rte_intr_free_epoll_fd(intr_handle);
115 	rte_intr_vec_list_free(intr_handle);
116 	rte_intr_nb_efd_set(intr_handle, 0);
117 }
118 
119 static int
120 rx_intr_vec_enable(struct mana_priv *priv)
121 {
122 	unsigned int i;
123 	unsigned int rxqs_n = priv->dev_data->nb_rx_queues;
124 	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
125 	struct rte_intr_handle *intr_handle = priv->intr_handle;
126 	int ret;
127 
128 	rx_intr_vec_disable(priv);
129 
130 	if (rte_intr_vec_list_alloc(intr_handle, NULL, n)) {
131 		DRV_LOG(ERR, "Failed to allocate memory for interrupt vector");
132 		return -ENOMEM;
133 	}
134 
135 	for (i = 0; i < n; i++) {
136 		struct mana_rxq *rxq = priv->dev_data->rx_queues[i];
137 
138 		ret = rte_intr_vec_list_index_set(intr_handle, i,
139 						  RTE_INTR_VEC_RXTX_OFFSET + i);
140 		if (ret) {
141 			DRV_LOG(ERR, "Failed to set intr vec %u", i);
142 			return ret;
143 		}
144 
145 		ret = rte_intr_efds_index_set(intr_handle, i, rxq->channel->fd);
146 		if (ret) {
147 			DRV_LOG(ERR, "Failed to set FD at intr %u", i);
148 			return ret;
149 		}
150 	}
151 
152 	return rte_intr_nb_efd_set(intr_handle, n);
153 }
154 
155 static void
156 rxq_intr_disable(struct mana_priv *priv)
157 {
158 	int err = rte_errno;
159 
160 	rx_intr_vec_disable(priv);
161 	rte_errno = err;
162 }
163 
164 static int
165 rxq_intr_enable(struct mana_priv *priv)
166 {
167 	const struct rte_eth_intr_conf *const intr_conf =
168 		&priv->dev_data->dev_conf.intr_conf;
169 
170 	if (!intr_conf->rxq)
171 		return 0;
172 
173 	return rx_intr_vec_enable(priv);
174 }
175 
176 static int
177 mana_dev_start(struct rte_eth_dev *dev)
178 {
179 	int ret;
180 	struct mana_priv *priv = dev->data->dev_private;
181 
182 	rte_spinlock_init(&priv->mr_btree_lock);
183 	ret = mana_mr_btree_init(&priv->mr_btree, MANA_MR_BTREE_CACHE_N,
184 				 dev->device->numa_node);
185 	if (ret) {
186 		DRV_LOG(ERR, "Failed to init device MR btree %d", ret);
187 		return ret;
188 	}
189 
190 	ret = mana_start_tx_queues(dev);
191 	if (ret) {
192 		DRV_LOG(ERR, "failed to start tx queues %d", ret);
193 		goto failed_tx;
194 	}
195 
196 	ret = mana_start_rx_queues(dev);
197 	if (ret) {
198 		DRV_LOG(ERR, "failed to start rx queues %d", ret);
199 		goto failed_rx;
200 	}
201 
202 	rte_wmb();
203 
204 	dev->tx_pkt_burst = mana_tx_burst;
205 	dev->rx_pkt_burst = mana_rx_burst;
206 
207 	DRV_LOG(INFO, "TX/RX queues have started");
208 
209 	/* Enable datapath for secondary processes */
210 	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_START_RXTX);
211 
212 	ret = rxq_intr_enable(priv);
213 	if (ret) {
214 		DRV_LOG(ERR, "Failed to enable RX interrupts");
215 		goto failed_intr;
216 	}
217 
218 	return 0;
219 
220 failed_intr:
221 	mana_stop_rx_queues(dev);
222 
223 failed_rx:
224 	mana_stop_tx_queues(dev);
225 
226 failed_tx:
227 	mana_mr_btree_free(&priv->mr_btree);
228 
229 	return ret;
230 }
231 
232 static int
233 mana_dev_stop(struct rte_eth_dev *dev)
234 {
235 	int ret;
236 	struct mana_priv *priv = dev->data->dev_private;
237 
238 	rxq_intr_disable(priv);
239 
240 	dev->tx_pkt_burst = mana_tx_burst_removed;
241 	dev->rx_pkt_burst = mana_rx_burst_removed;
242 
243 	/* Stop datapath on secondary processes */
244 	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_STOP_RXTX);
245 
246 	rte_wmb();
247 
248 	ret = mana_stop_tx_queues(dev);
249 	if (ret) {
250 		DRV_LOG(ERR, "failed to stop tx queues");
251 		return ret;
252 	}
253 
254 	ret = mana_stop_rx_queues(dev);
255 	if (ret) {
256 		DRV_LOG(ERR, "failed to stop tx queues");
257 		return ret;
258 	}
259 
260 	return 0;
261 }
262 
263 static int mana_intr_uninstall(struct mana_priv *priv);
264 
265 static int
266 mana_dev_close(struct rte_eth_dev *dev)
267 {
268 	struct mana_priv *priv = dev->data->dev_private;
269 	int ret;
270 
271 	mana_remove_all_mr(priv);
272 
273 	ret = mana_intr_uninstall(priv);
274 	if (ret)
275 		return ret;
276 
277 	ret = ibv_close_device(priv->ib_ctx);
278 	if (ret) {
279 		ret = errno;
280 		return ret;
281 	}
282 
283 	return 0;
284 }
285 
286 static int
287 mana_dev_info_get(struct rte_eth_dev *dev,
288 		  struct rte_eth_dev_info *dev_info)
289 {
290 	struct mana_priv *priv = dev->data->dev_private;
291 
292 	dev_info->min_mtu = RTE_ETHER_MIN_MTU;
293 	dev_info->max_mtu = MANA_MAX_MTU;
294 
295 	/* RX params */
296 	dev_info->min_rx_bufsize = MIN_RX_BUF_SIZE;
297 	dev_info->max_rx_pktlen = MANA_MAX_MTU + RTE_ETHER_HDR_LEN;
298 
299 	dev_info->max_rx_queues = RTE_MIN(priv->max_rx_queues, UINT16_MAX);
300 	dev_info->max_tx_queues = RTE_MIN(priv->max_tx_queues, UINT16_MAX);
301 
302 	dev_info->max_mac_addrs = MANA_MAX_MAC_ADDR;
303 	dev_info->max_hash_mac_addrs = 0;
304 
305 	dev_info->max_vfs = 1;
306 
307 	/* Offload params */
308 	dev_info->rx_offload_capa = MANA_DEV_RX_OFFLOAD_SUPPORT;
309 
310 	dev_info->tx_offload_capa = MANA_DEV_TX_OFFLOAD_SUPPORT;
311 
312 	/* RSS */
313 	dev_info->reta_size = INDIRECTION_TABLE_NUM_ELEMENTS;
314 	dev_info->hash_key_size = TOEPLITZ_HASH_KEY_SIZE_IN_BYTES;
315 	dev_info->flow_type_rss_offloads = MANA_ETH_RSS_SUPPORT;
316 
317 	/* Thresholds */
318 	dev_info->default_rxconf = (struct rte_eth_rxconf){
319 		.rx_thresh = {
320 			.pthresh = 8,
321 			.hthresh = 8,
322 			.wthresh = 0,
323 		},
324 		.rx_free_thresh = 32,
325 		/* If no descriptors available, pkts are dropped by default */
326 		.rx_drop_en = 1,
327 	};
328 
329 	dev_info->default_txconf = (struct rte_eth_txconf){
330 		.tx_thresh = {
331 			.pthresh = 32,
332 			.hthresh = 0,
333 			.wthresh = 0,
334 		},
335 		.tx_rs_thresh = 32,
336 		.tx_free_thresh = 32,
337 	};
338 
339 	/* Buffer limits */
340 	dev_info->rx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
341 	dev_info->rx_desc_lim.nb_max = RTE_MIN(priv->max_rx_desc, UINT16_MAX);
342 	dev_info->rx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
343 	dev_info->rx_desc_lim.nb_seg_max =
344 		RTE_MIN(priv->max_recv_sge, UINT16_MAX);
345 	dev_info->rx_desc_lim.nb_mtu_seg_max =
346 		RTE_MIN(priv->max_recv_sge, UINT16_MAX);
347 
348 	dev_info->tx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
349 	dev_info->tx_desc_lim.nb_max = RTE_MIN(priv->max_tx_desc, UINT16_MAX);
350 	dev_info->tx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
351 	dev_info->tx_desc_lim.nb_seg_max =
352 		RTE_MIN(priv->max_send_sge, UINT16_MAX);
353 	dev_info->tx_desc_lim.nb_mtu_seg_max =
354 		RTE_MIN(priv->max_send_sge, UINT16_MAX);
355 
356 	/* Speed */
357 	dev_info->speed_capa = RTE_ETH_LINK_SPEED_100G;
358 
359 	/* RX params */
360 	dev_info->default_rxportconf.burst_size = 1;
361 	dev_info->default_rxportconf.ring_size = MAX_RECEIVE_BUFFERS_PER_QUEUE;
362 	dev_info->default_rxportconf.nb_queues = 1;
363 
364 	/* TX params */
365 	dev_info->default_txportconf.burst_size = 1;
366 	dev_info->default_txportconf.ring_size = MAX_SEND_BUFFERS_PER_QUEUE;
367 	dev_info->default_txportconf.nb_queues = 1;
368 
369 	return 0;
370 }
371 
372 static void
373 mana_dev_tx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
374 		       struct rte_eth_txq_info *qinfo)
375 {
376 	struct mana_txq *txq = dev->data->tx_queues[queue_id];
377 
378 	qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads;
379 	qinfo->nb_desc = txq->num_desc;
380 }
381 
382 static void
383 mana_dev_rx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
384 		       struct rte_eth_rxq_info *qinfo)
385 {
386 	struct mana_rxq *rxq = dev->data->rx_queues[queue_id];
387 
388 	qinfo->mp = rxq->mp;
389 	qinfo->nb_desc = rxq->num_desc;
390 	qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads;
391 }
392 
393 static const uint32_t *
394 mana_supported_ptypes(struct rte_eth_dev *dev __rte_unused)
395 {
396 	static const uint32_t ptypes[] = {
397 		RTE_PTYPE_L2_ETHER,
398 		RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
399 		RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
400 		RTE_PTYPE_L4_FRAG,
401 		RTE_PTYPE_L4_TCP,
402 		RTE_PTYPE_L4_UDP,
403 		RTE_PTYPE_UNKNOWN
404 	};
405 
406 	return ptypes;
407 }
408 
409 static int
410 mana_rss_hash_update(struct rte_eth_dev *dev,
411 		     struct rte_eth_rss_conf *rss_conf)
412 {
413 	struct mana_priv *priv = dev->data->dev_private;
414 
415 	/* Currently can only update RSS hash when device is stopped */
416 	if (dev->data->dev_started) {
417 		DRV_LOG(ERR, "Can't update RSS after device has started");
418 		return -ENODEV;
419 	}
420 
421 	if (rss_conf->rss_hf & ~MANA_ETH_RSS_SUPPORT) {
422 		DRV_LOG(ERR, "Port %u invalid RSS HF 0x%" PRIx64,
423 			dev->data->port_id, rss_conf->rss_hf);
424 		return -EINVAL;
425 	}
426 
427 	if (rss_conf->rss_key && rss_conf->rss_key_len) {
428 		if (rss_conf->rss_key_len != TOEPLITZ_HASH_KEY_SIZE_IN_BYTES) {
429 			DRV_LOG(ERR, "Port %u key len must be %u long",
430 				dev->data->port_id,
431 				TOEPLITZ_HASH_KEY_SIZE_IN_BYTES);
432 			return -EINVAL;
433 		}
434 
435 		priv->rss_conf.rss_key_len = rss_conf->rss_key_len;
436 		priv->rss_conf.rss_key =
437 			rte_zmalloc("mana_rss", rss_conf->rss_key_len,
438 				    RTE_CACHE_LINE_SIZE);
439 		if (!priv->rss_conf.rss_key)
440 			return -ENOMEM;
441 		memcpy(priv->rss_conf.rss_key, rss_conf->rss_key,
442 		       rss_conf->rss_key_len);
443 	}
444 	priv->rss_conf.rss_hf = rss_conf->rss_hf;
445 
446 	return 0;
447 }
448 
449 static int
450 mana_rss_hash_conf_get(struct rte_eth_dev *dev,
451 		       struct rte_eth_rss_conf *rss_conf)
452 {
453 	struct mana_priv *priv = dev->data->dev_private;
454 
455 	if (!rss_conf)
456 		return -EINVAL;
457 
458 	if (rss_conf->rss_key &&
459 	    rss_conf->rss_key_len >= priv->rss_conf.rss_key_len) {
460 		memcpy(rss_conf->rss_key, priv->rss_conf.rss_key,
461 		       priv->rss_conf.rss_key_len);
462 	}
463 
464 	rss_conf->rss_key_len = priv->rss_conf.rss_key_len;
465 	rss_conf->rss_hf = priv->rss_conf.rss_hf;
466 
467 	return 0;
468 }
469 
470 static int
471 mana_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
472 			uint16_t nb_desc, unsigned int socket_id,
473 			const struct rte_eth_txconf *tx_conf __rte_unused)
474 
475 {
476 	struct mana_priv *priv = dev->data->dev_private;
477 	struct mana_txq *txq;
478 	int ret;
479 
480 	txq = rte_zmalloc_socket("mana_txq", sizeof(*txq), 0, socket_id);
481 	if (!txq) {
482 		DRV_LOG(ERR, "failed to allocate txq");
483 		return -ENOMEM;
484 	}
485 
486 	txq->socket = socket_id;
487 
488 	txq->desc_ring = rte_malloc_socket("mana_tx_desc_ring",
489 					   sizeof(struct mana_txq_desc) *
490 						nb_desc,
491 					   RTE_CACHE_LINE_SIZE, socket_id);
492 	if (!txq->desc_ring) {
493 		DRV_LOG(ERR, "failed to allocate txq desc_ring");
494 		ret = -ENOMEM;
495 		goto fail;
496 	}
497 
498 	txq->gdma_comp_buf = rte_malloc_socket("mana_txq_comp",
499 			sizeof(*txq->gdma_comp_buf) * nb_desc,
500 			RTE_CACHE_LINE_SIZE, socket_id);
501 	if (!txq->gdma_comp_buf) {
502 		DRV_LOG(ERR, "failed to allocate txq comp");
503 		ret = -ENOMEM;
504 		goto fail;
505 	}
506 
507 	ret = mana_mr_btree_init(&txq->mr_btree,
508 				 MANA_MR_BTREE_PER_QUEUE_N, socket_id);
509 	if (ret) {
510 		DRV_LOG(ERR, "Failed to init TXQ MR btree");
511 		goto fail;
512 	}
513 
514 	DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u txq->desc_ring %p",
515 		queue_idx, nb_desc, socket_id, txq->desc_ring);
516 
517 	txq->desc_ring_head = 0;
518 	txq->desc_ring_tail = 0;
519 	txq->priv = priv;
520 	txq->num_desc = nb_desc;
521 	dev->data->tx_queues[queue_idx] = txq;
522 
523 	return 0;
524 
525 fail:
526 	rte_free(txq->gdma_comp_buf);
527 	rte_free(txq->desc_ring);
528 	rte_free(txq);
529 	return ret;
530 }
531 
532 static void
533 mana_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
534 {
535 	struct mana_txq *txq = dev->data->tx_queues[qid];
536 
537 	mana_mr_btree_free(&txq->mr_btree);
538 
539 	rte_free(txq->gdma_comp_buf);
540 	rte_free(txq->desc_ring);
541 	rte_free(txq);
542 }
543 
544 static int
545 mana_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
546 			uint16_t nb_desc, unsigned int socket_id,
547 			const struct rte_eth_rxconf *rx_conf __rte_unused,
548 			struct rte_mempool *mp)
549 {
550 	struct mana_priv *priv = dev->data->dev_private;
551 	struct mana_rxq *rxq;
552 	int ret;
553 
554 	rxq = rte_zmalloc_socket("mana_rxq", sizeof(*rxq), 0, socket_id);
555 	if (!rxq) {
556 		DRV_LOG(ERR, "failed to allocate rxq");
557 		return -ENOMEM;
558 	}
559 
560 	DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u",
561 		queue_idx, nb_desc, socket_id);
562 
563 	rxq->socket = socket_id;
564 
565 	rxq->desc_ring = rte_zmalloc_socket("mana_rx_mbuf_ring",
566 					    sizeof(struct mana_rxq_desc) *
567 						nb_desc,
568 					    RTE_CACHE_LINE_SIZE, socket_id);
569 
570 	if (!rxq->desc_ring) {
571 		DRV_LOG(ERR, "failed to allocate rxq desc_ring");
572 		ret = -ENOMEM;
573 		goto fail;
574 	}
575 
576 	rxq->desc_ring_head = 0;
577 	rxq->desc_ring_tail = 0;
578 
579 	rxq->gdma_comp_buf = rte_malloc_socket("mana_rxq_comp",
580 			sizeof(*rxq->gdma_comp_buf) * nb_desc,
581 			RTE_CACHE_LINE_SIZE, socket_id);
582 	if (!rxq->gdma_comp_buf) {
583 		DRV_LOG(ERR, "failed to allocate rxq comp");
584 		ret = -ENOMEM;
585 		goto fail;
586 	}
587 
588 	ret = mana_mr_btree_init(&rxq->mr_btree,
589 				 MANA_MR_BTREE_PER_QUEUE_N, socket_id);
590 	if (ret) {
591 		DRV_LOG(ERR, "Failed to init RXQ MR btree");
592 		goto fail;
593 	}
594 
595 	rxq->priv = priv;
596 	rxq->num_desc = nb_desc;
597 	rxq->mp = mp;
598 	dev->data->rx_queues[queue_idx] = rxq;
599 
600 	return 0;
601 
602 fail:
603 	rte_free(rxq->gdma_comp_buf);
604 	rte_free(rxq->desc_ring);
605 	rte_free(rxq);
606 	return ret;
607 }
608 
609 static void
610 mana_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
611 {
612 	struct mana_rxq *rxq = dev->data->rx_queues[qid];
613 
614 	mana_mr_btree_free(&rxq->mr_btree);
615 
616 	rte_free(rxq->gdma_comp_buf);
617 	rte_free(rxq->desc_ring);
618 	rte_free(rxq);
619 }
620 
621 static int
622 mana_dev_link_update(struct rte_eth_dev *dev,
623 		     int wait_to_complete __rte_unused)
624 {
625 	struct rte_eth_link link;
626 
627 	/* MANA has no concept of carrier state, always reporting UP */
628 	link = (struct rte_eth_link) {
629 		.link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
630 		.link_autoneg = RTE_ETH_LINK_SPEED_FIXED,
631 		.link_speed = RTE_ETH_SPEED_NUM_100G,
632 		.link_status = RTE_ETH_LINK_UP,
633 	};
634 
635 	return rte_eth_linkstatus_set(dev, &link);
636 }
637 
638 static int
639 mana_dev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
640 {
641 	unsigned int i;
642 
643 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
644 		struct mana_txq *txq = dev->data->tx_queues[i];
645 
646 		if (!txq)
647 			continue;
648 
649 		stats->opackets += txq->stats.packets;
650 		stats->obytes += txq->stats.bytes;
651 		stats->oerrors += txq->stats.errors;
652 
653 		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
654 			stats->q_opackets[i] = txq->stats.packets;
655 			stats->q_obytes[i] = txq->stats.bytes;
656 		}
657 	}
658 
659 	stats->rx_nombuf = 0;
660 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
661 		struct mana_rxq *rxq = dev->data->rx_queues[i];
662 
663 		if (!rxq)
664 			continue;
665 
666 		stats->ipackets += rxq->stats.packets;
667 		stats->ibytes += rxq->stats.bytes;
668 		stats->ierrors += rxq->stats.errors;
669 
670 		/* There is no good way to get stats->imissed, not setting it */
671 
672 		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
673 			stats->q_ipackets[i] = rxq->stats.packets;
674 			stats->q_ibytes[i] = rxq->stats.bytes;
675 		}
676 
677 		stats->rx_nombuf += rxq->stats.nombuf;
678 	}
679 
680 	return 0;
681 }
682 
683 static int
684 mana_dev_stats_reset(struct rte_eth_dev *dev __rte_unused)
685 {
686 	unsigned int i;
687 
688 	PMD_INIT_FUNC_TRACE();
689 
690 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
691 		struct mana_txq *txq = dev->data->tx_queues[i];
692 
693 		if (!txq)
694 			continue;
695 
696 		memset(&txq->stats, 0, sizeof(txq->stats));
697 	}
698 
699 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
700 		struct mana_rxq *rxq = dev->data->rx_queues[i];
701 
702 		if (!rxq)
703 			continue;
704 
705 		memset(&rxq->stats, 0, sizeof(rxq->stats));
706 	}
707 
708 	return 0;
709 }
710 
711 static int
712 mana_get_ifname(const struct mana_priv *priv, char (*ifname)[IF_NAMESIZE])
713 {
714 	int ret;
715 	DIR *dir;
716 	struct dirent *dent;
717 
718 	MANA_MKSTR(dirpath, "%s/device/net", priv->ib_ctx->device->ibdev_path);
719 
720 	dir = opendir(dirpath);
721 	if (dir == NULL)
722 		return -ENODEV;
723 
724 	while ((dent = readdir(dir)) != NULL) {
725 		char *name = dent->d_name;
726 		FILE *file;
727 		struct rte_ether_addr addr;
728 		char *mac = NULL;
729 
730 		if ((name[0] == '.') &&
731 		    ((name[1] == '\0') ||
732 		     ((name[1] == '.') && (name[2] == '\0'))))
733 			continue;
734 
735 		MANA_MKSTR(path, "%s/%s/address", dirpath, name);
736 
737 		file = fopen(path, "r");
738 		if (!file) {
739 			ret = -ENODEV;
740 			break;
741 		}
742 
743 		ret = fscanf(file, "%ms", &mac);
744 		fclose(file);
745 
746 		if (ret <= 0) {
747 			ret = -EINVAL;
748 			break;
749 		}
750 
751 		ret = rte_ether_unformat_addr(mac, &addr);
752 		free(mac);
753 		if (ret)
754 			break;
755 
756 		if (rte_is_same_ether_addr(&addr, priv->dev_data->mac_addrs)) {
757 			strlcpy(*ifname, name, sizeof(*ifname));
758 			ret = 0;
759 			break;
760 		}
761 	}
762 
763 	closedir(dir);
764 	return ret;
765 }
766 
767 static int
768 mana_ifreq(const struct mana_priv *priv, int req, struct ifreq *ifr)
769 {
770 	int sock, ret;
771 
772 	sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
773 	if (sock == -1)
774 		return -errno;
775 
776 	ret = mana_get_ifname(priv, &ifr->ifr_name);
777 	if (ret) {
778 		close(sock);
779 		return ret;
780 	}
781 
782 	if (ioctl(sock, req, ifr) == -1)
783 		ret = -errno;
784 
785 	close(sock);
786 
787 	return ret;
788 }
789 
790 static int
791 mana_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
792 {
793 	struct mana_priv *priv = dev->data->dev_private;
794 	struct ifreq request = { .ifr_mtu = mtu, };
795 
796 	return mana_ifreq(priv, SIOCSIFMTU, &request);
797 }
798 
799 static const struct eth_dev_ops mana_dev_ops = {
800 	.dev_configure		= mana_dev_configure,
801 	.dev_start		= mana_dev_start,
802 	.dev_stop		= mana_dev_stop,
803 	.dev_close		= mana_dev_close,
804 	.dev_infos_get		= mana_dev_info_get,
805 	.txq_info_get		= mana_dev_tx_queue_info,
806 	.rxq_info_get		= mana_dev_rx_queue_info,
807 	.dev_supported_ptypes_get = mana_supported_ptypes,
808 	.rss_hash_update	= mana_rss_hash_update,
809 	.rss_hash_conf_get	= mana_rss_hash_conf_get,
810 	.tx_queue_setup		= mana_dev_tx_queue_setup,
811 	.tx_queue_release	= mana_dev_tx_queue_release,
812 	.rx_queue_setup		= mana_dev_rx_queue_setup,
813 	.rx_queue_release	= mana_dev_rx_queue_release,
814 	.rx_queue_intr_enable	= mana_rx_intr_enable,
815 	.rx_queue_intr_disable	= mana_rx_intr_disable,
816 	.link_update		= mana_dev_link_update,
817 	.stats_get		= mana_dev_stats_get,
818 	.stats_reset		= mana_dev_stats_reset,
819 	.mtu_set		= mana_mtu_set,
820 };
821 
822 static const struct eth_dev_ops mana_dev_secondary_ops = {
823 	.stats_get = mana_dev_stats_get,
824 	.stats_reset = mana_dev_stats_reset,
825 	.dev_infos_get = mana_dev_info_get,
826 };
827 
828 uint16_t
829 mana_rx_burst_removed(void *dpdk_rxq __rte_unused,
830 		      struct rte_mbuf **pkts __rte_unused,
831 		      uint16_t pkts_n __rte_unused)
832 {
833 	rte_mb();
834 	return 0;
835 }
836 
837 uint16_t
838 mana_tx_burst_removed(void *dpdk_rxq __rte_unused,
839 		      struct rte_mbuf **pkts __rte_unused,
840 		      uint16_t pkts_n __rte_unused)
841 {
842 	rte_mb();
843 	return 0;
844 }
845 
846 #define ETH_MANA_MAC_ARG "mac"
847 static const char * const mana_init_args[] = {
848 	ETH_MANA_MAC_ARG,
849 	NULL,
850 };
851 
852 /* Support of parsing up to 8 mac address from EAL command line */
853 #define MAX_NUM_ADDRESS 8
854 struct mana_conf {
855 	struct rte_ether_addr mac_array[MAX_NUM_ADDRESS];
856 	unsigned int index;
857 };
858 
859 static int
860 mana_arg_parse_callback(const char *key, const char *val, void *private)
861 {
862 	struct mana_conf *conf = (struct mana_conf *)private;
863 	int ret;
864 
865 	DRV_LOG(INFO, "key=%s value=%s index=%d", key, val, conf->index);
866 
867 	if (conf->index >= MAX_NUM_ADDRESS) {
868 		DRV_LOG(ERR, "Exceeding max MAC address");
869 		return 1;
870 	}
871 
872 	ret = rte_ether_unformat_addr(val, &conf->mac_array[conf->index]);
873 	if (ret) {
874 		DRV_LOG(ERR, "Invalid MAC address %s", val);
875 		return ret;
876 	}
877 
878 	conf->index++;
879 
880 	return 0;
881 }
882 
883 static int
884 mana_parse_args(struct rte_devargs *devargs, struct mana_conf *conf)
885 {
886 	struct rte_kvargs *kvlist;
887 	unsigned int arg_count;
888 	int ret = 0;
889 
890 	kvlist = rte_kvargs_parse(devargs->drv_str, mana_init_args);
891 	if (!kvlist) {
892 		DRV_LOG(ERR, "failed to parse kvargs args=%s", devargs->drv_str);
893 		return -EINVAL;
894 	}
895 
896 	arg_count = rte_kvargs_count(kvlist, mana_init_args[0]);
897 	if (arg_count > MAX_NUM_ADDRESS) {
898 		ret = -EINVAL;
899 		goto free_kvlist;
900 	}
901 	ret = rte_kvargs_process(kvlist, mana_init_args[0],
902 				 mana_arg_parse_callback, conf);
903 	if (ret) {
904 		DRV_LOG(ERR, "error parsing args");
905 		goto free_kvlist;
906 	}
907 
908 free_kvlist:
909 	rte_kvargs_free(kvlist);
910 	return ret;
911 }
912 
913 static int
914 get_port_mac(struct ibv_device *device, unsigned int port,
915 	     struct rte_ether_addr *addr)
916 {
917 	FILE *file;
918 	int ret = 0;
919 	DIR *dir;
920 	struct dirent *dent;
921 	unsigned int dev_port;
922 
923 	MANA_MKSTR(path, "%s/device/net", device->ibdev_path);
924 
925 	dir = opendir(path);
926 	if (!dir)
927 		return -ENOENT;
928 
929 	while ((dent = readdir(dir))) {
930 		char *name = dent->d_name;
931 		char *mac = NULL;
932 
933 		MANA_MKSTR(port_path, "%s/%s/dev_port", path, name);
934 
935 		/* Ignore . and .. */
936 		if ((name[0] == '.') &&
937 		    ((name[1] == '\0') ||
938 		     ((name[1] == '.') && (name[2] == '\0'))))
939 			continue;
940 
941 		file = fopen(port_path, "r");
942 		if (!file)
943 			continue;
944 
945 		ret = fscanf(file, "%u", &dev_port);
946 		fclose(file);
947 
948 		if (ret != 1)
949 			continue;
950 
951 		/* Ethernet ports start at 0, IB port start at 1 */
952 		if (dev_port == port - 1) {
953 			MANA_MKSTR(address_path, "%s/%s/address", path, name);
954 
955 			file = fopen(address_path, "r");
956 			if (!file)
957 				continue;
958 
959 			ret = fscanf(file, "%ms", &mac);
960 			fclose(file);
961 
962 			if (ret < 0)
963 				break;
964 
965 			ret = rte_ether_unformat_addr(mac, addr);
966 			if (ret)
967 				DRV_LOG(ERR, "unrecognized mac addr %s", mac);
968 
969 			free(mac);
970 			break;
971 		}
972 	}
973 
974 	closedir(dir);
975 	return ret;
976 }
977 
978 static int
979 mana_ibv_device_to_pci_addr(const struct ibv_device *device,
980 			    struct rte_pci_addr *pci_addr)
981 {
982 	FILE *file;
983 	char *line = NULL;
984 	size_t len = 0;
985 
986 	MANA_MKSTR(path, "%s/device/uevent", device->ibdev_path);
987 
988 	file = fopen(path, "r");
989 	if (!file)
990 		return -errno;
991 
992 	while (getline(&line, &len, file) != -1) {
993 		/* Extract information. */
994 		if (sscanf(line,
995 			   "PCI_SLOT_NAME="
996 			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
997 			   &pci_addr->domain,
998 			   &pci_addr->bus,
999 			   &pci_addr->devid,
1000 			   &pci_addr->function) == 4) {
1001 			break;
1002 		}
1003 	}
1004 
1005 	free(line);
1006 	fclose(file);
1007 	return 0;
1008 }
1009 
1010 /*
1011  * Interrupt handler from IB layer to notify this device is being removed.
1012  */
1013 static void
1014 mana_intr_handler(void *arg)
1015 {
1016 	struct mana_priv *priv = arg;
1017 	struct ibv_context *ctx = priv->ib_ctx;
1018 	struct ibv_async_event event;
1019 
1020 	/* Read and ack all messages from IB device */
1021 	while (true) {
1022 		if (ibv_get_async_event(ctx, &event))
1023 			break;
1024 
1025 		if (event.event_type == IBV_EVENT_DEVICE_FATAL) {
1026 			struct rte_eth_dev *dev;
1027 
1028 			dev = &rte_eth_devices[priv->port_id];
1029 			if (dev->data->dev_conf.intr_conf.rmv)
1030 				rte_eth_dev_callback_process(dev,
1031 					RTE_ETH_EVENT_INTR_RMV, NULL);
1032 		}
1033 
1034 		ibv_ack_async_event(&event);
1035 	}
1036 }
1037 
1038 static int
1039 mana_intr_uninstall(struct mana_priv *priv)
1040 {
1041 	int ret;
1042 
1043 	ret = rte_intr_callback_unregister(priv->intr_handle,
1044 					   mana_intr_handler, priv);
1045 	if (ret <= 0) {
1046 		DRV_LOG(ERR, "Failed to unregister intr callback ret %d", ret);
1047 		return ret;
1048 	}
1049 
1050 	rte_intr_instance_free(priv->intr_handle);
1051 
1052 	return 0;
1053 }
1054 
1055 int
1056 mana_fd_set_non_blocking(int fd)
1057 {
1058 	int ret = fcntl(fd, F_GETFL);
1059 
1060 	if (ret != -1 && !fcntl(fd, F_SETFL, ret | O_NONBLOCK))
1061 		return 0;
1062 
1063 	rte_errno = errno;
1064 	return -rte_errno;
1065 }
1066 
1067 static int
1068 mana_intr_install(struct rte_eth_dev *eth_dev, struct mana_priv *priv)
1069 {
1070 	int ret;
1071 	struct ibv_context *ctx = priv->ib_ctx;
1072 
1073 	priv->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_SHARED);
1074 	if (!priv->intr_handle) {
1075 		DRV_LOG(ERR, "Failed to allocate intr_handle");
1076 		rte_errno = ENOMEM;
1077 		return -ENOMEM;
1078 	}
1079 
1080 	ret = rte_intr_fd_set(priv->intr_handle, -1);
1081 	if (ret)
1082 		goto free_intr;
1083 
1084 	ret = mana_fd_set_non_blocking(ctx->async_fd);
1085 	if (ret) {
1086 		DRV_LOG(ERR, "Failed to change async_fd to NONBLOCK");
1087 		goto free_intr;
1088 	}
1089 
1090 	ret = rte_intr_fd_set(priv->intr_handle, ctx->async_fd);
1091 	if (ret)
1092 		goto free_intr;
1093 
1094 	ret = rte_intr_type_set(priv->intr_handle, RTE_INTR_HANDLE_EXT);
1095 	if (ret)
1096 		goto free_intr;
1097 
1098 	ret = rte_intr_callback_register(priv->intr_handle,
1099 					 mana_intr_handler, priv);
1100 	if (ret) {
1101 		DRV_LOG(ERR, "Failed to register intr callback");
1102 		rte_intr_fd_set(priv->intr_handle, -1);
1103 		goto free_intr;
1104 	}
1105 
1106 	eth_dev->intr_handle = priv->intr_handle;
1107 	return 0;
1108 
1109 free_intr:
1110 	rte_intr_instance_free(priv->intr_handle);
1111 	priv->intr_handle = NULL;
1112 
1113 	return ret;
1114 }
1115 
1116 static int
1117 mana_proc_priv_init(struct rte_eth_dev *dev)
1118 {
1119 	struct mana_process_priv *priv;
1120 
1121 	priv = rte_zmalloc_socket("mana_proc_priv",
1122 				  sizeof(struct mana_process_priv),
1123 				  RTE_CACHE_LINE_SIZE,
1124 				  dev->device->numa_node);
1125 	if (!priv)
1126 		return -ENOMEM;
1127 
1128 	dev->process_private = priv;
1129 	return 0;
1130 }
1131 
1132 /*
1133  * Map the doorbell page for the secondary process through IB device handle.
1134  */
1135 static int
1136 mana_map_doorbell_secondary(struct rte_eth_dev *eth_dev, int fd)
1137 {
1138 	struct mana_process_priv *priv = eth_dev->process_private;
1139 
1140 	void *addr;
1141 
1142 	addr = mmap(NULL, rte_mem_page_size(), PROT_WRITE, MAP_SHARED, fd, 0);
1143 	if (addr == MAP_FAILED) {
1144 		DRV_LOG(ERR, "Failed to map secondary doorbell port %u",
1145 			eth_dev->data->port_id);
1146 		return -ENOMEM;
1147 	}
1148 
1149 	DRV_LOG(INFO, "Secondary doorbell mapped to %p", addr);
1150 
1151 	priv->db_page = addr;
1152 
1153 	return 0;
1154 }
1155 
1156 /* Initialize shared data for the driver (all devices) */
1157 static int
1158 mana_init_shared_data(void)
1159 {
1160 	int ret =  0;
1161 	const struct rte_memzone *secondary_mz;
1162 
1163 	rte_spinlock_lock(&mana_shared_data_lock);
1164 
1165 	/* Skip if shared data is already initialized */
1166 	if (mana_shared_data)
1167 		goto exit;
1168 
1169 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1170 		mana_shared_mz = rte_memzone_reserve(MZ_MANA_SHARED_DATA,
1171 						     sizeof(*mana_shared_data),
1172 						     SOCKET_ID_ANY, 0);
1173 		if (!mana_shared_mz) {
1174 			DRV_LOG(ERR, "Cannot allocate mana shared data");
1175 			ret = -rte_errno;
1176 			goto exit;
1177 		}
1178 
1179 		mana_shared_data = mana_shared_mz->addr;
1180 		memset(mana_shared_data, 0, sizeof(*mana_shared_data));
1181 		rte_spinlock_init(&mana_shared_data->lock);
1182 	} else {
1183 		secondary_mz = rte_memzone_lookup(MZ_MANA_SHARED_DATA);
1184 		if (!secondary_mz) {
1185 			DRV_LOG(ERR, "Cannot attach mana shared data");
1186 			ret = -rte_errno;
1187 			goto exit;
1188 		}
1189 
1190 		mana_shared_data = secondary_mz->addr;
1191 		memset(&mana_local_data, 0, sizeof(mana_local_data));
1192 	}
1193 
1194 exit:
1195 	rte_spinlock_unlock(&mana_shared_data_lock);
1196 
1197 	return ret;
1198 }
1199 
1200 /*
1201  * Init the data structures for use in primary and secondary processes.
1202  */
1203 static int
1204 mana_init_once(void)
1205 {
1206 	int ret;
1207 
1208 	ret = mana_init_shared_data();
1209 	if (ret)
1210 		return ret;
1211 
1212 	rte_spinlock_lock(&mana_shared_data->lock);
1213 
1214 	switch (rte_eal_process_type()) {
1215 	case RTE_PROC_PRIMARY:
1216 		if (mana_shared_data->init_done)
1217 			break;
1218 
1219 		ret = mana_mp_init_primary();
1220 		if (ret)
1221 			break;
1222 		DRV_LOG(ERR, "MP INIT PRIMARY");
1223 
1224 		mana_shared_data->init_done = 1;
1225 		break;
1226 
1227 	case RTE_PROC_SECONDARY:
1228 
1229 		if (mana_local_data.init_done)
1230 			break;
1231 
1232 		ret = mana_mp_init_secondary();
1233 		if (ret)
1234 			break;
1235 
1236 		DRV_LOG(ERR, "MP INIT SECONDARY");
1237 
1238 		mana_local_data.init_done = 1;
1239 		break;
1240 
1241 	default:
1242 		/* Impossible, internal error */
1243 		ret = -EPROTO;
1244 		break;
1245 	}
1246 
1247 	rte_spinlock_unlock(&mana_shared_data->lock);
1248 
1249 	return ret;
1250 }
1251 
1252 /*
1253  * Probe an IB port
1254  * Return value:
1255  * positive value: successfully probed port
1256  * 0: port not matching specified MAC address
1257  * negative value: error code
1258  */
1259 static int
1260 mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
1261 		uint8_t port, struct rte_pci_device *pci_dev, struct rte_ether_addr *addr)
1262 {
1263 	struct mana_priv *priv = NULL;
1264 	struct rte_eth_dev *eth_dev = NULL;
1265 	struct ibv_parent_domain_init_attr attr = {0};
1266 	char address[64];
1267 	char name[RTE_ETH_NAME_MAX_LEN];
1268 	int ret;
1269 	struct ibv_context *ctx = NULL;
1270 
1271 	rte_ether_format_addr(address, sizeof(address), addr);
1272 	DRV_LOG(INFO, "device located port %u address %s", port, address);
1273 
1274 	priv = rte_zmalloc_socket(NULL, sizeof(*priv), RTE_CACHE_LINE_SIZE,
1275 				  SOCKET_ID_ANY);
1276 	if (!priv)
1277 		return -ENOMEM;
1278 
1279 	snprintf(name, sizeof(name), "%s_port%d", pci_dev->device.name, port);
1280 
1281 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1282 		int fd;
1283 
1284 		eth_dev = rte_eth_dev_attach_secondary(name);
1285 		if (!eth_dev) {
1286 			DRV_LOG(ERR, "Can't attach to dev %s", name);
1287 			ret =  -ENOMEM;
1288 			goto failed;
1289 		}
1290 
1291 		eth_dev->device = &pci_dev->device;
1292 		eth_dev->dev_ops = &mana_dev_secondary_ops;
1293 		ret = mana_proc_priv_init(eth_dev);
1294 		if (ret)
1295 			goto failed;
1296 		priv->process_priv = eth_dev->process_private;
1297 
1298 		/* Get the IB FD from the primary process */
1299 		fd = mana_mp_req_verbs_cmd_fd(eth_dev);
1300 		if (fd < 0) {
1301 			DRV_LOG(ERR, "Failed to get FD %d", fd);
1302 			ret = -ENODEV;
1303 			goto failed;
1304 		}
1305 
1306 		ret = mana_map_doorbell_secondary(eth_dev, fd);
1307 		if (ret) {
1308 			DRV_LOG(ERR, "Failed secondary map %d", fd);
1309 			goto failed;
1310 		}
1311 
1312 		/* fd is no not used after mapping doorbell */
1313 		close(fd);
1314 
1315 		eth_dev->tx_pkt_burst = mana_tx_burst_removed;
1316 		eth_dev->rx_pkt_burst = mana_rx_burst_removed;
1317 
1318 		rte_spinlock_lock(&mana_shared_data->lock);
1319 		mana_shared_data->secondary_cnt++;
1320 		mana_local_data.secondary_cnt++;
1321 		rte_spinlock_unlock(&mana_shared_data->lock);
1322 
1323 		rte_eth_copy_pci_info(eth_dev, pci_dev);
1324 		rte_eth_dev_probing_finish(eth_dev);
1325 
1326 		return 0;
1327 	}
1328 
1329 	ctx = ibv_open_device(ibdev);
1330 	if (!ctx) {
1331 		DRV_LOG(ERR, "Failed to open IB device %s", ibdev->name);
1332 		ret = -ENODEV;
1333 		goto failed;
1334 	}
1335 
1336 	eth_dev = rte_eth_dev_allocate(name);
1337 	if (!eth_dev) {
1338 		ret = -ENOMEM;
1339 		goto failed;
1340 	}
1341 
1342 	eth_dev->data->mac_addrs =
1343 		rte_calloc("mana_mac", 1,
1344 			   sizeof(struct rte_ether_addr), 0);
1345 	if (!eth_dev->data->mac_addrs) {
1346 		ret = -ENOMEM;
1347 		goto failed;
1348 	}
1349 
1350 	rte_ether_addr_copy(addr, eth_dev->data->mac_addrs);
1351 
1352 	priv->ib_pd = ibv_alloc_pd(ctx);
1353 	if (!priv->ib_pd) {
1354 		DRV_LOG(ERR, "ibv_alloc_pd failed port %d", port);
1355 		ret = -ENOMEM;
1356 		goto failed;
1357 	}
1358 
1359 	/* Create a parent domain with the port number */
1360 	attr.pd = priv->ib_pd;
1361 	attr.comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
1362 	attr.pd_context = (void *)(uintptr_t)port;
1363 	priv->ib_parent_pd = ibv_alloc_parent_domain(ctx, &attr);
1364 	if (!priv->ib_parent_pd) {
1365 		DRV_LOG(ERR, "ibv_alloc_parent_domain failed port %d", port);
1366 		ret = -ENOMEM;
1367 		goto failed;
1368 	}
1369 
1370 	priv->ib_ctx = ctx;
1371 	priv->port_id = eth_dev->data->port_id;
1372 	priv->dev_port = port;
1373 	eth_dev->data->dev_private = priv;
1374 	priv->dev_data = eth_dev->data;
1375 
1376 	priv->max_rx_queues = dev_attr->orig_attr.max_qp;
1377 	priv->max_tx_queues = dev_attr->orig_attr.max_qp;
1378 
1379 	priv->max_rx_desc =
1380 		RTE_MIN(dev_attr->orig_attr.max_qp_wr,
1381 			dev_attr->orig_attr.max_cqe);
1382 	priv->max_tx_desc =
1383 		RTE_MIN(dev_attr->orig_attr.max_qp_wr,
1384 			dev_attr->orig_attr.max_cqe);
1385 
1386 	priv->max_send_sge = dev_attr->orig_attr.max_sge;
1387 	priv->max_recv_sge = dev_attr->orig_attr.max_sge;
1388 
1389 	priv->max_mr = dev_attr->orig_attr.max_mr;
1390 	priv->max_mr_size = dev_attr->orig_attr.max_mr_size;
1391 
1392 	DRV_LOG(INFO, "dev %s max queues %d desc %d sge %d mr %" PRIu64,
1393 		name, priv->max_rx_queues, priv->max_rx_desc,
1394 		priv->max_send_sge, priv->max_mr_size);
1395 
1396 	rte_eth_copy_pci_info(eth_dev, pci_dev);
1397 
1398 	/* Create async interrupt handler */
1399 	ret = mana_intr_install(eth_dev, priv);
1400 	if (ret) {
1401 		DRV_LOG(ERR, "Failed to install intr handler");
1402 		goto failed;
1403 	}
1404 
1405 	rte_spinlock_lock(&mana_shared_data->lock);
1406 	mana_shared_data->primary_cnt++;
1407 	rte_spinlock_unlock(&mana_shared_data->lock);
1408 
1409 	eth_dev->device = &pci_dev->device;
1410 
1411 	DRV_LOG(INFO, "device %s at port %u", name, eth_dev->data->port_id);
1412 
1413 	eth_dev->rx_pkt_burst = mana_rx_burst_removed;
1414 	eth_dev->tx_pkt_burst = mana_tx_burst_removed;
1415 	eth_dev->dev_ops = &mana_dev_ops;
1416 
1417 	rte_eth_dev_probing_finish(eth_dev);
1418 
1419 	return 0;
1420 
1421 failed:
1422 	/* Free the resource for the port failed */
1423 	if (priv) {
1424 		if (priv->ib_parent_pd)
1425 			ibv_dealloc_pd(priv->ib_parent_pd);
1426 
1427 		if (priv->ib_pd)
1428 			ibv_dealloc_pd(priv->ib_pd);
1429 	}
1430 
1431 	if (eth_dev)
1432 		rte_eth_dev_release_port(eth_dev);
1433 
1434 	rte_free(priv);
1435 
1436 	if (ctx)
1437 		ibv_close_device(ctx);
1438 
1439 	return ret;
1440 }
1441 
1442 /*
1443  * Goes through the IB device list to look for the IB port matching the
1444  * mac_addr. If found, create a rte_eth_dev for it.
1445  * Return value: number of successfully probed devices
1446  */
1447 static int
1448 mana_pci_probe_mac(struct rte_pci_device *pci_dev,
1449 		   struct rte_ether_addr *mac_addr)
1450 {
1451 	struct ibv_device **ibv_list;
1452 	int ibv_idx;
1453 	struct ibv_context *ctx;
1454 	int num_devices;
1455 	int ret;
1456 	uint8_t port;
1457 	int count = 0;
1458 
1459 	ibv_list = ibv_get_device_list(&num_devices);
1460 	for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
1461 		struct ibv_device *ibdev = ibv_list[ibv_idx];
1462 		struct rte_pci_addr pci_addr;
1463 		struct ibv_device_attr_ex dev_attr;
1464 
1465 		DRV_LOG(INFO, "Probe device name %s dev_name %s ibdev_path %s",
1466 			ibdev->name, ibdev->dev_name, ibdev->ibdev_path);
1467 
1468 		if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
1469 			continue;
1470 
1471 		/* Ignore if this IB device is not this PCI device */
1472 		if (rte_pci_addr_cmp(&pci_dev->addr, &pci_addr) != 0)
1473 			continue;
1474 
1475 		ctx = ibv_open_device(ibdev);
1476 		if (!ctx) {
1477 			DRV_LOG(ERR, "Failed to open IB device %s",
1478 				ibdev->name);
1479 			continue;
1480 		}
1481 		ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
1482 		ibv_close_device(ctx);
1483 
1484 		if (ret) {
1485 			DRV_LOG(ERR, "Failed to query IB device %s",
1486 				ibdev->name);
1487 			continue;
1488 		}
1489 
1490 		for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
1491 		     port++) {
1492 			struct rte_ether_addr addr;
1493 			ret = get_port_mac(ibdev, port, &addr);
1494 			if (ret)
1495 				continue;
1496 
1497 			if (mac_addr && !rte_is_same_ether_addr(&addr, mac_addr))
1498 				continue;
1499 
1500 			ret = mana_probe_port(ibdev, &dev_attr, port, pci_dev, &addr);
1501 			if (ret) {
1502 				DRV_LOG(ERR, "Probe on IB port %u failed %d", port, ret);
1503 			} else {
1504 				count++;
1505 				DRV_LOG(INFO, "Successfully probed on IB port %u", port);
1506 			}
1507 		}
1508 	}
1509 
1510 	ibv_free_device_list(ibv_list);
1511 	return count;
1512 }
1513 
1514 /*
1515  * Main callback function from PCI bus to probe a device.
1516  */
1517 static int
1518 mana_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1519 	       struct rte_pci_device *pci_dev)
1520 {
1521 	struct rte_devargs *args = pci_dev->device.devargs;
1522 	struct mana_conf conf = {0};
1523 	unsigned int i;
1524 	int ret;
1525 	int count = 0;
1526 
1527 	if (args && args->drv_str) {
1528 		ret = mana_parse_args(args, &conf);
1529 		if (ret) {
1530 			DRV_LOG(ERR, "Failed to parse parameters args = %s",
1531 				args->drv_str);
1532 			return ret;
1533 		}
1534 	}
1535 
1536 	ret = mana_init_once();
1537 	if (ret) {
1538 		DRV_LOG(ERR, "Failed to init PMD global data %d", ret);
1539 		return ret;
1540 	}
1541 
1542 	/* If there are no driver parameters, probe on all ports */
1543 	if (conf.index) {
1544 		for (i = 0; i < conf.index; i++)
1545 			count += mana_pci_probe_mac(pci_dev,
1546 						    &conf.mac_array[i]);
1547 	} else {
1548 		count = mana_pci_probe_mac(pci_dev, NULL);
1549 	}
1550 
1551 	if (!count) {
1552 		rte_memzone_free(mana_shared_mz);
1553 		mana_shared_mz = NULL;
1554 		ret = -ENODEV;
1555 	}
1556 
1557 	return ret;
1558 }
1559 
1560 static int
1561 mana_dev_uninit(struct rte_eth_dev *dev)
1562 {
1563 	return mana_dev_close(dev);
1564 }
1565 
1566 /*
1567  * Callback from PCI to remove this device.
1568  */
1569 static int
1570 mana_pci_remove(struct rte_pci_device *pci_dev)
1571 {
1572 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1573 		rte_spinlock_lock(&mana_shared_data_lock);
1574 
1575 		rte_spinlock_lock(&mana_shared_data->lock);
1576 
1577 		RTE_VERIFY(mana_shared_data->primary_cnt > 0);
1578 		mana_shared_data->primary_cnt--;
1579 		if (!mana_shared_data->primary_cnt) {
1580 			DRV_LOG(DEBUG, "mp uninit primary");
1581 			mana_mp_uninit_primary();
1582 		}
1583 
1584 		rte_spinlock_unlock(&mana_shared_data->lock);
1585 
1586 		/* Also free the shared memory if this is the last */
1587 		if (!mana_shared_data->primary_cnt) {
1588 			DRV_LOG(DEBUG, "free shared memezone data");
1589 			rte_memzone_free(mana_shared_mz);
1590 			mana_shared_mz = NULL;
1591 		}
1592 
1593 		rte_spinlock_unlock(&mana_shared_data_lock);
1594 	} else {
1595 		rte_spinlock_lock(&mana_shared_data_lock);
1596 
1597 		rte_spinlock_lock(&mana_shared_data->lock);
1598 		RTE_VERIFY(mana_shared_data->secondary_cnt > 0);
1599 		mana_shared_data->secondary_cnt--;
1600 		rte_spinlock_unlock(&mana_shared_data->lock);
1601 
1602 		RTE_VERIFY(mana_local_data.secondary_cnt > 0);
1603 		mana_local_data.secondary_cnt--;
1604 		if (!mana_local_data.secondary_cnt) {
1605 			DRV_LOG(DEBUG, "mp uninit secondary");
1606 			mana_mp_uninit_secondary();
1607 		}
1608 
1609 		rte_spinlock_unlock(&mana_shared_data_lock);
1610 	}
1611 
1612 	return rte_eth_dev_pci_generic_remove(pci_dev, mana_dev_uninit);
1613 }
1614 
1615 static const struct rte_pci_id mana_pci_id_map[] = {
1616 	{
1617 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT,
1618 			       PCI_DEVICE_ID_MICROSOFT_MANA)
1619 	},
1620 	{
1621 		.vendor_id = 0
1622 	},
1623 };
1624 
1625 static struct rte_pci_driver mana_pci_driver = {
1626 	.id_table = mana_pci_id_map,
1627 	.probe = mana_pci_probe,
1628 	.remove = mana_pci_remove,
1629 	.drv_flags = RTE_PCI_DRV_INTR_RMV,
1630 };
1631 
1632 RTE_PMD_REGISTER_PCI(net_mana, mana_pci_driver);
1633 RTE_PMD_REGISTER_PCI_TABLE(net_mana, mana_pci_id_map);
1634 RTE_PMD_REGISTER_KMOD_DEP(net_mana, "* ib_uverbs & mana_ib");
1635 RTE_LOG_REGISTER_SUFFIX(mana_logtype_init, init, NOTICE);
1636 RTE_LOG_REGISTER_SUFFIX(mana_logtype_driver, driver, NOTICE);
1637 RTE_PMD_REGISTER_PARAM_STRING(net_mana, ETH_MANA_MAC_ARG "=<mac_addr>");
1638