xref: /dpdk/drivers/net/mana/mana.c (revision c6552d9a8deffa448de2d5e2e726f50508c1efd2)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2022 Microsoft Corporation
3  */
4 
5 #include <unistd.h>
6 #include <dirent.h>
7 #include <fcntl.h>
8 #include <sys/mman.h>
9 #include <sys/ioctl.h>
10 #include <net/if.h>
11 
12 #include <ethdev_driver.h>
13 #include <ethdev_pci.h>
14 #include <rte_kvargs.h>
15 #include <rte_eal_paging.h>
16 #include <rte_pci.h>
17 
18 #include <infiniband/verbs.h>
19 #include <infiniband/manadv.h>
20 
21 #include <assert.h>
22 
23 #include "mana.h"
24 
25 /* Shared memory between primary/secondary processes, per driver */
26 /* Data to track primary/secondary usage */
27 struct mana_shared_data *mana_shared_data;
28 static struct mana_shared_data mana_local_data;
29 
30 /* The memory region for the above data */
31 static const struct rte_memzone *mana_shared_mz;
32 static const char *MZ_MANA_SHARED_DATA = "mana_shared_data";
33 
34 /* Spinlock for mana_shared_data */
35 static rte_spinlock_t mana_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
36 
37 /* Allocate a buffer on the stack and fill it with a printf format string. */
38 #define MANA_MKSTR(name, ...) \
39 	int mkstr_size_##name = snprintf(NULL, 0, "" __VA_ARGS__); \
40 	char name[mkstr_size_##name + 1]; \
41 	\
42 	memset(name, 0, mkstr_size_##name + 1); \
43 	snprintf(name, sizeof(name), "" __VA_ARGS__)
44 
45 int mana_logtype_driver;
46 int mana_logtype_init;
47 
48 /*
49  * Callback from rdma-core to allocate a buffer for a queue.
50  */
51 void *
52 mana_alloc_verbs_buf(size_t size, void *data)
53 {
54 	void *ret;
55 	size_t alignment = rte_mem_page_size();
56 	int socket = (int)(uintptr_t)data;
57 
58 	DRV_LOG(DEBUG, "size=%zu socket=%d", size, socket);
59 
60 	if (alignment == (size_t)-1) {
61 		DRV_LOG(ERR, "Failed to get mem page size");
62 		rte_errno = ENOMEM;
63 		return NULL;
64 	}
65 
66 	ret = rte_zmalloc_socket("mana_verb_buf", size, alignment, socket);
67 	if (!ret && size)
68 		rte_errno = ENOMEM;
69 	return ret;
70 }
71 
72 void
73 mana_free_verbs_buf(void *ptr, void *data __rte_unused)
74 {
75 	rte_free(ptr);
76 }
77 
78 static int
79 mana_dev_configure(struct rte_eth_dev *dev)
80 {
81 	struct mana_priv *priv = dev->data->dev_private;
82 	struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
83 
84 	if (dev_conf->rxmode.mq_mode & RTE_ETH_MQ_RX_RSS_FLAG)
85 		dev_conf->rxmode.offloads |= RTE_ETH_RX_OFFLOAD_RSS_HASH;
86 
87 	if (dev->data->nb_rx_queues != dev->data->nb_tx_queues) {
88 		DRV_LOG(ERR, "Only support equal number of rx/tx queues");
89 		return -EINVAL;
90 	}
91 
92 	if (!rte_is_power_of_2(dev->data->nb_rx_queues)) {
93 		DRV_LOG(ERR, "number of TX/RX queues must be power of 2");
94 		return -EINVAL;
95 	}
96 
97 	priv->num_queues = dev->data->nb_rx_queues;
98 
99 	manadv_set_context_attr(priv->ib_ctx, MANADV_CTX_ATTR_BUF_ALLOCATORS,
100 				(void *)((uintptr_t)&(struct manadv_ctx_allocators){
101 					.alloc = &mana_alloc_verbs_buf,
102 					.free = &mana_free_verbs_buf,
103 					.data = 0,
104 				}));
105 
106 	return 0;
107 }
108 
109 static void
110 rx_intr_vec_disable(struct mana_priv *priv)
111 {
112 	struct rte_intr_handle *intr_handle = priv->intr_handle;
113 
114 	rte_intr_free_epoll_fd(intr_handle);
115 	rte_intr_vec_list_free(intr_handle);
116 	rte_intr_nb_efd_set(intr_handle, 0);
117 }
118 
119 static int
120 rx_intr_vec_enable(struct mana_priv *priv)
121 {
122 	unsigned int i;
123 	unsigned int rxqs_n = priv->dev_data->nb_rx_queues;
124 	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
125 	struct rte_intr_handle *intr_handle = priv->intr_handle;
126 	int ret;
127 
128 	rx_intr_vec_disable(priv);
129 
130 	if (rte_intr_vec_list_alloc(intr_handle, NULL, n)) {
131 		DRV_LOG(ERR, "Failed to allocate memory for interrupt vector");
132 		return -ENOMEM;
133 	}
134 
135 	for (i = 0; i < n; i++) {
136 		struct mana_rxq *rxq = priv->dev_data->rx_queues[i];
137 
138 		ret = rte_intr_vec_list_index_set(intr_handle, i,
139 						  RTE_INTR_VEC_RXTX_OFFSET + i);
140 		if (ret) {
141 			DRV_LOG(ERR, "Failed to set intr vec %u", i);
142 			return ret;
143 		}
144 
145 		ret = rte_intr_efds_index_set(intr_handle, i, rxq->channel->fd);
146 		if (ret) {
147 			DRV_LOG(ERR, "Failed to set FD at intr %u", i);
148 			return ret;
149 		}
150 	}
151 
152 	return rte_intr_nb_efd_set(intr_handle, n);
153 }
154 
155 static void
156 rxq_intr_disable(struct mana_priv *priv)
157 {
158 	int err = rte_errno;
159 
160 	rx_intr_vec_disable(priv);
161 	rte_errno = err;
162 }
163 
164 static int
165 rxq_intr_enable(struct mana_priv *priv)
166 {
167 	const struct rte_eth_intr_conf *const intr_conf =
168 		&priv->dev_data->dev_conf.intr_conf;
169 
170 	if (!intr_conf->rxq)
171 		return 0;
172 
173 	return rx_intr_vec_enable(priv);
174 }
175 
176 static int
177 mana_dev_start(struct rte_eth_dev *dev)
178 {
179 	int ret;
180 	struct mana_priv *priv = dev->data->dev_private;
181 
182 	rte_spinlock_init(&priv->mr_btree_lock);
183 	ret = mana_mr_btree_init(&priv->mr_btree, MANA_MR_BTREE_CACHE_N,
184 				 dev->device->numa_node);
185 	if (ret) {
186 		DRV_LOG(ERR, "Failed to init device MR btree %d", ret);
187 		return ret;
188 	}
189 
190 	ret = mana_start_tx_queues(dev);
191 	if (ret) {
192 		DRV_LOG(ERR, "failed to start tx queues %d", ret);
193 		goto failed_tx;
194 	}
195 
196 	ret = mana_start_rx_queues(dev);
197 	if (ret) {
198 		DRV_LOG(ERR, "failed to start rx queues %d", ret);
199 		goto failed_rx;
200 	}
201 
202 	rte_wmb();
203 
204 	dev->tx_pkt_burst = mana_tx_burst;
205 	dev->rx_pkt_burst = mana_rx_burst;
206 
207 	DRV_LOG(INFO, "TX/RX queues have started");
208 
209 	/* Enable datapath for secondary processes */
210 	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_START_RXTX);
211 
212 	ret = rxq_intr_enable(priv);
213 	if (ret) {
214 		DRV_LOG(ERR, "Failed to enable RX interrupts");
215 		goto failed_intr;
216 	}
217 
218 	return 0;
219 
220 failed_intr:
221 	mana_stop_rx_queues(dev);
222 
223 failed_rx:
224 	mana_stop_tx_queues(dev);
225 
226 failed_tx:
227 	mana_mr_btree_free(&priv->mr_btree);
228 
229 	return ret;
230 }
231 
232 static int
233 mana_dev_stop(struct rte_eth_dev *dev)
234 {
235 	int ret;
236 	struct mana_priv *priv = dev->data->dev_private;
237 
238 	rxq_intr_disable(priv);
239 
240 	dev->tx_pkt_burst = mana_tx_burst_removed;
241 	dev->rx_pkt_burst = mana_rx_burst_removed;
242 
243 	/* Stop datapath on secondary processes */
244 	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_STOP_RXTX);
245 
246 	rte_wmb();
247 
248 	ret = mana_stop_tx_queues(dev);
249 	if (ret) {
250 		DRV_LOG(ERR, "failed to stop tx queues");
251 		return ret;
252 	}
253 
254 	ret = mana_stop_rx_queues(dev);
255 	if (ret) {
256 		DRV_LOG(ERR, "failed to stop tx queues");
257 		return ret;
258 	}
259 
260 	return 0;
261 }
262 
263 static int mana_intr_uninstall(struct mana_priv *priv);
264 
265 static int
266 mana_dev_close(struct rte_eth_dev *dev)
267 {
268 	struct mana_priv *priv = dev->data->dev_private;
269 	int ret;
270 
271 	mana_remove_all_mr(priv);
272 
273 	ret = mana_intr_uninstall(priv);
274 	if (ret)
275 		return ret;
276 
277 	ret = ibv_close_device(priv->ib_ctx);
278 	if (ret) {
279 		ret = errno;
280 		return ret;
281 	}
282 
283 	return 0;
284 }
285 
286 static int
287 mana_dev_info_get(struct rte_eth_dev *dev,
288 		  struct rte_eth_dev_info *dev_info)
289 {
290 	struct mana_priv *priv = dev->data->dev_private;
291 
292 	dev_info->min_mtu = RTE_ETHER_MIN_MTU;
293 	dev_info->max_mtu = MANA_MAX_MTU;
294 
295 	/* RX params */
296 	dev_info->min_rx_bufsize = MIN_RX_BUF_SIZE;
297 	dev_info->max_rx_pktlen = MANA_MAX_MTU + RTE_ETHER_HDR_LEN;
298 
299 	dev_info->max_rx_queues = RTE_MIN(priv->max_rx_queues, UINT16_MAX);
300 	dev_info->max_tx_queues = RTE_MIN(priv->max_tx_queues, UINT16_MAX);
301 
302 	dev_info->max_mac_addrs = MANA_MAX_MAC_ADDR;
303 	dev_info->max_hash_mac_addrs = 0;
304 
305 	dev_info->max_vfs = 1;
306 
307 	/* Offload params */
308 	dev_info->rx_offload_capa = MANA_DEV_RX_OFFLOAD_SUPPORT;
309 
310 	dev_info->tx_offload_capa = MANA_DEV_TX_OFFLOAD_SUPPORT;
311 
312 	/* RSS */
313 	dev_info->reta_size = INDIRECTION_TABLE_NUM_ELEMENTS;
314 	dev_info->hash_key_size = TOEPLITZ_HASH_KEY_SIZE_IN_BYTES;
315 	dev_info->flow_type_rss_offloads = MANA_ETH_RSS_SUPPORT;
316 
317 	/* Thresholds */
318 	dev_info->default_rxconf = (struct rte_eth_rxconf){
319 		.rx_thresh = {
320 			.pthresh = 8,
321 			.hthresh = 8,
322 			.wthresh = 0,
323 		},
324 		.rx_free_thresh = 32,
325 		/* If no descriptors available, pkts are dropped by default */
326 		.rx_drop_en = 1,
327 	};
328 
329 	dev_info->default_txconf = (struct rte_eth_txconf){
330 		.tx_thresh = {
331 			.pthresh = 32,
332 			.hthresh = 0,
333 			.wthresh = 0,
334 		},
335 		.tx_rs_thresh = 32,
336 		.tx_free_thresh = 32,
337 	};
338 
339 	/* Buffer limits */
340 	dev_info->rx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
341 	dev_info->rx_desc_lim.nb_max = RTE_MIN(priv->max_rx_desc, UINT16_MAX);
342 	dev_info->rx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
343 	dev_info->rx_desc_lim.nb_seg_max =
344 		RTE_MIN(priv->max_recv_sge, UINT16_MAX);
345 	dev_info->rx_desc_lim.nb_mtu_seg_max =
346 		RTE_MIN(priv->max_recv_sge, UINT16_MAX);
347 
348 	dev_info->tx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
349 	dev_info->tx_desc_lim.nb_max = RTE_MIN(priv->max_tx_desc, UINT16_MAX);
350 	dev_info->tx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
351 	dev_info->tx_desc_lim.nb_seg_max =
352 		RTE_MIN(priv->max_send_sge, UINT16_MAX);
353 	dev_info->tx_desc_lim.nb_mtu_seg_max =
354 		RTE_MIN(priv->max_send_sge, UINT16_MAX);
355 
356 	/* Speed */
357 	dev_info->speed_capa = RTE_ETH_LINK_SPEED_100G;
358 
359 	/* RX params */
360 	dev_info->default_rxportconf.burst_size = 1;
361 	dev_info->default_rxportconf.ring_size = MAX_RECEIVE_BUFFERS_PER_QUEUE;
362 	dev_info->default_rxportconf.nb_queues = 1;
363 
364 	/* TX params */
365 	dev_info->default_txportconf.burst_size = 1;
366 	dev_info->default_txportconf.ring_size = MAX_SEND_BUFFERS_PER_QUEUE;
367 	dev_info->default_txportconf.nb_queues = 1;
368 
369 	return 0;
370 }
371 
372 static void
373 mana_dev_tx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
374 		       struct rte_eth_txq_info *qinfo)
375 {
376 	struct mana_txq *txq = dev->data->tx_queues[queue_id];
377 
378 	qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads;
379 	qinfo->nb_desc = txq->num_desc;
380 }
381 
382 static void
383 mana_dev_rx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
384 		       struct rte_eth_rxq_info *qinfo)
385 {
386 	struct mana_rxq *rxq = dev->data->rx_queues[queue_id];
387 
388 	qinfo->mp = rxq->mp;
389 	qinfo->nb_desc = rxq->num_desc;
390 	qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads;
391 }
392 
393 static const uint32_t *
394 mana_supported_ptypes(struct rte_eth_dev *dev __rte_unused,
395 		      size_t *no_of_elements)
396 {
397 	static const uint32_t ptypes[] = {
398 		RTE_PTYPE_L2_ETHER,
399 		RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
400 		RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
401 		RTE_PTYPE_L4_FRAG,
402 		RTE_PTYPE_L4_TCP,
403 		RTE_PTYPE_L4_UDP,
404 	};
405 
406 	*no_of_elements = RTE_DIM(ptypes);
407 	return ptypes;
408 }
409 
410 static int
411 mana_rss_hash_update(struct rte_eth_dev *dev,
412 		     struct rte_eth_rss_conf *rss_conf)
413 {
414 	struct mana_priv *priv = dev->data->dev_private;
415 
416 	/* Currently can only update RSS hash when device is stopped */
417 	if (dev->data->dev_started) {
418 		DRV_LOG(ERR, "Can't update RSS after device has started");
419 		return -ENODEV;
420 	}
421 
422 	if (rss_conf->rss_hf & ~MANA_ETH_RSS_SUPPORT) {
423 		DRV_LOG(ERR, "Port %u invalid RSS HF 0x%" PRIx64,
424 			dev->data->port_id, rss_conf->rss_hf);
425 		return -EINVAL;
426 	}
427 
428 	if (rss_conf->rss_key && rss_conf->rss_key_len) {
429 		if (rss_conf->rss_key_len != TOEPLITZ_HASH_KEY_SIZE_IN_BYTES) {
430 			DRV_LOG(ERR, "Port %u key len must be %u long",
431 				dev->data->port_id,
432 				TOEPLITZ_HASH_KEY_SIZE_IN_BYTES);
433 			return -EINVAL;
434 		}
435 
436 		priv->rss_conf.rss_key_len = rss_conf->rss_key_len;
437 		priv->rss_conf.rss_key =
438 			rte_zmalloc("mana_rss", rss_conf->rss_key_len,
439 				    RTE_CACHE_LINE_SIZE);
440 		if (!priv->rss_conf.rss_key)
441 			return -ENOMEM;
442 		memcpy(priv->rss_conf.rss_key, rss_conf->rss_key,
443 		       rss_conf->rss_key_len);
444 	}
445 	priv->rss_conf.rss_hf = rss_conf->rss_hf;
446 
447 	return 0;
448 }
449 
450 static int
451 mana_rss_hash_conf_get(struct rte_eth_dev *dev,
452 		       struct rte_eth_rss_conf *rss_conf)
453 {
454 	struct mana_priv *priv = dev->data->dev_private;
455 
456 	if (!rss_conf)
457 		return -EINVAL;
458 
459 	if (rss_conf->rss_key &&
460 	    rss_conf->rss_key_len >= priv->rss_conf.rss_key_len) {
461 		memcpy(rss_conf->rss_key, priv->rss_conf.rss_key,
462 		       priv->rss_conf.rss_key_len);
463 	}
464 
465 	rss_conf->rss_key_len = priv->rss_conf.rss_key_len;
466 	rss_conf->rss_hf = priv->rss_conf.rss_hf;
467 
468 	return 0;
469 }
470 
471 static int
472 mana_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
473 			uint16_t nb_desc, unsigned int socket_id,
474 			const struct rte_eth_txconf *tx_conf __rte_unused)
475 
476 {
477 	struct mana_priv *priv = dev->data->dev_private;
478 	struct mana_txq *txq;
479 	int ret;
480 
481 	txq = rte_zmalloc_socket("mana_txq", sizeof(*txq), 0, socket_id);
482 	if (!txq) {
483 		DRV_LOG(ERR, "failed to allocate txq");
484 		return -ENOMEM;
485 	}
486 
487 	txq->socket = socket_id;
488 
489 	txq->desc_ring = rte_malloc_socket("mana_tx_desc_ring",
490 					   sizeof(struct mana_txq_desc) *
491 						nb_desc,
492 					   RTE_CACHE_LINE_SIZE, socket_id);
493 	if (!txq->desc_ring) {
494 		DRV_LOG(ERR, "failed to allocate txq desc_ring");
495 		ret = -ENOMEM;
496 		goto fail;
497 	}
498 
499 	txq->gdma_comp_buf = rte_malloc_socket("mana_txq_comp",
500 			sizeof(*txq->gdma_comp_buf) * nb_desc,
501 			RTE_CACHE_LINE_SIZE, socket_id);
502 	if (!txq->gdma_comp_buf) {
503 		DRV_LOG(ERR, "failed to allocate txq comp");
504 		ret = -ENOMEM;
505 		goto fail;
506 	}
507 
508 	ret = mana_mr_btree_init(&txq->mr_btree,
509 				 MANA_MR_BTREE_PER_QUEUE_N, socket_id);
510 	if (ret) {
511 		DRV_LOG(ERR, "Failed to init TXQ MR btree");
512 		goto fail;
513 	}
514 
515 	DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u txq->desc_ring %p",
516 		queue_idx, nb_desc, socket_id, txq->desc_ring);
517 
518 	txq->desc_ring_head = 0;
519 	txq->desc_ring_tail = 0;
520 	txq->priv = priv;
521 	txq->num_desc = nb_desc;
522 	dev->data->tx_queues[queue_idx] = txq;
523 
524 	return 0;
525 
526 fail:
527 	rte_free(txq->gdma_comp_buf);
528 	rte_free(txq->desc_ring);
529 	rte_free(txq);
530 	return ret;
531 }
532 
533 static void
534 mana_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
535 {
536 	struct mana_txq *txq = dev->data->tx_queues[qid];
537 
538 	mana_mr_btree_free(&txq->mr_btree);
539 
540 	rte_free(txq->gdma_comp_buf);
541 	rte_free(txq->desc_ring);
542 	rte_free(txq);
543 }
544 
545 static int
546 mana_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
547 			uint16_t nb_desc, unsigned int socket_id,
548 			const struct rte_eth_rxconf *rx_conf __rte_unused,
549 			struct rte_mempool *mp)
550 {
551 	struct mana_priv *priv = dev->data->dev_private;
552 	struct mana_rxq *rxq;
553 	int ret;
554 
555 	rxq = rte_zmalloc_socket("mana_rxq", sizeof(*rxq), 0, socket_id);
556 	if (!rxq) {
557 		DRV_LOG(ERR, "failed to allocate rxq");
558 		return -ENOMEM;
559 	}
560 
561 	DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u",
562 		queue_idx, nb_desc, socket_id);
563 
564 	rxq->socket = socket_id;
565 
566 	rxq->desc_ring = rte_zmalloc_socket("mana_rx_mbuf_ring",
567 					    sizeof(struct mana_rxq_desc) *
568 						nb_desc,
569 					    RTE_CACHE_LINE_SIZE, socket_id);
570 
571 	if (!rxq->desc_ring) {
572 		DRV_LOG(ERR, "failed to allocate rxq desc_ring");
573 		ret = -ENOMEM;
574 		goto fail;
575 	}
576 
577 	rxq->desc_ring_head = 0;
578 	rxq->desc_ring_tail = 0;
579 
580 	rxq->gdma_comp_buf = rte_malloc_socket("mana_rxq_comp",
581 			sizeof(*rxq->gdma_comp_buf) * nb_desc,
582 			RTE_CACHE_LINE_SIZE, socket_id);
583 	if (!rxq->gdma_comp_buf) {
584 		DRV_LOG(ERR, "failed to allocate rxq comp");
585 		ret = -ENOMEM;
586 		goto fail;
587 	}
588 
589 	ret = mana_mr_btree_init(&rxq->mr_btree,
590 				 MANA_MR_BTREE_PER_QUEUE_N, socket_id);
591 	if (ret) {
592 		DRV_LOG(ERR, "Failed to init RXQ MR btree");
593 		goto fail;
594 	}
595 
596 	rxq->priv = priv;
597 	rxq->num_desc = nb_desc;
598 	rxq->mp = mp;
599 	dev->data->rx_queues[queue_idx] = rxq;
600 
601 	return 0;
602 
603 fail:
604 	rte_free(rxq->gdma_comp_buf);
605 	rte_free(rxq->desc_ring);
606 	rte_free(rxq);
607 	return ret;
608 }
609 
610 static void
611 mana_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
612 {
613 	struct mana_rxq *rxq = dev->data->rx_queues[qid];
614 
615 	mana_mr_btree_free(&rxq->mr_btree);
616 
617 	rte_free(rxq->gdma_comp_buf);
618 	rte_free(rxq->desc_ring);
619 	rte_free(rxq);
620 }
621 
622 static int
623 mana_dev_link_update(struct rte_eth_dev *dev,
624 		     int wait_to_complete __rte_unused)
625 {
626 	struct rte_eth_link link;
627 
628 	/* MANA has no concept of carrier state, always reporting UP */
629 	link = (struct rte_eth_link) {
630 		.link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
631 		.link_autoneg = RTE_ETH_LINK_SPEED_FIXED,
632 		.link_speed = RTE_ETH_SPEED_NUM_100G,
633 		.link_status = RTE_ETH_LINK_UP,
634 	};
635 
636 	return rte_eth_linkstatus_set(dev, &link);
637 }
638 
639 static int
640 mana_dev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
641 {
642 	unsigned int i;
643 
644 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
645 		struct mana_txq *txq = dev->data->tx_queues[i];
646 
647 		if (!txq)
648 			continue;
649 
650 		stats->opackets += txq->stats.packets;
651 		stats->obytes += txq->stats.bytes;
652 		stats->oerrors += txq->stats.errors;
653 
654 		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
655 			stats->q_opackets[i] = txq->stats.packets;
656 			stats->q_obytes[i] = txq->stats.bytes;
657 		}
658 	}
659 
660 	stats->rx_nombuf = 0;
661 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
662 		struct mana_rxq *rxq = dev->data->rx_queues[i];
663 
664 		if (!rxq)
665 			continue;
666 
667 		stats->ipackets += rxq->stats.packets;
668 		stats->ibytes += rxq->stats.bytes;
669 		stats->ierrors += rxq->stats.errors;
670 
671 		/* There is no good way to get stats->imissed, not setting it */
672 
673 		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
674 			stats->q_ipackets[i] = rxq->stats.packets;
675 			stats->q_ibytes[i] = rxq->stats.bytes;
676 		}
677 
678 		stats->rx_nombuf += rxq->stats.nombuf;
679 	}
680 
681 	return 0;
682 }
683 
684 static int
685 mana_dev_stats_reset(struct rte_eth_dev *dev __rte_unused)
686 {
687 	unsigned int i;
688 
689 	PMD_INIT_FUNC_TRACE();
690 
691 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
692 		struct mana_txq *txq = dev->data->tx_queues[i];
693 
694 		if (!txq)
695 			continue;
696 
697 		memset(&txq->stats, 0, sizeof(txq->stats));
698 	}
699 
700 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
701 		struct mana_rxq *rxq = dev->data->rx_queues[i];
702 
703 		if (!rxq)
704 			continue;
705 
706 		memset(&rxq->stats, 0, sizeof(rxq->stats));
707 	}
708 
709 	return 0;
710 }
711 
712 static int
713 mana_get_ifname(const struct mana_priv *priv, char (*ifname)[IF_NAMESIZE])
714 {
715 	int ret;
716 	DIR *dir;
717 	struct dirent *dent;
718 
719 	MANA_MKSTR(dirpath, "%s/device/net", priv->ib_ctx->device->ibdev_path);
720 
721 	dir = opendir(dirpath);
722 	if (dir == NULL)
723 		return -ENODEV;
724 
725 	while ((dent = readdir(dir)) != NULL) {
726 		char *name = dent->d_name;
727 		FILE *file;
728 		struct rte_ether_addr addr;
729 		char *mac = NULL;
730 
731 		if ((name[0] == '.') &&
732 		    ((name[1] == '\0') ||
733 		     ((name[1] == '.') && (name[2] == '\0'))))
734 			continue;
735 
736 		MANA_MKSTR(path, "%s/%s/address", dirpath, name);
737 
738 		file = fopen(path, "r");
739 		if (!file) {
740 			ret = -ENODEV;
741 			break;
742 		}
743 
744 		ret = fscanf(file, "%ms", &mac);
745 		fclose(file);
746 
747 		if (ret <= 0) {
748 			ret = -EINVAL;
749 			break;
750 		}
751 
752 		ret = rte_ether_unformat_addr(mac, &addr);
753 		free(mac);
754 		if (ret)
755 			break;
756 
757 		if (rte_is_same_ether_addr(&addr, priv->dev_data->mac_addrs)) {
758 			strlcpy(*ifname, name, sizeof(*ifname));
759 			ret = 0;
760 			break;
761 		}
762 	}
763 
764 	closedir(dir);
765 	return ret;
766 }
767 
768 static int
769 mana_ifreq(const struct mana_priv *priv, int req, struct ifreq *ifr)
770 {
771 	int sock, ret;
772 
773 	sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
774 	if (sock == -1)
775 		return -errno;
776 
777 	ret = mana_get_ifname(priv, &ifr->ifr_name);
778 	if (ret) {
779 		close(sock);
780 		return ret;
781 	}
782 
783 	if (ioctl(sock, req, ifr) == -1)
784 		ret = -errno;
785 
786 	close(sock);
787 
788 	return ret;
789 }
790 
791 static int
792 mana_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
793 {
794 	struct mana_priv *priv = dev->data->dev_private;
795 	struct ifreq request = { .ifr_mtu = mtu, };
796 
797 	return mana_ifreq(priv, SIOCSIFMTU, &request);
798 }
799 
800 static const struct eth_dev_ops mana_dev_ops = {
801 	.dev_configure		= mana_dev_configure,
802 	.dev_start		= mana_dev_start,
803 	.dev_stop		= mana_dev_stop,
804 	.dev_close		= mana_dev_close,
805 	.dev_infos_get		= mana_dev_info_get,
806 	.txq_info_get		= mana_dev_tx_queue_info,
807 	.rxq_info_get		= mana_dev_rx_queue_info,
808 	.dev_supported_ptypes_get = mana_supported_ptypes,
809 	.rss_hash_update	= mana_rss_hash_update,
810 	.rss_hash_conf_get	= mana_rss_hash_conf_get,
811 	.tx_queue_setup		= mana_dev_tx_queue_setup,
812 	.tx_queue_release	= mana_dev_tx_queue_release,
813 	.rx_queue_setup		= mana_dev_rx_queue_setup,
814 	.rx_queue_release	= mana_dev_rx_queue_release,
815 	.rx_queue_intr_enable	= mana_rx_intr_enable,
816 	.rx_queue_intr_disable	= mana_rx_intr_disable,
817 	.link_update		= mana_dev_link_update,
818 	.stats_get		= mana_dev_stats_get,
819 	.stats_reset		= mana_dev_stats_reset,
820 	.mtu_set		= mana_mtu_set,
821 };
822 
823 static const struct eth_dev_ops mana_dev_secondary_ops = {
824 	.stats_get = mana_dev_stats_get,
825 	.stats_reset = mana_dev_stats_reset,
826 	.dev_infos_get = mana_dev_info_get,
827 };
828 
829 uint16_t
830 mana_rx_burst_removed(void *dpdk_rxq __rte_unused,
831 		      struct rte_mbuf **pkts __rte_unused,
832 		      uint16_t pkts_n __rte_unused)
833 {
834 	rte_mb();
835 	return 0;
836 }
837 
838 uint16_t
839 mana_tx_burst_removed(void *dpdk_rxq __rte_unused,
840 		      struct rte_mbuf **pkts __rte_unused,
841 		      uint16_t pkts_n __rte_unused)
842 {
843 	rte_mb();
844 	return 0;
845 }
846 
847 #define ETH_MANA_MAC_ARG "mac"
848 static const char * const mana_init_args[] = {
849 	ETH_MANA_MAC_ARG,
850 	NULL,
851 };
852 
853 /* Support of parsing up to 8 mac address from EAL command line */
854 #define MAX_NUM_ADDRESS 8
855 struct mana_conf {
856 	struct rte_ether_addr mac_array[MAX_NUM_ADDRESS];
857 	unsigned int index;
858 };
859 
860 static int
861 mana_arg_parse_callback(const char *key, const char *val, void *private)
862 {
863 	struct mana_conf *conf = (struct mana_conf *)private;
864 	int ret;
865 
866 	DRV_LOG(INFO, "key=%s value=%s index=%d", key, val, conf->index);
867 
868 	if (conf->index >= MAX_NUM_ADDRESS) {
869 		DRV_LOG(ERR, "Exceeding max MAC address");
870 		return 1;
871 	}
872 
873 	ret = rte_ether_unformat_addr(val, &conf->mac_array[conf->index]);
874 	if (ret) {
875 		DRV_LOG(ERR, "Invalid MAC address %s", val);
876 		return ret;
877 	}
878 
879 	conf->index++;
880 
881 	return 0;
882 }
883 
884 static int
885 mana_parse_args(struct rte_devargs *devargs, struct mana_conf *conf)
886 {
887 	struct rte_kvargs *kvlist;
888 	unsigned int arg_count;
889 	int ret = 0;
890 
891 	kvlist = rte_kvargs_parse(devargs->drv_str, mana_init_args);
892 	if (!kvlist) {
893 		DRV_LOG(ERR, "failed to parse kvargs args=%s", devargs->drv_str);
894 		return -EINVAL;
895 	}
896 
897 	arg_count = rte_kvargs_count(kvlist, mana_init_args[0]);
898 	if (arg_count > MAX_NUM_ADDRESS) {
899 		ret = -EINVAL;
900 		goto free_kvlist;
901 	}
902 	ret = rte_kvargs_process(kvlist, mana_init_args[0],
903 				 mana_arg_parse_callback, conf);
904 	if (ret) {
905 		DRV_LOG(ERR, "error parsing args");
906 		goto free_kvlist;
907 	}
908 
909 free_kvlist:
910 	rte_kvargs_free(kvlist);
911 	return ret;
912 }
913 
914 static int
915 get_port_mac(struct ibv_device *device, unsigned int port,
916 	     struct rte_ether_addr *addr)
917 {
918 	FILE *file;
919 	int ret = 0;
920 	DIR *dir;
921 	struct dirent *dent;
922 	unsigned int dev_port;
923 
924 	MANA_MKSTR(path, "%s/device/net", device->ibdev_path);
925 
926 	dir = opendir(path);
927 	if (!dir)
928 		return -ENOENT;
929 
930 	while ((dent = readdir(dir))) {
931 		char *name = dent->d_name;
932 		char *mac = NULL;
933 
934 		MANA_MKSTR(port_path, "%s/%s/dev_port", path, name);
935 
936 		/* Ignore . and .. */
937 		if ((name[0] == '.') &&
938 		    ((name[1] == '\0') ||
939 		     ((name[1] == '.') && (name[2] == '\0'))))
940 			continue;
941 
942 		file = fopen(port_path, "r");
943 		if (!file)
944 			continue;
945 
946 		ret = fscanf(file, "%u", &dev_port);
947 		fclose(file);
948 
949 		if (ret != 1)
950 			continue;
951 
952 		/* Ethernet ports start at 0, IB port start at 1 */
953 		if (dev_port == port - 1) {
954 			MANA_MKSTR(address_path, "%s/%s/address", path, name);
955 
956 			file = fopen(address_path, "r");
957 			if (!file)
958 				continue;
959 
960 			ret = fscanf(file, "%ms", &mac);
961 			fclose(file);
962 
963 			if (ret < 0)
964 				break;
965 
966 			ret = rte_ether_unformat_addr(mac, addr);
967 			if (ret)
968 				DRV_LOG(ERR, "unrecognized mac addr %s", mac);
969 
970 			free(mac);
971 			break;
972 		}
973 	}
974 
975 	closedir(dir);
976 	return ret;
977 }
978 
979 static int
980 mana_ibv_device_to_pci_addr(const struct ibv_device *device,
981 			    struct rte_pci_addr *pci_addr)
982 {
983 	FILE *file;
984 	char *line = NULL;
985 	size_t len = 0;
986 
987 	MANA_MKSTR(path, "%s/device/uevent", device->ibdev_path);
988 
989 	file = fopen(path, "r");
990 	if (!file)
991 		return -errno;
992 
993 	while (getline(&line, &len, file) != -1) {
994 		/* Extract information. */
995 		if (sscanf(line,
996 			   "PCI_SLOT_NAME="
997 			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
998 			   &pci_addr->domain,
999 			   &pci_addr->bus,
1000 			   &pci_addr->devid,
1001 			   &pci_addr->function) == 4) {
1002 			break;
1003 		}
1004 	}
1005 
1006 	free(line);
1007 	fclose(file);
1008 	return 0;
1009 }
1010 
1011 /*
1012  * Interrupt handler from IB layer to notify this device is being removed.
1013  */
1014 static void
1015 mana_intr_handler(void *arg)
1016 {
1017 	struct mana_priv *priv = arg;
1018 	struct ibv_context *ctx = priv->ib_ctx;
1019 	struct ibv_async_event event;
1020 
1021 	/* Read and ack all messages from IB device */
1022 	while (true) {
1023 		if (ibv_get_async_event(ctx, &event))
1024 			break;
1025 
1026 		if (event.event_type == IBV_EVENT_DEVICE_FATAL) {
1027 			struct rte_eth_dev *dev;
1028 
1029 			dev = &rte_eth_devices[priv->port_id];
1030 			if (dev->data->dev_conf.intr_conf.rmv)
1031 				rte_eth_dev_callback_process(dev,
1032 					RTE_ETH_EVENT_INTR_RMV, NULL);
1033 		}
1034 
1035 		ibv_ack_async_event(&event);
1036 	}
1037 }
1038 
1039 static int
1040 mana_intr_uninstall(struct mana_priv *priv)
1041 {
1042 	int ret;
1043 
1044 	ret = rte_intr_callback_unregister(priv->intr_handle,
1045 					   mana_intr_handler, priv);
1046 	if (ret <= 0) {
1047 		DRV_LOG(ERR, "Failed to unregister intr callback ret %d", ret);
1048 		return ret;
1049 	}
1050 
1051 	rte_intr_instance_free(priv->intr_handle);
1052 
1053 	return 0;
1054 }
1055 
1056 int
1057 mana_fd_set_non_blocking(int fd)
1058 {
1059 	int ret = fcntl(fd, F_GETFL);
1060 
1061 	if (ret != -1 && !fcntl(fd, F_SETFL, ret | O_NONBLOCK))
1062 		return 0;
1063 
1064 	rte_errno = errno;
1065 	return -rte_errno;
1066 }
1067 
1068 static int
1069 mana_intr_install(struct rte_eth_dev *eth_dev, struct mana_priv *priv)
1070 {
1071 	int ret;
1072 	struct ibv_context *ctx = priv->ib_ctx;
1073 
1074 	priv->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_SHARED);
1075 	if (!priv->intr_handle) {
1076 		DRV_LOG(ERR, "Failed to allocate intr_handle");
1077 		rte_errno = ENOMEM;
1078 		return -ENOMEM;
1079 	}
1080 
1081 	ret = rte_intr_fd_set(priv->intr_handle, -1);
1082 	if (ret)
1083 		goto free_intr;
1084 
1085 	ret = mana_fd_set_non_blocking(ctx->async_fd);
1086 	if (ret) {
1087 		DRV_LOG(ERR, "Failed to change async_fd to NONBLOCK");
1088 		goto free_intr;
1089 	}
1090 
1091 	ret = rte_intr_fd_set(priv->intr_handle, ctx->async_fd);
1092 	if (ret)
1093 		goto free_intr;
1094 
1095 	ret = rte_intr_type_set(priv->intr_handle, RTE_INTR_HANDLE_EXT);
1096 	if (ret)
1097 		goto free_intr;
1098 
1099 	ret = rte_intr_callback_register(priv->intr_handle,
1100 					 mana_intr_handler, priv);
1101 	if (ret) {
1102 		DRV_LOG(ERR, "Failed to register intr callback");
1103 		rte_intr_fd_set(priv->intr_handle, -1);
1104 		goto free_intr;
1105 	}
1106 
1107 	eth_dev->intr_handle = priv->intr_handle;
1108 	return 0;
1109 
1110 free_intr:
1111 	rte_intr_instance_free(priv->intr_handle);
1112 	priv->intr_handle = NULL;
1113 
1114 	return ret;
1115 }
1116 
1117 static int
1118 mana_proc_priv_init(struct rte_eth_dev *dev)
1119 {
1120 	struct mana_process_priv *priv;
1121 
1122 	priv = rte_zmalloc_socket("mana_proc_priv",
1123 				  sizeof(struct mana_process_priv),
1124 				  RTE_CACHE_LINE_SIZE,
1125 				  dev->device->numa_node);
1126 	if (!priv)
1127 		return -ENOMEM;
1128 
1129 	dev->process_private = priv;
1130 	return 0;
1131 }
1132 
1133 /*
1134  * Map the doorbell page for the secondary process through IB device handle.
1135  */
1136 static int
1137 mana_map_doorbell_secondary(struct rte_eth_dev *eth_dev, int fd)
1138 {
1139 	struct mana_process_priv *priv = eth_dev->process_private;
1140 
1141 	void *addr;
1142 
1143 	addr = mmap(NULL, rte_mem_page_size(), PROT_WRITE, MAP_SHARED, fd, 0);
1144 	if (addr == MAP_FAILED) {
1145 		DRV_LOG(ERR, "Failed to map secondary doorbell port %u",
1146 			eth_dev->data->port_id);
1147 		return -ENOMEM;
1148 	}
1149 
1150 	DRV_LOG(INFO, "Secondary doorbell mapped to %p", addr);
1151 
1152 	priv->db_page = addr;
1153 
1154 	return 0;
1155 }
1156 
1157 /* Initialize shared data for the driver (all devices) */
1158 static int
1159 mana_init_shared_data(void)
1160 {
1161 	int ret =  0;
1162 	const struct rte_memzone *secondary_mz;
1163 
1164 	rte_spinlock_lock(&mana_shared_data_lock);
1165 
1166 	/* Skip if shared data is already initialized */
1167 	if (mana_shared_data)
1168 		goto exit;
1169 
1170 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1171 		mana_shared_mz = rte_memzone_reserve(MZ_MANA_SHARED_DATA,
1172 						     sizeof(*mana_shared_data),
1173 						     SOCKET_ID_ANY, 0);
1174 		if (!mana_shared_mz) {
1175 			DRV_LOG(ERR, "Cannot allocate mana shared data");
1176 			ret = -rte_errno;
1177 			goto exit;
1178 		}
1179 
1180 		mana_shared_data = mana_shared_mz->addr;
1181 		memset(mana_shared_data, 0, sizeof(*mana_shared_data));
1182 		rte_spinlock_init(&mana_shared_data->lock);
1183 	} else {
1184 		secondary_mz = rte_memzone_lookup(MZ_MANA_SHARED_DATA);
1185 		if (!secondary_mz) {
1186 			DRV_LOG(ERR, "Cannot attach mana shared data");
1187 			ret = -rte_errno;
1188 			goto exit;
1189 		}
1190 
1191 		mana_shared_data = secondary_mz->addr;
1192 		memset(&mana_local_data, 0, sizeof(mana_local_data));
1193 	}
1194 
1195 exit:
1196 	rte_spinlock_unlock(&mana_shared_data_lock);
1197 
1198 	return ret;
1199 }
1200 
1201 /*
1202  * Init the data structures for use in primary and secondary processes.
1203  */
1204 static int
1205 mana_init_once(void)
1206 {
1207 	int ret;
1208 
1209 	ret = mana_init_shared_data();
1210 	if (ret)
1211 		return ret;
1212 
1213 	rte_spinlock_lock(&mana_shared_data->lock);
1214 
1215 	switch (rte_eal_process_type()) {
1216 	case RTE_PROC_PRIMARY:
1217 		if (mana_shared_data->init_done)
1218 			break;
1219 
1220 		ret = mana_mp_init_primary();
1221 		if (ret)
1222 			break;
1223 		DRV_LOG(ERR, "MP INIT PRIMARY");
1224 
1225 		mana_shared_data->init_done = 1;
1226 		break;
1227 
1228 	case RTE_PROC_SECONDARY:
1229 
1230 		if (mana_local_data.init_done)
1231 			break;
1232 
1233 		ret = mana_mp_init_secondary();
1234 		if (ret)
1235 			break;
1236 
1237 		DRV_LOG(ERR, "MP INIT SECONDARY");
1238 
1239 		mana_local_data.init_done = 1;
1240 		break;
1241 
1242 	default:
1243 		/* Impossible, internal error */
1244 		ret = -EPROTO;
1245 		break;
1246 	}
1247 
1248 	rte_spinlock_unlock(&mana_shared_data->lock);
1249 
1250 	return ret;
1251 }
1252 
1253 /*
1254  * Probe an IB port
1255  * Return value:
1256  * positive value: successfully probed port
1257  * 0: port not matching specified MAC address
1258  * negative value: error code
1259  */
1260 static int
1261 mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
1262 		uint8_t port, struct rte_pci_device *pci_dev, struct rte_ether_addr *addr)
1263 {
1264 	struct mana_priv *priv = NULL;
1265 	struct rte_eth_dev *eth_dev = NULL;
1266 	struct ibv_parent_domain_init_attr attr = {0};
1267 	char address[64];
1268 	char name[RTE_ETH_NAME_MAX_LEN];
1269 	int ret;
1270 	struct ibv_context *ctx = NULL;
1271 
1272 	rte_ether_format_addr(address, sizeof(address), addr);
1273 	DRV_LOG(INFO, "device located port %u address %s", port, address);
1274 
1275 	priv = rte_zmalloc_socket(NULL, sizeof(*priv), RTE_CACHE_LINE_SIZE,
1276 				  SOCKET_ID_ANY);
1277 	if (!priv)
1278 		return -ENOMEM;
1279 
1280 	snprintf(name, sizeof(name), "%s_port%d", pci_dev->device.name, port);
1281 
1282 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1283 		int fd;
1284 
1285 		eth_dev = rte_eth_dev_attach_secondary(name);
1286 		if (!eth_dev) {
1287 			DRV_LOG(ERR, "Can't attach to dev %s", name);
1288 			ret =  -ENOMEM;
1289 			goto failed;
1290 		}
1291 
1292 		eth_dev->device = &pci_dev->device;
1293 		eth_dev->dev_ops = &mana_dev_secondary_ops;
1294 		ret = mana_proc_priv_init(eth_dev);
1295 		if (ret)
1296 			goto failed;
1297 		priv->process_priv = eth_dev->process_private;
1298 
1299 		/* Get the IB FD from the primary process */
1300 		fd = mana_mp_req_verbs_cmd_fd(eth_dev);
1301 		if (fd < 0) {
1302 			DRV_LOG(ERR, "Failed to get FD %d", fd);
1303 			ret = -ENODEV;
1304 			goto failed;
1305 		}
1306 
1307 		ret = mana_map_doorbell_secondary(eth_dev, fd);
1308 		if (ret) {
1309 			DRV_LOG(ERR, "Failed secondary map %d", fd);
1310 			goto failed;
1311 		}
1312 
1313 		/* fd is no not used after mapping doorbell */
1314 		close(fd);
1315 
1316 		eth_dev->tx_pkt_burst = mana_tx_burst;
1317 		eth_dev->rx_pkt_burst = mana_rx_burst;
1318 
1319 		rte_spinlock_lock(&mana_shared_data->lock);
1320 		mana_shared_data->secondary_cnt++;
1321 		mana_local_data.secondary_cnt++;
1322 		rte_spinlock_unlock(&mana_shared_data->lock);
1323 
1324 		rte_eth_copy_pci_info(eth_dev, pci_dev);
1325 		rte_eth_dev_probing_finish(eth_dev);
1326 
1327 		return 0;
1328 	}
1329 
1330 	ctx = ibv_open_device(ibdev);
1331 	if (!ctx) {
1332 		DRV_LOG(ERR, "Failed to open IB device %s", ibdev->name);
1333 		ret = -ENODEV;
1334 		goto failed;
1335 	}
1336 
1337 	eth_dev = rte_eth_dev_allocate(name);
1338 	if (!eth_dev) {
1339 		ret = -ENOMEM;
1340 		goto failed;
1341 	}
1342 
1343 	eth_dev->data->mac_addrs =
1344 		rte_calloc("mana_mac", 1,
1345 			   sizeof(struct rte_ether_addr), 0);
1346 	if (!eth_dev->data->mac_addrs) {
1347 		ret = -ENOMEM;
1348 		goto failed;
1349 	}
1350 
1351 	rte_ether_addr_copy(addr, eth_dev->data->mac_addrs);
1352 
1353 	priv->ib_pd = ibv_alloc_pd(ctx);
1354 	if (!priv->ib_pd) {
1355 		DRV_LOG(ERR, "ibv_alloc_pd failed port %d", port);
1356 		ret = -ENOMEM;
1357 		goto failed;
1358 	}
1359 
1360 	/* Create a parent domain with the port number */
1361 	attr.pd = priv->ib_pd;
1362 	attr.comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
1363 	attr.pd_context = (void *)(uintptr_t)port;
1364 	priv->ib_parent_pd = ibv_alloc_parent_domain(ctx, &attr);
1365 	if (!priv->ib_parent_pd) {
1366 		DRV_LOG(ERR, "ibv_alloc_parent_domain failed port %d", port);
1367 		ret = -ENOMEM;
1368 		goto failed;
1369 	}
1370 
1371 	priv->ib_ctx = ctx;
1372 	priv->port_id = eth_dev->data->port_id;
1373 	priv->dev_port = port;
1374 	eth_dev->data->dev_private = priv;
1375 	priv->dev_data = eth_dev->data;
1376 
1377 	priv->max_rx_queues = dev_attr->orig_attr.max_qp;
1378 	priv->max_tx_queues = dev_attr->orig_attr.max_qp;
1379 
1380 	priv->max_rx_desc =
1381 		RTE_MIN(dev_attr->orig_attr.max_qp_wr,
1382 			dev_attr->orig_attr.max_cqe);
1383 	priv->max_tx_desc =
1384 		RTE_MIN(dev_attr->orig_attr.max_qp_wr,
1385 			dev_attr->orig_attr.max_cqe);
1386 
1387 	priv->max_send_sge = dev_attr->orig_attr.max_sge;
1388 	priv->max_recv_sge = dev_attr->orig_attr.max_sge;
1389 
1390 	priv->max_mr = dev_attr->orig_attr.max_mr;
1391 	priv->max_mr_size = dev_attr->orig_attr.max_mr_size;
1392 
1393 	DRV_LOG(INFO, "dev %s max queues %d desc %d sge %d mr %" PRIu64,
1394 		name, priv->max_rx_queues, priv->max_rx_desc,
1395 		priv->max_send_sge, priv->max_mr_size);
1396 
1397 	rte_eth_copy_pci_info(eth_dev, pci_dev);
1398 
1399 	/* Create async interrupt handler */
1400 	ret = mana_intr_install(eth_dev, priv);
1401 	if (ret) {
1402 		DRV_LOG(ERR, "Failed to install intr handler");
1403 		goto failed;
1404 	}
1405 
1406 	rte_spinlock_lock(&mana_shared_data->lock);
1407 	mana_shared_data->primary_cnt++;
1408 	rte_spinlock_unlock(&mana_shared_data->lock);
1409 
1410 	eth_dev->device = &pci_dev->device;
1411 
1412 	DRV_LOG(INFO, "device %s at port %u", name, eth_dev->data->port_id);
1413 
1414 	eth_dev->rx_pkt_burst = mana_rx_burst_removed;
1415 	eth_dev->tx_pkt_burst = mana_tx_burst_removed;
1416 	eth_dev->dev_ops = &mana_dev_ops;
1417 
1418 	rte_eth_dev_probing_finish(eth_dev);
1419 
1420 	return 0;
1421 
1422 failed:
1423 	/* Free the resource for the port failed */
1424 	if (priv) {
1425 		if (priv->ib_parent_pd)
1426 			ibv_dealloc_pd(priv->ib_parent_pd);
1427 
1428 		if (priv->ib_pd)
1429 			ibv_dealloc_pd(priv->ib_pd);
1430 	}
1431 
1432 	if (eth_dev)
1433 		rte_eth_dev_release_port(eth_dev);
1434 
1435 	rte_free(priv);
1436 
1437 	if (ctx)
1438 		ibv_close_device(ctx);
1439 
1440 	return ret;
1441 }
1442 
1443 /*
1444  * Goes through the IB device list to look for the IB port matching the
1445  * mac_addr. If found, create a rte_eth_dev for it.
1446  * Return value: number of successfully probed devices
1447  */
1448 static int
1449 mana_pci_probe_mac(struct rte_pci_device *pci_dev,
1450 		   struct rte_ether_addr *mac_addr)
1451 {
1452 	struct ibv_device **ibv_list;
1453 	int ibv_idx;
1454 	struct ibv_context *ctx;
1455 	int num_devices;
1456 	int ret;
1457 	uint8_t port;
1458 	int count = 0;
1459 
1460 	ibv_list = ibv_get_device_list(&num_devices);
1461 	for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
1462 		struct ibv_device *ibdev = ibv_list[ibv_idx];
1463 		struct rte_pci_addr pci_addr;
1464 		struct ibv_device_attr_ex dev_attr;
1465 
1466 		DRV_LOG(INFO, "Probe device name %s dev_name %s ibdev_path %s",
1467 			ibdev->name, ibdev->dev_name, ibdev->ibdev_path);
1468 
1469 		if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
1470 			continue;
1471 
1472 		/* Ignore if this IB device is not this PCI device */
1473 		if (rte_pci_addr_cmp(&pci_dev->addr, &pci_addr) != 0)
1474 			continue;
1475 
1476 		ctx = ibv_open_device(ibdev);
1477 		if (!ctx) {
1478 			DRV_LOG(ERR, "Failed to open IB device %s",
1479 				ibdev->name);
1480 			continue;
1481 		}
1482 		ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
1483 		ibv_close_device(ctx);
1484 
1485 		if (ret) {
1486 			DRV_LOG(ERR, "Failed to query IB device %s",
1487 				ibdev->name);
1488 			continue;
1489 		}
1490 
1491 		for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
1492 		     port++) {
1493 			struct rte_ether_addr addr;
1494 			ret = get_port_mac(ibdev, port, &addr);
1495 			if (ret)
1496 				continue;
1497 
1498 			if (mac_addr && !rte_is_same_ether_addr(&addr, mac_addr))
1499 				continue;
1500 
1501 			ret = mana_probe_port(ibdev, &dev_attr, port, pci_dev, &addr);
1502 			if (ret) {
1503 				DRV_LOG(ERR, "Probe on IB port %u failed %d", port, ret);
1504 			} else {
1505 				count++;
1506 				DRV_LOG(INFO, "Successfully probed on IB port %u", port);
1507 			}
1508 		}
1509 	}
1510 
1511 	ibv_free_device_list(ibv_list);
1512 	return count;
1513 }
1514 
1515 /*
1516  * Main callback function from PCI bus to probe a device.
1517  */
1518 static int
1519 mana_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1520 	       struct rte_pci_device *pci_dev)
1521 {
1522 	struct rte_devargs *args = pci_dev->device.devargs;
1523 	struct mana_conf conf = {0};
1524 	unsigned int i;
1525 	int ret;
1526 	int count = 0;
1527 
1528 	if (args && args->drv_str) {
1529 		ret = mana_parse_args(args, &conf);
1530 		if (ret) {
1531 			DRV_LOG(ERR, "Failed to parse parameters args = %s",
1532 				args->drv_str);
1533 			return ret;
1534 		}
1535 	}
1536 
1537 	ret = mana_init_once();
1538 	if (ret) {
1539 		DRV_LOG(ERR, "Failed to init PMD global data %d", ret);
1540 		return ret;
1541 	}
1542 
1543 	/* If there are no driver parameters, probe on all ports */
1544 	if (conf.index) {
1545 		for (i = 0; i < conf.index; i++)
1546 			count += mana_pci_probe_mac(pci_dev,
1547 						    &conf.mac_array[i]);
1548 	} else {
1549 		count = mana_pci_probe_mac(pci_dev, NULL);
1550 	}
1551 
1552 	if (!count) {
1553 		rte_memzone_free(mana_shared_mz);
1554 		mana_shared_mz = NULL;
1555 		ret = -ENODEV;
1556 	}
1557 
1558 	return ret;
1559 }
1560 
1561 static int
1562 mana_dev_uninit(struct rte_eth_dev *dev)
1563 {
1564 	return mana_dev_close(dev);
1565 }
1566 
1567 /*
1568  * Callback from PCI to remove this device.
1569  */
1570 static int
1571 mana_pci_remove(struct rte_pci_device *pci_dev)
1572 {
1573 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1574 		rte_spinlock_lock(&mana_shared_data_lock);
1575 
1576 		rte_spinlock_lock(&mana_shared_data->lock);
1577 
1578 		RTE_VERIFY(mana_shared_data->primary_cnt > 0);
1579 		mana_shared_data->primary_cnt--;
1580 		if (!mana_shared_data->primary_cnt) {
1581 			DRV_LOG(DEBUG, "mp uninit primary");
1582 			mana_mp_uninit_primary();
1583 		}
1584 
1585 		rte_spinlock_unlock(&mana_shared_data->lock);
1586 
1587 		/* Also free the shared memory if this is the last */
1588 		if (!mana_shared_data->primary_cnt) {
1589 			DRV_LOG(DEBUG, "free shared memezone data");
1590 			rte_memzone_free(mana_shared_mz);
1591 			mana_shared_mz = NULL;
1592 		}
1593 
1594 		rte_spinlock_unlock(&mana_shared_data_lock);
1595 	} else {
1596 		rte_spinlock_lock(&mana_shared_data_lock);
1597 
1598 		rte_spinlock_lock(&mana_shared_data->lock);
1599 		RTE_VERIFY(mana_shared_data->secondary_cnt > 0);
1600 		mana_shared_data->secondary_cnt--;
1601 		rte_spinlock_unlock(&mana_shared_data->lock);
1602 
1603 		RTE_VERIFY(mana_local_data.secondary_cnt > 0);
1604 		mana_local_data.secondary_cnt--;
1605 		if (!mana_local_data.secondary_cnt) {
1606 			DRV_LOG(DEBUG, "mp uninit secondary");
1607 			mana_mp_uninit_secondary();
1608 		}
1609 
1610 		rte_spinlock_unlock(&mana_shared_data_lock);
1611 	}
1612 
1613 	return rte_eth_dev_pci_generic_remove(pci_dev, mana_dev_uninit);
1614 }
1615 
1616 static const struct rte_pci_id mana_pci_id_map[] = {
1617 	{
1618 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT,
1619 			       PCI_DEVICE_ID_MICROSOFT_MANA)
1620 	},
1621 	{
1622 		.vendor_id = 0
1623 	},
1624 };
1625 
1626 static struct rte_pci_driver mana_pci_driver = {
1627 	.id_table = mana_pci_id_map,
1628 	.probe = mana_pci_probe,
1629 	.remove = mana_pci_remove,
1630 	.drv_flags = RTE_PCI_DRV_INTR_RMV,
1631 };
1632 
1633 RTE_PMD_REGISTER_PCI(net_mana, mana_pci_driver);
1634 RTE_PMD_REGISTER_PCI_TABLE(net_mana, mana_pci_id_map);
1635 RTE_PMD_REGISTER_KMOD_DEP(net_mana, "* ib_uverbs & mana_ib");
1636 RTE_LOG_REGISTER_SUFFIX(mana_logtype_init, init, NOTICE);
1637 RTE_LOG_REGISTER_SUFFIX(mana_logtype_driver, driver, NOTICE);
1638 RTE_PMD_REGISTER_PARAM_STRING(net_mana, ETH_MANA_MAC_ARG "=<mac_addr>");
1639