xref: /dpdk/drivers/net/mana/mana.c (revision c2c0c8af08ed4e37a10bbabf98a09a105a5650a8)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright 2022 Microsoft Corporation
3  */
4 
5 #include <unistd.h>
6 #include <dirent.h>
7 #include <fcntl.h>
8 #include <sys/mman.h>
9 #include <sys/ioctl.h>
10 #include <net/if.h>
11 
12 #include <ethdev_driver.h>
13 #include <ethdev_pci.h>
14 #include <rte_kvargs.h>
15 #include <rte_eal_paging.h>
16 #include <rte_pci.h>
17 
18 #include <infiniband/verbs.h>
19 #include <infiniband/manadv.h>
20 
21 #include <assert.h>
22 
23 #include "mana.h"
24 
25 /* Shared memory between primary/secondary processes, per driver */
26 /* Data to track primary/secondary usage */
27 struct mana_shared_data *mana_shared_data;
28 static struct mana_shared_data mana_local_data;
29 
30 /* The memory region for the above data */
31 static const struct rte_memzone *mana_shared_mz;
32 static const char *MZ_MANA_SHARED_DATA = "mana_shared_data";
33 
34 /* Spinlock for mana_shared_data */
35 static rte_spinlock_t mana_shared_data_lock = RTE_SPINLOCK_INITIALIZER;
36 
37 /* Allocate a buffer on the stack and fill it with a printf format string. */
38 #define MANA_MKSTR(name, ...) \
39 	int mkstr_size_##name = snprintf(NULL, 0, "" __VA_ARGS__); \
40 	char name[mkstr_size_##name + 1]; \
41 	\
42 	memset(name, 0, mkstr_size_##name + 1); \
43 	snprintf(name, sizeof(name), "" __VA_ARGS__)
44 
45 int mana_logtype_driver;
46 int mana_logtype_init;
47 
48 /*
49  * Callback from rdma-core to allocate a buffer for a queue.
50  */
51 void *
mana_alloc_verbs_buf(size_t size,void * data)52 mana_alloc_verbs_buf(size_t size, void *data)
53 {
54 	void *ret;
55 	size_t alignment = rte_mem_page_size();
56 	int socket = (int)(uintptr_t)data;
57 
58 	DRV_LOG(DEBUG, "size=%zu socket=%d", size, socket);
59 
60 	if (alignment == (size_t)-1) {
61 		DRV_LOG(ERR, "Failed to get mem page size");
62 		rte_errno = ENOMEM;
63 		return NULL;
64 	}
65 
66 	ret = rte_zmalloc_socket("mana_verb_buf", size, alignment, socket);
67 	if (!ret && size)
68 		rte_errno = ENOMEM;
69 	return ret;
70 }
71 
72 void
mana_free_verbs_buf(void * ptr,void * data __rte_unused)73 mana_free_verbs_buf(void *ptr, void *data __rte_unused)
74 {
75 	rte_free(ptr);
76 }
77 
78 static int
mana_dev_configure(struct rte_eth_dev * dev)79 mana_dev_configure(struct rte_eth_dev *dev)
80 {
81 	struct mana_priv *priv = dev->data->dev_private;
82 	struct rte_eth_conf *dev_conf = &dev->data->dev_conf;
83 
84 	if (dev_conf->rxmode.mq_mode & RTE_ETH_MQ_RX_RSS_FLAG)
85 		dev_conf->rxmode.offloads |= RTE_ETH_RX_OFFLOAD_RSS_HASH;
86 
87 	if (dev->data->nb_rx_queues != dev->data->nb_tx_queues) {
88 		DRV_LOG(ERR, "Only support equal number of rx/tx queues");
89 		return -EINVAL;
90 	}
91 
92 	if (!rte_is_power_of_2(dev->data->nb_rx_queues)) {
93 		DRV_LOG(ERR, "number of TX/RX queues must be power of 2");
94 		return -EINVAL;
95 	}
96 
97 	priv->vlan_strip = !!(dev_conf->rxmode.offloads &
98 			      RTE_ETH_RX_OFFLOAD_VLAN_STRIP);
99 
100 	priv->num_queues = dev->data->nb_rx_queues;
101 
102 	manadv_set_context_attr(priv->ib_ctx, MANADV_CTX_ATTR_BUF_ALLOCATORS,
103 				(void *)((uintptr_t)&(struct manadv_ctx_allocators){
104 					.alloc = &mana_alloc_verbs_buf,
105 					.free = &mana_free_verbs_buf,
106 					.data = 0,
107 				}));
108 
109 	return 0;
110 }
111 
112 static void
rx_intr_vec_disable(struct mana_priv * priv)113 rx_intr_vec_disable(struct mana_priv *priv)
114 {
115 	struct rte_intr_handle *intr_handle = priv->intr_handle;
116 
117 	rte_intr_free_epoll_fd(intr_handle);
118 	rte_intr_vec_list_free(intr_handle);
119 	rte_intr_nb_efd_set(intr_handle, 0);
120 }
121 
122 static int
rx_intr_vec_enable(struct mana_priv * priv)123 rx_intr_vec_enable(struct mana_priv *priv)
124 {
125 	unsigned int i;
126 	unsigned int rxqs_n = priv->dev_data->nb_rx_queues;
127 	unsigned int n = RTE_MIN(rxqs_n, (uint32_t)RTE_MAX_RXTX_INTR_VEC_ID);
128 	struct rte_intr_handle *intr_handle = priv->intr_handle;
129 	int ret;
130 
131 	rx_intr_vec_disable(priv);
132 
133 	if (rte_intr_vec_list_alloc(intr_handle, NULL, n)) {
134 		DRV_LOG(ERR, "Failed to allocate memory for interrupt vector");
135 		return -ENOMEM;
136 	}
137 
138 	for (i = 0; i < n; i++) {
139 		struct mana_rxq *rxq = priv->dev_data->rx_queues[i];
140 
141 		ret = rte_intr_vec_list_index_set(intr_handle, i,
142 						  RTE_INTR_VEC_RXTX_OFFSET + i);
143 		if (ret) {
144 			DRV_LOG(ERR, "Failed to set intr vec %u", i);
145 			return ret;
146 		}
147 
148 		ret = rte_intr_efds_index_set(intr_handle, i, rxq->channel->fd);
149 		if (ret) {
150 			DRV_LOG(ERR, "Failed to set FD at intr %u", i);
151 			return ret;
152 		}
153 	}
154 
155 	return rte_intr_nb_efd_set(intr_handle, n);
156 }
157 
158 static void
rxq_intr_disable(struct mana_priv * priv)159 rxq_intr_disable(struct mana_priv *priv)
160 {
161 	int err = rte_errno;
162 
163 	rx_intr_vec_disable(priv);
164 	rte_errno = err;
165 }
166 
167 static int
rxq_intr_enable(struct mana_priv * priv)168 rxq_intr_enable(struct mana_priv *priv)
169 {
170 	const struct rte_eth_intr_conf *const intr_conf =
171 		&priv->dev_data->dev_conf.intr_conf;
172 
173 	if (!intr_conf->rxq)
174 		return 0;
175 
176 	return rx_intr_vec_enable(priv);
177 }
178 
179 static int
mana_dev_start(struct rte_eth_dev * dev)180 mana_dev_start(struct rte_eth_dev *dev)
181 {
182 	int ret;
183 	struct mana_priv *priv = dev->data->dev_private;
184 
185 	rte_spinlock_init(&priv->mr_btree_lock);
186 	ret = mana_mr_btree_init(&priv->mr_btree, MANA_MR_BTREE_CACHE_N,
187 				 dev->device->numa_node);
188 	if (ret) {
189 		DRV_LOG(ERR, "Failed to init device MR btree %d", ret);
190 		return ret;
191 	}
192 
193 	ret = mana_start_tx_queues(dev);
194 	if (ret) {
195 		DRV_LOG(ERR, "failed to start tx queues %d", ret);
196 		goto failed_tx;
197 	}
198 
199 	ret = mana_start_rx_queues(dev);
200 	if (ret) {
201 		DRV_LOG(ERR, "failed to start rx queues %d", ret);
202 		goto failed_rx;
203 	}
204 
205 	rte_wmb();
206 
207 	dev->tx_pkt_burst = mana_tx_burst;
208 	dev->rx_pkt_burst = mana_rx_burst;
209 
210 	DRV_LOG(INFO, "TX/RX queues have started");
211 
212 	/* Enable datapath for secondary processes */
213 	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_START_RXTX);
214 
215 	ret = rxq_intr_enable(priv);
216 	if (ret) {
217 		DRV_LOG(ERR, "Failed to enable RX interrupts");
218 		goto failed_intr;
219 	}
220 
221 	return 0;
222 
223 failed_intr:
224 	mana_stop_rx_queues(dev);
225 
226 failed_rx:
227 	mana_stop_tx_queues(dev);
228 
229 failed_tx:
230 	mana_mr_btree_free(&priv->mr_btree);
231 
232 	return ret;
233 }
234 
235 static int
mana_dev_stop(struct rte_eth_dev * dev)236 mana_dev_stop(struct rte_eth_dev *dev)
237 {
238 	int ret;
239 	struct mana_priv *priv = dev->data->dev_private;
240 
241 	rxq_intr_disable(priv);
242 
243 	dev->tx_pkt_burst = mana_tx_burst_removed;
244 	dev->rx_pkt_burst = mana_rx_burst_removed;
245 
246 	/* Stop datapath on secondary processes */
247 	mana_mp_req_on_rxtx(dev, MANA_MP_REQ_STOP_RXTX);
248 
249 	rte_wmb();
250 
251 	ret = mana_stop_tx_queues(dev);
252 	if (ret) {
253 		DRV_LOG(ERR, "failed to stop tx queues");
254 		return ret;
255 	}
256 
257 	ret = mana_stop_rx_queues(dev);
258 	if (ret) {
259 		DRV_LOG(ERR, "failed to stop tx queues");
260 		return ret;
261 	}
262 
263 	return 0;
264 }
265 
266 static int mana_intr_uninstall(struct mana_priv *priv);
267 
268 static int
mana_dev_close(struct rte_eth_dev * dev)269 mana_dev_close(struct rte_eth_dev *dev)
270 {
271 	struct mana_priv *priv = dev->data->dev_private;
272 	int ret;
273 
274 	mana_remove_all_mr(priv);
275 
276 	ret = mana_intr_uninstall(priv);
277 	if (ret)
278 		return ret;
279 
280 	ret = ibv_close_device(priv->ib_ctx);
281 	if (ret) {
282 		ret = errno;
283 		return ret;
284 	}
285 
286 	return 0;
287 }
288 
289 static int
mana_dev_info_get(struct rte_eth_dev * dev,struct rte_eth_dev_info * dev_info)290 mana_dev_info_get(struct rte_eth_dev *dev,
291 		  struct rte_eth_dev_info *dev_info)
292 {
293 	struct mana_priv *priv = dev->data->dev_private;
294 
295 	dev_info->min_mtu = RTE_ETHER_MIN_MTU;
296 	dev_info->max_mtu = MANA_MAX_MTU;
297 
298 	/* RX params */
299 	dev_info->min_rx_bufsize = MIN_RX_BUF_SIZE;
300 	dev_info->max_rx_pktlen = MANA_MAX_MTU + RTE_ETHER_HDR_LEN;
301 
302 	dev_info->max_rx_queues = RTE_MIN(priv->max_rx_queues, UINT16_MAX);
303 	dev_info->max_tx_queues = RTE_MIN(priv->max_tx_queues, UINT16_MAX);
304 
305 	dev_info->max_mac_addrs = MANA_MAX_MAC_ADDR;
306 	dev_info->max_hash_mac_addrs = 0;
307 
308 	dev_info->max_vfs = 1;
309 
310 	/* Offload params */
311 	dev_info->rx_offload_capa = MANA_DEV_RX_OFFLOAD_SUPPORT;
312 
313 	dev_info->tx_offload_capa = MANA_DEV_TX_OFFLOAD_SUPPORT;
314 
315 	/* RSS */
316 	dev_info->reta_size = INDIRECTION_TABLE_NUM_ELEMENTS;
317 	dev_info->hash_key_size = TOEPLITZ_HASH_KEY_SIZE_IN_BYTES;
318 	dev_info->flow_type_rss_offloads = MANA_ETH_RSS_SUPPORT;
319 
320 	/* Thresholds */
321 	dev_info->default_rxconf = (struct rte_eth_rxconf){
322 		.rx_thresh = {
323 			.pthresh = 8,
324 			.hthresh = 8,
325 			.wthresh = 0,
326 		},
327 		.rx_free_thresh = 32,
328 		/* If no descriptors available, pkts are dropped by default */
329 		.rx_drop_en = 1,
330 	};
331 
332 	dev_info->default_txconf = (struct rte_eth_txconf){
333 		.tx_thresh = {
334 			.pthresh = 32,
335 			.hthresh = 0,
336 			.wthresh = 0,
337 		},
338 		.tx_rs_thresh = 32,
339 		.tx_free_thresh = 32,
340 	};
341 
342 	/* Buffer limits */
343 	dev_info->rx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
344 	dev_info->rx_desc_lim.nb_max = RTE_MIN(priv->max_rx_desc, UINT16_MAX);
345 	dev_info->rx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
346 	dev_info->rx_desc_lim.nb_seg_max =
347 		RTE_MIN(priv->max_recv_sge, UINT16_MAX);
348 	dev_info->rx_desc_lim.nb_mtu_seg_max =
349 		RTE_MIN(priv->max_recv_sge, UINT16_MAX);
350 
351 	dev_info->tx_desc_lim.nb_min = MIN_BUFFERS_PER_QUEUE;
352 	dev_info->tx_desc_lim.nb_max = RTE_MIN(priv->max_tx_desc, UINT16_MAX);
353 	dev_info->tx_desc_lim.nb_align = MIN_BUFFERS_PER_QUEUE;
354 	dev_info->tx_desc_lim.nb_seg_max =
355 		RTE_MIN(priv->max_send_sge, UINT16_MAX);
356 	dev_info->tx_desc_lim.nb_mtu_seg_max =
357 		RTE_MIN(priv->max_send_sge, UINT16_MAX);
358 
359 	/* Speed */
360 	dev_info->speed_capa = RTE_ETH_LINK_SPEED_100G;
361 
362 	/* RX params */
363 	dev_info->default_rxportconf.burst_size = 1;
364 	dev_info->default_rxportconf.ring_size = MAX_RECEIVE_BUFFERS_PER_QUEUE;
365 	dev_info->default_rxportconf.nb_queues = 1;
366 
367 	/* TX params */
368 	dev_info->default_txportconf.burst_size = 1;
369 	dev_info->default_txportconf.ring_size = MAX_SEND_BUFFERS_PER_QUEUE;
370 	dev_info->default_txportconf.nb_queues = 1;
371 
372 	return 0;
373 }
374 
375 static void
mana_dev_tx_queue_info(struct rte_eth_dev * dev,uint16_t queue_id,struct rte_eth_txq_info * qinfo)376 mana_dev_tx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
377 		       struct rte_eth_txq_info *qinfo)
378 {
379 	struct mana_txq *txq = dev->data->tx_queues[queue_id];
380 
381 	qinfo->conf.offloads = dev->data->dev_conf.txmode.offloads;
382 	qinfo->nb_desc = txq->num_desc;
383 }
384 
385 static void
mana_dev_rx_queue_info(struct rte_eth_dev * dev,uint16_t queue_id,struct rte_eth_rxq_info * qinfo)386 mana_dev_rx_queue_info(struct rte_eth_dev *dev, uint16_t queue_id,
387 		       struct rte_eth_rxq_info *qinfo)
388 {
389 	struct mana_rxq *rxq = dev->data->rx_queues[queue_id];
390 
391 	qinfo->mp = rxq->mp;
392 	qinfo->nb_desc = rxq->num_desc;
393 	qinfo->conf.offloads = dev->data->dev_conf.rxmode.offloads;
394 }
395 
396 static const uint32_t *
mana_supported_ptypes(struct rte_eth_dev * dev __rte_unused,size_t * no_of_elements)397 mana_supported_ptypes(struct rte_eth_dev *dev __rte_unused,
398 		      size_t *no_of_elements)
399 {
400 	static const uint32_t ptypes[] = {
401 		RTE_PTYPE_L2_ETHER,
402 		RTE_PTYPE_L3_IPV4_EXT_UNKNOWN,
403 		RTE_PTYPE_L3_IPV6_EXT_UNKNOWN,
404 		RTE_PTYPE_L4_FRAG,
405 		RTE_PTYPE_L4_TCP,
406 		RTE_PTYPE_L4_UDP,
407 	};
408 
409 	*no_of_elements = RTE_DIM(ptypes);
410 	return ptypes;
411 }
412 
413 static int
mana_rss_hash_update(struct rte_eth_dev * dev,struct rte_eth_rss_conf * rss_conf)414 mana_rss_hash_update(struct rte_eth_dev *dev,
415 		     struct rte_eth_rss_conf *rss_conf)
416 {
417 	struct mana_priv *priv = dev->data->dev_private;
418 
419 	/* Currently can only update RSS hash when device is stopped */
420 	if (dev->data->dev_started) {
421 		DRV_LOG(ERR, "Can't update RSS after device has started");
422 		return -ENODEV;
423 	}
424 
425 	if (rss_conf->rss_hf & ~MANA_ETH_RSS_SUPPORT) {
426 		DRV_LOG(ERR, "Port %u invalid RSS HF 0x%" PRIx64,
427 			dev->data->port_id, rss_conf->rss_hf);
428 		return -EINVAL;
429 	}
430 
431 	if (rss_conf->rss_key && rss_conf->rss_key_len) {
432 		if (rss_conf->rss_key_len != TOEPLITZ_HASH_KEY_SIZE_IN_BYTES) {
433 			DRV_LOG(ERR, "Port %u key len must be %u long",
434 				dev->data->port_id,
435 				TOEPLITZ_HASH_KEY_SIZE_IN_BYTES);
436 			return -EINVAL;
437 		}
438 
439 		priv->rss_conf.rss_key_len = rss_conf->rss_key_len;
440 		priv->rss_conf.rss_key =
441 			rte_zmalloc("mana_rss", rss_conf->rss_key_len,
442 				    RTE_CACHE_LINE_SIZE);
443 		if (!priv->rss_conf.rss_key)
444 			return -ENOMEM;
445 		memcpy(priv->rss_conf.rss_key, rss_conf->rss_key,
446 		       rss_conf->rss_key_len);
447 	}
448 	priv->rss_conf.rss_hf = rss_conf->rss_hf;
449 
450 	return 0;
451 }
452 
453 static int
mana_rss_hash_conf_get(struct rte_eth_dev * dev,struct rte_eth_rss_conf * rss_conf)454 mana_rss_hash_conf_get(struct rte_eth_dev *dev,
455 		       struct rte_eth_rss_conf *rss_conf)
456 {
457 	struct mana_priv *priv = dev->data->dev_private;
458 
459 	if (!rss_conf)
460 		return -EINVAL;
461 
462 	if (rss_conf->rss_key &&
463 	    rss_conf->rss_key_len >= priv->rss_conf.rss_key_len) {
464 		memcpy(rss_conf->rss_key, priv->rss_conf.rss_key,
465 		       priv->rss_conf.rss_key_len);
466 	}
467 
468 	rss_conf->rss_key_len = priv->rss_conf.rss_key_len;
469 	rss_conf->rss_hf = priv->rss_conf.rss_hf;
470 
471 	return 0;
472 }
473 
474 static int
mana_dev_tx_queue_setup(struct rte_eth_dev * dev,uint16_t queue_idx,uint16_t nb_desc,unsigned int socket_id,const struct rte_eth_txconf * tx_conf __rte_unused)475 mana_dev_tx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
476 			uint16_t nb_desc, unsigned int socket_id,
477 			const struct rte_eth_txconf *tx_conf __rte_unused)
478 
479 {
480 	struct mana_priv *priv = dev->data->dev_private;
481 	struct mana_txq *txq;
482 	int ret;
483 
484 	txq = rte_zmalloc_socket("mana_txq", sizeof(*txq), 0, socket_id);
485 	if (!txq) {
486 		DRV_LOG(ERR, "failed to allocate txq");
487 		return -ENOMEM;
488 	}
489 
490 	txq->socket = socket_id;
491 
492 	txq->desc_ring = rte_malloc_socket("mana_tx_desc_ring",
493 					   sizeof(struct mana_txq_desc) *
494 						nb_desc,
495 					   RTE_CACHE_LINE_SIZE, socket_id);
496 	if (!txq->desc_ring) {
497 		DRV_LOG(ERR, "failed to allocate txq desc_ring");
498 		ret = -ENOMEM;
499 		goto fail;
500 	}
501 
502 	txq->gdma_comp_buf = rte_malloc_socket("mana_txq_comp",
503 			sizeof(*txq->gdma_comp_buf) * nb_desc,
504 			RTE_CACHE_LINE_SIZE, socket_id);
505 	if (!txq->gdma_comp_buf) {
506 		DRV_LOG(ERR, "failed to allocate txq comp");
507 		ret = -ENOMEM;
508 		goto fail;
509 	}
510 
511 	ret = mana_mr_btree_init(&txq->mr_btree,
512 				 MANA_MR_BTREE_PER_QUEUE_N, socket_id);
513 	if (ret) {
514 		DRV_LOG(ERR, "Failed to init TXQ MR btree");
515 		goto fail;
516 	}
517 
518 	DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u txq->desc_ring %p",
519 		queue_idx, nb_desc, socket_id, txq->desc_ring);
520 
521 	txq->desc_ring_head = 0;
522 	txq->desc_ring_tail = 0;
523 	txq->priv = priv;
524 	txq->num_desc = nb_desc;
525 	dev->data->tx_queues[queue_idx] = txq;
526 
527 	return 0;
528 
529 fail:
530 	rte_free(txq->gdma_comp_buf);
531 	rte_free(txq->desc_ring);
532 	rte_free(txq);
533 	return ret;
534 }
535 
536 static void
mana_dev_tx_queue_release(struct rte_eth_dev * dev,uint16_t qid)537 mana_dev_tx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
538 {
539 	struct mana_txq *txq = dev->data->tx_queues[qid];
540 
541 	mana_mr_btree_free(&txq->mr_btree);
542 
543 	rte_free(txq->gdma_comp_buf);
544 	rte_free(txq->desc_ring);
545 	rte_free(txq);
546 }
547 
548 static int
mana_dev_rx_queue_setup(struct rte_eth_dev * dev,uint16_t queue_idx,uint16_t nb_desc,unsigned int socket_id,const struct rte_eth_rxconf * rx_conf __rte_unused,struct rte_mempool * mp)549 mana_dev_rx_queue_setup(struct rte_eth_dev *dev, uint16_t queue_idx,
550 			uint16_t nb_desc, unsigned int socket_id,
551 			const struct rte_eth_rxconf *rx_conf __rte_unused,
552 			struct rte_mempool *mp)
553 {
554 	struct mana_priv *priv = dev->data->dev_private;
555 	struct mana_rxq *rxq;
556 	int ret;
557 
558 	rxq = rte_zmalloc_socket("mana_rxq", sizeof(*rxq), 0, socket_id);
559 	if (!rxq) {
560 		DRV_LOG(ERR, "failed to allocate rxq");
561 		return -ENOMEM;
562 	}
563 
564 	DRV_LOG(DEBUG, "idx %u nb_desc %u socket %u",
565 		queue_idx, nb_desc, socket_id);
566 
567 	rxq->socket = socket_id;
568 
569 	rxq->desc_ring = rte_zmalloc_socket("mana_rx_mbuf_ring",
570 					    sizeof(struct mana_rxq_desc) *
571 						nb_desc,
572 					    RTE_CACHE_LINE_SIZE, socket_id);
573 
574 	if (!rxq->desc_ring) {
575 		DRV_LOG(ERR, "failed to allocate rxq desc_ring");
576 		ret = -ENOMEM;
577 		goto fail;
578 	}
579 
580 	rxq->desc_ring_head = 0;
581 	rxq->desc_ring_tail = 0;
582 
583 	rxq->gdma_comp_buf = rte_malloc_socket("mana_rxq_comp",
584 			sizeof(*rxq->gdma_comp_buf) * nb_desc,
585 			RTE_CACHE_LINE_SIZE, socket_id);
586 	if (!rxq->gdma_comp_buf) {
587 		DRV_LOG(ERR, "failed to allocate rxq comp");
588 		ret = -ENOMEM;
589 		goto fail;
590 	}
591 
592 	ret = mana_mr_btree_init(&rxq->mr_btree,
593 				 MANA_MR_BTREE_PER_QUEUE_N, socket_id);
594 	if (ret) {
595 		DRV_LOG(ERR, "Failed to init RXQ MR btree");
596 		goto fail;
597 	}
598 
599 	rxq->priv = priv;
600 	rxq->num_desc = nb_desc;
601 	rxq->mp = mp;
602 	dev->data->rx_queues[queue_idx] = rxq;
603 
604 	return 0;
605 
606 fail:
607 	rte_free(rxq->gdma_comp_buf);
608 	rte_free(rxq->desc_ring);
609 	rte_free(rxq);
610 	return ret;
611 }
612 
613 static void
mana_dev_rx_queue_release(struct rte_eth_dev * dev,uint16_t qid)614 mana_dev_rx_queue_release(struct rte_eth_dev *dev, uint16_t qid)
615 {
616 	struct mana_rxq *rxq = dev->data->rx_queues[qid];
617 
618 	mana_mr_btree_free(&rxq->mr_btree);
619 
620 	rte_free(rxq->gdma_comp_buf);
621 	rte_free(rxq->desc_ring);
622 	rte_free(rxq);
623 }
624 
625 static int
mana_dev_link_update(struct rte_eth_dev * dev,int wait_to_complete __rte_unused)626 mana_dev_link_update(struct rte_eth_dev *dev,
627 		     int wait_to_complete __rte_unused)
628 {
629 	struct rte_eth_link link;
630 
631 	/* MANA has no concept of carrier state, always reporting UP */
632 	link = (struct rte_eth_link) {
633 		.link_duplex = RTE_ETH_LINK_FULL_DUPLEX,
634 		.link_autoneg = RTE_ETH_LINK_SPEED_FIXED,
635 		.link_speed = RTE_ETH_SPEED_NUM_100G,
636 		.link_status = RTE_ETH_LINK_UP,
637 	};
638 
639 	return rte_eth_linkstatus_set(dev, &link);
640 }
641 
642 static int
mana_dev_stats_get(struct rte_eth_dev * dev,struct rte_eth_stats * stats)643 mana_dev_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *stats)
644 {
645 	unsigned int i;
646 
647 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
648 		struct mana_txq *txq = dev->data->tx_queues[i];
649 
650 		if (!txq)
651 			continue;
652 
653 		stats->opackets += txq->stats.packets;
654 		stats->obytes += txq->stats.bytes;
655 		stats->oerrors += txq->stats.errors;
656 
657 		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
658 			stats->q_opackets[i] = txq->stats.packets;
659 			stats->q_obytes[i] = txq->stats.bytes;
660 		}
661 	}
662 
663 	stats->rx_nombuf = 0;
664 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
665 		struct mana_rxq *rxq = dev->data->rx_queues[i];
666 
667 		if (!rxq)
668 			continue;
669 
670 		stats->ipackets += rxq->stats.packets;
671 		stats->ibytes += rxq->stats.bytes;
672 		stats->ierrors += rxq->stats.errors;
673 
674 		/* There is no good way to get stats->imissed, not setting it */
675 
676 		if (i < RTE_ETHDEV_QUEUE_STAT_CNTRS) {
677 			stats->q_ipackets[i] = rxq->stats.packets;
678 			stats->q_ibytes[i] = rxq->stats.bytes;
679 		}
680 
681 		stats->rx_nombuf += rxq->stats.nombuf;
682 	}
683 
684 	return 0;
685 }
686 
687 static int
mana_dev_stats_reset(struct rte_eth_dev * dev __rte_unused)688 mana_dev_stats_reset(struct rte_eth_dev *dev __rte_unused)
689 {
690 	unsigned int i;
691 
692 	PMD_INIT_FUNC_TRACE();
693 
694 	for (i = 0; i < dev->data->nb_tx_queues; i++) {
695 		struct mana_txq *txq = dev->data->tx_queues[i];
696 
697 		if (!txq)
698 			continue;
699 
700 		memset(&txq->stats, 0, sizeof(txq->stats));
701 	}
702 
703 	for (i = 0; i < dev->data->nb_rx_queues; i++) {
704 		struct mana_rxq *rxq = dev->data->rx_queues[i];
705 
706 		if (!rxq)
707 			continue;
708 
709 		memset(&rxq->stats, 0, sizeof(rxq->stats));
710 	}
711 
712 	return 0;
713 }
714 
715 static int
mana_get_ifname(const struct mana_priv * priv,char (* ifname)[IF_NAMESIZE])716 mana_get_ifname(const struct mana_priv *priv, char (*ifname)[IF_NAMESIZE])
717 {
718 	int ret = -ENODEV;
719 	DIR *dir;
720 	struct dirent *dent;
721 
722 	MANA_MKSTR(dirpath, "%s/device/net", priv->ib_ctx->device->ibdev_path);
723 
724 	dir = opendir(dirpath);
725 	if (dir == NULL)
726 		return -ENODEV;
727 
728 	while ((dent = readdir(dir)) != NULL) {
729 		char *name = dent->d_name;
730 		FILE *file;
731 		struct rte_ether_addr addr;
732 		char *mac = NULL;
733 
734 		if ((name[0] == '.') &&
735 		    ((name[1] == '\0') ||
736 		     ((name[1] == '.') && (name[2] == '\0'))))
737 			continue;
738 
739 		MANA_MKSTR(path, "%s/%s/address", dirpath, name);
740 
741 		file = fopen(path, "r");
742 		if (!file) {
743 			ret = -ENODEV;
744 			break;
745 		}
746 
747 		ret = fscanf(file, "%ms", &mac);
748 		fclose(file);
749 
750 		if (ret <= 0) {
751 			ret = -EINVAL;
752 			break;
753 		}
754 
755 		ret = rte_ether_unformat_addr(mac, &addr);
756 		free(mac);
757 		if (ret)
758 			break;
759 
760 		if (rte_is_same_ether_addr(&addr, priv->dev_data->mac_addrs)) {
761 			strlcpy(*ifname, name, sizeof(*ifname));
762 			ret = 0;
763 			break;
764 		}
765 	}
766 
767 	closedir(dir);
768 	return ret;
769 }
770 
771 static int
mana_ifreq(const struct mana_priv * priv,int req,struct ifreq * ifr)772 mana_ifreq(const struct mana_priv *priv, int req, struct ifreq *ifr)
773 {
774 	int sock, ret;
775 
776 	sock = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
777 	if (sock == -1)
778 		return -errno;
779 
780 	ret = mana_get_ifname(priv, &ifr->ifr_name);
781 	if (ret) {
782 		close(sock);
783 		return ret;
784 	}
785 
786 	if (ioctl(sock, req, ifr) == -1)
787 		ret = -errno;
788 
789 	close(sock);
790 
791 	return ret;
792 }
793 
794 static int
mana_mtu_set(struct rte_eth_dev * dev,uint16_t mtu)795 mana_mtu_set(struct rte_eth_dev *dev, uint16_t mtu)
796 {
797 	struct mana_priv *priv = dev->data->dev_private;
798 	struct ifreq request = { .ifr_mtu = mtu, };
799 
800 	return mana_ifreq(priv, SIOCSIFMTU, &request);
801 }
802 
803 static const struct eth_dev_ops mana_dev_ops = {
804 	.dev_configure		= mana_dev_configure,
805 	.dev_start		= mana_dev_start,
806 	.dev_stop		= mana_dev_stop,
807 	.dev_close		= mana_dev_close,
808 	.dev_infos_get		= mana_dev_info_get,
809 	.txq_info_get		= mana_dev_tx_queue_info,
810 	.rxq_info_get		= mana_dev_rx_queue_info,
811 	.dev_supported_ptypes_get = mana_supported_ptypes,
812 	.rss_hash_update	= mana_rss_hash_update,
813 	.rss_hash_conf_get	= mana_rss_hash_conf_get,
814 	.tx_queue_setup		= mana_dev_tx_queue_setup,
815 	.tx_queue_release	= mana_dev_tx_queue_release,
816 	.rx_queue_setup		= mana_dev_rx_queue_setup,
817 	.rx_queue_release	= mana_dev_rx_queue_release,
818 	.rx_queue_intr_enable	= mana_rx_intr_enable,
819 	.rx_queue_intr_disable	= mana_rx_intr_disable,
820 	.link_update		= mana_dev_link_update,
821 	.stats_get		= mana_dev_stats_get,
822 	.stats_reset		= mana_dev_stats_reset,
823 	.mtu_set		= mana_mtu_set,
824 };
825 
826 static const struct eth_dev_ops mana_dev_secondary_ops = {
827 	.stats_get = mana_dev_stats_get,
828 	.stats_reset = mana_dev_stats_reset,
829 	.dev_infos_get = mana_dev_info_get,
830 };
831 
832 uint16_t
mana_rx_burst_removed(void * dpdk_rxq __rte_unused,struct rte_mbuf ** pkts __rte_unused,uint16_t pkts_n __rte_unused)833 mana_rx_burst_removed(void *dpdk_rxq __rte_unused,
834 		      struct rte_mbuf **pkts __rte_unused,
835 		      uint16_t pkts_n __rte_unused)
836 {
837 	rte_mb();
838 	return 0;
839 }
840 
841 uint16_t
mana_tx_burst_removed(void * dpdk_rxq __rte_unused,struct rte_mbuf ** pkts __rte_unused,uint16_t pkts_n __rte_unused)842 mana_tx_burst_removed(void *dpdk_rxq __rte_unused,
843 		      struct rte_mbuf **pkts __rte_unused,
844 		      uint16_t pkts_n __rte_unused)
845 {
846 	rte_mb();
847 	return 0;
848 }
849 
850 #define ETH_MANA_MAC_ARG "mac"
851 static const char * const mana_init_args[] = {
852 	ETH_MANA_MAC_ARG,
853 	NULL,
854 };
855 
856 /* Support of parsing up to 8 mac address from EAL command line */
857 #define MAX_NUM_ADDRESS 8
858 struct mana_conf {
859 	struct rte_ether_addr mac_array[MAX_NUM_ADDRESS];
860 	unsigned int index;
861 };
862 
863 static int
mana_arg_parse_callback(const char * key,const char * val,void * private)864 mana_arg_parse_callback(const char *key, const char *val, void *private)
865 {
866 	struct mana_conf *conf = (struct mana_conf *)private;
867 	int ret;
868 
869 	DRV_LOG(INFO, "key=%s value=%s index=%d", key, val, conf->index);
870 
871 	if (conf->index >= MAX_NUM_ADDRESS) {
872 		DRV_LOG(ERR, "Exceeding max MAC address");
873 		return 1;
874 	}
875 
876 	ret = rte_ether_unformat_addr(val, &conf->mac_array[conf->index]);
877 	if (ret) {
878 		DRV_LOG(ERR, "Invalid MAC address %s", val);
879 		return ret;
880 	}
881 
882 	conf->index++;
883 
884 	return 0;
885 }
886 
887 static int
mana_parse_args(struct rte_devargs * devargs,struct mana_conf * conf)888 mana_parse_args(struct rte_devargs *devargs, struct mana_conf *conf)
889 {
890 	struct rte_kvargs *kvlist;
891 	unsigned int arg_count;
892 	int ret = 0;
893 
894 	kvlist = rte_kvargs_parse(devargs->drv_str, mana_init_args);
895 	if (!kvlist) {
896 		DRV_LOG(ERR, "failed to parse kvargs args=%s", devargs->drv_str);
897 		return -EINVAL;
898 	}
899 
900 	arg_count = rte_kvargs_count(kvlist, mana_init_args[0]);
901 	if (arg_count > MAX_NUM_ADDRESS) {
902 		ret = -EINVAL;
903 		goto free_kvlist;
904 	}
905 	ret = rte_kvargs_process(kvlist, mana_init_args[0],
906 				 mana_arg_parse_callback, conf);
907 	if (ret) {
908 		DRV_LOG(ERR, "error parsing args");
909 		goto free_kvlist;
910 	}
911 
912 free_kvlist:
913 	rte_kvargs_free(kvlist);
914 	return ret;
915 }
916 
917 static int
get_port_mac(struct ibv_device * device,unsigned int port,struct rte_ether_addr * addr)918 get_port_mac(struct ibv_device *device, unsigned int port,
919 	     struct rte_ether_addr *addr)
920 {
921 	FILE *file;
922 	int ret = 0;
923 	DIR *dir;
924 	struct dirent *dent;
925 	unsigned int dev_port;
926 
927 	MANA_MKSTR(path, "%s/device/net", device->ibdev_path);
928 
929 	dir = opendir(path);
930 	if (!dir)
931 		return -ENOENT;
932 
933 	while ((dent = readdir(dir))) {
934 		char *name = dent->d_name;
935 		char *mac = NULL;
936 
937 		MANA_MKSTR(port_path, "%s/%s/dev_port", path, name);
938 
939 		/* Ignore . and .. */
940 		if ((name[0] == '.') &&
941 		    ((name[1] == '\0') ||
942 		     ((name[1] == '.') && (name[2] == '\0'))))
943 			continue;
944 
945 		file = fopen(port_path, "r");
946 		if (!file)
947 			continue;
948 
949 		ret = fscanf(file, "%u", &dev_port);
950 		fclose(file);
951 
952 		if (ret != 1)
953 			continue;
954 
955 		/* Ethernet ports start at 0, IB port start at 1 */
956 		if (dev_port == port - 1) {
957 			MANA_MKSTR(address_path, "%s/%s/address", path, name);
958 
959 			file = fopen(address_path, "r");
960 			if (!file)
961 				continue;
962 
963 			ret = fscanf(file, "%ms", &mac);
964 			fclose(file);
965 
966 			if (ret < 0)
967 				break;
968 
969 			ret = rte_ether_unformat_addr(mac, addr);
970 			if (ret)
971 				DRV_LOG(ERR, "unrecognized mac addr %s", mac);
972 
973 			free(mac);
974 			break;
975 		}
976 	}
977 
978 	closedir(dir);
979 	return ret;
980 }
981 
982 static int
mana_ibv_device_to_pci_addr(const struct ibv_device * device,struct rte_pci_addr * pci_addr)983 mana_ibv_device_to_pci_addr(const struct ibv_device *device,
984 			    struct rte_pci_addr *pci_addr)
985 {
986 	FILE *file;
987 	char *line = NULL;
988 	size_t len = 0;
989 
990 	MANA_MKSTR(path, "%s/device/uevent", device->ibdev_path);
991 
992 	file = fopen(path, "r");
993 	if (!file)
994 		return -errno;
995 
996 	while (getline(&line, &len, file) != -1) {
997 		/* Extract information. */
998 		if (sscanf(line,
999 			   "PCI_SLOT_NAME="
1000 			   "%" SCNx32 ":%" SCNx8 ":%" SCNx8 ".%" SCNx8 "\n",
1001 			   &pci_addr->domain,
1002 			   &pci_addr->bus,
1003 			   &pci_addr->devid,
1004 			   &pci_addr->function) == 4) {
1005 			break;
1006 		}
1007 	}
1008 
1009 	free(line);
1010 	fclose(file);
1011 	return 0;
1012 }
1013 
1014 /*
1015  * Interrupt handler from IB layer to notify this device is being removed.
1016  */
1017 static void
mana_intr_handler(void * arg)1018 mana_intr_handler(void *arg)
1019 {
1020 	struct mana_priv *priv = arg;
1021 	struct ibv_context *ctx = priv->ib_ctx;
1022 	struct ibv_async_event event;
1023 
1024 	/* Read and ack all messages from IB device */
1025 	while (true) {
1026 		if (ibv_get_async_event(ctx, &event))
1027 			break;
1028 
1029 		if (event.event_type == IBV_EVENT_DEVICE_FATAL) {
1030 			struct rte_eth_dev *dev;
1031 
1032 			dev = &rte_eth_devices[priv->port_id];
1033 			if (dev->data->dev_conf.intr_conf.rmv)
1034 				rte_eth_dev_callback_process(dev,
1035 					RTE_ETH_EVENT_INTR_RMV, NULL);
1036 		}
1037 
1038 		ibv_ack_async_event(&event);
1039 	}
1040 }
1041 
1042 static int
mana_intr_uninstall(struct mana_priv * priv)1043 mana_intr_uninstall(struct mana_priv *priv)
1044 {
1045 	int ret;
1046 
1047 	ret = rte_intr_callback_unregister(priv->intr_handle,
1048 					   mana_intr_handler, priv);
1049 	if (ret <= 0) {
1050 		DRV_LOG(ERR, "Failed to unregister intr callback ret %d", ret);
1051 		return ret;
1052 	}
1053 
1054 	rte_intr_instance_free(priv->intr_handle);
1055 
1056 	return 0;
1057 }
1058 
1059 int
mana_fd_set_non_blocking(int fd)1060 mana_fd_set_non_blocking(int fd)
1061 {
1062 	int ret = fcntl(fd, F_GETFL);
1063 
1064 	if (ret != -1 && !fcntl(fd, F_SETFL, ret | O_NONBLOCK))
1065 		return 0;
1066 
1067 	rte_errno = errno;
1068 	return -rte_errno;
1069 }
1070 
1071 static int
mana_intr_install(struct rte_eth_dev * eth_dev,struct mana_priv * priv)1072 mana_intr_install(struct rte_eth_dev *eth_dev, struct mana_priv *priv)
1073 {
1074 	int ret;
1075 	struct ibv_context *ctx = priv->ib_ctx;
1076 
1077 	priv->intr_handle = rte_intr_instance_alloc(RTE_INTR_INSTANCE_F_SHARED);
1078 	if (!priv->intr_handle) {
1079 		DRV_LOG(ERR, "Failed to allocate intr_handle");
1080 		rte_errno = ENOMEM;
1081 		return -ENOMEM;
1082 	}
1083 
1084 	ret = rte_intr_fd_set(priv->intr_handle, -1);
1085 	if (ret)
1086 		goto free_intr;
1087 
1088 	ret = mana_fd_set_non_blocking(ctx->async_fd);
1089 	if (ret) {
1090 		DRV_LOG(ERR, "Failed to change async_fd to NONBLOCK");
1091 		goto free_intr;
1092 	}
1093 
1094 	ret = rte_intr_fd_set(priv->intr_handle, ctx->async_fd);
1095 	if (ret)
1096 		goto free_intr;
1097 
1098 	ret = rte_intr_type_set(priv->intr_handle, RTE_INTR_HANDLE_EXT);
1099 	if (ret)
1100 		goto free_intr;
1101 
1102 	ret = rte_intr_callback_register(priv->intr_handle,
1103 					 mana_intr_handler, priv);
1104 	if (ret) {
1105 		DRV_LOG(ERR, "Failed to register intr callback");
1106 		rte_intr_fd_set(priv->intr_handle, -1);
1107 		goto free_intr;
1108 	}
1109 
1110 	eth_dev->intr_handle = priv->intr_handle;
1111 	return 0;
1112 
1113 free_intr:
1114 	rte_intr_instance_free(priv->intr_handle);
1115 	priv->intr_handle = NULL;
1116 
1117 	return ret;
1118 }
1119 
1120 static int
mana_proc_priv_init(struct rte_eth_dev * dev)1121 mana_proc_priv_init(struct rte_eth_dev *dev)
1122 {
1123 	struct mana_process_priv *priv;
1124 
1125 	priv = rte_zmalloc_socket("mana_proc_priv",
1126 				  sizeof(struct mana_process_priv),
1127 				  RTE_CACHE_LINE_SIZE,
1128 				  dev->device->numa_node);
1129 	if (!priv)
1130 		return -ENOMEM;
1131 
1132 	dev->process_private = priv;
1133 	return 0;
1134 }
1135 
1136 /*
1137  * Map the doorbell page for the secondary process through IB device handle.
1138  */
1139 static int
mana_map_doorbell_secondary(struct rte_eth_dev * eth_dev,int fd)1140 mana_map_doorbell_secondary(struct rte_eth_dev *eth_dev, int fd)
1141 {
1142 	struct mana_process_priv *priv = eth_dev->process_private;
1143 
1144 	void *addr;
1145 
1146 	addr = mmap(NULL, rte_mem_page_size(), PROT_WRITE, MAP_SHARED, fd, 0);
1147 	if (addr == MAP_FAILED) {
1148 		DRV_LOG(ERR, "Failed to map secondary doorbell port %u",
1149 			eth_dev->data->port_id);
1150 		return -ENOMEM;
1151 	}
1152 
1153 	DRV_LOG(INFO, "Secondary doorbell mapped to %p", addr);
1154 
1155 	priv->db_page = addr;
1156 
1157 	return 0;
1158 }
1159 
1160 /* Initialize shared data for the driver (all devices) */
1161 static int
mana_init_shared_data(void)1162 mana_init_shared_data(void)
1163 {
1164 	int ret =  0;
1165 	const struct rte_memzone *secondary_mz;
1166 
1167 	rte_spinlock_lock(&mana_shared_data_lock);
1168 
1169 	/* Skip if shared data is already initialized */
1170 	if (mana_shared_data)
1171 		goto exit;
1172 
1173 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1174 		mana_shared_mz = rte_memzone_reserve(MZ_MANA_SHARED_DATA,
1175 						     sizeof(*mana_shared_data),
1176 						     SOCKET_ID_ANY, 0);
1177 		if (!mana_shared_mz) {
1178 			DRV_LOG(ERR, "Cannot allocate mana shared data");
1179 			ret = -rte_errno;
1180 			goto exit;
1181 		}
1182 
1183 		mana_shared_data = mana_shared_mz->addr;
1184 		memset(mana_shared_data, 0, sizeof(*mana_shared_data));
1185 		rte_spinlock_init(&mana_shared_data->lock);
1186 	} else {
1187 		secondary_mz = rte_memzone_lookup(MZ_MANA_SHARED_DATA);
1188 		if (!secondary_mz) {
1189 			DRV_LOG(ERR, "Cannot attach mana shared data");
1190 			ret = -rte_errno;
1191 			goto exit;
1192 		}
1193 
1194 		mana_shared_data = secondary_mz->addr;
1195 		memset(&mana_local_data, 0, sizeof(mana_local_data));
1196 	}
1197 
1198 exit:
1199 	rte_spinlock_unlock(&mana_shared_data_lock);
1200 
1201 	return ret;
1202 }
1203 
1204 /*
1205  * Init the data structures for use in primary and secondary processes.
1206  */
1207 static int
mana_init_once(void)1208 mana_init_once(void)
1209 {
1210 	int ret;
1211 
1212 	ret = mana_init_shared_data();
1213 	if (ret)
1214 		return ret;
1215 
1216 	rte_spinlock_lock(&mana_shared_data->lock);
1217 
1218 	switch (rte_eal_process_type()) {
1219 	case RTE_PROC_PRIMARY:
1220 		if (mana_shared_data->init_done)
1221 			break;
1222 
1223 		ret = mana_mp_init_primary();
1224 		if (ret)
1225 			break;
1226 		DRV_LOG(ERR, "MP INIT PRIMARY");
1227 
1228 		mana_shared_data->init_done = 1;
1229 		break;
1230 
1231 	case RTE_PROC_SECONDARY:
1232 
1233 		if (mana_local_data.init_done)
1234 			break;
1235 
1236 		ret = mana_mp_init_secondary();
1237 		if (ret)
1238 			break;
1239 
1240 		DRV_LOG(ERR, "MP INIT SECONDARY");
1241 
1242 		mana_local_data.init_done = 1;
1243 		break;
1244 
1245 	default:
1246 		/* Impossible, internal error */
1247 		ret = -EPROTO;
1248 		break;
1249 	}
1250 
1251 	rte_spinlock_unlock(&mana_shared_data->lock);
1252 
1253 	return ret;
1254 }
1255 
1256 /*
1257  * Probe an IB port
1258  * Return value:
1259  * positive value: successfully probed port
1260  * 0: port not matching specified MAC address
1261  * negative value: error code
1262  */
1263 static int
mana_probe_port(struct ibv_device * ibdev,struct ibv_device_attr_ex * dev_attr,uint8_t port,struct rte_pci_device * pci_dev,struct rte_ether_addr * addr)1264 mana_probe_port(struct ibv_device *ibdev, struct ibv_device_attr_ex *dev_attr,
1265 		uint8_t port, struct rte_pci_device *pci_dev, struct rte_ether_addr *addr)
1266 {
1267 	struct mana_priv *priv = NULL;
1268 	struct rte_eth_dev *eth_dev = NULL;
1269 	struct ibv_parent_domain_init_attr attr = {0};
1270 	char address[64];
1271 	char name[RTE_ETH_NAME_MAX_LEN];
1272 	int ret;
1273 	struct ibv_context *ctx = NULL;
1274 
1275 	rte_ether_format_addr(address, sizeof(address), addr);
1276 	DRV_LOG(INFO, "device located port %u address %s", port, address);
1277 
1278 	priv = rte_zmalloc_socket(NULL, sizeof(*priv), RTE_CACHE_LINE_SIZE,
1279 				  SOCKET_ID_ANY);
1280 	if (!priv)
1281 		return -ENOMEM;
1282 
1283 	snprintf(name, sizeof(name), "%s_port%d", pci_dev->device.name, port);
1284 
1285 	if (rte_eal_process_type() == RTE_PROC_SECONDARY) {
1286 		int fd;
1287 
1288 		eth_dev = rte_eth_dev_attach_secondary(name);
1289 		if (!eth_dev) {
1290 			DRV_LOG(ERR, "Can't attach to dev %s", name);
1291 			ret =  -ENOMEM;
1292 			goto failed;
1293 		}
1294 
1295 		eth_dev->device = &pci_dev->device;
1296 		eth_dev->dev_ops = &mana_dev_secondary_ops;
1297 		ret = mana_proc_priv_init(eth_dev);
1298 		if (ret)
1299 			goto failed;
1300 		priv->process_priv = eth_dev->process_private;
1301 
1302 		/* Get the IB FD from the primary process */
1303 		fd = mana_mp_req_verbs_cmd_fd(eth_dev);
1304 		if (fd < 0) {
1305 			DRV_LOG(ERR, "Failed to get FD %d", fd);
1306 			ret = -ENODEV;
1307 			goto failed;
1308 		}
1309 
1310 		ret = mana_map_doorbell_secondary(eth_dev, fd);
1311 		if (ret) {
1312 			DRV_LOG(ERR, "Failed secondary map %d", fd);
1313 			goto failed;
1314 		}
1315 
1316 		/* fd is no not used after mapping doorbell */
1317 		close(fd);
1318 
1319 		eth_dev->tx_pkt_burst = mana_tx_burst;
1320 		eth_dev->rx_pkt_burst = mana_rx_burst;
1321 
1322 		rte_spinlock_lock(&mana_shared_data->lock);
1323 		mana_shared_data->secondary_cnt++;
1324 		mana_local_data.secondary_cnt++;
1325 		rte_spinlock_unlock(&mana_shared_data->lock);
1326 
1327 		rte_eth_copy_pci_info(eth_dev, pci_dev);
1328 		rte_eth_dev_probing_finish(eth_dev);
1329 
1330 		return 0;
1331 	}
1332 
1333 	ctx = ibv_open_device(ibdev);
1334 	if (!ctx) {
1335 		DRV_LOG(ERR, "Failed to open IB device %s", ibdev->name);
1336 		ret = -ENODEV;
1337 		goto failed;
1338 	}
1339 
1340 	eth_dev = rte_eth_dev_allocate(name);
1341 	if (!eth_dev) {
1342 		ret = -ENOMEM;
1343 		goto failed;
1344 	}
1345 
1346 	eth_dev->data->mac_addrs =
1347 		rte_calloc("mana_mac", 1,
1348 			   sizeof(struct rte_ether_addr), 0);
1349 	if (!eth_dev->data->mac_addrs) {
1350 		ret = -ENOMEM;
1351 		goto failed;
1352 	}
1353 
1354 	rte_ether_addr_copy(addr, eth_dev->data->mac_addrs);
1355 
1356 	priv->ib_pd = ibv_alloc_pd(ctx);
1357 	if (!priv->ib_pd) {
1358 		DRV_LOG(ERR, "ibv_alloc_pd failed port %d", port);
1359 		ret = -ENOMEM;
1360 		goto failed;
1361 	}
1362 
1363 	/* Create a parent domain with the port number */
1364 	attr.pd = priv->ib_pd;
1365 	attr.comp_mask = IBV_PARENT_DOMAIN_INIT_ATTR_PD_CONTEXT;
1366 	attr.pd_context = (void *)(uintptr_t)port;
1367 	priv->ib_parent_pd = ibv_alloc_parent_domain(ctx, &attr);
1368 	if (!priv->ib_parent_pd) {
1369 		DRV_LOG(ERR, "ibv_alloc_parent_domain failed port %d", port);
1370 		ret = -ENOMEM;
1371 		goto failed;
1372 	}
1373 
1374 	priv->ib_ctx = ctx;
1375 	priv->port_id = eth_dev->data->port_id;
1376 	priv->dev_port = port;
1377 	eth_dev->data->dev_private = priv;
1378 	priv->dev_data = eth_dev->data;
1379 
1380 	priv->max_rx_queues = dev_attr->orig_attr.max_qp;
1381 	priv->max_tx_queues = dev_attr->orig_attr.max_qp;
1382 
1383 	priv->max_rx_desc =
1384 		RTE_MIN(dev_attr->orig_attr.max_qp_wr,
1385 			dev_attr->orig_attr.max_cqe);
1386 	priv->max_tx_desc =
1387 		RTE_MIN(dev_attr->orig_attr.max_qp_wr,
1388 			dev_attr->orig_attr.max_cqe);
1389 
1390 	priv->max_send_sge = dev_attr->orig_attr.max_sge;
1391 	priv->max_recv_sge = dev_attr->orig_attr.max_sge;
1392 
1393 	priv->max_mr = dev_attr->orig_attr.max_mr;
1394 	priv->max_mr_size = dev_attr->orig_attr.max_mr_size;
1395 
1396 	DRV_LOG(INFO, "dev %s max queues %d desc %d sge %d mr %" PRIu64,
1397 		name, priv->max_rx_queues, priv->max_rx_desc,
1398 		priv->max_send_sge, priv->max_mr_size);
1399 
1400 	rte_eth_copy_pci_info(eth_dev, pci_dev);
1401 
1402 	/* Create async interrupt handler */
1403 	ret = mana_intr_install(eth_dev, priv);
1404 	if (ret) {
1405 		DRV_LOG(ERR, "Failed to install intr handler");
1406 		goto failed;
1407 	}
1408 
1409 	rte_spinlock_lock(&mana_shared_data->lock);
1410 	mana_shared_data->primary_cnt++;
1411 	rte_spinlock_unlock(&mana_shared_data->lock);
1412 
1413 	eth_dev->device = &pci_dev->device;
1414 
1415 	DRV_LOG(INFO, "device %s at port %u", name, eth_dev->data->port_id);
1416 
1417 	eth_dev->rx_pkt_burst = mana_rx_burst_removed;
1418 	eth_dev->tx_pkt_burst = mana_tx_burst_removed;
1419 	eth_dev->dev_ops = &mana_dev_ops;
1420 
1421 	rte_eth_dev_probing_finish(eth_dev);
1422 
1423 	return 0;
1424 
1425 failed:
1426 	/* Free the resource for the port failed */
1427 	if (priv) {
1428 		if (priv->ib_parent_pd)
1429 			ibv_dealloc_pd(priv->ib_parent_pd);
1430 
1431 		if (priv->ib_pd)
1432 			ibv_dealloc_pd(priv->ib_pd);
1433 	}
1434 
1435 	if (eth_dev)
1436 		rte_eth_dev_release_port(eth_dev);
1437 
1438 	rte_free(priv);
1439 
1440 	if (ctx)
1441 		ibv_close_device(ctx);
1442 
1443 	return ret;
1444 }
1445 
1446 /*
1447  * Goes through the IB device list to look for the IB port matching the
1448  * mac_addr. If found, create a rte_eth_dev for it.
1449  * Return value: number of successfully probed devices
1450  */
1451 static int
mana_pci_probe_mac(struct rte_pci_device * pci_dev,struct rte_ether_addr * mac_addr)1452 mana_pci_probe_mac(struct rte_pci_device *pci_dev,
1453 		   struct rte_ether_addr *mac_addr)
1454 {
1455 	struct ibv_device **ibv_list;
1456 	int ibv_idx;
1457 	struct ibv_context *ctx;
1458 	int num_devices;
1459 	int ret;
1460 	uint8_t port;
1461 	int count = 0;
1462 
1463 	ibv_list = ibv_get_device_list(&num_devices);
1464 	for (ibv_idx = 0; ibv_idx < num_devices; ibv_idx++) {
1465 		struct ibv_device *ibdev = ibv_list[ibv_idx];
1466 		struct rte_pci_addr pci_addr;
1467 		struct ibv_device_attr_ex dev_attr;
1468 
1469 		DRV_LOG(INFO, "Probe device name %s dev_name %s ibdev_path %s",
1470 			ibdev->name, ibdev->dev_name, ibdev->ibdev_path);
1471 
1472 		if (mana_ibv_device_to_pci_addr(ibdev, &pci_addr))
1473 			continue;
1474 
1475 		/* Ignore if this IB device is not this PCI device */
1476 		if (rte_pci_addr_cmp(&pci_dev->addr, &pci_addr) != 0)
1477 			continue;
1478 
1479 		ctx = ibv_open_device(ibdev);
1480 		if (!ctx) {
1481 			DRV_LOG(ERR, "Failed to open IB device %s",
1482 				ibdev->name);
1483 			continue;
1484 		}
1485 		ret = ibv_query_device_ex(ctx, NULL, &dev_attr);
1486 		ibv_close_device(ctx);
1487 
1488 		if (ret) {
1489 			DRV_LOG(ERR, "Failed to query IB device %s",
1490 				ibdev->name);
1491 			continue;
1492 		}
1493 
1494 		for (port = 1; port <= dev_attr.orig_attr.phys_port_cnt;
1495 		     port++) {
1496 			struct rte_ether_addr addr;
1497 			ret = get_port_mac(ibdev, port, &addr);
1498 			if (ret)
1499 				continue;
1500 
1501 			if (mac_addr && !rte_is_same_ether_addr(&addr, mac_addr))
1502 				continue;
1503 
1504 			ret = mana_probe_port(ibdev, &dev_attr, port, pci_dev, &addr);
1505 			if (ret) {
1506 				DRV_LOG(ERR, "Probe on IB port %u failed %d", port, ret);
1507 			} else {
1508 				count++;
1509 				DRV_LOG(INFO, "Successfully probed on IB port %u", port);
1510 			}
1511 		}
1512 	}
1513 
1514 	ibv_free_device_list(ibv_list);
1515 	return count;
1516 }
1517 
1518 /*
1519  * Main callback function from PCI bus to probe a device.
1520  */
1521 static int
mana_pci_probe(struct rte_pci_driver * pci_drv __rte_unused,struct rte_pci_device * pci_dev)1522 mana_pci_probe(struct rte_pci_driver *pci_drv __rte_unused,
1523 	       struct rte_pci_device *pci_dev)
1524 {
1525 	struct rte_devargs *args = pci_dev->device.devargs;
1526 	struct mana_conf conf = {0};
1527 	unsigned int i;
1528 	int ret;
1529 	int count = 0;
1530 
1531 	if (args && args->drv_str) {
1532 		ret = mana_parse_args(args, &conf);
1533 		if (ret) {
1534 			DRV_LOG(ERR, "Failed to parse parameters args = %s",
1535 				args->drv_str);
1536 			return ret;
1537 		}
1538 	}
1539 
1540 	ret = mana_init_once();
1541 	if (ret) {
1542 		DRV_LOG(ERR, "Failed to init PMD global data %d", ret);
1543 		return ret;
1544 	}
1545 
1546 	/* If there are no driver parameters, probe on all ports */
1547 	if (conf.index) {
1548 		for (i = 0; i < conf.index; i++)
1549 			count += mana_pci_probe_mac(pci_dev,
1550 						    &conf.mac_array[i]);
1551 	} else {
1552 		count = mana_pci_probe_mac(pci_dev, NULL);
1553 	}
1554 
1555 	if (!count) {
1556 		rte_memzone_free(mana_shared_mz);
1557 		mana_shared_mz = NULL;
1558 		ret = -ENODEV;
1559 	}
1560 
1561 	return ret;
1562 }
1563 
1564 static int
mana_dev_uninit(struct rte_eth_dev * dev)1565 mana_dev_uninit(struct rte_eth_dev *dev)
1566 {
1567 	return mana_dev_close(dev);
1568 }
1569 
1570 /*
1571  * Callback from PCI to remove this device.
1572  */
1573 static int
mana_pci_remove(struct rte_pci_device * pci_dev)1574 mana_pci_remove(struct rte_pci_device *pci_dev)
1575 {
1576 	if (rte_eal_process_type() == RTE_PROC_PRIMARY) {
1577 		rte_spinlock_lock(&mana_shared_data_lock);
1578 
1579 		rte_spinlock_lock(&mana_shared_data->lock);
1580 
1581 		RTE_VERIFY(mana_shared_data->primary_cnt > 0);
1582 		mana_shared_data->primary_cnt--;
1583 		if (!mana_shared_data->primary_cnt) {
1584 			DRV_LOG(DEBUG, "mp uninit primary");
1585 			mana_mp_uninit_primary();
1586 		}
1587 
1588 		rte_spinlock_unlock(&mana_shared_data->lock);
1589 
1590 		/* Also free the shared memory if this is the last */
1591 		if (!mana_shared_data->primary_cnt) {
1592 			DRV_LOG(DEBUG, "free shared memezone data");
1593 			rte_memzone_free(mana_shared_mz);
1594 			mana_shared_mz = NULL;
1595 		}
1596 
1597 		rte_spinlock_unlock(&mana_shared_data_lock);
1598 	} else {
1599 		rte_spinlock_lock(&mana_shared_data_lock);
1600 
1601 		rte_spinlock_lock(&mana_shared_data->lock);
1602 		RTE_VERIFY(mana_shared_data->secondary_cnt > 0);
1603 		mana_shared_data->secondary_cnt--;
1604 		rte_spinlock_unlock(&mana_shared_data->lock);
1605 
1606 		RTE_VERIFY(mana_local_data.secondary_cnt > 0);
1607 		mana_local_data.secondary_cnt--;
1608 		if (!mana_local_data.secondary_cnt) {
1609 			DRV_LOG(DEBUG, "mp uninit secondary");
1610 			mana_mp_uninit_secondary();
1611 		}
1612 
1613 		rte_spinlock_unlock(&mana_shared_data_lock);
1614 	}
1615 
1616 	return rte_eth_dev_pci_generic_remove(pci_dev, mana_dev_uninit);
1617 }
1618 
1619 static const struct rte_pci_id mana_pci_id_map[] = {
1620 	{
1621 		RTE_PCI_DEVICE(PCI_VENDOR_ID_MICROSOFT,
1622 			       PCI_DEVICE_ID_MICROSOFT_MANA)
1623 	},
1624 	{
1625 		.vendor_id = 0
1626 	},
1627 };
1628 
1629 static struct rte_pci_driver mana_pci_driver = {
1630 	.id_table = mana_pci_id_map,
1631 	.probe = mana_pci_probe,
1632 	.remove = mana_pci_remove,
1633 	.drv_flags = RTE_PCI_DRV_INTR_RMV,
1634 };
1635 
1636 RTE_PMD_REGISTER_PCI(net_mana, mana_pci_driver);
1637 RTE_PMD_REGISTER_PCI_TABLE(net_mana, mana_pci_id_map);
1638 RTE_PMD_REGISTER_KMOD_DEP(net_mana, "* ib_uverbs & mana_ib");
1639 RTE_LOG_REGISTER_SUFFIX(mana_logtype_init, init, NOTICE);
1640 RTE_LOG_REGISTER_SUFFIX(mana_logtype_driver, driver, NOTICE);
1641 RTE_PMD_REGISTER_PARAM_STRING(net_mana, ETH_MANA_MAC_ARG "=<mac_addr>");
1642