xref: /dpdk/drivers/vdpa/nfp/nfp_vdpa.c (revision b6de43530dfa30cbf6b70857e3835099701063d4)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright (c) 2023 Corigine, Inc.
3  * All rights reserved.
4  */
5 
6 #include <pthread.h>
7 #include <sys/epoll.h>
8 #include <sys/ioctl.h>
9 #include <unistd.h>
10 
11 #include <nfp_common_pci.h>
12 #include <nfp_dev.h>
13 #include <rte_vfio.h>
14 #include <rte_eal_paging.h>
15 #include <rte_malloc.h>
16 #include <vdpa_driver.h>
17 
18 #include "nfp_vdpa_core.h"
19 #include "nfp_vdpa_log.h"
20 
21 #define NFP_VDPA_DRIVER_NAME nfp_vdpa
22 
23 #define MSIX_IRQ_SET_BUF_LEN (sizeof(struct vfio_irq_set) + \
24 		sizeof(int) * (NFP_VDPA_MAX_QUEUES * 2 + 1))
25 
26 #define NFP_VDPA_USED_RING_LEN(size) \
27 		((size) * sizeof(struct vring_used_elem) + sizeof(struct vring_used))
28 
29 #define EPOLL_DATA_INTR        1
30 
31 struct nfp_vdpa_dev {
32 	struct rte_pci_device *pci_dev;
33 	struct rte_vdpa_device *vdev;
34 	struct nfp_vdpa_hw hw;
35 
36 	int vfio_container_fd;
37 	int vfio_group_fd;
38 	int vfio_dev_fd;
39 	int iommu_group;
40 
41 	rte_thread_t tid;    /**< Thread for notify relay */
42 	int epoll_fd;
43 
44 	int vid;
45 	uint16_t max_queues;
46 	RTE_ATOMIC(uint32_t) started;
47 	RTE_ATOMIC(uint32_t) dev_attached;
48 	RTE_ATOMIC(uint32_t) running;
49 	rte_spinlock_t lock;
50 
51 	/** Eventfd for used ring interrupt */
52 	int intr_fd[NFP_VDPA_MAX_QUEUES * 2];
53 };
54 
55 struct nfp_vdpa_dev_node {
56 	TAILQ_ENTRY(nfp_vdpa_dev_node) next;
57 	struct nfp_vdpa_dev *device;
58 };
59 
60 TAILQ_HEAD(vdpa_dev_list_head, nfp_vdpa_dev_node);
61 
62 static struct vdpa_dev_list_head vdpa_dev_list =
63 	TAILQ_HEAD_INITIALIZER(vdpa_dev_list);
64 
65 static pthread_mutex_t vdpa_list_lock = PTHREAD_MUTEX_INITIALIZER;
66 
67 static struct nfp_vdpa_dev_node *
68 nfp_vdpa_find_node_by_vdev(struct rte_vdpa_device *vdev)
69 {
70 	bool found = false;
71 	struct nfp_vdpa_dev_node *node;
72 
73 	pthread_mutex_lock(&vdpa_list_lock);
74 
75 	TAILQ_FOREACH(node, &vdpa_dev_list, next) {
76 		if (vdev == node->device->vdev) {
77 			found = true;
78 			break;
79 		}
80 	}
81 
82 	pthread_mutex_unlock(&vdpa_list_lock);
83 
84 	if (found)
85 		return node;
86 
87 	return NULL;
88 }
89 
90 static struct nfp_vdpa_dev_node *
91 nfp_vdpa_find_node_by_pdev(struct rte_pci_device *pdev)
92 {
93 	bool found = false;
94 	struct nfp_vdpa_dev_node *node;
95 
96 	pthread_mutex_lock(&vdpa_list_lock);
97 
98 	TAILQ_FOREACH(node, &vdpa_dev_list, next) {
99 		if (pdev == node->device->pci_dev) {
100 			found = true;
101 			break;
102 		}
103 	}
104 
105 	pthread_mutex_unlock(&vdpa_list_lock);
106 
107 	if (found)
108 		return node;
109 
110 	return NULL;
111 }
112 
113 static int
114 nfp_vdpa_vfio_setup(struct nfp_vdpa_dev *device)
115 {
116 	int ret;
117 	char dev_name[RTE_DEV_NAME_MAX_LEN] = {0};
118 	struct rte_pci_device *pci_dev = device->pci_dev;
119 
120 	rte_pci_unmap_device(pci_dev);
121 
122 	rte_pci_device_name(&pci_dev->addr, dev_name, RTE_DEV_NAME_MAX_LEN);
123 	ret = rte_vfio_get_group_num(rte_pci_get_sysfs_path(), dev_name,
124 			&device->iommu_group);
125 	if (ret <= 0)
126 		return -1;
127 
128 	device->vfio_container_fd = rte_vfio_container_create();
129 	if (device->vfio_container_fd < 0)
130 		return -1;
131 
132 	device->vfio_group_fd = rte_vfio_container_group_bind(
133 			device->vfio_container_fd, device->iommu_group);
134 	if (device->vfio_group_fd < 0)
135 		goto container_destroy;
136 
137 	DRV_VDPA_LOG(DEBUG, "The container_fd=%d, group_fd=%d.",
138 			device->vfio_container_fd, device->vfio_group_fd);
139 
140 	ret = rte_pci_map_device(pci_dev);
141 	if (ret != 0)
142 		goto group_unbind;
143 
144 	device->vfio_dev_fd = rte_intr_dev_fd_get(pci_dev->intr_handle);
145 
146 	return 0;
147 
148 group_unbind:
149 	rte_vfio_container_group_unbind(device->vfio_container_fd, device->iommu_group);
150 container_destroy:
151 	rte_vfio_container_destroy(device->vfio_container_fd);
152 
153 	return -1;
154 }
155 
156 static void
157 nfp_vdpa_vfio_teardown(struct nfp_vdpa_dev *device)
158 {
159 	rte_pci_unmap_device(device->pci_dev);
160 	rte_vfio_container_group_unbind(device->vfio_container_fd, device->iommu_group);
161 	rte_vfio_container_destroy(device->vfio_container_fd);
162 }
163 
164 static int
165 nfp_vdpa_dma_do_unmap(struct rte_vhost_memory *mem,
166 		uint32_t times,
167 		int vfio_container_fd)
168 {
169 	uint32_t i;
170 	int ret = 0;
171 	struct rte_vhost_mem_region *region;
172 
173 	for (i = 0; i < times; i++) {
174 		region = &mem->regions[i];
175 
176 		ret = rte_vfio_container_dma_unmap(vfio_container_fd,
177 				region->host_user_addr, region->guest_phys_addr,
178 				region->size);
179 		if (ret < 0) {
180 			/* Here should not return, even error happened. */
181 			DRV_VDPA_LOG(ERR, "DMA unmap failed. Times: %u.", i);
182 		}
183 	}
184 
185 	return ret;
186 }
187 
188 static int
189 nfp_vdpa_dma_do_map(struct rte_vhost_memory *mem,
190 		uint32_t times,
191 		int vfio_container_fd)
192 {
193 	int ret;
194 	uint32_t i;
195 	struct rte_vhost_mem_region *region;
196 
197 	for (i = 0; i < times; i++) {
198 		region = &mem->regions[i];
199 
200 		ret = rte_vfio_container_dma_map(vfio_container_fd,
201 				region->host_user_addr, region->guest_phys_addr,
202 				region->size);
203 		if (ret < 0) {
204 			DRV_VDPA_LOG(ERR, "DMA map failed.");
205 			nfp_vdpa_dma_do_unmap(mem, i, vfio_container_fd);
206 			return ret;
207 		}
208 	}
209 
210 	return 0;
211 }
212 
213 static int
214 nfp_vdpa_dma_map(struct nfp_vdpa_dev *device,
215 		bool do_map)
216 {
217 	int ret;
218 	int vfio_container_fd;
219 	struct rte_vhost_memory *mem = NULL;
220 
221 	ret = rte_vhost_get_mem_table(device->vid, &mem);
222 	if (ret < 0) {
223 		DRV_VDPA_LOG(ERR, "Failed to get memory layout.");
224 		return ret;
225 	}
226 
227 	vfio_container_fd = device->vfio_container_fd;
228 	DRV_VDPA_LOG(DEBUG, "The vfio_container_fd %d.", vfio_container_fd);
229 
230 	if (do_map)
231 		ret = nfp_vdpa_dma_do_map(mem, mem->nregions, vfio_container_fd);
232 	else
233 		ret = nfp_vdpa_dma_do_unmap(mem, mem->nregions, vfio_container_fd);
234 
235 	free(mem);
236 
237 	return ret;
238 }
239 
240 static uint64_t
241 nfp_vdpa_qva_to_gpa(int vid,
242 		uint64_t qva)
243 {
244 	int ret;
245 	uint32_t i;
246 	uint64_t gpa = 0;
247 	struct rte_vhost_memory *mem = NULL;
248 	struct rte_vhost_mem_region *region;
249 
250 	ret = rte_vhost_get_mem_table(vid, &mem);
251 	if (ret < 0) {
252 		DRV_VDPA_LOG(ERR, "Failed to get memory layout.");
253 		return gpa;
254 	}
255 
256 	for (i = 0; i < mem->nregions; i++) {
257 		region = &mem->regions[i];
258 
259 		if (qva >= region->host_user_addr &&
260 				qva < region->host_user_addr + region->size) {
261 			gpa = qva - region->host_user_addr + region->guest_phys_addr;
262 			break;
263 		}
264 	}
265 
266 	free(mem);
267 
268 	return gpa;
269 }
270 
271 static void
272 nfp_vdpa_relay_vring_free(struct nfp_vdpa_dev *device,
273 		uint16_t vring_index)
274 {
275 	uint16_t i;
276 	uint64_t size;
277 	struct rte_vhost_vring vring;
278 	uint64_t m_vring_iova = NFP_VDPA_RELAY_VRING;
279 
280 	for (i = 0; i < vring_index; i++) {
281 		rte_vhost_get_vhost_vring(device->vid, i, &vring);
282 
283 		size = RTE_ALIGN_CEIL(vring_size(vring.size, rte_mem_page_size()),
284 				rte_mem_page_size());
285 		rte_vfio_container_dma_unmap(device->vfio_container_fd,
286 				(uint64_t)(uintptr_t)device->hw.m_vring[i].desc,
287 				m_vring_iova, size);
288 
289 		rte_free(device->hw.m_vring[i].desc);
290 		m_vring_iova += size;
291 	}
292 }
293 
294 static int
295 nfp_vdpa_relay_vring_alloc(struct nfp_vdpa_dev *device)
296 {
297 	int ret;
298 	uint16_t i;
299 	uint64_t size;
300 	void *vring_buf;
301 	uint64_t page_size;
302 	struct rte_vhost_vring vring;
303 	struct nfp_vdpa_hw *vdpa_hw = &device->hw;
304 	uint64_t m_vring_iova = NFP_VDPA_RELAY_VRING;
305 
306 	page_size = rte_mem_page_size();
307 
308 	for (i = 0; i < vdpa_hw->nr_vring; i++) {
309 		rte_vhost_get_vhost_vring(device->vid, i, &vring);
310 
311 		size = RTE_ALIGN_CEIL(vring_size(vring.size, page_size), page_size);
312 		vring_buf = rte_zmalloc("nfp_vdpa_relay", size, page_size);
313 		if (vring_buf == NULL)
314 			goto vring_free_all;
315 
316 		vring_init(&vdpa_hw->m_vring[i], vring.size, vring_buf, page_size);
317 
318 		ret = rte_vfio_container_dma_map(device->vfio_container_fd,
319 				(uint64_t)(uintptr_t)vring_buf, m_vring_iova, size);
320 		if (ret != 0) {
321 			DRV_VDPA_LOG(ERR, "vDPA vring relay dma map failed.");
322 			goto vring_free_one;
323 		}
324 
325 		m_vring_iova += size;
326 	}
327 
328 	return 0;
329 
330 vring_free_one:
331 	rte_free(device->hw.m_vring[i].desc);
332 vring_free_all:
333 	nfp_vdpa_relay_vring_free(device, i);
334 
335 	return -ENOSPC;
336 }
337 
338 static int
339 nfp_vdpa_start(struct nfp_vdpa_dev *device,
340 		bool relay)
341 {
342 	int ret;
343 	int vid;
344 	uint16_t i;
345 	uint64_t gpa;
346 	uint16_t size;
347 	struct rte_vhost_vring vring;
348 	struct nfp_vdpa_hw *vdpa_hw = &device->hw;
349 	uint64_t m_vring_iova = NFP_VDPA_RELAY_VRING;
350 
351 	vid = device->vid;
352 	vdpa_hw->nr_vring = rte_vhost_get_vring_num(vid);
353 
354 	ret = rte_vhost_get_negotiated_features(vid, &vdpa_hw->req_features);
355 	if (ret != 0)
356 		return ret;
357 
358 	if (relay) {
359 		ret = nfp_vdpa_relay_vring_alloc(device);
360 		if (ret != 0)
361 			return ret;
362 	}
363 
364 	for (i = 0; i < vdpa_hw->nr_vring; i++) {
365 		ret = rte_vhost_get_vhost_vring(vid, i, &vring);
366 		if (ret != 0)
367 			goto relay_vring_free;
368 
369 		gpa = nfp_vdpa_qva_to_gpa(vid, (uint64_t)(uintptr_t)vring.desc);
370 		if (gpa == 0) {
371 			DRV_VDPA_LOG(ERR, "Fail to get GPA for descriptor ring.");
372 			goto relay_vring_free;
373 		}
374 
375 		vdpa_hw->vring[i].desc = gpa;
376 
377 		gpa = nfp_vdpa_qva_to_gpa(vid, (uint64_t)(uintptr_t)vring.avail);
378 		if (gpa == 0) {
379 			DRV_VDPA_LOG(ERR, "Fail to get GPA for available ring.");
380 			goto relay_vring_free;
381 		}
382 
383 		vdpa_hw->vring[i].avail = gpa;
384 
385 		/* Direct I/O for Tx queue, relay for Rx queue */
386 		if (relay && ((i & 1) == 0)) {
387 			vdpa_hw->vring[i].used = m_vring_iova +
388 					(char *)vdpa_hw->m_vring[i].used -
389 					(char *)vdpa_hw->m_vring[i].desc;
390 
391 			ret = rte_vhost_get_vring_base(vid, i,
392 					&vdpa_hw->m_vring[i].avail->idx,
393 					&vdpa_hw->m_vring[i].used->idx);
394 			if (ret != 0)
395 				goto relay_vring_free;
396 		} else {
397 			gpa = nfp_vdpa_qva_to_gpa(vid, (uint64_t)(uintptr_t)vring.used);
398 			if (gpa == 0) {
399 				DRV_VDPA_LOG(ERR, "Fail to get GPA for used ring.");
400 				goto relay_vring_free;
401 			}
402 
403 			vdpa_hw->vring[i].used = gpa;
404 		}
405 
406 		vdpa_hw->vring[i].size = vring.size;
407 
408 		if (relay) {
409 			size = RTE_ALIGN_CEIL(vring_size(vring.size,
410 					rte_mem_page_size()), rte_mem_page_size());
411 			m_vring_iova += size;
412 		}
413 
414 		ret = rte_vhost_get_vring_base(vid, i,
415 				&vdpa_hw->vring[i].last_avail_idx,
416 				&vdpa_hw->vring[i].last_used_idx);
417 		if (ret != 0)
418 			goto relay_vring_free;
419 	}
420 
421 	if (relay)
422 		return nfp_vdpa_relay_hw_start(&device->hw, vid);
423 	else
424 		return nfp_vdpa_hw_start(&device->hw, vid);
425 
426 relay_vring_free:
427 	if (relay)
428 		nfp_vdpa_relay_vring_free(device, vdpa_hw->nr_vring);
429 
430 	return -EFAULT;
431 }
432 
433 static void
434 nfp_vdpa_update_used_ring(struct nfp_vdpa_dev *dev,
435 		uint16_t qid)
436 {
437 	rte_vdpa_relay_vring_used(dev->vid, qid, &dev->hw.m_vring[qid]);
438 	rte_vhost_vring_call(dev->vid, qid);
439 }
440 
441 static void
442 nfp_vdpa_relay_stop(struct nfp_vdpa_dev *device)
443 {
444 	int vid;
445 	uint32_t i;
446 	uint64_t len;
447 	struct rte_vhost_vring vring;
448 	struct nfp_vdpa_hw *vdpa_hw = &device->hw;
449 
450 	nfp_vdpa_hw_stop(vdpa_hw);
451 
452 	vid = device->vid;
453 	for (i = 0; i < vdpa_hw->nr_vring; i++) {
454 		/* Synchronize remaining new used entries if any */
455 		if ((i & 1) == 0)
456 			nfp_vdpa_update_used_ring(device, i);
457 
458 		rte_vhost_get_vhost_vring(vid, i, &vring);
459 		len = NFP_VDPA_USED_RING_LEN(vring.size);
460 		vdpa_hw->vring[i].last_avail_idx = vring.avail->idx;
461 		vdpa_hw->vring[i].last_used_idx = vring.used->idx;
462 
463 		rte_vhost_set_vring_base(vid, i,
464 				vdpa_hw->vring[i].last_avail_idx,
465 				vdpa_hw->vring[i].last_used_idx);
466 
467 		rte_vhost_log_used_vring(vid, i, 0, len);
468 
469 		if (vring.used->idx != vring.avail->idx)
470 			rte_atomic_store_explicit(
471 					(unsigned short __rte_atomic *)&vring.used->idx,
472 					vring.avail->idx, rte_memory_order_release);
473 	}
474 
475 	nfp_vdpa_relay_vring_free(device, vdpa_hw->nr_vring);
476 }
477 
478 static void
479 nfp_vdpa_stop(struct nfp_vdpa_dev *device,
480 		bool relay)
481 {
482 	int vid;
483 	uint32_t i;
484 	struct nfp_vdpa_hw *vdpa_hw = &device->hw;
485 
486 	nfp_vdpa_hw_stop(vdpa_hw);
487 
488 	vid = device->vid;
489 	if (relay)
490 		nfp_vdpa_relay_stop(device);
491 	else
492 		for (i = 0; i < vdpa_hw->nr_vring; i++)
493 			rte_vhost_set_vring_base(vid, i,
494 					vdpa_hw->vring[i].last_avail_idx,
495 					vdpa_hw->vring[i].last_used_idx);
496 
497 }
498 
499 static int
500 nfp_vdpa_enable_vfio_intr(struct nfp_vdpa_dev *device,
501 		bool relay)
502 {
503 	int fd;
504 	int ret;
505 	uint16_t i;
506 	int *fd_ptr;
507 	uint16_t nr_vring;
508 	struct vfio_irq_set *irq_set;
509 	struct rte_vhost_vring vring;
510 	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
511 
512 	nr_vring = rte_vhost_get_vring_num(device->vid);
513 
514 	irq_set = (struct vfio_irq_set *)irq_set_buf;
515 	irq_set->argsz = sizeof(irq_set_buf);
516 	irq_set->count = nr_vring + 1;
517 	irq_set->flags = VFIO_IRQ_SET_DATA_EVENTFD | VFIO_IRQ_SET_ACTION_TRIGGER;
518 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
519 	irq_set->start = 0;
520 
521 	fd_ptr = (int *)&irq_set->data;
522 	fd_ptr[RTE_INTR_VEC_ZERO_OFFSET] = rte_intr_fd_get(device->pci_dev->intr_handle);
523 
524 	for (i = 0; i < nr_vring; i++)
525 		device->intr_fd[i] = -1;
526 
527 	for (i = 0; i < nr_vring; i++) {
528 		rte_vhost_get_vhost_vring(device->vid, i, &vring);
529 		fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = vring.callfd;
530 	}
531 
532 	if (relay) {
533 		for (i = 0; i < nr_vring; i += 2) {
534 			fd = eventfd(0, EFD_NONBLOCK | EFD_CLOEXEC);
535 			if (fd < 0) {
536 				DRV_VDPA_LOG(ERR, "Can't setup eventfd.");
537 				return -EINVAL;
538 			}
539 
540 			device->intr_fd[i] = fd;
541 			fd_ptr[RTE_INTR_VEC_RXTX_OFFSET + i] = fd;
542 		}
543 	}
544 
545 	ret = ioctl(device->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
546 	if (ret != 0) {
547 		DRV_VDPA_LOG(ERR, "Error enabling MSI-X interrupts.");
548 		return -EIO;
549 	}
550 
551 	return 0;
552 }
553 
554 static int
555 nfp_vdpa_disable_vfio_intr(struct nfp_vdpa_dev *device)
556 {
557 	int ret;
558 	struct vfio_irq_set *irq_set;
559 	char irq_set_buf[MSIX_IRQ_SET_BUF_LEN];
560 
561 	irq_set = (struct vfio_irq_set *)irq_set_buf;
562 	irq_set->argsz = sizeof(irq_set_buf);
563 	irq_set->count = 0;
564 	irq_set->flags = VFIO_IRQ_SET_DATA_NONE | VFIO_IRQ_SET_ACTION_TRIGGER;
565 	irq_set->index = VFIO_PCI_MSIX_IRQ_INDEX;
566 	irq_set->start = 0;
567 
568 	ret = ioctl(device->vfio_dev_fd, VFIO_DEVICE_SET_IRQS, irq_set);
569 	if (ret != 0) {
570 		DRV_VDPA_LOG(ERR, "Error disabling MSI-X interrupts.");
571 		return -EIO;
572 	}
573 
574 	return 0;
575 }
576 
577 static void
578 nfp_vdpa_read_kickfd(int kickfd)
579 {
580 	int bytes;
581 	uint64_t buf;
582 
583 	for (;;) {
584 		bytes = read(kickfd, &buf, 8);
585 		if (bytes >= 0)
586 			break;
587 
588 		if (errno != EINTR && errno != EWOULDBLOCK &&
589 				errno != EAGAIN) {
590 			DRV_VDPA_LOG(ERR, "Error reading kickfd.");
591 			break;
592 		}
593 	}
594 }
595 
596 static int
597 nfp_vdpa_notify_epoll_ctl(uint32_t queue_num,
598 		struct nfp_vdpa_dev *device)
599 {
600 	int ret;
601 	uint32_t qid;
602 
603 	for (qid = 0; qid < queue_num; qid++) {
604 		struct epoll_event ev;
605 		struct rte_vhost_vring vring;
606 
607 		ev.events = EPOLLIN | EPOLLPRI;
608 		rte_vhost_get_vhost_vring(device->vid, qid, &vring);
609 		ev.data.u64 = qid | (uint64_t)vring.kickfd << 32;
610 		ret = epoll_ctl(device->epoll_fd, EPOLL_CTL_ADD, vring.kickfd, &ev);
611 		if (ret < 0) {
612 			DRV_VDPA_LOG(ERR, "Epoll add error for queue %d.", qid);
613 			return ret;
614 		}
615 	}
616 
617 	return 0;
618 }
619 
620 static int
621 nfp_vdpa_notify_epoll_wait(uint32_t queue_num,
622 		struct nfp_vdpa_dev *device)
623 {
624 	int i;
625 	int fds;
626 	int kickfd;
627 	uint32_t qid;
628 	struct epoll_event events[NFP_VDPA_MAX_QUEUES * 2];
629 
630 	for (;;) {
631 		fds = epoll_wait(device->epoll_fd, events, queue_num, -1);
632 		if (fds < 0) {
633 			if (errno == EINTR)
634 				continue;
635 
636 			DRV_VDPA_LOG(ERR, "Epoll wait fail.");
637 			return -EACCES;
638 		}
639 
640 		for (i = 0; i < fds; i++) {
641 			qid = events[i].data.u32;
642 			kickfd = (uint32_t)(events[i].data.u64 >> 32);
643 
644 			nfp_vdpa_read_kickfd(kickfd);
645 			nfp_vdpa_notify_queue(&device->hw, qid);
646 		}
647 	}
648 
649 	return 0;
650 }
651 
652 static uint32_t
653 nfp_vdpa_notify_relay(void *arg)
654 {
655 	int ret;
656 	int epoll_fd;
657 	uint32_t queue_num;
658 	struct nfp_vdpa_dev *device = arg;
659 
660 	epoll_fd = epoll_create(NFP_VDPA_MAX_QUEUES * 2);
661 	if (epoll_fd < 0) {
662 		DRV_VDPA_LOG(ERR, "Failed to create epoll instance.");
663 		return 1;
664 	}
665 
666 	device->epoll_fd = epoll_fd;
667 
668 	queue_num = rte_vhost_get_vring_num(device->vid);
669 
670 	ret = nfp_vdpa_notify_epoll_ctl(queue_num, device);
671 	if (ret != 0)
672 		goto notify_exit;
673 
674 	ret = nfp_vdpa_notify_epoll_wait(queue_num, device);
675 	if (ret != 0)
676 		goto notify_exit;
677 
678 	return 0;
679 
680 notify_exit:
681 	close(device->epoll_fd);
682 	device->epoll_fd = -1;
683 
684 	return 1;
685 }
686 
687 static int
688 nfp_vdpa_setup_notify_relay(struct nfp_vdpa_dev *device)
689 {
690 	int ret;
691 	char name[RTE_THREAD_INTERNAL_NAME_SIZE];
692 
693 	snprintf(name, sizeof(name), "nfp-noti%d", device->vid);
694 	ret = rte_thread_create_internal_control(&device->tid, name,
695 			nfp_vdpa_notify_relay, (void *)device);
696 	if (ret != 0) {
697 		DRV_VDPA_LOG(ERR, "Failed to create notify relay pthread.");
698 		return -1;
699 	}
700 
701 	return 0;
702 }
703 
704 static void
705 nfp_vdpa_unset_notify_relay(struct nfp_vdpa_dev *device)
706 {
707 	if (device->tid.opaque_id != 0) {
708 		pthread_cancel((pthread_t)device->tid.opaque_id);
709 		rte_thread_join(device->tid, NULL);
710 		device->tid.opaque_id = 0;
711 	}
712 
713 	if (device->epoll_fd >= 0) {
714 		close(device->epoll_fd);
715 		device->epoll_fd = -1;
716 	}
717 }
718 
719 static int
720 update_datapath(struct nfp_vdpa_dev *device)
721 {
722 	int ret;
723 
724 	rte_spinlock_lock(&device->lock);
725 
726 	if ((rte_atomic_load_explicit(&device->running, rte_memory_order_relaxed) == 0) &&
727 			(rte_atomic_load_explicit(&device->started,
728 					rte_memory_order_relaxed) != 0) &&
729 			(rte_atomic_load_explicit(&device->dev_attached,
730 					rte_memory_order_relaxed) != 0)) {
731 		ret = nfp_vdpa_dma_map(device, true);
732 		if (ret != 0)
733 			goto unlock_exit;
734 
735 		ret = nfp_vdpa_enable_vfio_intr(device, false);
736 		if (ret != 0)
737 			goto dma_map_rollback;
738 
739 		ret = nfp_vdpa_start(device, false);
740 		if (ret != 0)
741 			goto disable_vfio_intr;
742 
743 		ret = nfp_vdpa_setup_notify_relay(device);
744 		if (ret != 0)
745 			goto vdpa_stop;
746 
747 		rte_atomic_store_explicit(&device->running, 1, rte_memory_order_relaxed);
748 	} else if ((rte_atomic_load_explicit(&device->running, rte_memory_order_relaxed) != 0) &&
749 			((rte_atomic_load_explicit(&device->started,
750 					rte_memory_order_relaxed) != 0) ||
751 			(rte_atomic_load_explicit(&device->dev_attached,
752 					rte_memory_order_relaxed) != 0))) {
753 		nfp_vdpa_unset_notify_relay(device);
754 
755 		nfp_vdpa_stop(device, false);
756 
757 		ret = nfp_vdpa_disable_vfio_intr(device);
758 		if (ret != 0)
759 			goto unlock_exit;
760 
761 		ret = nfp_vdpa_dma_map(device, false);
762 		if (ret != 0)
763 			goto unlock_exit;
764 
765 		rte_atomic_store_explicit(&device->running, 0, rte_memory_order_relaxed);
766 	}
767 
768 	rte_spinlock_unlock(&device->lock);
769 	return 0;
770 
771 vdpa_stop:
772 	nfp_vdpa_stop(device, false);
773 disable_vfio_intr:
774 	nfp_vdpa_disable_vfio_intr(device);
775 dma_map_rollback:
776 	nfp_vdpa_dma_map(device, false);
777 unlock_exit:
778 	rte_spinlock_unlock(&device->lock);
779 	return ret;
780 }
781 
782 static int
783 nfp_vdpa_vring_epoll_ctl(uint32_t queue_num,
784 		struct nfp_vdpa_dev *device)
785 {
786 	int ret;
787 	uint32_t qid;
788 	struct epoll_event ev;
789 	struct rte_vhost_vring vring;
790 
791 	for (qid = 0; qid < queue_num; qid++) {
792 		ev.events = EPOLLIN | EPOLLPRI;
793 		rte_vhost_get_vhost_vring(device->vid, qid, &vring);
794 		ev.data.u64 = qid << 1 | (uint64_t)vring.kickfd << 32;
795 		ret = epoll_ctl(device->epoll_fd, EPOLL_CTL_ADD, vring.kickfd, &ev);
796 		if (ret < 0) {
797 			DRV_VDPA_LOG(ERR, "Epoll add error for queue %u.", qid);
798 			return ret;
799 		}
800 	}
801 
802 	/* vDPA driver interrupt */
803 	for (qid = 0; qid < queue_num; qid += 2) {
804 		ev.events = EPOLLIN | EPOLLPRI;
805 		/* Leave a flag to mark it's for interrupt */
806 		ev.data.u64 = EPOLL_DATA_INTR | qid << 1 |
807 				(uint64_t)device->intr_fd[qid] << 32;
808 		ret = epoll_ctl(device->epoll_fd, EPOLL_CTL_ADD,
809 				device->intr_fd[qid], &ev);
810 		if (ret < 0) {
811 			DRV_VDPA_LOG(ERR, "Epoll add error for queue %u.", qid);
812 			return ret;
813 		}
814 
815 		nfp_vdpa_update_used_ring(device, qid);
816 	}
817 
818 	return 0;
819 }
820 
821 static int
822 nfp_vdpa_vring_epoll_wait(uint32_t queue_num,
823 		struct nfp_vdpa_dev *device)
824 {
825 	int i;
826 	int fds;
827 	int kickfd;
828 	uint32_t qid;
829 	struct epoll_event events[NFP_VDPA_MAX_QUEUES * 2];
830 
831 	for (;;) {
832 		fds = epoll_wait(device->epoll_fd, events, queue_num * 2, -1);
833 		if (fds < 0) {
834 			if (errno == EINTR)
835 				continue;
836 
837 			DRV_VDPA_LOG(ERR, "Epoll wait fail.");
838 			return -EACCES;
839 		}
840 
841 		for (i = 0; i < fds; i++) {
842 			qid = events[i].data.u32 >> 1;
843 			kickfd = (uint32_t)(events[i].data.u64 >> 32);
844 
845 			nfp_vdpa_read_kickfd(kickfd);
846 			if ((events[i].data.u32 & EPOLL_DATA_INTR) != 0) {
847 				nfp_vdpa_update_used_ring(device, qid);
848 				nfp_vdpa_irq_unmask(&device->hw);
849 			} else {
850 				nfp_vdpa_notify_queue(&device->hw, qid);
851 			}
852 		}
853 	}
854 
855 	return 0;
856 }
857 
858 static uint32_t
859 nfp_vdpa_vring_relay(void *arg)
860 {
861 	int ret;
862 	int epoll_fd;
863 	uint16_t queue_id;
864 	uint32_t queue_num;
865 	struct nfp_vdpa_dev *device = arg;
866 
867 	epoll_fd = epoll_create(NFP_VDPA_MAX_QUEUES * 2);
868 	if (epoll_fd < 0) {
869 		DRV_VDPA_LOG(ERR, "failed to create epoll instance.");
870 		return 1;
871 	}
872 
873 	device->epoll_fd = epoll_fd;
874 
875 	queue_num = rte_vhost_get_vring_num(device->vid);
876 
877 	ret = nfp_vdpa_vring_epoll_ctl(queue_num, device);
878 	if (ret != 0)
879 		goto notify_exit;
880 
881 	/* Start relay with a first kick */
882 	for (queue_id = 0; queue_id < queue_num; queue_id++)
883 		nfp_vdpa_notify_queue(&device->hw, queue_id);
884 
885 	ret = nfp_vdpa_vring_epoll_wait(queue_num, device);
886 	if (ret != 0)
887 		goto notify_exit;
888 
889 	return 0;
890 
891 notify_exit:
892 	close(device->epoll_fd);
893 	device->epoll_fd = -1;
894 
895 	return 1;
896 }
897 
898 static int
899 nfp_vdpa_setup_vring_relay(struct nfp_vdpa_dev *device)
900 {
901 	int ret;
902 	char name[RTE_THREAD_INTERNAL_NAME_SIZE];
903 
904 	snprintf(name, sizeof(name), "nfp_vring%d", device->vid);
905 	ret = rte_thread_create_internal_control(&device->tid, name,
906 			nfp_vdpa_vring_relay, (void *)device);
907 	if (ret != 0) {
908 		DRV_VDPA_LOG(ERR, "Failed to create vring relay pthread.");
909 		return -EPERM;
910 	}
911 
912 	return 0;
913 }
914 
915 static int
916 nfp_vdpa_sw_fallback(struct nfp_vdpa_dev *device)
917 {
918 	int ret;
919 	int vid = device->vid;
920 
921 	/* Stop the direct IO data path */
922 	nfp_vdpa_unset_notify_relay(device);
923 	nfp_vdpa_disable_vfio_intr(device);
924 
925 	ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, false);
926 	if ((ret != 0) && (ret != -ENOTSUP)) {
927 		DRV_VDPA_LOG(ERR, "Unset the host notifier failed.");
928 		goto error;
929 	}
930 
931 	/* Setup interrupt for vring relay */
932 	ret = nfp_vdpa_enable_vfio_intr(device, true);
933 	if (ret != 0)
934 		goto error;
935 
936 	/* Config the VF */
937 	ret = nfp_vdpa_start(device, true);
938 	if (ret != 0)
939 		goto unset_intr;
940 
941 	/* Setup vring relay thread */
942 	ret = nfp_vdpa_setup_vring_relay(device);
943 	if (ret != 0)
944 		goto stop_vf;
945 
946 	device->hw.sw_fallback_running = true;
947 
948 	return 0;
949 
950 stop_vf:
951 	nfp_vdpa_stop(device, true);
952 unset_intr:
953 	nfp_vdpa_disable_vfio_intr(device);
954 error:
955 	return ret;
956 }
957 
958 static int
959 nfp_vdpa_dev_config(int vid)
960 {
961 	int ret;
962 	struct nfp_vdpa_dev *device;
963 	struct rte_vdpa_device *vdev;
964 	struct nfp_vdpa_dev_node *node;
965 
966 	vdev = rte_vhost_get_vdpa_device(vid);
967 	node = nfp_vdpa_find_node_by_vdev(vdev);
968 	if (node == NULL) {
969 		DRV_VDPA_LOG(ERR, "Invalid vDPA device: %p.", vdev);
970 		return -ENODEV;
971 	}
972 
973 	device = node->device;
974 	device->vid = vid;
975 	rte_atomic_store_explicit(&device->dev_attached, 1, rte_memory_order_relaxed);
976 	update_datapath(device);
977 
978 	ret = rte_vhost_host_notifier_ctrl(vid, RTE_VHOST_QUEUE_ALL, true);
979 	if (ret != 0)
980 		DRV_VDPA_LOG(INFO, "vDPA (%s): software relay is used.",
981 				vdev->device->name);
982 
983 	return 0;
984 }
985 
986 static int
987 nfp_vdpa_dev_close(int vid)
988 {
989 	struct nfp_vdpa_dev *device;
990 	struct rte_vdpa_device *vdev;
991 	struct nfp_vdpa_dev_node *node;
992 
993 	vdev = rte_vhost_get_vdpa_device(vid);
994 	node = nfp_vdpa_find_node_by_vdev(vdev);
995 	if (node == NULL) {
996 		DRV_VDPA_LOG(ERR, "Invalid vDPA device: %p.", vdev);
997 		return -ENODEV;
998 	}
999 
1000 	device = node->device;
1001 	if (device->hw.sw_fallback_running) {
1002 		/* Reset VF */
1003 		nfp_vdpa_stop(device, true);
1004 
1005 		/* Remove interrupt setting */
1006 		nfp_vdpa_disable_vfio_intr(device);
1007 
1008 		/* Unset DMA map for guest memory */
1009 		nfp_vdpa_dma_map(device, false);
1010 
1011 		device->hw.sw_fallback_running = false;
1012 
1013 		rte_atomic_store_explicit(&device->dev_attached, 0,
1014 				rte_memory_order_relaxed);
1015 		rte_atomic_store_explicit(&device->running, 0,
1016 				rte_memory_order_relaxed);
1017 	} else {
1018 		rte_atomic_store_explicit(&device->dev_attached, 0,
1019 				rte_memory_order_relaxed);
1020 		update_datapath(device);
1021 	}
1022 
1023 	return 0;
1024 }
1025 
1026 static int
1027 nfp_vdpa_get_vfio_group_fd(int vid)
1028 {
1029 	struct rte_vdpa_device *vdev;
1030 	struct nfp_vdpa_dev_node *node;
1031 
1032 	vdev = rte_vhost_get_vdpa_device(vid);
1033 	node = nfp_vdpa_find_node_by_vdev(vdev);
1034 	if (node == NULL) {
1035 		DRV_VDPA_LOG(ERR, "Invalid vDPA device: %p.", vdev);
1036 		return -ENODEV;
1037 	}
1038 
1039 	return node->device->vfio_group_fd;
1040 }
1041 
1042 static int
1043 nfp_vdpa_get_vfio_device_fd(int vid)
1044 {
1045 	struct rte_vdpa_device *vdev;
1046 	struct nfp_vdpa_dev_node *node;
1047 
1048 	vdev = rte_vhost_get_vdpa_device(vid);
1049 	node = nfp_vdpa_find_node_by_vdev(vdev);
1050 	if (node == NULL) {
1051 		DRV_VDPA_LOG(ERR, "Invalid vDPA device: %p.", vdev);
1052 		return -ENODEV;
1053 	}
1054 
1055 	return node->device->vfio_dev_fd;
1056 }
1057 
1058 static int
1059 nfp_vdpa_get_notify_area(int vid,
1060 		int qid,
1061 		uint64_t *offset,
1062 		uint64_t *size)
1063 {
1064 	int ret;
1065 	struct nfp_vdpa_dev *device;
1066 	struct rte_vdpa_device *vdev;
1067 	struct nfp_vdpa_dev_node *node;
1068 	struct vfio_region_info region = {
1069 		.argsz = sizeof(region)
1070 	};
1071 
1072 	vdev = rte_vhost_get_vdpa_device(vid);
1073 	node = nfp_vdpa_find_node_by_vdev(vdev);
1074 	if (node == NULL) {
1075 		DRV_VDPA_LOG(ERR,  "Invalid vDPA device: %p", vdev);
1076 		return -ENODEV;
1077 	}
1078 
1079 	device = node->device;
1080 	region.index = device->hw.notify_region;
1081 
1082 	ret = ioctl(device->vfio_dev_fd, VFIO_DEVICE_GET_REGION_INFO, &region);
1083 	if (ret != 0) {
1084 		DRV_VDPA_LOG(ERR, "Get not get device region info.");
1085 		return -EIO;
1086 	}
1087 
1088 	*offset = nfp_vdpa_get_queue_notify_offset(&device->hw, qid) + region.offset;
1089 	*size = NFP_VDPA_NOTIFY_ADDR_INTERVAL;
1090 
1091 	return 0;
1092 }
1093 
1094 static int
1095 nfp_vdpa_get_queue_num(struct rte_vdpa_device *vdev,
1096 		uint32_t *queue_num)
1097 {
1098 	struct nfp_vdpa_dev_node *node;
1099 
1100 	node = nfp_vdpa_find_node_by_vdev(vdev);
1101 	if (node == NULL) {
1102 		DRV_VDPA_LOG(ERR, "Invalid vDPA device: %p.", vdev);
1103 		return -ENODEV;
1104 	}
1105 
1106 	*queue_num = node->device->max_queues;
1107 
1108 	return 0;
1109 }
1110 
1111 static int
1112 nfp_vdpa_get_vdpa_features(struct rte_vdpa_device *vdev,
1113 		uint64_t *features)
1114 {
1115 	struct nfp_vdpa_dev_node *node;
1116 
1117 	node = nfp_vdpa_find_node_by_vdev(vdev);
1118 	if (node == NULL) {
1119 		DRV_VDPA_LOG(ERR,  "Invalid vDPA device: %p", vdev);
1120 		return -ENODEV;
1121 	}
1122 
1123 	*features = node->device->hw.features;
1124 
1125 	return 0;
1126 }
1127 
1128 static int
1129 nfp_vdpa_get_protocol_features(struct rte_vdpa_device *vdev __rte_unused,
1130 		uint64_t *features)
1131 {
1132 	*features = 1ULL << VHOST_USER_PROTOCOL_F_LOG_SHMFD |
1133 			1ULL << VHOST_USER_PROTOCOL_F_REPLY_ACK |
1134 			1ULL << VHOST_USER_PROTOCOL_F_BACKEND_REQ |
1135 			1ULL << VHOST_USER_PROTOCOL_F_BACKEND_SEND_FD |
1136 			1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER;
1137 
1138 	return 0;
1139 }
1140 
1141 static int
1142 nfp_vdpa_set_features(int32_t vid)
1143 {
1144 	int ret;
1145 	uint64_t features = 0;
1146 	struct nfp_vdpa_dev *device;
1147 	struct rte_vdpa_device *vdev;
1148 	struct nfp_vdpa_dev_node *node;
1149 
1150 	DRV_VDPA_LOG(DEBUG, "Start vid=%d.", vid);
1151 
1152 	vdev = rte_vhost_get_vdpa_device(vid);
1153 	node = nfp_vdpa_find_node_by_vdev(vdev);
1154 	if (node == NULL) {
1155 		DRV_VDPA_LOG(ERR, "Invalid vDPA device: %p.", vdev);
1156 		return -ENODEV;
1157 	}
1158 
1159 	rte_vhost_get_negotiated_features(vid, &features);
1160 
1161 	if (RTE_VHOST_NEED_LOG(features) == 0)
1162 		return 0;
1163 
1164 	device = node->device;
1165 	if (device->hw.sw_lm) {
1166 		ret = nfp_vdpa_sw_fallback(device);
1167 		if (ret != 0) {
1168 			DRV_VDPA_LOG(ERR, "Software fallback start failed.");
1169 			return -1;
1170 		}
1171 	}
1172 
1173 	return 0;
1174 }
1175 
1176 static int
1177 nfp_vdpa_set_vring_state(int vid,
1178 		int vring,
1179 		int state)
1180 {
1181 	DRV_VDPA_LOG(DEBUG, "Start vid=%d, vring=%d, state=%d.", vid, vring, state);
1182 	return 0;
1183 }
1184 
1185 struct rte_vdpa_dev_ops nfp_vdpa_ops = {
1186 	.get_queue_num = nfp_vdpa_get_queue_num,
1187 	.get_features = nfp_vdpa_get_vdpa_features,
1188 	.get_protocol_features = nfp_vdpa_get_protocol_features,
1189 	.dev_conf = nfp_vdpa_dev_config,
1190 	.dev_close = nfp_vdpa_dev_close,
1191 	.set_vring_state = nfp_vdpa_set_vring_state,
1192 	.set_features = nfp_vdpa_set_features,
1193 	.get_vfio_group_fd = nfp_vdpa_get_vfio_group_fd,
1194 	.get_vfio_device_fd = nfp_vdpa_get_vfio_device_fd,
1195 	.get_notify_area = nfp_vdpa_get_notify_area,
1196 };
1197 
1198 static int
1199 nfp_vdpa_pci_probe(struct rte_pci_device *pci_dev)
1200 {
1201 	int ret;
1202 	struct nfp_vdpa_dev *device;
1203 	struct nfp_vdpa_dev_node *node;
1204 
1205 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1206 		return 0;
1207 
1208 	node = calloc(1, sizeof(*node));
1209 	if (node == NULL)
1210 		return -ENOMEM;
1211 
1212 	device = calloc(1, sizeof(*device));
1213 	if (device == NULL)
1214 		goto free_node;
1215 
1216 	device->pci_dev = pci_dev;
1217 
1218 	ret = nfp_vdpa_vfio_setup(device);
1219 	if (ret != 0)
1220 		goto free_device;
1221 
1222 	ret = nfp_vdpa_hw_init(&device->hw, pci_dev);
1223 	if (ret != 0)
1224 		goto vfio_teardown;
1225 
1226 	device->max_queues = NFP_VDPA_MAX_QUEUES;
1227 
1228 	device->vdev = rte_vdpa_register_device(&pci_dev->device, &nfp_vdpa_ops);
1229 	if (device->vdev == NULL) {
1230 		DRV_VDPA_LOG(ERR, "Failed to register device %s.", pci_dev->name);
1231 		goto vfio_teardown;
1232 	}
1233 
1234 	node->device = device;
1235 	pthread_mutex_lock(&vdpa_list_lock);
1236 	TAILQ_INSERT_TAIL(&vdpa_dev_list, node, next);
1237 	pthread_mutex_unlock(&vdpa_list_lock);
1238 
1239 	rte_spinlock_init(&device->lock);
1240 	rte_atomic_store_explicit(&device->started, 1, rte_memory_order_relaxed);
1241 	update_datapath(device);
1242 
1243 	return 0;
1244 
1245 vfio_teardown:
1246 	nfp_vdpa_vfio_teardown(device);
1247 free_device:
1248 	free(device);
1249 free_node:
1250 	free(node);
1251 
1252 	return -1;
1253 }
1254 
1255 static int
1256 nfp_vdpa_pci_remove(struct rte_pci_device *pci_dev)
1257 {
1258 	struct nfp_vdpa_dev *device;
1259 	struct nfp_vdpa_dev_node *node;
1260 
1261 	if (rte_eal_process_type() != RTE_PROC_PRIMARY)
1262 		return 0;
1263 
1264 	node = nfp_vdpa_find_node_by_pdev(pci_dev);
1265 	if (node == NULL) {
1266 		DRV_VDPA_LOG(ERR, "Invalid device: %s.", pci_dev->name);
1267 		return -ENODEV;
1268 	}
1269 
1270 	device = node->device;
1271 
1272 	rte_atomic_store_explicit(&device->started, 0, rte_memory_order_relaxed);
1273 	update_datapath(device);
1274 
1275 	pthread_mutex_lock(&vdpa_list_lock);
1276 	TAILQ_REMOVE(&vdpa_dev_list, node, next);
1277 	pthread_mutex_unlock(&vdpa_list_lock);
1278 
1279 	rte_vdpa_unregister_device(device->vdev);
1280 	nfp_vdpa_vfio_teardown(device);
1281 
1282 	free(device);
1283 	free(node);
1284 
1285 	return 0;
1286 }
1287 
1288 static const struct rte_pci_id pci_id_nfp_vdpa_map[] = {
1289 	{
1290 		RTE_PCI_DEVICE(PCI_VENDOR_ID_NETRONOME,
1291 				PCI_DEVICE_ID_NFP6000_VF_NIC)
1292 	},
1293 	{
1294 		.vendor_id = 0,
1295 	},
1296 };
1297 
1298 static struct nfp_class_driver nfp_vdpa = {
1299 	.drv_class = NFP_CLASS_VDPA,
1300 	.name = RTE_STR(NFP_VDPA_DRIVER_NAME),
1301 	.id_table = pci_id_nfp_vdpa_map,
1302 	.drv_flags =  RTE_PCI_DRV_NEED_MAPPING | RTE_PCI_DRV_INTR_LSC,
1303 	.probe = nfp_vdpa_pci_probe,
1304 	.remove = nfp_vdpa_pci_remove,
1305 };
1306 
1307 RTE_INIT(nfp_vdpa_init)
1308 {
1309 	nfp_class_driver_register(&nfp_vdpa);
1310 }
1311 
1312 RTE_PMD_REGISTER_PCI_TABLE(NFP_VDPA_DRIVER_NAME, pci_id_nfp_vdpa_map);
1313 RTE_PMD_REGISTER_KMOD_DEP(NFP_VDPA_DRIVER_NAME, "* vfio-pci");
1314