xref: /dpdk/lib/vhost/vhost_user.c (revision 3da59f30a23f2e795d2315f3d949e1b3e0ce0c3d)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2018 Intel Corporation
3  */
4 
5 /* Security model
6  * --------------
7  * The vhost-user protocol connection is an external interface, so it must be
8  * robust against invalid inputs.
9  *
10  * This is important because the vhost-user frontend is only one step removed
11  * from the guest.  Malicious guests that have escaped will then launch further
12  * attacks from the vhost-user frontend.
13  *
14  * Even in deployments where guests are trusted, a bug in the vhost-user frontend
15  * can still cause invalid messages to be sent.  Such messages must not
16  * compromise the stability of the DPDK application by causing crashes, memory
17  * corruption, or other problematic behavior.
18  *
19  * Do not assume received VhostUserMsg fields contain sensible values!
20  */
21 
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <unistd.h>
27 #include <fcntl.h>
28 #include <sys/ioctl.h>
29 #include <sys/mman.h>
30 #include <sys/stat.h>
31 #include <sys/syscall.h>
32 #ifdef RTE_LIBRTE_VHOST_NUMA
33 #include <numaif.h>
34 #endif
35 #ifdef RTE_LIBRTE_VHOST_POSTCOPY
36 #include <linux/userfaultfd.h>
37 #endif
38 #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
39 #include <linux/memfd.h>
40 #define MEMFD_SUPPORTED
41 #endif
42 
43 #include <rte_common.h>
44 #include <rte_malloc.h>
45 #include <rte_log.h>
46 #include <rte_vfio.h>
47 #include <rte_errno.h>
48 
49 #include "iotlb.h"
50 #include "vhost.h"
51 #include "vhost_user.h"
52 
53 #define VIRTIO_MIN_MTU 68
54 #define VIRTIO_MAX_MTU 65535
55 
56 #define INFLIGHT_ALIGNMENT	64
57 #define INFLIGHT_VERSION	0x1
58 
59 typedef struct vhost_message_handler {
60 	const char *description;
61 	int (*callback)(struct virtio_net **pdev, struct vhu_msg_context *ctx,
62 		int main_fd);
63 	bool accepts_fd;
64 } vhost_message_handler_t;
65 static vhost_message_handler_t vhost_message_handlers[];
66 
67 static int send_vhost_reply(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx);
68 static int read_vhost_message(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx);
69 
70 static void
71 close_msg_fds(struct vhu_msg_context *ctx)
72 {
73 	int i;
74 
75 	for (i = 0; i < ctx->fd_num; i++) {
76 		int fd = ctx->fds[i];
77 
78 		if (fd == -1)
79 			continue;
80 
81 		ctx->fds[i] = -1;
82 		close(fd);
83 	}
84 }
85 
86 /*
87  * Ensure the expected number of FDs is received,
88  * close all FDs and return an error if this is not the case.
89  */
90 static int
91 validate_msg_fds(struct virtio_net *dev, struct vhu_msg_context *ctx, int expected_fds)
92 {
93 	if (ctx->fd_num == expected_fds)
94 		return 0;
95 
96 	VHOST_CONFIG_LOG(dev->ifname, ERR,
97 		"expect %d FDs for request %s, received %d",
98 		expected_fds, vhost_message_handlers[ctx->msg.request.frontend].description,
99 		ctx->fd_num);
100 
101 	close_msg_fds(ctx);
102 
103 	return -1;
104 }
105 
106 static uint64_t
107 get_blk_size(int fd)
108 {
109 	struct stat stat;
110 	int ret;
111 
112 	ret = fstat(fd, &stat);
113 	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
114 }
115 
116 static void
117 async_dma_map(struct virtio_net *dev, bool do_map)
118 {
119 	int ret = 0;
120 	uint32_t i;
121 	struct guest_page *page;
122 
123 	if (do_map) {
124 		for (i = 0; i < dev->nr_guest_pages; i++) {
125 			page = &dev->guest_pages[i];
126 			ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
127 							 page->host_user_addr,
128 							 page->host_iova,
129 							 page->size);
130 			if (ret) {
131 				/*
132 				 * DMA device may bind with kernel driver, in this case,
133 				 * we don't need to program IOMMU manually. However, if no
134 				 * device is bound with vfio/uio in DPDK, and vfio kernel
135 				 * module is loaded, the API will still be called and return
136 				 * with ENODEV.
137 				 *
138 				 * DPDK vfio only returns ENODEV in very similar situations
139 				 * (vfio either unsupported, or supported but no devices found).
140 				 * Either way, no mappings could be performed. We treat it as
141 				 * normal case in async path. This is a workaround.
142 				 */
143 				if (rte_errno == ENODEV)
144 					return;
145 
146 				/* DMA mapping errors won't stop VHOST_USER_SET_MEM_TABLE. */
147 				VHOST_CONFIG_LOG(dev->ifname, ERR, "DMA engine map failed");
148 			}
149 		}
150 
151 	} else {
152 		for (i = 0; i < dev->nr_guest_pages; i++) {
153 			page = &dev->guest_pages[i];
154 			ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
155 							   page->host_user_addr,
156 							   page->host_iova,
157 							   page->size);
158 			if (ret) {
159 				/* like DMA map, ignore the kernel driver case when unmap. */
160 				if (rte_errno == EINVAL)
161 					return;
162 
163 				VHOST_CONFIG_LOG(dev->ifname, ERR, "DMA engine unmap failed");
164 			}
165 		}
166 	}
167 }
168 
169 static void
170 free_mem_region(struct virtio_net *dev)
171 {
172 	uint32_t i;
173 	struct rte_vhost_mem_region *reg;
174 
175 	if (!dev || !dev->mem)
176 		return;
177 
178 	if (dev->async_copy && rte_vfio_is_enabled("vfio"))
179 		async_dma_map(dev, false);
180 
181 	for (i = 0; i < dev->mem->nregions; i++) {
182 		reg = &dev->mem->regions[i];
183 		if (reg->host_user_addr) {
184 			munmap(reg->mmap_addr, reg->mmap_size);
185 			close(reg->fd);
186 		}
187 	}
188 }
189 
190 void
191 vhost_backend_cleanup(struct virtio_net *dev)
192 {
193 	struct rte_vdpa_device *vdpa_dev;
194 
195 	vdpa_dev = dev->vdpa_dev;
196 	if (vdpa_dev && vdpa_dev->ops->dev_cleanup != NULL)
197 		vdpa_dev->ops->dev_cleanup(dev->vid);
198 
199 	if (dev->mem) {
200 		free_mem_region(dev);
201 		rte_free(dev->mem);
202 		dev->mem = NULL;
203 	}
204 
205 	rte_free(dev->guest_pages);
206 	dev->guest_pages = NULL;
207 
208 	if (dev->log_addr) {
209 		munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
210 		dev->log_addr = 0;
211 	}
212 
213 	if (dev->inflight_info) {
214 		if (dev->inflight_info->addr) {
215 			munmap(dev->inflight_info->addr,
216 			       dev->inflight_info->size);
217 			dev->inflight_info->addr = NULL;
218 		}
219 
220 		if (dev->inflight_info->fd >= 0) {
221 			close(dev->inflight_info->fd);
222 			dev->inflight_info->fd = -1;
223 		}
224 
225 		rte_free(dev->inflight_info);
226 		dev->inflight_info = NULL;
227 	}
228 
229 	if (dev->backend_req_fd >= 0) {
230 		close(dev->backend_req_fd);
231 		dev->backend_req_fd = -1;
232 	}
233 
234 	if (dev->postcopy_ufd >= 0) {
235 		close(dev->postcopy_ufd);
236 		dev->postcopy_ufd = -1;
237 	}
238 
239 	dev->postcopy_listening = 0;
240 
241 	vhost_user_iotlb_destroy(dev);
242 }
243 
244 static void
245 vhost_user_notify_queue_state(struct virtio_net *dev, struct vhost_virtqueue *vq,
246 	int enable)
247 {
248 	struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev;
249 
250 	/* Configure guest notifications on enable */
251 	if (enable && vq->notif_enable != VIRTIO_UNINITIALIZED_NOTIF)
252 		vhost_enable_guest_notification(dev, vq, vq->notif_enable);
253 
254 	if (vdpa_dev && vdpa_dev->ops->set_vring_state)
255 		vdpa_dev->ops->set_vring_state(dev->vid, vq->index, enable);
256 
257 	if (dev->notify_ops->vring_state_changed)
258 		dev->notify_ops->vring_state_changed(dev->vid, vq->index, enable);
259 }
260 
261 /*
262  * This function just returns success at the moment unless
263  * the device hasn't been initialised.
264  */
265 static int
266 vhost_user_set_owner(struct virtio_net **pdev __rte_unused,
267 			struct vhu_msg_context *ctx __rte_unused,
268 			int main_fd __rte_unused)
269 {
270 	return RTE_VHOST_MSG_RESULT_OK;
271 }
272 
273 static int
274 vhost_user_reset_owner(struct virtio_net **pdev,
275 			struct vhu_msg_context *ctx __rte_unused,
276 			int main_fd __rte_unused)
277 {
278 	struct virtio_net *dev = *pdev;
279 
280 	vhost_destroy_device_notify(dev);
281 
282 	cleanup_device(dev, 0);
283 	reset_device(dev);
284 	return RTE_VHOST_MSG_RESULT_OK;
285 }
286 
287 /*
288  * The features that we support are requested.
289  */
290 static int
291 vhost_user_get_features(struct virtio_net **pdev,
292 			struct vhu_msg_context *ctx,
293 			int main_fd __rte_unused)
294 {
295 	struct virtio_net *dev = *pdev;
296 	uint64_t features = 0;
297 
298 	rte_vhost_driver_get_features(dev->ifname, &features);
299 
300 	ctx->msg.payload.u64 = features;
301 	ctx->msg.size = sizeof(ctx->msg.payload.u64);
302 	ctx->fd_num = 0;
303 
304 	return RTE_VHOST_MSG_RESULT_REPLY;
305 }
306 
307 /*
308  * The queue number that we support are requested.
309  */
310 static int
311 vhost_user_get_queue_num(struct virtio_net **pdev,
312 			struct vhu_msg_context *ctx,
313 			int main_fd __rte_unused)
314 {
315 	struct virtio_net *dev = *pdev;
316 	uint32_t queue_num = 0;
317 
318 	rte_vhost_driver_get_queue_num(dev->ifname, &queue_num);
319 
320 	ctx->msg.payload.u64 = (uint64_t)queue_num;
321 	ctx->msg.size = sizeof(ctx->msg.payload.u64);
322 	ctx->fd_num = 0;
323 
324 	return RTE_VHOST_MSG_RESULT_REPLY;
325 }
326 
327 /*
328  * We receive the negotiated features supported by us and the virtio device.
329  */
330 static int
331 vhost_user_set_features(struct virtio_net **pdev,
332 			struct vhu_msg_context *ctx,
333 			int main_fd __rte_unused)
334 {
335 	struct virtio_net *dev = *pdev;
336 	uint64_t features = ctx->msg.payload.u64;
337 	uint64_t vhost_features = 0;
338 	struct rte_vdpa_device *vdpa_dev;
339 
340 	rte_vhost_driver_get_features(dev->ifname, &vhost_features);
341 	if (features & ~vhost_features) {
342 		VHOST_CONFIG_LOG(dev->ifname, ERR, "received invalid negotiated features.");
343 		dev->flags |= VIRTIO_DEV_FEATURES_FAILED;
344 		dev->status &= ~VIRTIO_DEVICE_STATUS_FEATURES_OK;
345 
346 		return RTE_VHOST_MSG_RESULT_ERR;
347 	}
348 
349 	if (dev->flags & VIRTIO_DEV_RUNNING) {
350 		if (dev->features == features)
351 			return RTE_VHOST_MSG_RESULT_OK;
352 
353 		/*
354 		 * Error out if frontend tries to change features while device is
355 		 * in running state. The exception being VHOST_F_LOG_ALL, which
356 		 * is enabled when the live-migration starts.
357 		 */
358 		if ((dev->features ^ features) & ~(1ULL << VHOST_F_LOG_ALL)) {
359 			VHOST_CONFIG_LOG(dev->ifname, ERR,
360 				"features changed while device is running.");
361 			return RTE_VHOST_MSG_RESULT_ERR;
362 		}
363 
364 		if (dev->notify_ops->features_changed)
365 			dev->notify_ops->features_changed(dev->vid, features);
366 	}
367 
368 	dev->features = features;
369 	if (dev->features &
370 		((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
371 		 (1ULL << VIRTIO_F_VERSION_1) |
372 		 (1ULL << VIRTIO_F_RING_PACKED))) {
373 		dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
374 	} else {
375 		dev->vhost_hlen = sizeof(struct virtio_net_hdr);
376 	}
377 	VHOST_CONFIG_LOG(dev->ifname, INFO,
378 		"negotiated Virtio features: 0x%" PRIx64,
379 		dev->features);
380 	VHOST_CONFIG_LOG(dev->ifname, DEBUG,
381 		"mergeable RX buffers %s, virtio 1 %s",
382 		(dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
383 		(dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
384 
385 	if ((dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET) &&
386 	    !(dev->features & (1ULL << VIRTIO_NET_F_MQ))) {
387 		/*
388 		 * Remove all but first queue pair if MQ hasn't been
389 		 * negotiated. This is safe because the device is not
390 		 * running at this stage.
391 		 */
392 		while (dev->nr_vring > 2) {
393 			struct vhost_virtqueue *vq;
394 
395 			vq = dev->virtqueue[--dev->nr_vring];
396 			if (!vq)
397 				continue;
398 
399 			dev->virtqueue[dev->nr_vring] = NULL;
400 			cleanup_vq(vq, 1);
401 			cleanup_vq_inflight(dev, vq);
402 			/* vhost_user_lock_all_queue_pairs locked all qps */
403 			vq_assert_lock(dev, vq);
404 			rte_rwlock_write_unlock(&vq->access_lock);
405 			free_vq(dev, vq);
406 		}
407 	}
408 
409 	vdpa_dev = dev->vdpa_dev;
410 	if (vdpa_dev)
411 		vdpa_dev->ops->set_features(dev->vid);
412 
413 	dev->flags &= ~VIRTIO_DEV_FEATURES_FAILED;
414 	return RTE_VHOST_MSG_RESULT_OK;
415 }
416 
417 /*
418  * The virtio device sends us the size of the descriptor ring.
419  */
420 static int
421 vhost_user_set_vring_num(struct virtio_net **pdev,
422 			struct vhu_msg_context *ctx,
423 			int main_fd __rte_unused)
424 {
425 	struct virtio_net *dev = *pdev;
426 	struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index];
427 
428 	if (ctx->msg.payload.state.num > 32768) {
429 		VHOST_CONFIG_LOG(dev->ifname, ERR,
430 			"invalid virtqueue size %u",
431 			ctx->msg.payload.state.num);
432 		return RTE_VHOST_MSG_RESULT_ERR;
433 	}
434 
435 	vq->size = ctx->msg.payload.state.num;
436 
437 	/* VIRTIO 1.0, 2.4 Virtqueues says:
438 	 *
439 	 *   Queue Size value is always a power of 2. The maximum Queue Size
440 	 *   value is 32768.
441 	 *
442 	 * VIRTIO 1.1 2.7 Virtqueues says:
443 	 *
444 	 *   Packed virtqueues support up to 2^15 entries each.
445 	 */
446 	if (!vq_is_packed(dev)) {
447 		if (vq->size & (vq->size - 1)) {
448 			VHOST_CONFIG_LOG(dev->ifname, ERR,
449 				"invalid virtqueue size %u",
450 				vq->size);
451 			return RTE_VHOST_MSG_RESULT_ERR;
452 		}
453 	}
454 
455 	if (vq_is_packed(dev)) {
456 		rte_free(vq->shadow_used_packed);
457 		vq->shadow_used_packed = rte_malloc_socket(NULL,
458 				vq->size *
459 				sizeof(struct vring_used_elem_packed),
460 				RTE_CACHE_LINE_SIZE, vq->numa_node);
461 		if (!vq->shadow_used_packed) {
462 			VHOST_CONFIG_LOG(dev->ifname, ERR,
463 				"failed to allocate memory for shadow used ring.");
464 			return RTE_VHOST_MSG_RESULT_ERR;
465 		}
466 
467 	} else {
468 		rte_free(vq->shadow_used_split);
469 
470 		vq->shadow_used_split = rte_malloc_socket(NULL,
471 				vq->size * sizeof(struct vring_used_elem),
472 				RTE_CACHE_LINE_SIZE, vq->numa_node);
473 
474 		if (!vq->shadow_used_split) {
475 			VHOST_CONFIG_LOG(dev->ifname, ERR,
476 				"failed to allocate memory for vq internal data.");
477 			return RTE_VHOST_MSG_RESULT_ERR;
478 		}
479 	}
480 
481 	rte_free(vq->batch_copy_elems);
482 	vq->batch_copy_elems = rte_malloc_socket(NULL,
483 				vq->size * sizeof(struct batch_copy_elem),
484 				RTE_CACHE_LINE_SIZE, vq->numa_node);
485 	if (!vq->batch_copy_elems) {
486 		VHOST_CONFIG_LOG(dev->ifname, ERR,
487 			"failed to allocate memory for batching copy.");
488 		return RTE_VHOST_MSG_RESULT_ERR;
489 	}
490 
491 	return RTE_VHOST_MSG_RESULT_OK;
492 }
493 
494 /*
495  * Reallocate virtio_dev, vhost_virtqueue and related data structures to
496  * make them on the same numa node as the memory of vring descriptor.
497  */
498 #ifdef RTE_LIBRTE_VHOST_NUMA
499 static void
500 numa_realloc(struct virtio_net **pdev, struct vhost_virtqueue **pvq)
501 {
502 	int node, dev_node;
503 	struct virtio_net *dev;
504 	struct vhost_virtqueue *vq;
505 	struct batch_copy_elem *bce;
506 	struct guest_page *gp;
507 	struct rte_vhost_memory *mem;
508 	size_t mem_size;
509 	int ret;
510 
511 	dev = *pdev;
512 	vq = *pvq;
513 
514 	/*
515 	 * If VQ is ready, it is too late to reallocate, it certainly already
516 	 * happened anyway on VHOST_USER_SET_VRING_ADRR.
517 	 */
518 	if (vq->ready)
519 		return;
520 
521 	ret = get_mempolicy(&node, NULL, 0, vq->desc, MPOL_F_NODE | MPOL_F_ADDR);
522 	if (ret) {
523 		VHOST_CONFIG_LOG(dev->ifname, ERR,
524 			"unable to get virtqueue %d numa information.",
525 			vq->index);
526 		return;
527 	}
528 
529 	if (node == vq->numa_node)
530 		goto out_dev_realloc;
531 
532 	vq = rte_realloc_socket(*pvq, sizeof(**pvq), 0, node);
533 	if (!vq) {
534 		VHOST_CONFIG_LOG(dev->ifname, ERR,
535 			"failed to realloc virtqueue %d on node %d",
536 			(*pvq)->index, node);
537 		return;
538 	}
539 	*pvq = vq;
540 
541 	if (vq != dev->virtqueue[vq->index]) {
542 		VHOST_CONFIG_LOG(dev->ifname, INFO, "reallocated virtqueue on node %d", node);
543 		dev->virtqueue[vq->index] = vq;
544 	}
545 
546 	if (vq_is_packed(dev)) {
547 		struct vring_used_elem_packed *sup;
548 
549 		sup = rte_realloc_socket(vq->shadow_used_packed, vq->size * sizeof(*sup),
550 				RTE_CACHE_LINE_SIZE, node);
551 		if (!sup) {
552 			VHOST_CONFIG_LOG(dev->ifname, ERR,
553 				"failed to realloc shadow packed on node %d",
554 				node);
555 			return;
556 		}
557 		vq->shadow_used_packed = sup;
558 	} else {
559 		struct vring_used_elem *sus;
560 
561 		sus = rte_realloc_socket(vq->shadow_used_split, vq->size * sizeof(*sus),
562 				RTE_CACHE_LINE_SIZE, node);
563 		if (!sus) {
564 			VHOST_CONFIG_LOG(dev->ifname, ERR,
565 				"failed to realloc shadow split on node %d",
566 				node);
567 			return;
568 		}
569 		vq->shadow_used_split = sus;
570 	}
571 
572 	bce = rte_realloc_socket(vq->batch_copy_elems, vq->size * sizeof(*bce),
573 			RTE_CACHE_LINE_SIZE, node);
574 	if (!bce) {
575 		VHOST_CONFIG_LOG(dev->ifname, ERR,
576 			"failed to realloc batch copy elem on node %d",
577 			node);
578 		return;
579 	}
580 	vq->batch_copy_elems = bce;
581 
582 	if (vq->log_cache) {
583 		struct log_cache_entry *lc;
584 
585 		lc = rte_realloc_socket(vq->log_cache, sizeof(*lc) * VHOST_LOG_CACHE_NR, 0, node);
586 		if (!lc) {
587 			VHOST_CONFIG_LOG(dev->ifname, ERR,
588 				"failed to realloc log cache on node %d",
589 				node);
590 			return;
591 		}
592 		vq->log_cache = lc;
593 	}
594 
595 	if (vq->resubmit_inflight) {
596 		struct rte_vhost_resubmit_info *ri;
597 
598 		ri = rte_realloc_socket(vq->resubmit_inflight, sizeof(*ri), 0, node);
599 		if (!ri) {
600 			VHOST_CONFIG_LOG(dev->ifname, ERR,
601 				"failed to realloc resubmit inflight on node %d",
602 				node);
603 			return;
604 		}
605 		vq->resubmit_inflight = ri;
606 
607 		if (ri->resubmit_list) {
608 			struct rte_vhost_resubmit_desc *rd;
609 
610 			rd = rte_realloc_socket(ri->resubmit_list, sizeof(*rd) * ri->resubmit_num,
611 					0, node);
612 			if (!rd) {
613 				VHOST_CONFIG_LOG(dev->ifname, ERR,
614 					"failed to realloc resubmit list on node %d",
615 					node);
616 				return;
617 			}
618 			ri->resubmit_list = rd;
619 		}
620 	}
621 
622 	vq->numa_node = node;
623 
624 out_dev_realloc:
625 
626 	if (dev->flags & VIRTIO_DEV_RUNNING)
627 		return;
628 
629 	ret = get_mempolicy(&dev_node, NULL, 0, dev, MPOL_F_NODE | MPOL_F_ADDR);
630 	if (ret) {
631 		VHOST_CONFIG_LOG(dev->ifname, ERR, "unable to get numa information.");
632 		return;
633 	}
634 
635 	if (dev_node == node)
636 		return;
637 
638 	dev = rte_realloc_socket(*pdev, sizeof(**pdev), 0, node);
639 	if (!dev) {
640 		VHOST_CONFIG_LOG((*pdev)->ifname, ERR, "failed to realloc dev on node %d", node);
641 		return;
642 	}
643 	*pdev = dev;
644 
645 	VHOST_CONFIG_LOG(dev->ifname, INFO, "reallocated device on node %d", node);
646 	vhost_devices[dev->vid] = dev;
647 
648 	mem_size = sizeof(struct rte_vhost_memory) +
649 		sizeof(struct rte_vhost_mem_region) * dev->mem->nregions;
650 	mem = rte_realloc_socket(dev->mem, mem_size, 0, node);
651 	if (!mem) {
652 		VHOST_CONFIG_LOG(dev->ifname, ERR,
653 			"failed to realloc mem table on node %d",
654 			node);
655 		return;
656 	}
657 	dev->mem = mem;
658 
659 	gp = rte_realloc_socket(dev->guest_pages, dev->max_guest_pages * sizeof(*gp),
660 			RTE_CACHE_LINE_SIZE, node);
661 	if (!gp) {
662 		VHOST_CONFIG_LOG(dev->ifname, ERR,
663 			"failed to realloc guest pages on node %d",
664 			node);
665 		return;
666 	}
667 	dev->guest_pages = gp;
668 
669 	vhost_user_iotlb_init(dev);
670 }
671 #else
672 static void
673 numa_realloc(struct virtio_net **pdev, struct vhost_virtqueue **pvq)
674 {
675 	RTE_SET_USED(pdev);
676 	RTE_SET_USED(pvq);
677 }
678 #endif
679 
680 /* Converts QEMU virtual address to Vhost virtual address. */
681 static uint64_t
682 qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len)
683 {
684 	struct rte_vhost_mem_region *r;
685 	uint32_t i;
686 
687 	if (unlikely(!dev || !dev->mem))
688 		goto out_error;
689 
690 	/* Find the region where the address lives. */
691 	for (i = 0; i < dev->mem->nregions; i++) {
692 		r = &dev->mem->regions[i];
693 
694 		if (qva >= r->guest_user_addr &&
695 		    qva <  r->guest_user_addr + r->size) {
696 
697 			if (unlikely(*len > r->guest_user_addr + r->size - qva))
698 				*len = r->guest_user_addr + r->size - qva;
699 
700 			return qva - r->guest_user_addr +
701 			       r->host_user_addr;
702 		}
703 	}
704 out_error:
705 	*len = 0;
706 
707 	return 0;
708 }
709 
710 
711 /*
712  * Converts ring address to Vhost virtual address.
713  * If IOMMU is enabled, the ring address is a guest IO virtual address,
714  * else it is a QEMU virtual address.
715  */
716 static uint64_t
717 ring_addr_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq,
718 		uint64_t ra, uint64_t *size)
719 {
720 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) {
721 		uint64_t vva;
722 
723 		vhost_user_iotlb_rd_lock(vq);
724 		vva = vhost_iova_to_vva(dev, vq, ra,
725 					size, VHOST_ACCESS_RW);
726 		vhost_user_iotlb_rd_unlock(vq);
727 
728 		return vva;
729 	}
730 
731 	return qva_to_vva(dev, ra, size);
732 }
733 
734 static uint64_t
735 log_addr_to_gpa(struct virtio_net *dev, struct vhost_virtqueue *vq)
736 {
737 	uint64_t log_gpa;
738 
739 	vhost_user_iotlb_rd_lock(vq);
740 	log_gpa = translate_log_addr(dev, vq, vq->ring_addrs.log_guest_addr);
741 	vhost_user_iotlb_rd_unlock(vq);
742 
743 	return log_gpa;
744 }
745 
746 static uint64_t
747 hua_to_alignment(struct rte_vhost_memory *mem, void *ptr)
748 {
749 	struct rte_vhost_mem_region *r;
750 	uint32_t i;
751 	uintptr_t hua = (uintptr_t)ptr;
752 
753 	for (i = 0; i < mem->nregions; i++) {
754 		r = &mem->regions[i];
755 		if (hua >= r->host_user_addr &&
756 			hua < r->host_user_addr + r->size) {
757 			return get_blk_size(r->fd);
758 		}
759 	}
760 
761 	/* If region isn't found, don't align at all */
762 	return 1;
763 }
764 
765 void
766 mem_set_dump(struct virtio_net *dev, void *ptr, size_t size, bool enable, uint64_t pagesz)
767 {
768 #ifdef MADV_DONTDUMP
769 	void *start = RTE_PTR_ALIGN_FLOOR(ptr, pagesz);
770 	uintptr_t end = RTE_ALIGN_CEIL((uintptr_t)ptr + size, pagesz);
771 	size_t len = end - (uintptr_t)start;
772 
773 	if (madvise(start, len, enable ? MADV_DODUMP : MADV_DONTDUMP) == -1) {
774 		VHOST_CONFIG_LOG(dev->ifname, INFO,
775 			"could not set coredump preference (%s).", strerror(errno));
776 	}
777 #endif
778 }
779 
780 static void
781 translate_ring_addresses(struct virtio_net **pdev, struct vhost_virtqueue **pvq)
782 {
783 	struct vhost_virtqueue *vq;
784 	struct virtio_net *dev;
785 	uint64_t len, expected_len;
786 
787 	dev = *pdev;
788 	vq = *pvq;
789 
790 	vq_assert_lock(dev, vq);
791 
792 	if (vq->ring_addrs.flags & (1 << VHOST_VRING_F_LOG)) {
793 		vq->log_guest_addr =
794 			log_addr_to_gpa(dev, vq);
795 		if (vq->log_guest_addr == 0) {
796 			VHOST_CONFIG_LOG(dev->ifname, DEBUG, "failed to map log_guest_addr.");
797 			return;
798 		}
799 	}
800 
801 	if (vq_is_packed(dev)) {
802 		len = sizeof(struct vring_packed_desc) * vq->size;
803 		vq->desc_packed = (struct vring_packed_desc *)(uintptr_t)
804 			ring_addr_to_vva(dev, vq, vq->ring_addrs.desc_user_addr, &len);
805 		if (vq->desc_packed == NULL ||
806 				len != sizeof(struct vring_packed_desc) *
807 				vq->size) {
808 			VHOST_CONFIG_LOG(dev->ifname, DEBUG, "failed to map desc_packed ring.");
809 			return;
810 		}
811 
812 		mem_set_dump(dev, vq->desc_packed, len, true,
813 			hua_to_alignment(dev->mem, vq->desc_packed));
814 		numa_realloc(&dev, &vq);
815 		*pdev = dev;
816 		*pvq = vq;
817 
818 		len = sizeof(struct vring_packed_desc_event);
819 		vq->driver_event = (struct vring_packed_desc_event *)
820 					(uintptr_t)ring_addr_to_vva(dev,
821 					vq, vq->ring_addrs.avail_user_addr, &len);
822 		if (vq->driver_event == NULL ||
823 				len != sizeof(struct vring_packed_desc_event)) {
824 			VHOST_CONFIG_LOG(dev->ifname, DEBUG,
825 				"failed to find driver area address.");
826 			return;
827 		}
828 
829 		mem_set_dump(dev, vq->driver_event, len, true,
830 			hua_to_alignment(dev->mem, vq->driver_event));
831 		len = sizeof(struct vring_packed_desc_event);
832 		vq->device_event = (struct vring_packed_desc_event *)
833 					(uintptr_t)ring_addr_to_vva(dev,
834 					vq, vq->ring_addrs.used_user_addr, &len);
835 		if (vq->device_event == NULL ||
836 				len != sizeof(struct vring_packed_desc_event)) {
837 			VHOST_CONFIG_LOG(dev->ifname, DEBUG,
838 				"failed to find device area address.");
839 			return;
840 		}
841 
842 		mem_set_dump(dev, vq->device_event, len, true,
843 			hua_to_alignment(dev->mem, vq->device_event));
844 		vq->access_ok = true;
845 		return;
846 	}
847 
848 	/* The addresses are converted from QEMU virtual to Vhost virtual. */
849 	if (vq->desc && vq->avail && vq->used)
850 		return;
851 
852 	len = sizeof(struct vring_desc) * vq->size;
853 	vq->desc = (struct vring_desc *)(uintptr_t)ring_addr_to_vva(dev,
854 			vq, vq->ring_addrs.desc_user_addr, &len);
855 	if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) {
856 		VHOST_CONFIG_LOG(dev->ifname, DEBUG, "failed to map desc ring.");
857 		return;
858 	}
859 
860 	mem_set_dump(dev, vq->desc, len, true, hua_to_alignment(dev->mem, vq->desc));
861 	numa_realloc(&dev, &vq);
862 	*pdev = dev;
863 	*pvq = vq;
864 
865 	len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size;
866 	if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))
867 		len += sizeof(uint16_t);
868 	expected_len = len;
869 	vq->avail = (struct vring_avail *)(uintptr_t)ring_addr_to_vva(dev,
870 			vq, vq->ring_addrs.avail_user_addr, &len);
871 	if (vq->avail == 0 || len != expected_len) {
872 		VHOST_CONFIG_LOG(dev->ifname, DEBUG, "failed to map avail ring.");
873 		return;
874 	}
875 
876 	mem_set_dump(dev, vq->avail, len, true, hua_to_alignment(dev->mem, vq->avail));
877 	len = sizeof(struct vring_used) +
878 		sizeof(struct vring_used_elem) * vq->size;
879 	if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))
880 		len += sizeof(uint16_t);
881 	expected_len = len;
882 	vq->used = (struct vring_used *)(uintptr_t)ring_addr_to_vva(dev,
883 			vq, vq->ring_addrs.used_user_addr, &len);
884 	if (vq->used == 0 || len != expected_len) {
885 		VHOST_CONFIG_LOG(dev->ifname, DEBUG, "failed to map used ring.");
886 		return;
887 	}
888 
889 	mem_set_dump(dev, vq->used, len, true, hua_to_alignment(dev->mem, vq->used));
890 
891 	if (vq->last_used_idx != vq->used->idx) {
892 		VHOST_CONFIG_LOG(dev->ifname, WARNING,
893 			"last_used_idx (%u) and vq->used->idx (%u) mismatches;",
894 			vq->last_used_idx, vq->used->idx);
895 		vq->last_used_idx  = vq->used->idx;
896 		vq->last_avail_idx = vq->used->idx;
897 		VHOST_CONFIG_LOG(dev->ifname, WARNING,
898 			"some packets maybe resent for Tx and dropped for Rx");
899 	}
900 
901 	vq->access_ok = true;
902 
903 	VHOST_CONFIG_LOG(dev->ifname, DEBUG, "mapped address desc: %p", vq->desc);
904 	VHOST_CONFIG_LOG(dev->ifname, DEBUG, "mapped address avail: %p", vq->avail);
905 	VHOST_CONFIG_LOG(dev->ifname, DEBUG, "mapped address used: %p", vq->used);
906 	VHOST_CONFIG_LOG(dev->ifname, DEBUG, "log_guest_addr: %" PRIx64, vq->log_guest_addr);
907 }
908 
909 /*
910  * The virtio device sends us the desc, used and avail ring addresses.
911  * This function then converts these to our address space.
912  */
913 static int
914 vhost_user_set_vring_addr(struct virtio_net **pdev,
915 			struct vhu_msg_context *ctx,
916 			int main_fd __rte_unused)
917 {
918 	struct virtio_net *dev = *pdev;
919 	struct vhost_virtqueue *vq;
920 	struct vhost_vring_addr *addr = &ctx->msg.payload.addr;
921 	bool access_ok;
922 
923 	if (dev->mem == NULL)
924 		return RTE_VHOST_MSG_RESULT_ERR;
925 
926 	/* addr->index refers to the queue index. The txq 1, rxq is 0. */
927 	vq = dev->virtqueue[ctx->msg.payload.addr.index];
928 
929 	/* vhost_user_lock_all_queue_pairs locked all qps */
930 	vq_assert_lock(dev, vq);
931 
932 	access_ok = vq->access_ok;
933 
934 	/*
935 	 * Rings addresses should not be interpreted as long as the ring is not
936 	 * started and enabled
937 	 */
938 	memcpy(&vq->ring_addrs, addr, sizeof(*addr));
939 
940 	vring_invalidate(dev, vq);
941 
942 	if ((vq->enabled && (dev->features &
943 				(1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) ||
944 			access_ok) {
945 		translate_ring_addresses(&dev, &vq);
946 		*pdev = dev;
947 	}
948 
949 	return RTE_VHOST_MSG_RESULT_OK;
950 }
951 
952 /*
953  * The virtio device sends us the available ring last used index.
954  */
955 static int
956 vhost_user_set_vring_base(struct virtio_net **pdev,
957 			struct vhu_msg_context *ctx,
958 			int main_fd __rte_unused)
959 {
960 	struct virtio_net *dev = *pdev;
961 	struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index];
962 	uint64_t val = ctx->msg.payload.state.num;
963 
964 	if (vq_is_packed(dev)) {
965 		/*
966 		 * Bit[0:14]: avail index
967 		 * Bit[15]: avail wrap counter
968 		 */
969 		vq->last_avail_idx = val & 0x7fff;
970 		vq->avail_wrap_counter = !!(val & (0x1 << 15));
971 		/*
972 		 * Set used index to same value as available one, as
973 		 * their values should be the same since ring processing
974 		 * was stopped at get time.
975 		 */
976 		vq->last_used_idx = vq->last_avail_idx;
977 		vq->used_wrap_counter = vq->avail_wrap_counter;
978 	} else {
979 		vq->last_used_idx = ctx->msg.payload.state.num;
980 		vq->last_avail_idx = ctx->msg.payload.state.num;
981 	}
982 
983 	VHOST_CONFIG_LOG(dev->ifname, INFO,
984 		"vring base idx:%u last_used_idx:%u last_avail_idx:%u.",
985 		ctx->msg.payload.state.index, vq->last_used_idx, vq->last_avail_idx);
986 
987 	return RTE_VHOST_MSG_RESULT_OK;
988 }
989 
990 static int
991 add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
992 		   uint64_t host_iova, uint64_t host_user_addr, uint64_t size)
993 {
994 	struct guest_page *page, *last_page;
995 	struct guest_page *old_pages;
996 
997 	if (dev->nr_guest_pages == dev->max_guest_pages) {
998 		dev->max_guest_pages *= 2;
999 		old_pages = dev->guest_pages;
1000 		dev->guest_pages = rte_realloc(dev->guest_pages,
1001 					dev->max_guest_pages * sizeof(*page),
1002 					RTE_CACHE_LINE_SIZE);
1003 		if (dev->guest_pages == NULL) {
1004 			VHOST_CONFIG_LOG(dev->ifname, ERR, "cannot realloc guest_pages");
1005 			rte_free(old_pages);
1006 			return -1;
1007 		}
1008 	}
1009 
1010 	if (dev->nr_guest_pages > 0) {
1011 		last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
1012 		/* merge if the two pages are continuous */
1013 		if (host_iova == last_page->host_iova + last_page->size &&
1014 		    guest_phys_addr == last_page->guest_phys_addr + last_page->size &&
1015 		    host_user_addr == last_page->host_user_addr + last_page->size) {
1016 			last_page->size += size;
1017 			return 0;
1018 		}
1019 	}
1020 
1021 	page = &dev->guest_pages[dev->nr_guest_pages++];
1022 	page->guest_phys_addr = guest_phys_addr;
1023 	page->host_iova  = host_iova;
1024 	page->host_user_addr = host_user_addr;
1025 	page->size = size;
1026 
1027 	return 0;
1028 }
1029 
1030 static int
1031 add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
1032 		uint64_t page_size)
1033 {
1034 	uint64_t reg_size = reg->size;
1035 	uint64_t host_user_addr  = reg->host_user_addr;
1036 	uint64_t guest_phys_addr = reg->guest_phys_addr;
1037 	uint64_t host_iova;
1038 	uint64_t size;
1039 
1040 	host_iova = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr);
1041 	size = page_size - (guest_phys_addr & (page_size - 1));
1042 	size = RTE_MIN(size, reg_size);
1043 
1044 	if (add_one_guest_page(dev, guest_phys_addr, host_iova,
1045 			       host_user_addr, size) < 0)
1046 		return -1;
1047 
1048 	host_user_addr  += size;
1049 	guest_phys_addr += size;
1050 	reg_size -= size;
1051 
1052 	while (reg_size > 0) {
1053 		size = RTE_MIN(reg_size, page_size);
1054 		host_iova = rte_mem_virt2iova((void *)(uintptr_t)
1055 						  host_user_addr);
1056 		if (add_one_guest_page(dev, guest_phys_addr, host_iova,
1057 				       host_user_addr, size) < 0)
1058 			return -1;
1059 
1060 		host_user_addr  += size;
1061 		guest_phys_addr += size;
1062 		reg_size -= size;
1063 	}
1064 
1065 	/* sort guest page array if over binary search threshold */
1066 	if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
1067 		qsort((void *)dev->guest_pages, dev->nr_guest_pages,
1068 			sizeof(struct guest_page), guest_page_addrcmp);
1069 	}
1070 
1071 	return 0;
1072 }
1073 
1074 #ifdef RTE_LIBRTE_VHOST_DEBUG
1075 /* TODO: enable it only in debug mode? */
1076 static void
1077 dump_guest_pages(struct virtio_net *dev)
1078 {
1079 	uint32_t i;
1080 	struct guest_page *page;
1081 
1082 	for (i = 0; i < dev->nr_guest_pages; i++) {
1083 		page = &dev->guest_pages[i];
1084 
1085 		VHOST_CONFIG_LOG(dev->ifname, INFO, "guest physical page region %u", i);
1086 		VHOST_CONFIG_LOG(dev->ifname, INFO, "\tguest_phys_addr: %" PRIx64,
1087 			page->guest_phys_addr);
1088 		VHOST_CONFIG_LOG(dev->ifname, INFO, "\thost_iova : %" PRIx64,
1089 			page->host_iova);
1090 		VHOST_CONFIG_LOG(dev->ifname, INFO, "\tsize           : %" PRIx64,
1091 			page->size);
1092 	}
1093 }
1094 #else
1095 #define dump_guest_pages(dev)
1096 #endif
1097 
1098 static bool
1099 vhost_memory_changed(struct VhostUserMemory *new,
1100 		     struct rte_vhost_memory *old)
1101 {
1102 	uint32_t i;
1103 
1104 	if (new->nregions != old->nregions)
1105 		return true;
1106 
1107 	for (i = 0; i < new->nregions; ++i) {
1108 		VhostUserMemoryRegion *new_r = &new->regions[i];
1109 		struct rte_vhost_mem_region *old_r = &old->regions[i];
1110 
1111 		if (new_r->guest_phys_addr != old_r->guest_phys_addr)
1112 			return true;
1113 		if (new_r->memory_size != old_r->size)
1114 			return true;
1115 		if (new_r->userspace_addr != old_r->guest_user_addr)
1116 			return true;
1117 	}
1118 
1119 	return false;
1120 }
1121 
1122 #ifdef RTE_LIBRTE_VHOST_POSTCOPY
1123 static int
1124 vhost_user_postcopy_region_register(struct virtio_net *dev,
1125 		struct rte_vhost_mem_region *reg)
1126 {
1127 	struct uffdio_register reg_struct;
1128 
1129 	/*
1130 	 * Let's register all the mmapped area to ensure
1131 	 * alignment on page boundary.
1132 	 */
1133 	reg_struct.range.start = (uint64_t)(uintptr_t)reg->mmap_addr;
1134 	reg_struct.range.len = reg->mmap_size;
1135 	reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
1136 
1137 	if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER,
1138 				&reg_struct)) {
1139 		VHOST_CONFIG_LOG(dev->ifname, ERR,
1140 			"failed to register ufd for region "
1141 			"%" PRIx64 " - %" PRIx64 " (ufd = %d) %s",
1142 			(uint64_t)reg_struct.range.start,
1143 			(uint64_t)reg_struct.range.start +
1144 			(uint64_t)reg_struct.range.len - 1,
1145 			dev->postcopy_ufd,
1146 			strerror(errno));
1147 		return -1;
1148 	}
1149 
1150 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1151 		"\t userfaultfd registered for range : %" PRIx64 " - %" PRIx64,
1152 		(uint64_t)reg_struct.range.start,
1153 		(uint64_t)reg_struct.range.start +
1154 		(uint64_t)reg_struct.range.len - 1);
1155 
1156 	return 0;
1157 }
1158 #else
1159 static int
1160 vhost_user_postcopy_region_register(struct virtio_net *dev __rte_unused,
1161 		struct rte_vhost_mem_region *reg __rte_unused)
1162 {
1163 	return -1;
1164 }
1165 #endif
1166 
1167 static int
1168 vhost_user_postcopy_register(struct virtio_net *dev, int main_fd,
1169 		struct vhu_msg_context *ctx)
1170 {
1171 	struct VhostUserMemory *memory;
1172 	struct rte_vhost_mem_region *reg;
1173 	struct vhu_msg_context ack_ctx;
1174 	uint32_t i;
1175 
1176 	if (!dev->postcopy_listening)
1177 		return 0;
1178 
1179 	/*
1180 	 * We haven't a better way right now than sharing
1181 	 * DPDK's virtual address with Qemu, so that Qemu can
1182 	 * retrieve the region offset when handling userfaults.
1183 	 */
1184 	memory = &ctx->msg.payload.memory;
1185 	for (i = 0; i < memory->nregions; i++) {
1186 		reg = &dev->mem->regions[i];
1187 		memory->regions[i].userspace_addr = reg->host_user_addr;
1188 	}
1189 
1190 	/* Send the addresses back to qemu */
1191 	ctx->fd_num = 0;
1192 	send_vhost_reply(dev, main_fd, ctx);
1193 
1194 	/* Wait for qemu to acknowledge it got the addresses
1195 	 * we've got to wait before we're allowed to generate faults.
1196 	 */
1197 	if (read_vhost_message(dev, main_fd, &ack_ctx) <= 0) {
1198 		VHOST_CONFIG_LOG(dev->ifname, ERR,
1199 			"failed to read qemu ack on postcopy set-mem-table");
1200 		return -1;
1201 	}
1202 
1203 	if (validate_msg_fds(dev, &ack_ctx, 0) != 0)
1204 		return -1;
1205 
1206 	if (ack_ctx.msg.request.frontend != VHOST_USER_SET_MEM_TABLE) {
1207 		VHOST_CONFIG_LOG(dev->ifname, ERR,
1208 			"bad qemu ack on postcopy set-mem-table (%d)",
1209 			ack_ctx.msg.request.frontend);
1210 		return -1;
1211 	}
1212 
1213 	/* Now userfault register and we can use the memory */
1214 	for (i = 0; i < memory->nregions; i++) {
1215 		reg = &dev->mem->regions[i];
1216 		if (vhost_user_postcopy_region_register(dev, reg) < 0)
1217 			return -1;
1218 	}
1219 
1220 	return 0;
1221 }
1222 
1223 static int
1224 vhost_user_mmap_region(struct virtio_net *dev,
1225 		struct rte_vhost_mem_region *region,
1226 		uint64_t mmap_offset)
1227 {
1228 	void *mmap_addr;
1229 	uint64_t mmap_size;
1230 	uint64_t alignment;
1231 	int populate;
1232 
1233 	/* Check for memory_size + mmap_offset overflow */
1234 	if (mmap_offset >= -region->size) {
1235 		VHOST_CONFIG_LOG(dev->ifname, ERR,
1236 			"mmap_offset (%#"PRIx64") and memory_size (%#"PRIx64") overflow",
1237 			mmap_offset, region->size);
1238 		return -1;
1239 	}
1240 
1241 	mmap_size = region->size + mmap_offset;
1242 
1243 	/* mmap() without flag of MAP_ANONYMOUS, should be called with length
1244 	 * argument aligned with hugepagesz at older longterm version Linux,
1245 	 * like 2.6.32 and 3.2.72, or mmap() will fail with EINVAL.
1246 	 *
1247 	 * To avoid failure, make sure in caller to keep length aligned.
1248 	 */
1249 	alignment = get_blk_size(region->fd);
1250 	if (alignment == (uint64_t)-1) {
1251 		VHOST_CONFIG_LOG(dev->ifname, ERR, "couldn't get hugepage size through fstat");
1252 		return -1;
1253 	}
1254 	mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
1255 	if (mmap_size == 0) {
1256 		/*
1257 		 * It could happen if initial mmap_size + alignment overflows
1258 		 * the sizeof uint64, which could happen if either mmap_size or
1259 		 * alignment value is wrong.
1260 		 *
1261 		 * mmap() kernel implementation would return an error, but
1262 		 * better catch it before and provide useful info in the logs.
1263 		 */
1264 		VHOST_CONFIG_LOG(dev->ifname, ERR,
1265 			"mmap size (0x%" PRIx64 ") or alignment (0x%" PRIx64 ") is invalid",
1266 			region->size + mmap_offset, alignment);
1267 		return -1;
1268 	}
1269 
1270 	populate = dev->async_copy ? MAP_POPULATE : 0;
1271 	mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1272 			MAP_SHARED | populate, region->fd, 0);
1273 
1274 	if (mmap_addr == MAP_FAILED) {
1275 		VHOST_CONFIG_LOG(dev->ifname, ERR, "mmap failed (%s).", strerror(errno));
1276 		return -1;
1277 	}
1278 
1279 	region->mmap_addr = mmap_addr;
1280 	region->mmap_size = mmap_size;
1281 	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
1282 	mem_set_dump(dev, mmap_addr, mmap_size, false, alignment);
1283 
1284 	if (dev->async_copy) {
1285 		if (add_guest_pages(dev, region, alignment) < 0) {
1286 			VHOST_CONFIG_LOG(dev->ifname, ERR,
1287 				"adding guest pages to region failed.");
1288 			return -1;
1289 		}
1290 	}
1291 
1292 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1293 		"guest memory region size: 0x%" PRIx64,
1294 		region->size);
1295 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1296 		"\t guest physical addr: 0x%" PRIx64,
1297 		region->guest_phys_addr);
1298 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1299 		"\t guest virtual  addr: 0x%" PRIx64,
1300 		region->guest_user_addr);
1301 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1302 		"\t host  virtual  addr: 0x%" PRIx64,
1303 		region->host_user_addr);
1304 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1305 		"\t mmap addr : 0x%" PRIx64,
1306 		(uint64_t)(uintptr_t)mmap_addr);
1307 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1308 		"\t mmap size : 0x%" PRIx64,
1309 		mmap_size);
1310 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1311 		"\t mmap align: 0x%" PRIx64,
1312 		alignment);
1313 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1314 		"\t mmap off  : 0x%" PRIx64,
1315 		mmap_offset);
1316 
1317 	return 0;
1318 }
1319 
1320 static int
1321 vhost_user_set_mem_table(struct virtio_net **pdev,
1322 			struct vhu_msg_context *ctx,
1323 			int main_fd)
1324 {
1325 	struct virtio_net *dev = *pdev;
1326 	struct VhostUserMemory *memory = &ctx->msg.payload.memory;
1327 	struct rte_vhost_mem_region *reg;
1328 	int numa_node = SOCKET_ID_ANY;
1329 	uint64_t mmap_offset;
1330 	uint32_t i;
1331 	bool async_notify = false;
1332 
1333 	if (validate_msg_fds(dev, ctx, memory->nregions) != 0)
1334 		return RTE_VHOST_MSG_RESULT_ERR;
1335 
1336 	if (memory->nregions > VHOST_MEMORY_MAX_NREGIONS) {
1337 		VHOST_CONFIG_LOG(dev->ifname, ERR,
1338 			"too many memory regions (%u)",
1339 			memory->nregions);
1340 		goto close_msg_fds;
1341 	}
1342 
1343 	if (dev->mem && !vhost_memory_changed(memory, dev->mem)) {
1344 		VHOST_CONFIG_LOG(dev->ifname, INFO, "memory regions not changed");
1345 
1346 		close_msg_fds(ctx);
1347 
1348 		return RTE_VHOST_MSG_RESULT_OK;
1349 	}
1350 
1351 	if (dev->mem) {
1352 		if (dev->flags & VIRTIO_DEV_VDPA_CONFIGURED) {
1353 			struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev;
1354 
1355 			if (vdpa_dev && vdpa_dev->ops->dev_close)
1356 				vdpa_dev->ops->dev_close(dev->vid);
1357 			dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED;
1358 		}
1359 
1360 		/* notify the vhost application to stop DMA transfers */
1361 		if (dev->async_copy && dev->notify_ops->vring_state_changed) {
1362 			for (i = 0; i < dev->nr_vring; i++) {
1363 				dev->notify_ops->vring_state_changed(dev->vid,
1364 						i, 0);
1365 			}
1366 			async_notify = true;
1367 		}
1368 
1369 		/* Flush IOTLB cache as previous HVAs are now invalid */
1370 		if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1371 			vhost_user_iotlb_flush_all(dev);
1372 
1373 		free_mem_region(dev);
1374 		rte_free(dev->mem);
1375 		dev->mem = NULL;
1376 	}
1377 
1378 	/*
1379 	 * If VQ 0 has already been allocated, try to allocate on the same
1380 	 * NUMA node. It can be reallocated later in numa_realloc().
1381 	 */
1382 	if (dev->nr_vring > 0)
1383 		numa_node = dev->virtqueue[0]->numa_node;
1384 
1385 	dev->nr_guest_pages = 0;
1386 	if (dev->guest_pages == NULL) {
1387 		dev->max_guest_pages = 8;
1388 		dev->guest_pages = rte_zmalloc_socket(NULL,
1389 					dev->max_guest_pages *
1390 					sizeof(struct guest_page),
1391 					RTE_CACHE_LINE_SIZE,
1392 					numa_node);
1393 		if (dev->guest_pages == NULL) {
1394 			VHOST_CONFIG_LOG(dev->ifname, ERR,
1395 				"failed to allocate memory for dev->guest_pages");
1396 			goto close_msg_fds;
1397 		}
1398 	}
1399 
1400 	dev->mem = rte_zmalloc_socket("vhost-mem-table", sizeof(struct rte_vhost_memory) +
1401 		sizeof(struct rte_vhost_mem_region) * memory->nregions, 0, numa_node);
1402 	if (dev->mem == NULL) {
1403 		VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to allocate memory for dev->mem");
1404 		goto free_guest_pages;
1405 	}
1406 
1407 	for (i = 0; i < memory->nregions; i++) {
1408 		reg = &dev->mem->regions[i];
1409 
1410 		reg->guest_phys_addr = memory->regions[i].guest_phys_addr;
1411 		reg->guest_user_addr = memory->regions[i].userspace_addr;
1412 		reg->size            = memory->regions[i].memory_size;
1413 		reg->fd              = ctx->fds[i];
1414 
1415 		/*
1416 		 * Assign invalid file descriptor value to avoid double
1417 		 * closing on error path.
1418 		 */
1419 		ctx->fds[i] = -1;
1420 
1421 		mmap_offset = memory->regions[i].mmap_offset;
1422 
1423 		if (vhost_user_mmap_region(dev, reg, mmap_offset) < 0) {
1424 			VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to mmap region %u", i);
1425 			goto free_mem_table;
1426 		}
1427 
1428 		dev->mem->nregions++;
1429 	}
1430 
1431 	if (dev->async_copy && rte_vfio_is_enabled("vfio"))
1432 		async_dma_map(dev, true);
1433 
1434 	if (vhost_user_postcopy_register(dev, main_fd, ctx) < 0)
1435 		goto free_mem_table;
1436 
1437 	for (i = 0; i < dev->nr_vring; i++) {
1438 		struct vhost_virtqueue *vq = dev->virtqueue[i];
1439 
1440 		if (!vq)
1441 			continue;
1442 
1443 		if (vq->desc || vq->avail || vq->used) {
1444 			/* vhost_user_lock_all_queue_pairs locked all qps */
1445 			vq_assert_lock(dev, vq);
1446 
1447 			/*
1448 			 * If the memory table got updated, the ring addresses
1449 			 * need to be translated again as virtual addresses have
1450 			 * changed.
1451 			 */
1452 			vring_invalidate(dev, vq);
1453 
1454 			translate_ring_addresses(&dev, &vq);
1455 			*pdev = dev;
1456 		}
1457 	}
1458 
1459 	dump_guest_pages(dev);
1460 
1461 	if (async_notify) {
1462 		for (i = 0; i < dev->nr_vring; i++)
1463 			dev->notify_ops->vring_state_changed(dev->vid, i, 1);
1464 	}
1465 
1466 	return RTE_VHOST_MSG_RESULT_OK;
1467 
1468 free_mem_table:
1469 	free_mem_region(dev);
1470 	rte_free(dev->mem);
1471 	dev->mem = NULL;
1472 
1473 free_guest_pages:
1474 	rte_free(dev->guest_pages);
1475 	dev->guest_pages = NULL;
1476 close_msg_fds:
1477 	close_msg_fds(ctx);
1478 	return RTE_VHOST_MSG_RESULT_ERR;
1479 }
1480 
1481 static bool
1482 vq_is_ready(struct virtio_net *dev, struct vhost_virtqueue *vq)
1483 {
1484 	bool rings_ok;
1485 
1486 	if (!vq)
1487 		return false;
1488 
1489 	if (vq_is_packed(dev))
1490 		rings_ok = vq->desc_packed && vq->driver_event &&
1491 			vq->device_event;
1492 	else
1493 		rings_ok = vq->desc && vq->avail && vq->used;
1494 
1495 	return rings_ok &&
1496 	       vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
1497 	       vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD &&
1498 	       vq->enabled;
1499 }
1500 
1501 #define VIRTIO_BUILTIN_NUM_VQS_TO_BE_READY 2u
1502 #define VIRTIO_BLK_NUM_VQS_TO_BE_READY 1u
1503 
1504 static int
1505 virtio_is_ready(struct virtio_net *dev)
1506 {
1507 	struct rte_vdpa_device *vdpa_dev;
1508 	struct vhost_virtqueue *vq;
1509 	uint32_t vdpa_type;
1510 	uint32_t i, nr_vring = dev->nr_vring;
1511 
1512 	if (dev->flags & VIRTIO_DEV_READY)
1513 		return 1;
1514 
1515 	if (!dev->nr_vring)
1516 		return 0;
1517 
1518 	vdpa_dev = dev->vdpa_dev;
1519 	if (vdpa_dev)
1520 		vdpa_type = vdpa_dev->type;
1521 	else
1522 		vdpa_type = -1;
1523 
1524 	if (vdpa_type == RTE_VHOST_VDPA_DEVICE_TYPE_BLK) {
1525 		nr_vring = VIRTIO_BLK_NUM_VQS_TO_BE_READY;
1526 	} else {
1527 		if (dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET)
1528 			nr_vring = VIRTIO_BUILTIN_NUM_VQS_TO_BE_READY;
1529 	}
1530 
1531 	if (dev->nr_vring < nr_vring)
1532 		return 0;
1533 
1534 	for (i = 0; i < nr_vring; i++) {
1535 		vq = dev->virtqueue[i];
1536 
1537 		if (!vq_is_ready(dev, vq))
1538 			return 0;
1539 	}
1540 
1541 	/* If supported, ensure the frontend is really done with config */
1542 	if (dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_STATUS))
1543 		if (!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER_OK))
1544 			return 0;
1545 
1546 	dev->flags |= VIRTIO_DEV_READY;
1547 
1548 	if (!(dev->flags & VIRTIO_DEV_RUNNING))
1549 		VHOST_CONFIG_LOG(dev->ifname, INFO, "virtio is now ready for processing.");
1550 	return 1;
1551 }
1552 
1553 static void *
1554 inflight_mem_alloc(struct virtio_net *dev, const char *name, size_t size, int *fd)
1555 {
1556 	void *ptr;
1557 	int mfd = -1;
1558 	uint64_t alignment;
1559 	char fname[20] = "/tmp/memfd-XXXXXX";
1560 
1561 	*fd = -1;
1562 #ifdef MEMFD_SUPPORTED
1563 	mfd = memfd_create(name, MFD_CLOEXEC);
1564 #else
1565 	RTE_SET_USED(name);
1566 #endif
1567 	if (mfd == -1) {
1568 		mfd = mkstemp(fname);
1569 		if (mfd == -1) {
1570 			VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to get inflight buffer fd");
1571 			return NULL;
1572 		}
1573 
1574 		unlink(fname);
1575 	}
1576 
1577 	if (ftruncate(mfd, size) == -1) {
1578 		VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to alloc inflight buffer");
1579 		close(mfd);
1580 		return NULL;
1581 	}
1582 
1583 	ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, mfd, 0);
1584 	if (ptr == MAP_FAILED) {
1585 		VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to mmap inflight buffer");
1586 		close(mfd);
1587 		return NULL;
1588 	}
1589 
1590 	alignment = get_blk_size(mfd);
1591 	mem_set_dump(dev, ptr, size, false, alignment);
1592 	*fd = mfd;
1593 	return ptr;
1594 }
1595 
1596 static uint32_t
1597 get_pervq_shm_size_split(uint16_t queue_size)
1598 {
1599 	return RTE_ALIGN_MUL_CEIL(sizeof(struct rte_vhost_inflight_desc_split) *
1600 				  queue_size + sizeof(uint64_t) +
1601 				  sizeof(uint16_t) * 4, INFLIGHT_ALIGNMENT);
1602 }
1603 
1604 static uint32_t
1605 get_pervq_shm_size_packed(uint16_t queue_size)
1606 {
1607 	return RTE_ALIGN_MUL_CEIL(sizeof(struct rte_vhost_inflight_desc_packed)
1608 				  * queue_size + sizeof(uint64_t) +
1609 				  sizeof(uint16_t) * 6 + sizeof(uint8_t) * 9,
1610 				  INFLIGHT_ALIGNMENT);
1611 }
1612 
1613 static int
1614 vhost_user_get_inflight_fd(struct virtio_net **pdev,
1615 			   struct vhu_msg_context *ctx,
1616 			   int main_fd __rte_unused)
1617 {
1618 	struct rte_vhost_inflight_info_packed *inflight_packed;
1619 	uint64_t pervq_inflight_size, mmap_size;
1620 	uint16_t num_queues, queue_size;
1621 	struct virtio_net *dev = *pdev;
1622 	int fd, i, j;
1623 	int numa_node = SOCKET_ID_ANY;
1624 	void *addr;
1625 
1626 	if (ctx->msg.size != sizeof(ctx->msg.payload.inflight)) {
1627 		VHOST_CONFIG_LOG(dev->ifname, ERR,
1628 			"invalid get_inflight_fd message size is %d",
1629 			ctx->msg.size);
1630 		return RTE_VHOST_MSG_RESULT_ERR;
1631 	}
1632 
1633 	/*
1634 	 * If VQ 0 has already been allocated, try to allocate on the same
1635 	 * NUMA node. It can be reallocated later in numa_realloc().
1636 	 */
1637 	if (dev->nr_vring > 0)
1638 		numa_node = dev->virtqueue[0]->numa_node;
1639 
1640 	if (dev->inflight_info == NULL) {
1641 		dev->inflight_info = rte_zmalloc_socket("inflight_info",
1642 				sizeof(struct inflight_mem_info), 0, numa_node);
1643 		if (!dev->inflight_info) {
1644 			VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to alloc dev inflight area");
1645 			return RTE_VHOST_MSG_RESULT_ERR;
1646 		}
1647 		dev->inflight_info->fd = -1;
1648 	}
1649 
1650 	num_queues = ctx->msg.payload.inflight.num_queues;
1651 	queue_size = ctx->msg.payload.inflight.queue_size;
1652 
1653 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1654 		"get_inflight_fd num_queues: %u",
1655 		ctx->msg.payload.inflight.num_queues);
1656 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1657 		"get_inflight_fd queue_size: %u",
1658 		ctx->msg.payload.inflight.queue_size);
1659 
1660 	if (vq_is_packed(dev))
1661 		pervq_inflight_size = get_pervq_shm_size_packed(queue_size);
1662 	else
1663 		pervq_inflight_size = get_pervq_shm_size_split(queue_size);
1664 
1665 	mmap_size = num_queues * pervq_inflight_size;
1666 	addr = inflight_mem_alloc(dev, "vhost-inflight", mmap_size, &fd);
1667 	if (!addr) {
1668 		VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to alloc vhost inflight area");
1669 			ctx->msg.payload.inflight.mmap_size = 0;
1670 		return RTE_VHOST_MSG_RESULT_ERR;
1671 	}
1672 	memset(addr, 0, mmap_size);
1673 
1674 	if (dev->inflight_info->addr) {
1675 		munmap(dev->inflight_info->addr, dev->inflight_info->size);
1676 		dev->inflight_info->addr = NULL;
1677 	}
1678 
1679 	if (dev->inflight_info->fd >= 0) {
1680 		close(dev->inflight_info->fd);
1681 		dev->inflight_info->fd = -1;
1682 	}
1683 
1684 	dev->inflight_info->addr = addr;
1685 	dev->inflight_info->size = ctx->msg.payload.inflight.mmap_size = mmap_size;
1686 	dev->inflight_info->fd = ctx->fds[0] = fd;
1687 	ctx->msg.payload.inflight.mmap_offset = 0;
1688 	ctx->fd_num = 1;
1689 
1690 	if (vq_is_packed(dev)) {
1691 		for (i = 0; i < num_queues; i++) {
1692 			inflight_packed =
1693 				(struct rte_vhost_inflight_info_packed *)addr;
1694 			inflight_packed->used_wrap_counter = 1;
1695 			inflight_packed->old_used_wrap_counter = 1;
1696 			for (j = 0; j < queue_size; j++)
1697 				inflight_packed->desc[j].next = j + 1;
1698 			addr = (void *)((char *)addr + pervq_inflight_size);
1699 		}
1700 	}
1701 
1702 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1703 		"send inflight mmap_size: %"PRIu64,
1704 		ctx->msg.payload.inflight.mmap_size);
1705 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1706 		"send inflight mmap_offset: %"PRIu64,
1707 		ctx->msg.payload.inflight.mmap_offset);
1708 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1709 		"send inflight fd: %d", ctx->fds[0]);
1710 
1711 	return RTE_VHOST_MSG_RESULT_REPLY;
1712 }
1713 
1714 static int
1715 vhost_user_set_inflight_fd(struct virtio_net **pdev,
1716 			   struct vhu_msg_context *ctx,
1717 			   int main_fd __rte_unused)
1718 {
1719 	uint64_t mmap_size, mmap_offset;
1720 	uint16_t num_queues, queue_size;
1721 	struct virtio_net *dev = *pdev;
1722 	uint32_t pervq_inflight_size;
1723 	struct vhost_virtqueue *vq;
1724 	void *addr;
1725 	int fd, i;
1726 	int numa_node = SOCKET_ID_ANY;
1727 
1728 	if (validate_msg_fds(dev, ctx, 1) != 0)
1729 		return RTE_VHOST_MSG_RESULT_ERR;
1730 
1731 	fd = ctx->fds[0];
1732 	if (ctx->msg.size != sizeof(ctx->msg.payload.inflight) || fd < 0) {
1733 		VHOST_CONFIG_LOG(dev->ifname, ERR,
1734 			"invalid set_inflight_fd message size is %d,fd is %d",
1735 			ctx->msg.size, fd);
1736 		return RTE_VHOST_MSG_RESULT_ERR;
1737 	}
1738 
1739 	mmap_size = ctx->msg.payload.inflight.mmap_size;
1740 	mmap_offset = ctx->msg.payload.inflight.mmap_offset;
1741 	num_queues = ctx->msg.payload.inflight.num_queues;
1742 	queue_size = ctx->msg.payload.inflight.queue_size;
1743 
1744 	if (vq_is_packed(dev))
1745 		pervq_inflight_size = get_pervq_shm_size_packed(queue_size);
1746 	else
1747 		pervq_inflight_size = get_pervq_shm_size_split(queue_size);
1748 
1749 	VHOST_CONFIG_LOG(dev->ifname, INFO, "set_inflight_fd mmap_size: %"PRIu64, mmap_size);
1750 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1751 		"set_inflight_fd mmap_offset: %"PRIu64,
1752 		mmap_offset);
1753 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1754 		"set_inflight_fd num_queues: %u",
1755 		num_queues);
1756 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1757 		"set_inflight_fd queue_size: %u",
1758 		queue_size);
1759 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1760 		"set_inflight_fd fd: %d",
1761 		fd);
1762 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1763 		"set_inflight_fd pervq_inflight_size: %d",
1764 		pervq_inflight_size);
1765 
1766 	/*
1767 	 * If VQ 0 has already been allocated, try to allocate on the same
1768 	 * NUMA node. It can be reallocated later in numa_realloc().
1769 	 */
1770 	if (dev->nr_vring > 0)
1771 		numa_node = dev->virtqueue[0]->numa_node;
1772 
1773 	if (!dev->inflight_info) {
1774 		dev->inflight_info = rte_zmalloc_socket("inflight_info",
1775 				sizeof(struct inflight_mem_info), 0, numa_node);
1776 		if (dev->inflight_info == NULL) {
1777 			VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to alloc dev inflight area");
1778 			return RTE_VHOST_MSG_RESULT_ERR;
1779 		}
1780 		dev->inflight_info->fd = -1;
1781 	}
1782 
1783 	if (dev->inflight_info->addr) {
1784 		munmap(dev->inflight_info->addr, dev->inflight_info->size);
1785 		dev->inflight_info->addr = NULL;
1786 	}
1787 
1788 	addr = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
1789 		    fd, mmap_offset);
1790 	if (addr == MAP_FAILED) {
1791 		VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to mmap share memory.");
1792 		return RTE_VHOST_MSG_RESULT_ERR;
1793 	}
1794 
1795 	if (dev->inflight_info->fd >= 0) {
1796 		close(dev->inflight_info->fd);
1797 		dev->inflight_info->fd = -1;
1798 	}
1799 
1800 	mem_set_dump(dev, addr, mmap_size, false, get_blk_size(fd));
1801 	dev->inflight_info->fd = fd;
1802 	dev->inflight_info->addr = addr;
1803 	dev->inflight_info->size = mmap_size;
1804 
1805 	for (i = 0; i < num_queues; i++) {
1806 		vq = dev->virtqueue[i];
1807 		if (!vq)
1808 			continue;
1809 
1810 		if (vq_is_packed(dev)) {
1811 			vq->inflight_packed = addr;
1812 			vq->inflight_packed->desc_num = queue_size;
1813 		} else {
1814 			vq->inflight_split = addr;
1815 			vq->inflight_split->desc_num = queue_size;
1816 		}
1817 		addr = (void *)((char *)addr + pervq_inflight_size);
1818 	}
1819 
1820 	return RTE_VHOST_MSG_RESULT_OK;
1821 }
1822 
1823 static int
1824 vhost_user_set_vring_call(struct virtio_net **pdev,
1825 			struct vhu_msg_context *ctx,
1826 			int main_fd __rte_unused)
1827 {
1828 	struct virtio_net *dev = *pdev;
1829 	struct vhost_vring_file file;
1830 	struct vhost_virtqueue *vq;
1831 	int expected_fds;
1832 
1833 	expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1;
1834 	if (validate_msg_fds(dev, ctx, expected_fds) != 0)
1835 		return RTE_VHOST_MSG_RESULT_ERR;
1836 
1837 	file.index = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK;
1838 	if (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)
1839 		file.fd = VIRTIO_INVALID_EVENTFD;
1840 	else
1841 		file.fd = ctx->fds[0];
1842 	VHOST_CONFIG_LOG(dev->ifname, INFO,
1843 		"vring call idx:%d file:%d",
1844 		file.index, file.fd);
1845 
1846 	vq = dev->virtqueue[file.index];
1847 
1848 	if (vq->ready) {
1849 		vq->ready = false;
1850 		vhost_user_notify_queue_state(dev, vq, 0);
1851 	}
1852 
1853 	if (vq->callfd >= 0)
1854 		close(vq->callfd);
1855 
1856 	vq->callfd = file.fd;
1857 
1858 	return RTE_VHOST_MSG_RESULT_OK;
1859 }
1860 
1861 static int vhost_user_set_vring_err(struct virtio_net **pdev,
1862 			struct vhu_msg_context *ctx,
1863 			int main_fd __rte_unused)
1864 {
1865 	struct virtio_net *dev = *pdev;
1866 	int expected_fds;
1867 
1868 	expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1;
1869 	if (validate_msg_fds(dev, ctx, expected_fds) != 0)
1870 		return RTE_VHOST_MSG_RESULT_ERR;
1871 
1872 	if (!(ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK))
1873 		close(ctx->fds[0]);
1874 	VHOST_CONFIG_LOG(dev->ifname, DEBUG, "not implemented");
1875 
1876 	return RTE_VHOST_MSG_RESULT_OK;
1877 }
1878 
1879 static int
1880 resubmit_desc_compare(const void *a, const void *b)
1881 {
1882 	const struct rte_vhost_resubmit_desc *desc0 = a;
1883 	const struct rte_vhost_resubmit_desc *desc1 = b;
1884 
1885 	if (desc1->counter > desc0->counter)
1886 		return 1;
1887 
1888 	return -1;
1889 }
1890 
1891 static int
1892 vhost_check_queue_inflights_split(struct virtio_net *dev,
1893 				  struct vhost_virtqueue *vq)
1894 {
1895 	uint16_t i;
1896 	uint16_t resubmit_num = 0, last_io, num;
1897 	struct vring_used *used = vq->used;
1898 	struct rte_vhost_resubmit_info *resubmit;
1899 	struct rte_vhost_inflight_info_split *inflight_split;
1900 
1901 	if (!(dev->protocol_features &
1902 	    (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
1903 		return RTE_VHOST_MSG_RESULT_OK;
1904 
1905 	/* The frontend may still not support the inflight feature
1906 	 * although we negotiate the protocol feature.
1907 	 */
1908 	if ((!vq->inflight_split))
1909 		return RTE_VHOST_MSG_RESULT_OK;
1910 
1911 	if (!vq->inflight_split->version) {
1912 		vq->inflight_split->version = INFLIGHT_VERSION;
1913 		return RTE_VHOST_MSG_RESULT_OK;
1914 	}
1915 
1916 	if (vq->resubmit_inflight)
1917 		return RTE_VHOST_MSG_RESULT_OK;
1918 
1919 	inflight_split = vq->inflight_split;
1920 	vq->global_counter = 0;
1921 	last_io = inflight_split->last_inflight_io;
1922 
1923 	if (inflight_split->used_idx != used->idx) {
1924 		inflight_split->desc[last_io].inflight = 0;
1925 		rte_atomic_thread_fence(rte_memory_order_seq_cst);
1926 		inflight_split->used_idx = used->idx;
1927 	}
1928 
1929 	for (i = 0; i < inflight_split->desc_num; i++) {
1930 		if (inflight_split->desc[i].inflight == 1)
1931 			resubmit_num++;
1932 	}
1933 
1934 	vq->last_avail_idx += resubmit_num;
1935 
1936 	if (resubmit_num) {
1937 		resubmit = rte_zmalloc_socket("resubmit", sizeof(struct rte_vhost_resubmit_info),
1938 				0, vq->numa_node);
1939 		if (!resubmit) {
1940 			VHOST_CONFIG_LOG(dev->ifname, ERR,
1941 				"failed to allocate memory for resubmit info.");
1942 			return RTE_VHOST_MSG_RESULT_ERR;
1943 		}
1944 
1945 		resubmit->resubmit_list = rte_zmalloc_socket("resubmit_list",
1946 				resubmit_num * sizeof(struct rte_vhost_resubmit_desc),
1947 				0, vq->numa_node);
1948 		if (!resubmit->resubmit_list) {
1949 			VHOST_CONFIG_LOG(dev->ifname, ERR,
1950 					"failed to allocate memory for inflight desc.");
1951 			rte_free(resubmit);
1952 			return RTE_VHOST_MSG_RESULT_ERR;
1953 		}
1954 
1955 		num = 0;
1956 		for (i = 0; i < vq->inflight_split->desc_num; i++) {
1957 			if (vq->inflight_split->desc[i].inflight == 1) {
1958 				resubmit->resubmit_list[num].index = i;
1959 				resubmit->resubmit_list[num].counter =
1960 					inflight_split->desc[i].counter;
1961 				num++;
1962 			}
1963 		}
1964 		resubmit->resubmit_num = num;
1965 
1966 		if (resubmit->resubmit_num > 1)
1967 			qsort(resubmit->resubmit_list, resubmit->resubmit_num,
1968 			      sizeof(struct rte_vhost_resubmit_desc),
1969 			      resubmit_desc_compare);
1970 
1971 		vq->global_counter = resubmit->resubmit_list[0].counter + 1;
1972 		vq->resubmit_inflight = resubmit;
1973 	}
1974 
1975 	return RTE_VHOST_MSG_RESULT_OK;
1976 }
1977 
1978 static int
1979 vhost_check_queue_inflights_packed(struct virtio_net *dev,
1980 				   struct vhost_virtqueue *vq)
1981 {
1982 	uint16_t i;
1983 	uint16_t resubmit_num = 0, old_used_idx, num;
1984 	struct rte_vhost_resubmit_info *resubmit;
1985 	struct rte_vhost_inflight_info_packed *inflight_packed;
1986 
1987 	if (!(dev->protocol_features &
1988 	    (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
1989 		return RTE_VHOST_MSG_RESULT_OK;
1990 
1991 	/* The frontend may still not support the inflight feature
1992 	 * although we negotiate the protocol feature.
1993 	 */
1994 	if ((!vq->inflight_packed))
1995 		return RTE_VHOST_MSG_RESULT_OK;
1996 
1997 	if (!vq->inflight_packed->version) {
1998 		vq->inflight_packed->version = INFLIGHT_VERSION;
1999 		return RTE_VHOST_MSG_RESULT_OK;
2000 	}
2001 
2002 	if (vq->resubmit_inflight)
2003 		return RTE_VHOST_MSG_RESULT_OK;
2004 
2005 	inflight_packed = vq->inflight_packed;
2006 	vq->global_counter = 0;
2007 	old_used_idx = inflight_packed->old_used_idx;
2008 
2009 	if (inflight_packed->used_idx != old_used_idx) {
2010 		if (inflight_packed->desc[old_used_idx].inflight == 0) {
2011 			inflight_packed->old_used_idx =
2012 				inflight_packed->used_idx;
2013 			inflight_packed->old_used_wrap_counter =
2014 				inflight_packed->used_wrap_counter;
2015 			inflight_packed->old_free_head =
2016 				inflight_packed->free_head;
2017 		} else {
2018 			inflight_packed->used_idx =
2019 				inflight_packed->old_used_idx;
2020 			inflight_packed->used_wrap_counter =
2021 				inflight_packed->old_used_wrap_counter;
2022 			inflight_packed->free_head =
2023 				inflight_packed->old_free_head;
2024 		}
2025 	}
2026 
2027 	for (i = 0; i < inflight_packed->desc_num; i++) {
2028 		if (inflight_packed->desc[i].inflight == 1)
2029 			resubmit_num++;
2030 	}
2031 
2032 	if (resubmit_num) {
2033 		resubmit = rte_zmalloc_socket("resubmit", sizeof(struct rte_vhost_resubmit_info),
2034 				0, vq->numa_node);
2035 		if (resubmit == NULL) {
2036 			VHOST_CONFIG_LOG(dev->ifname, ERR,
2037 				"failed to allocate memory for resubmit info.");
2038 			return RTE_VHOST_MSG_RESULT_ERR;
2039 		}
2040 
2041 		resubmit->resubmit_list = rte_zmalloc_socket("resubmit_list",
2042 				resubmit_num * sizeof(struct rte_vhost_resubmit_desc),
2043 				0, vq->numa_node);
2044 		if (resubmit->resubmit_list == NULL) {
2045 			VHOST_CONFIG_LOG(dev->ifname, ERR,
2046 				"failed to allocate memory for resubmit desc.");
2047 			rte_free(resubmit);
2048 			return RTE_VHOST_MSG_RESULT_ERR;
2049 		}
2050 
2051 		num = 0;
2052 		for (i = 0; i < inflight_packed->desc_num; i++) {
2053 			if (vq->inflight_packed->desc[i].inflight == 1) {
2054 				resubmit->resubmit_list[num].index = i;
2055 				resubmit->resubmit_list[num].counter =
2056 					inflight_packed->desc[i].counter;
2057 				num++;
2058 			}
2059 		}
2060 		resubmit->resubmit_num = num;
2061 
2062 		if (resubmit->resubmit_num > 1)
2063 			qsort(resubmit->resubmit_list, resubmit->resubmit_num,
2064 			      sizeof(struct rte_vhost_resubmit_desc),
2065 			      resubmit_desc_compare);
2066 
2067 		vq->global_counter = resubmit->resubmit_list[0].counter + 1;
2068 		vq->resubmit_inflight = resubmit;
2069 	}
2070 
2071 	return RTE_VHOST_MSG_RESULT_OK;
2072 }
2073 
2074 static int
2075 vhost_user_set_vring_kick(struct virtio_net **pdev,
2076 			struct vhu_msg_context *ctx,
2077 			int main_fd __rte_unused)
2078 {
2079 	struct virtio_net *dev = *pdev;
2080 	struct vhost_vring_file file;
2081 	struct vhost_virtqueue *vq;
2082 	int expected_fds;
2083 
2084 	expected_fds = (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1;
2085 	if (validate_msg_fds(dev, ctx, expected_fds) != 0)
2086 		return RTE_VHOST_MSG_RESULT_ERR;
2087 
2088 	file.index = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK;
2089 	if (ctx->msg.payload.u64 & VHOST_USER_VRING_NOFD_MASK)
2090 		file.fd = VIRTIO_INVALID_EVENTFD;
2091 	else
2092 		file.fd = ctx->fds[0];
2093 	VHOST_CONFIG_LOG(dev->ifname, INFO,
2094 		"vring kick idx:%d file:%d",
2095 		file.index, file.fd);
2096 
2097 	/* Interpret ring addresses only when ring is started. */
2098 	vq = dev->virtqueue[file.index];
2099 	translate_ring_addresses(&dev, &vq);
2100 	*pdev = dev;
2101 
2102 	/*
2103 	 * When VHOST_USER_F_PROTOCOL_FEATURES is not negotiated,
2104 	 * the ring starts already enabled. Otherwise, it is enabled via
2105 	 * the SET_VRING_ENABLE message.
2106 	 */
2107 	if (!(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) {
2108 		vq->enabled = true;
2109 	}
2110 
2111 	if (vq->ready) {
2112 		vq->ready = false;
2113 		vhost_user_notify_queue_state(dev, vq, 0);
2114 	}
2115 
2116 	if (vq->kickfd >= 0)
2117 		close(vq->kickfd);
2118 	vq->kickfd = file.fd;
2119 
2120 	if (vq_is_packed(dev)) {
2121 		if (vhost_check_queue_inflights_packed(dev, vq)) {
2122 			VHOST_CONFIG_LOG(dev->ifname, ERR,
2123 				"failed to inflights for vq: %d",
2124 				file.index);
2125 			return RTE_VHOST_MSG_RESULT_ERR;
2126 		}
2127 	} else {
2128 		if (vhost_check_queue_inflights_split(dev, vq)) {
2129 			VHOST_CONFIG_LOG(dev->ifname, ERR,
2130 				"failed to inflights for vq: %d",
2131 				file.index);
2132 			return RTE_VHOST_MSG_RESULT_ERR;
2133 		}
2134 	}
2135 
2136 	return RTE_VHOST_MSG_RESULT_OK;
2137 }
2138 
2139 /*
2140  * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
2141  */
2142 static int
2143 vhost_user_get_vring_base(struct virtio_net **pdev,
2144 			struct vhu_msg_context *ctx,
2145 			int main_fd __rte_unused)
2146 {
2147 	struct virtio_net *dev = *pdev;
2148 	struct vhost_virtqueue *vq = dev->virtqueue[ctx->msg.payload.state.index];
2149 	uint64_t val;
2150 
2151 	/* We have to stop the queue (virtio) if it is running. */
2152 	vhost_destroy_device_notify(dev);
2153 
2154 	dev->flags &= ~VIRTIO_DEV_READY;
2155 	dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED;
2156 
2157 	/* Here we are safe to get the indexes */
2158 	if (vq_is_packed(dev)) {
2159 		/*
2160 		 * Bit[0:14]: avail index
2161 		 * Bit[15]: avail wrap counter
2162 		 */
2163 		val = vq->last_avail_idx & 0x7fff;
2164 		val |= vq->avail_wrap_counter << 15;
2165 		ctx->msg.payload.state.num = val;
2166 	} else {
2167 		ctx->msg.payload.state.num = vq->last_avail_idx;
2168 	}
2169 
2170 	VHOST_CONFIG_LOG(dev->ifname, INFO,
2171 		"vring base idx:%d file:%d",
2172 		ctx->msg.payload.state.index, ctx->msg.payload.state.num);
2173 	/*
2174 	 * Based on current qemu vhost-user implementation, this message is
2175 	 * sent and only sent in vhost_vring_stop.
2176 	 * TODO: cleanup the vring, it isn't usable since here.
2177 	 */
2178 	if (vq->kickfd >= 0)
2179 		close(vq->kickfd);
2180 
2181 	vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
2182 
2183 	if (vq->callfd >= 0)
2184 		close(vq->callfd);
2185 
2186 	vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
2187 
2188 	vq->signalled_used_valid = false;
2189 
2190 	if (vq_is_packed(dev)) {
2191 		rte_free(vq->shadow_used_packed);
2192 		vq->shadow_used_packed = NULL;
2193 	} else {
2194 		rte_free(vq->shadow_used_split);
2195 		vq->shadow_used_split = NULL;
2196 	}
2197 
2198 	rte_free(vq->batch_copy_elems);
2199 	vq->batch_copy_elems = NULL;
2200 
2201 	rte_free(vq->log_cache);
2202 	vq->log_cache = NULL;
2203 
2204 	ctx->msg.size = sizeof(ctx->msg.payload.state);
2205 	ctx->fd_num = 0;
2206 
2207 	vhost_user_iotlb_flush_all(dev);
2208 
2209 	rte_rwlock_write_lock(&vq->access_lock);
2210 	vring_invalidate(dev, vq);
2211 	rte_rwlock_write_unlock(&vq->access_lock);
2212 
2213 	return RTE_VHOST_MSG_RESULT_REPLY;
2214 }
2215 
2216 /*
2217  * when virtio queues are ready to work, qemu will send us to
2218  * enable the virtio queue pair.
2219  */
2220 static int
2221 vhost_user_set_vring_enable(struct virtio_net **pdev,
2222 			struct vhu_msg_context *ctx,
2223 			int main_fd __rte_unused)
2224 {
2225 	struct virtio_net *dev = *pdev;
2226 	struct vhost_virtqueue *vq;
2227 	bool enable = !!ctx->msg.payload.state.num;
2228 	int index = (int)ctx->msg.payload.state.index;
2229 
2230 	VHOST_CONFIG_LOG(dev->ifname, INFO,
2231 		"set queue enable: %d to qp idx: %d",
2232 		enable, index);
2233 
2234 	vq = dev->virtqueue[index];
2235 	if (!(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED)) {
2236 		/* vhost_user_lock_all_queue_pairs locked all qps */
2237 		vq_assert_lock(dev, vq);
2238 		if (enable && vq->async && vq->async->pkts_inflight_n) {
2239 			VHOST_CONFIG_LOG(dev->ifname, ERR,
2240 				"failed to enable vring. Inflight packets must be completed first");
2241 			return RTE_VHOST_MSG_RESULT_ERR;
2242 		}
2243 	}
2244 
2245 	vq->enabled = enable;
2246 
2247 	return RTE_VHOST_MSG_RESULT_OK;
2248 }
2249 
2250 static int
2251 vhost_user_get_protocol_features(struct virtio_net **pdev,
2252 			struct vhu_msg_context *ctx,
2253 			int main_fd __rte_unused)
2254 {
2255 	struct virtio_net *dev = *pdev;
2256 	uint64_t features, protocol_features;
2257 
2258 	rte_vhost_driver_get_features(dev->ifname, &features);
2259 	rte_vhost_driver_get_protocol_features(dev->ifname, &protocol_features);
2260 
2261 	ctx->msg.payload.u64 = protocol_features;
2262 	ctx->msg.size = sizeof(ctx->msg.payload.u64);
2263 	ctx->fd_num = 0;
2264 
2265 	return RTE_VHOST_MSG_RESULT_REPLY;
2266 }
2267 
2268 static int
2269 vhost_user_set_protocol_features(struct virtio_net **pdev,
2270 			struct vhu_msg_context *ctx,
2271 			int main_fd __rte_unused)
2272 {
2273 	struct virtio_net *dev = *pdev;
2274 	uint64_t protocol_features = ctx->msg.payload.u64;
2275 	uint64_t backend_protocol_features = 0;
2276 
2277 	rte_vhost_driver_get_protocol_features(dev->ifname,
2278 			&backend_protocol_features);
2279 	if (protocol_features & ~backend_protocol_features) {
2280 		VHOST_CONFIG_LOG(dev->ifname, ERR, "received invalid protocol features.");
2281 		return RTE_VHOST_MSG_RESULT_ERR;
2282 	}
2283 
2284 	dev->protocol_features = protocol_features;
2285 	VHOST_CONFIG_LOG(dev->ifname, INFO,
2286 		"negotiated Vhost-user protocol features: 0x%" PRIx64,
2287 		dev->protocol_features);
2288 
2289 	return RTE_VHOST_MSG_RESULT_OK;
2290 }
2291 
2292 static int
2293 vhost_user_set_log_base(struct virtio_net **pdev,
2294 			struct vhu_msg_context *ctx,
2295 			int main_fd __rte_unused)
2296 {
2297 	struct virtio_net *dev = *pdev;
2298 	int fd = ctx->fds[0];
2299 	uint64_t size, off;
2300 	uint64_t alignment;
2301 	void *addr;
2302 	uint32_t i;
2303 
2304 	if (validate_msg_fds(dev, ctx, 1) != 0)
2305 		return RTE_VHOST_MSG_RESULT_ERR;
2306 
2307 	if (fd < 0) {
2308 		VHOST_CONFIG_LOG(dev->ifname, ERR, "invalid log fd: %d", fd);
2309 		return RTE_VHOST_MSG_RESULT_ERR;
2310 	}
2311 
2312 	if (ctx->msg.size != sizeof(VhostUserLog)) {
2313 		VHOST_CONFIG_LOG(dev->ifname, ERR,
2314 			"invalid log base msg size: %"PRId32" != %d",
2315 			ctx->msg.size, (int)sizeof(VhostUserLog));
2316 		goto close_msg_fds;
2317 	}
2318 
2319 	size = ctx->msg.payload.log.mmap_size;
2320 	off  = ctx->msg.payload.log.mmap_offset;
2321 
2322 	/* Check for mmap size and offset overflow. */
2323 	if (off >= -size) {
2324 		VHOST_CONFIG_LOG(dev->ifname, ERR,
2325 			"log offset %#"PRIx64" and log size %#"PRIx64" overflow",
2326 			off, size);
2327 		goto close_msg_fds;
2328 	}
2329 
2330 	VHOST_CONFIG_LOG(dev->ifname, INFO,
2331 		"log mmap size: %"PRId64", offset: %"PRId64,
2332 		size, off);
2333 
2334 	/*
2335 	 * mmap from 0 to workaround a hugepage mmap bug: mmap will
2336 	 * fail when offset is not page size aligned.
2337 	 */
2338 	addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
2339 	alignment = get_blk_size(fd);
2340 	close(fd);
2341 	if (addr == MAP_FAILED) {
2342 		VHOST_CONFIG_LOG(dev->ifname, ERR, "mmap log base failed!");
2343 		return RTE_VHOST_MSG_RESULT_ERR;
2344 	}
2345 
2346 	/*
2347 	 * Free previously mapped log memory on occasionally
2348 	 * multiple VHOST_USER_SET_LOG_BASE.
2349 	 */
2350 	if (dev->log_addr) {
2351 		munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
2352 	}
2353 	dev->log_addr = (uint64_t)(uintptr_t)addr;
2354 	dev->log_base = dev->log_addr + off;
2355 	dev->log_size = size;
2356 	mem_set_dump(dev, addr, size + off, false, alignment);
2357 
2358 	for (i = 0; i < dev->nr_vring; i++) {
2359 		struct vhost_virtqueue *vq = dev->virtqueue[i];
2360 
2361 		rte_free(vq->log_cache);
2362 		vq->log_cache = NULL;
2363 		vq->log_cache_nb_elem = 0;
2364 		vq->log_cache = rte_malloc_socket("vq log cache",
2365 				sizeof(struct log_cache_entry) * VHOST_LOG_CACHE_NR,
2366 				0, vq->numa_node);
2367 		/*
2368 		 * If log cache alloc fail, don't fail migration, but no
2369 		 * caching will be done, which will impact performance
2370 		 */
2371 		if (!vq->log_cache)
2372 			VHOST_CONFIG_LOG(dev->ifname, ERR,
2373 				"failed to allocate VQ logging cache");
2374 	}
2375 
2376 	/*
2377 	 * The spec is not clear about it (yet), but QEMU doesn't expect
2378 	 * any payload in the reply.
2379 	 */
2380 	ctx->msg.size = 0;
2381 	ctx->fd_num = 0;
2382 
2383 	return RTE_VHOST_MSG_RESULT_REPLY;
2384 
2385 close_msg_fds:
2386 	close_msg_fds(ctx);
2387 	return RTE_VHOST_MSG_RESULT_ERR;
2388 }
2389 
2390 static int vhost_user_set_log_fd(struct virtio_net **pdev,
2391 			struct vhu_msg_context *ctx,
2392 			int main_fd __rte_unused)
2393 {
2394 	struct virtio_net *dev = *pdev;
2395 
2396 	if (validate_msg_fds(dev, ctx, 1) != 0)
2397 		return RTE_VHOST_MSG_RESULT_ERR;
2398 
2399 	close(ctx->fds[0]);
2400 	VHOST_CONFIG_LOG(dev->ifname, DEBUG, "not implemented.");
2401 
2402 	return RTE_VHOST_MSG_RESULT_OK;
2403 }
2404 
2405 /*
2406  * An rarp packet is constructed and broadcasted to notify switches about
2407  * the new location of the migrated VM, so that packets from outside will
2408  * not be lost after migration.
2409  *
2410  * However, we don't actually "send" a rarp packet here, instead, we set
2411  * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
2412  */
2413 static int
2414 vhost_user_send_rarp(struct virtio_net **pdev,
2415 			struct vhu_msg_context *ctx,
2416 			int main_fd __rte_unused)
2417 {
2418 	struct virtio_net *dev = *pdev;
2419 	uint8_t *mac = (uint8_t *)&ctx->msg.payload.u64;
2420 	struct rte_vdpa_device *vdpa_dev;
2421 
2422 	VHOST_CONFIG_LOG(dev->ifname, DEBUG,
2423 		"MAC: " RTE_ETHER_ADDR_PRT_FMT,
2424 		mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
2425 	memcpy(dev->mac.addr_bytes, mac, 6);
2426 
2427 	/*
2428 	 * Set the flag to inject a RARP broadcast packet at
2429 	 * rte_vhost_dequeue_burst().
2430 	 *
2431 	 * rte_memory_order_release ordering is for making sure the mac is
2432 	 * copied before the flag is set.
2433 	 */
2434 	rte_atomic_store_explicit(&dev->broadcast_rarp, 1, rte_memory_order_release);
2435 	vdpa_dev = dev->vdpa_dev;
2436 	if (vdpa_dev && vdpa_dev->ops->migration_done)
2437 		vdpa_dev->ops->migration_done(dev->vid);
2438 
2439 	return RTE_VHOST_MSG_RESULT_OK;
2440 }
2441 
2442 static int
2443 vhost_user_net_set_mtu(struct virtio_net **pdev,
2444 			struct vhu_msg_context *ctx,
2445 			int main_fd __rte_unused)
2446 {
2447 	struct virtio_net *dev = *pdev;
2448 
2449 	if (ctx->msg.payload.u64 < VIRTIO_MIN_MTU ||
2450 			ctx->msg.payload.u64 > VIRTIO_MAX_MTU) {
2451 		VHOST_CONFIG_LOG(dev->ifname, ERR,
2452 			"invalid MTU size (%"PRIu64")",
2453 			ctx->msg.payload.u64);
2454 
2455 		return RTE_VHOST_MSG_RESULT_ERR;
2456 	}
2457 
2458 	dev->mtu = ctx->msg.payload.u64;
2459 
2460 	return RTE_VHOST_MSG_RESULT_OK;
2461 }
2462 
2463 static int
2464 vhost_user_set_req_fd(struct virtio_net **pdev,
2465 			struct vhu_msg_context *ctx,
2466 			int main_fd __rte_unused)
2467 {
2468 	struct virtio_net *dev = *pdev;
2469 	int fd = ctx->fds[0];
2470 
2471 	if (validate_msg_fds(dev, ctx, 1) != 0)
2472 		return RTE_VHOST_MSG_RESULT_ERR;
2473 
2474 	if (fd < 0) {
2475 		VHOST_CONFIG_LOG(dev->ifname, ERR,
2476 			"invalid file descriptor for backend channel (%d)", fd);
2477 		return RTE_VHOST_MSG_RESULT_ERR;
2478 	}
2479 
2480 	if (dev->backend_req_fd >= 0)
2481 		close(dev->backend_req_fd);
2482 
2483 	dev->backend_req_fd = fd;
2484 
2485 	return RTE_VHOST_MSG_RESULT_OK;
2486 }
2487 
2488 static int
2489 is_vring_iotlb_split(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg)
2490 {
2491 	struct vhost_vring_addr *ra;
2492 	uint64_t start, end, len;
2493 
2494 	start = imsg->iova;
2495 	end = start + imsg->size;
2496 
2497 	ra = &vq->ring_addrs;
2498 	len = sizeof(struct vring_desc) * vq->size;
2499 	if (ra->desc_user_addr < end && (ra->desc_user_addr + len) > start)
2500 		return 1;
2501 
2502 	len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size;
2503 	if (ra->avail_user_addr < end && (ra->avail_user_addr + len) > start)
2504 		return 1;
2505 
2506 	len = sizeof(struct vring_used) +
2507 	       sizeof(struct vring_used_elem) * vq->size;
2508 	if (ra->used_user_addr < end && (ra->used_user_addr + len) > start)
2509 		return 1;
2510 
2511 	if (ra->flags & (1 << VHOST_VRING_F_LOG)) {
2512 		len = sizeof(uint64_t);
2513 		if (ra->log_guest_addr < end &&
2514 		    (ra->log_guest_addr + len) > start)
2515 			return 1;
2516 	}
2517 
2518 	return 0;
2519 }
2520 
2521 static int
2522 is_vring_iotlb_packed(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg)
2523 {
2524 	struct vhost_vring_addr *ra;
2525 	uint64_t start, end, len;
2526 
2527 	start = imsg->iova;
2528 	end = start + imsg->size;
2529 
2530 	ra = &vq->ring_addrs;
2531 	len = sizeof(struct vring_packed_desc) * vq->size;
2532 	if (ra->desc_user_addr < end && (ra->desc_user_addr + len) > start)
2533 		return 1;
2534 
2535 	len = sizeof(struct vring_packed_desc_event);
2536 	if (ra->avail_user_addr < end && (ra->avail_user_addr + len) > start)
2537 		return 1;
2538 
2539 	len = sizeof(struct vring_packed_desc_event);
2540 	if (ra->used_user_addr < end && (ra->used_user_addr + len) > start)
2541 		return 1;
2542 
2543 	if (ra->flags & (1 << VHOST_VRING_F_LOG)) {
2544 		len = sizeof(uint64_t);
2545 		if (ra->log_guest_addr < end &&
2546 		    (ra->log_guest_addr + len) > start)
2547 			return 1;
2548 	}
2549 
2550 	return 0;
2551 }
2552 
2553 static int is_vring_iotlb(struct virtio_net *dev,
2554 			  struct vhost_virtqueue *vq,
2555 			  struct vhost_iotlb_msg *imsg)
2556 {
2557 	if (vq_is_packed(dev))
2558 		return is_vring_iotlb_packed(vq, imsg);
2559 	else
2560 		return is_vring_iotlb_split(vq, imsg);
2561 }
2562 
2563 static int
2564 vhost_user_get_config(struct virtio_net **pdev,
2565 			struct vhu_msg_context *ctx,
2566 			int main_fd __rte_unused)
2567 {
2568 	struct virtio_net *dev = *pdev;
2569 	struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev;
2570 	int ret = 0;
2571 
2572 	if (validate_msg_fds(dev, ctx, 0) != 0)
2573 		return RTE_VHOST_MSG_RESULT_ERR;
2574 
2575 	if (!vdpa_dev) {
2576 		VHOST_CONFIG_LOG(dev->ifname, ERR, "is not vDPA device!");
2577 		return RTE_VHOST_MSG_RESULT_ERR;
2578 	}
2579 
2580 	if (vdpa_dev->ops->get_config) {
2581 		ret = vdpa_dev->ops->get_config(dev->vid,
2582 					   ctx->msg.payload.cfg.region,
2583 					   ctx->msg.payload.cfg.size);
2584 		if (ret != 0) {
2585 			ctx->msg.size = 0;
2586 			VHOST_CONFIG_LOG(dev->ifname, ERR, "get_config() return error!");
2587 		}
2588 	} else {
2589 		VHOST_CONFIG_LOG(dev->ifname, ERR, "get_config() not supported!");
2590 	}
2591 
2592 	return RTE_VHOST_MSG_RESULT_REPLY;
2593 }
2594 
2595 static int
2596 vhost_user_set_config(struct virtio_net **pdev,
2597 			struct vhu_msg_context *ctx,
2598 			int main_fd __rte_unused)
2599 {
2600 	struct virtio_net *dev = *pdev;
2601 	struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev;
2602 	int ret = 0;
2603 
2604 	if (validate_msg_fds(dev, ctx, 0) != 0)
2605 		return RTE_VHOST_MSG_RESULT_ERR;
2606 
2607 	if (ctx->msg.payload.cfg.size > VHOST_USER_MAX_CONFIG_SIZE) {
2608 		VHOST_CONFIG_LOG(dev->ifname, ERR,
2609 			"vhost_user_config size: %"PRIu32", should not be larger than %d",
2610 			ctx->msg.payload.cfg.size, VHOST_USER_MAX_CONFIG_SIZE);
2611 		goto out;
2612 	}
2613 
2614 	if (!vdpa_dev) {
2615 		VHOST_CONFIG_LOG(dev->ifname, ERR, "is not vDPA device!");
2616 		goto out;
2617 	}
2618 
2619 	if (vdpa_dev->ops->set_config) {
2620 		ret = vdpa_dev->ops->set_config(dev->vid,
2621 			ctx->msg.payload.cfg.region,
2622 			ctx->msg.payload.cfg.offset,
2623 			ctx->msg.payload.cfg.size,
2624 			ctx->msg.payload.cfg.flags);
2625 		if (ret)
2626 			VHOST_CONFIG_LOG(dev->ifname, ERR, "set_config() return error!");
2627 	} else {
2628 		VHOST_CONFIG_LOG(dev->ifname, ERR, "set_config() not supported!");
2629 	}
2630 
2631 	return RTE_VHOST_MSG_RESULT_OK;
2632 
2633 out:
2634 	return RTE_VHOST_MSG_RESULT_ERR;
2635 }
2636 
2637 static int
2638 vhost_user_iotlb_msg(struct virtio_net **pdev,
2639 			struct vhu_msg_context *ctx,
2640 			int main_fd __rte_unused)
2641 {
2642 	struct virtio_net *dev = *pdev;
2643 	struct vhost_iotlb_msg *imsg = &ctx->msg.payload.iotlb;
2644 	uint16_t i;
2645 	uint64_t vva, len, pg_sz;
2646 
2647 	switch (imsg->type) {
2648 	case VHOST_IOTLB_UPDATE:
2649 		len = imsg->size;
2650 		vva = qva_to_vva(dev, imsg->uaddr, &len);
2651 		if (!vva)
2652 			return RTE_VHOST_MSG_RESULT_ERR;
2653 
2654 		pg_sz = hua_to_alignment(dev->mem, (void *)(uintptr_t)vva);
2655 
2656 		vhost_user_iotlb_cache_insert(dev, imsg->iova, vva, 0, len, pg_sz, imsg->perm);
2657 
2658 		for (i = 0; i < dev->nr_vring; i++) {
2659 			struct vhost_virtqueue *vq = dev->virtqueue[i];
2660 
2661 			if (!vq)
2662 				continue;
2663 
2664 			if (is_vring_iotlb(dev, vq, imsg)) {
2665 				rte_rwlock_write_lock(&vq->access_lock);
2666 				translate_ring_addresses(&dev, &vq);
2667 				*pdev = dev;
2668 				rte_rwlock_write_unlock(&vq->access_lock);
2669 			}
2670 		}
2671 		break;
2672 	case VHOST_IOTLB_INVALIDATE:
2673 		vhost_user_iotlb_cache_remove(dev, imsg->iova, imsg->size);
2674 
2675 		for (i = 0; i < dev->nr_vring; i++) {
2676 			struct vhost_virtqueue *vq = dev->virtqueue[i];
2677 
2678 			if (!vq)
2679 				continue;
2680 
2681 			if (is_vring_iotlb(dev, vq, imsg)) {
2682 				rte_rwlock_write_lock(&vq->access_lock);
2683 				vring_invalidate(dev, vq);
2684 				rte_rwlock_write_unlock(&vq->access_lock);
2685 			}
2686 		}
2687 		break;
2688 	default:
2689 		VHOST_CONFIG_LOG(dev->ifname, ERR, "invalid IOTLB message type (%d)",
2690 			imsg->type);
2691 		return RTE_VHOST_MSG_RESULT_ERR;
2692 	}
2693 
2694 	return RTE_VHOST_MSG_RESULT_OK;
2695 }
2696 
2697 static int
2698 vhost_user_set_postcopy_advise(struct virtio_net **pdev,
2699 			struct vhu_msg_context *ctx,
2700 			int main_fd __rte_unused)
2701 {
2702 	struct virtio_net *dev = *pdev;
2703 #ifdef RTE_LIBRTE_VHOST_POSTCOPY
2704 	struct uffdio_api api_struct;
2705 
2706 	dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
2707 
2708 	if (dev->postcopy_ufd == -1) {
2709 		VHOST_CONFIG_LOG(dev->ifname, ERR,
2710 			"userfaultfd not available: %s",
2711 			strerror(errno));
2712 		return RTE_VHOST_MSG_RESULT_ERR;
2713 	}
2714 	api_struct.api = UFFD_API;
2715 	api_struct.features = 0;
2716 	if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) {
2717 		VHOST_CONFIG_LOG(dev->ifname, ERR,
2718 			"UFFDIO_API ioctl failure: %s",
2719 			strerror(errno));
2720 		close(dev->postcopy_ufd);
2721 		dev->postcopy_ufd = -1;
2722 		return RTE_VHOST_MSG_RESULT_ERR;
2723 	}
2724 	ctx->fds[0] = dev->postcopy_ufd;
2725 	ctx->fd_num = 1;
2726 
2727 	return RTE_VHOST_MSG_RESULT_REPLY;
2728 #else
2729 	dev->postcopy_ufd = -1;
2730 	ctx->fd_num = 0;
2731 
2732 	return RTE_VHOST_MSG_RESULT_ERR;
2733 #endif
2734 }
2735 
2736 static int
2737 vhost_user_set_postcopy_listen(struct virtio_net **pdev,
2738 			struct vhu_msg_context *ctx __rte_unused,
2739 			int main_fd __rte_unused)
2740 {
2741 	struct virtio_net *dev = *pdev;
2742 
2743 	if (dev->mem && dev->mem->nregions) {
2744 		VHOST_CONFIG_LOG(dev->ifname, ERR,
2745 			"regions already registered at postcopy-listen");
2746 		return RTE_VHOST_MSG_RESULT_ERR;
2747 	}
2748 	dev->postcopy_listening = 1;
2749 
2750 	return RTE_VHOST_MSG_RESULT_OK;
2751 }
2752 
2753 static int
2754 vhost_user_postcopy_end(struct virtio_net **pdev,
2755 			struct vhu_msg_context *ctx,
2756 			int main_fd __rte_unused)
2757 {
2758 	struct virtio_net *dev = *pdev;
2759 
2760 	dev->postcopy_listening = 0;
2761 	if (dev->postcopy_ufd >= 0) {
2762 		close(dev->postcopy_ufd);
2763 		dev->postcopy_ufd = -1;
2764 	}
2765 
2766 	ctx->msg.payload.u64 = 0;
2767 	ctx->msg.size = sizeof(ctx->msg.payload.u64);
2768 	ctx->fd_num = 0;
2769 
2770 	return RTE_VHOST_MSG_RESULT_REPLY;
2771 }
2772 
2773 static int
2774 vhost_user_get_status(struct virtio_net **pdev,
2775 		      struct vhu_msg_context *ctx,
2776 		      int main_fd __rte_unused)
2777 {
2778 	struct virtio_net *dev = *pdev;
2779 
2780 	ctx->msg.payload.u64 = dev->status;
2781 	ctx->msg.size = sizeof(ctx->msg.payload.u64);
2782 	ctx->fd_num = 0;
2783 
2784 	return RTE_VHOST_MSG_RESULT_REPLY;
2785 }
2786 
2787 static int
2788 vhost_user_set_status(struct virtio_net **pdev,
2789 			struct vhu_msg_context *ctx,
2790 			int main_fd __rte_unused)
2791 {
2792 	struct virtio_net *dev = *pdev;
2793 
2794 	/* As per Virtio specification, the device status is 8bits long */
2795 	if (ctx->msg.payload.u64 > UINT8_MAX) {
2796 		VHOST_CONFIG_LOG(dev->ifname, ERR,
2797 			"invalid VHOST_USER_SET_STATUS payload 0x%" PRIx64,
2798 			ctx->msg.payload.u64);
2799 		return RTE_VHOST_MSG_RESULT_ERR;
2800 	}
2801 
2802 	dev->status = ctx->msg.payload.u64;
2803 
2804 	if ((dev->status & VIRTIO_DEVICE_STATUS_FEATURES_OK) &&
2805 	    (dev->flags & VIRTIO_DEV_FEATURES_FAILED)) {
2806 		VHOST_CONFIG_LOG(dev->ifname, ERR,
2807 			"FEATURES_OK bit is set but feature negotiation failed");
2808 		/*
2809 		 * Clear the bit to let the driver know about the feature
2810 		 * negotiation failure
2811 		 */
2812 		dev->status &= ~VIRTIO_DEVICE_STATUS_FEATURES_OK;
2813 	}
2814 
2815 	VHOST_CONFIG_LOG(dev->ifname, INFO, "new device status(0x%08x):", dev->status);
2816 	VHOST_CONFIG_LOG(dev->ifname, INFO,
2817 		"\t-RESET: %u",
2818 		(dev->status == VIRTIO_DEVICE_STATUS_RESET));
2819 	VHOST_CONFIG_LOG(dev->ifname, INFO,
2820 		"\t-ACKNOWLEDGE: %u",
2821 		!!(dev->status & VIRTIO_DEVICE_STATUS_ACK));
2822 	VHOST_CONFIG_LOG(dev->ifname, INFO,
2823 		"\t-DRIVER: %u",
2824 		!!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER));
2825 	VHOST_CONFIG_LOG(dev->ifname, INFO,
2826 		"\t-FEATURES_OK: %u",
2827 		!!(dev->status & VIRTIO_DEVICE_STATUS_FEATURES_OK));
2828 	VHOST_CONFIG_LOG(dev->ifname, INFO,
2829 		"\t-DRIVER_OK: %u",
2830 		!!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER_OK));
2831 	VHOST_CONFIG_LOG(dev->ifname, INFO,
2832 		"\t-DEVICE_NEED_RESET: %u",
2833 		!!(dev->status & VIRTIO_DEVICE_STATUS_DEV_NEED_RESET));
2834 	VHOST_CONFIG_LOG(dev->ifname, INFO,
2835 		"\t-FAILED: %u",
2836 		!!(dev->status & VIRTIO_DEVICE_STATUS_FAILED));
2837 
2838 	return RTE_VHOST_MSG_RESULT_OK;
2839 }
2840 
2841 #define VHOST_MESSAGE_HANDLERS \
2842 VHOST_MESSAGE_HANDLER(VHOST_USER_NONE, NULL, false) \
2843 VHOST_MESSAGE_HANDLER(VHOST_USER_GET_FEATURES, vhost_user_get_features, false) \
2844 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_FEATURES, vhost_user_set_features, false) \
2845 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_OWNER, vhost_user_set_owner, false) \
2846 VHOST_MESSAGE_HANDLER(VHOST_USER_RESET_OWNER, vhost_user_reset_owner, false) \
2847 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_MEM_TABLE, vhost_user_set_mem_table, true) \
2848 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_LOG_BASE, vhost_user_set_log_base, true) \
2849 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_LOG_FD, vhost_user_set_log_fd, true) \
2850 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_VRING_NUM, vhost_user_set_vring_num, false) \
2851 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_VRING_ADDR, vhost_user_set_vring_addr, false) \
2852 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_VRING_BASE, vhost_user_set_vring_base, false) \
2853 VHOST_MESSAGE_HANDLER(VHOST_USER_GET_VRING_BASE, vhost_user_get_vring_base, false) \
2854 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_VRING_KICK, vhost_user_set_vring_kick, true) \
2855 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_VRING_CALL, vhost_user_set_vring_call, true) \
2856 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_VRING_ERR, vhost_user_set_vring_err, true) \
2857 VHOST_MESSAGE_HANDLER(VHOST_USER_GET_PROTOCOL_FEATURES, vhost_user_get_protocol_features, false) \
2858 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_PROTOCOL_FEATURES, vhost_user_set_protocol_features, false) \
2859 VHOST_MESSAGE_HANDLER(VHOST_USER_GET_QUEUE_NUM, vhost_user_get_queue_num, false) \
2860 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_VRING_ENABLE, vhost_user_set_vring_enable, false) \
2861 VHOST_MESSAGE_HANDLER(VHOST_USER_SEND_RARP, vhost_user_send_rarp, false) \
2862 VHOST_MESSAGE_HANDLER(VHOST_USER_NET_SET_MTU, vhost_user_net_set_mtu, false) \
2863 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_BACKEND_REQ_FD, vhost_user_set_req_fd, true) \
2864 VHOST_MESSAGE_HANDLER(VHOST_USER_IOTLB_MSG, vhost_user_iotlb_msg, false) \
2865 VHOST_MESSAGE_HANDLER(VHOST_USER_GET_CONFIG, vhost_user_get_config, false) \
2866 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_CONFIG, vhost_user_set_config, false) \
2867 VHOST_MESSAGE_HANDLER(VHOST_USER_POSTCOPY_ADVISE, vhost_user_set_postcopy_advise, false) \
2868 VHOST_MESSAGE_HANDLER(VHOST_USER_POSTCOPY_LISTEN, vhost_user_set_postcopy_listen, false) \
2869 VHOST_MESSAGE_HANDLER(VHOST_USER_POSTCOPY_END, vhost_user_postcopy_end, false) \
2870 VHOST_MESSAGE_HANDLER(VHOST_USER_GET_INFLIGHT_FD, vhost_user_get_inflight_fd, false) \
2871 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_INFLIGHT_FD, vhost_user_set_inflight_fd, true) \
2872 VHOST_MESSAGE_HANDLER(VHOST_USER_SET_STATUS, vhost_user_set_status, false) \
2873 VHOST_MESSAGE_HANDLER(VHOST_USER_GET_STATUS, vhost_user_get_status, false)
2874 
2875 #define VHOST_MESSAGE_HANDLER(id, handler, accepts_fd) \
2876 	[id] = { #id, handler, accepts_fd },
2877 static vhost_message_handler_t vhost_message_handlers[] = {
2878 	VHOST_MESSAGE_HANDLERS
2879 };
2880 #undef VHOST_MESSAGE_HANDLER
2881 
2882 /* return bytes# of read on success or negative val on failure. */
2883 static int
2884 read_vhost_message(struct virtio_net *dev, int sockfd, struct  vhu_msg_context *ctx)
2885 {
2886 	int ret;
2887 
2888 	ret = read_fd_message(dev->ifname, sockfd, (char *)&ctx->msg, VHOST_USER_HDR_SIZE,
2889 		ctx->fds, VHOST_MEMORY_MAX_NREGIONS, &ctx->fd_num);
2890 	if (ret <= 0)
2891 		goto out;
2892 
2893 	if (ret != VHOST_USER_HDR_SIZE) {
2894 		VHOST_CONFIG_LOG(dev->ifname, ERR, "Unexpected header size read");
2895 		ret = -1;
2896 		goto out;
2897 	}
2898 
2899 	if (ctx->msg.size) {
2900 		if (ctx->msg.size > sizeof(ctx->msg.payload)) {
2901 			VHOST_CONFIG_LOG(dev->ifname, ERR, "invalid msg size: %d",
2902 				ctx->msg.size);
2903 			ret = -1;
2904 			goto out;
2905 		}
2906 		ret = read(sockfd, &ctx->msg.payload, ctx->msg.size);
2907 		if (ret <= 0)
2908 			goto out;
2909 		if (ret != (int)ctx->msg.size) {
2910 			VHOST_CONFIG_LOG(dev->ifname, ERR, "read control message failed");
2911 			ret = -1;
2912 			goto out;
2913 		}
2914 	}
2915 
2916 out:
2917 	if (ret <= 0)
2918 		close_msg_fds(ctx);
2919 
2920 	return ret;
2921 }
2922 
2923 static int
2924 send_vhost_message(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx)
2925 {
2926 	if (!ctx)
2927 		return 0;
2928 
2929 	return send_fd_message(dev->ifname, sockfd, (char *)&ctx->msg,
2930 		VHOST_USER_HDR_SIZE + ctx->msg.size, ctx->fds, ctx->fd_num);
2931 }
2932 
2933 static int
2934 send_vhost_reply(struct virtio_net *dev, int sockfd, struct vhu_msg_context *ctx)
2935 {
2936 	if (!ctx)
2937 		return 0;
2938 
2939 	ctx->msg.flags &= ~VHOST_USER_VERSION_MASK;
2940 	ctx->msg.flags &= ~VHOST_USER_NEED_REPLY;
2941 	ctx->msg.flags |= VHOST_USER_VERSION;
2942 	ctx->msg.flags |= VHOST_USER_REPLY_MASK;
2943 
2944 	return send_vhost_message(dev, sockfd, ctx);
2945 }
2946 
2947 static int
2948 send_vhost_backend_message(struct virtio_net *dev, struct vhu_msg_context *ctx)
2949 {
2950 	return send_vhost_message(dev, dev->backend_req_fd, ctx);
2951 }
2952 
2953 static int
2954 send_vhost_backend_message_process_reply(struct virtio_net *dev, struct vhu_msg_context *ctx)
2955 {
2956 	struct vhu_msg_context msg_reply;
2957 	int ret;
2958 
2959 	rte_spinlock_lock(&dev->backend_req_lock);
2960 	ret = send_vhost_backend_message(dev, ctx);
2961 	if (ret < 0) {
2962 		VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to send config change (%d)", ret);
2963 		goto out;
2964 	}
2965 
2966 	ret = read_vhost_message(dev, dev->backend_req_fd, &msg_reply);
2967 	if (ret <= 0) {
2968 		if (ret < 0)
2969 			VHOST_CONFIG_LOG(dev->ifname, ERR,
2970 				"vhost read backend message reply failed");
2971 		else
2972 			VHOST_CONFIG_LOG(dev->ifname, INFO, "vhost peer closed");
2973 		ret = -1;
2974 		goto out;
2975 	}
2976 
2977 	if (msg_reply.msg.request.backend != ctx->msg.request.backend) {
2978 		VHOST_CONFIG_LOG(dev->ifname, ERR,
2979 			"received unexpected msg type (%u), expected %u",
2980 			msg_reply.msg.request.backend, ctx->msg.request.backend);
2981 		ret = -1;
2982 		goto out;
2983 	}
2984 
2985 	ret = msg_reply.msg.payload.u64 ? -1 : 0;
2986 out:
2987 	rte_spinlock_unlock(&dev->backend_req_lock);
2988 	return ret;
2989 }
2990 
2991 /*
2992  * Allocate a queue pair if it hasn't been allocated yet
2993  */
2994 static int
2995 vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev,
2996 			struct vhu_msg_context *ctx)
2997 {
2998 	uint32_t vring_idx;
2999 
3000 	switch (ctx->msg.request.frontend) {
3001 	case VHOST_USER_SET_VRING_KICK:
3002 	case VHOST_USER_SET_VRING_CALL:
3003 	case VHOST_USER_SET_VRING_ERR:
3004 		vring_idx = ctx->msg.payload.u64 & VHOST_USER_VRING_IDX_MASK;
3005 		break;
3006 	case VHOST_USER_SET_VRING_NUM:
3007 	case VHOST_USER_SET_VRING_BASE:
3008 	case VHOST_USER_GET_VRING_BASE:
3009 	case VHOST_USER_SET_VRING_ENABLE:
3010 		vring_idx = ctx->msg.payload.state.index;
3011 		break;
3012 	case VHOST_USER_SET_VRING_ADDR:
3013 		vring_idx = ctx->msg.payload.addr.index;
3014 		break;
3015 	case VHOST_USER_SET_INFLIGHT_FD:
3016 		vring_idx = ctx->msg.payload.inflight.num_queues - 1;
3017 		break;
3018 	default:
3019 		return 0;
3020 	}
3021 
3022 	if (vring_idx >= VHOST_MAX_VRING) {
3023 		VHOST_CONFIG_LOG(dev->ifname, ERR, "invalid vring index: %u", vring_idx);
3024 		return -1;
3025 	}
3026 
3027 	if (dev->virtqueue[vring_idx])
3028 		return 0;
3029 
3030 	return alloc_vring_queue(dev, vring_idx);
3031 }
3032 
3033 static void
3034 vhost_user_lock_all_queue_pairs(struct virtio_net *dev)
3035 	__rte_no_thread_safety_analysis
3036 {
3037 	unsigned int i = 0;
3038 	unsigned int vq_num = 0;
3039 
3040 	while (vq_num < dev->nr_vring) {
3041 		struct vhost_virtqueue *vq = dev->virtqueue[i];
3042 
3043 		if (vq) {
3044 			rte_rwlock_write_lock(&vq->access_lock);
3045 			vq_num++;
3046 		}
3047 		i++;
3048 	}
3049 }
3050 
3051 static void
3052 vhost_user_unlock_all_queue_pairs(struct virtio_net *dev)
3053 	__rte_no_thread_safety_analysis
3054 {
3055 	unsigned int i = 0;
3056 	unsigned int vq_num = 0;
3057 
3058 	while (vq_num < dev->nr_vring) {
3059 		struct vhost_virtqueue *vq = dev->virtqueue[i];
3060 
3061 		if (vq) {
3062 			rte_rwlock_write_unlock(&vq->access_lock);
3063 			vq_num++;
3064 		}
3065 		i++;
3066 	}
3067 }
3068 
3069 int
3070 vhost_user_msg_handler(int vid, int fd)
3071 {
3072 	struct virtio_net *dev;
3073 	struct vhu_msg_context ctx;
3074 	vhost_message_handler_t *msg_handler;
3075 	struct rte_vdpa_device *vdpa_dev;
3076 	int msg_result = RTE_VHOST_MSG_RESULT_OK;
3077 	int ret;
3078 	int unlock_required = 0;
3079 	bool handled;
3080 	uint32_t request;
3081 	uint32_t i;
3082 	uint16_t blk_call_fd;
3083 
3084 	dev = get_device(vid);
3085 	if (dev == NULL)
3086 		return -1;
3087 
3088 	if (!dev->notify_ops) {
3089 		dev->notify_ops = vhost_driver_callback_get(dev->ifname);
3090 		if (!dev->notify_ops) {
3091 			VHOST_CONFIG_LOG(dev->ifname, ERR,
3092 				"failed to get callback ops for driver");
3093 			return -1;
3094 		}
3095 	}
3096 
3097 	ctx.msg.request.frontend = VHOST_USER_NONE;
3098 	ret = read_vhost_message(dev, fd, &ctx);
3099 	if (ret == 0) {
3100 		VHOST_CONFIG_LOG(dev->ifname, INFO, "vhost peer closed");
3101 		return -1;
3102 	}
3103 
3104 	request = ctx.msg.request.frontend;
3105 	if (request > VHOST_USER_NONE && request < RTE_DIM(vhost_message_handlers))
3106 		msg_handler = &vhost_message_handlers[request];
3107 	else
3108 		msg_handler = NULL;
3109 
3110 	if (ret < 0) {
3111 		VHOST_CONFIG_LOG(dev->ifname, ERR, "vhost read message %s%s%sfailed",
3112 				msg_handler != NULL ? "for " : "",
3113 				msg_handler != NULL ? msg_handler->description : "",
3114 				msg_handler != NULL ? " " : "");
3115 		return -1;
3116 	}
3117 
3118 	if (msg_handler != NULL && msg_handler->description != NULL) {
3119 		if (request != VHOST_USER_IOTLB_MSG)
3120 			VHOST_CONFIG_LOG(dev->ifname, INFO,
3121 				"read message %s",
3122 				msg_handler->description);
3123 		else
3124 			VHOST_CONFIG_LOG(dev->ifname, DEBUG,
3125 				"read message %s",
3126 				msg_handler->description);
3127 	} else {
3128 		VHOST_CONFIG_LOG(dev->ifname, DEBUG, "external request %d", request);
3129 	}
3130 
3131 	ret = vhost_user_check_and_alloc_queue_pair(dev, &ctx);
3132 	if (ret < 0) {
3133 		VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to alloc queue");
3134 		return -1;
3135 	}
3136 
3137 	/*
3138 	 * Note: we don't lock all queues on VHOST_USER_GET_VRING_BASE
3139 	 * and VHOST_USER_RESET_OWNER, since it is sent when virtio stops
3140 	 * and device is destroyed. destroy_device waits for queues to be
3141 	 * inactive, so it is safe. Otherwise taking the access_lock
3142 	 * would cause a dead lock.
3143 	 */
3144 	switch (request) {
3145 	case VHOST_USER_SET_FEATURES:
3146 	case VHOST_USER_SET_PROTOCOL_FEATURES:
3147 	case VHOST_USER_SET_OWNER:
3148 	case VHOST_USER_SET_MEM_TABLE:
3149 	case VHOST_USER_SET_LOG_BASE:
3150 	case VHOST_USER_SET_LOG_FD:
3151 	case VHOST_USER_SET_VRING_NUM:
3152 	case VHOST_USER_SET_VRING_ADDR:
3153 	case VHOST_USER_SET_VRING_BASE:
3154 	case VHOST_USER_SET_VRING_KICK:
3155 	case VHOST_USER_SET_VRING_CALL:
3156 	case VHOST_USER_SET_VRING_ERR:
3157 	case VHOST_USER_SET_VRING_ENABLE:
3158 	case VHOST_USER_SEND_RARP:
3159 	case VHOST_USER_NET_SET_MTU:
3160 	case VHOST_USER_SET_BACKEND_REQ_FD:
3161 		if (!(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED)) {
3162 			vhost_user_lock_all_queue_pairs(dev);
3163 			unlock_required = 1;
3164 		}
3165 		break;
3166 	default:
3167 		break;
3168 
3169 	}
3170 
3171 	handled = false;
3172 	if (dev->extern_ops.pre_msg_handle) {
3173 		RTE_BUILD_BUG_ON(offsetof(struct vhu_msg_context, msg) != 0);
3174 		msg_result = (*dev->extern_ops.pre_msg_handle)(dev->vid, &ctx);
3175 		switch (msg_result) {
3176 		case RTE_VHOST_MSG_RESULT_REPLY:
3177 			send_vhost_reply(dev, fd, &ctx);
3178 			/* Fall-through */
3179 		case RTE_VHOST_MSG_RESULT_ERR:
3180 		case RTE_VHOST_MSG_RESULT_OK:
3181 			handled = true;
3182 			goto skip_to_post_handle;
3183 		case RTE_VHOST_MSG_RESULT_NOT_HANDLED:
3184 		default:
3185 			break;
3186 		}
3187 	}
3188 
3189 	if (msg_handler == NULL || msg_handler->callback == NULL)
3190 		goto skip_to_post_handle;
3191 
3192 	if (!msg_handler->accepts_fd && validate_msg_fds(dev, &ctx, 0) != 0) {
3193 		msg_result = RTE_VHOST_MSG_RESULT_ERR;
3194 	} else {
3195 		msg_result = msg_handler->callback(&dev, &ctx, fd);
3196 	}
3197 
3198 	switch (msg_result) {
3199 	case RTE_VHOST_MSG_RESULT_ERR:
3200 		VHOST_CONFIG_LOG(dev->ifname, ERR,
3201 			"processing %s failed.",
3202 			msg_handler->description);
3203 		handled = true;
3204 		break;
3205 	case RTE_VHOST_MSG_RESULT_OK:
3206 		VHOST_CONFIG_LOG(dev->ifname, DEBUG,
3207 			"processing %s succeeded.",
3208 			msg_handler->description);
3209 		handled = true;
3210 		break;
3211 	case RTE_VHOST_MSG_RESULT_REPLY:
3212 		VHOST_CONFIG_LOG(dev->ifname, DEBUG,
3213 			"processing %s succeeded and needs reply.",
3214 			msg_handler->description);
3215 		send_vhost_reply(dev, fd, &ctx);
3216 		handled = true;
3217 		break;
3218 	default:
3219 		break;
3220 	}
3221 
3222 skip_to_post_handle:
3223 	if (msg_result != RTE_VHOST_MSG_RESULT_ERR &&
3224 			dev->extern_ops.post_msg_handle) {
3225 		RTE_BUILD_BUG_ON(offsetof(struct vhu_msg_context, msg) != 0);
3226 		msg_result = (*dev->extern_ops.post_msg_handle)(dev->vid, &ctx);
3227 		switch (msg_result) {
3228 		case RTE_VHOST_MSG_RESULT_REPLY:
3229 			send_vhost_reply(dev, fd, &ctx);
3230 			/* Fall-through */
3231 		case RTE_VHOST_MSG_RESULT_ERR:
3232 		case RTE_VHOST_MSG_RESULT_OK:
3233 			handled = true;
3234 		case RTE_VHOST_MSG_RESULT_NOT_HANDLED:
3235 		default:
3236 			break;
3237 		}
3238 	}
3239 
3240 	/* If message was not handled at this stage, treat it as an error */
3241 	if (!handled) {
3242 		VHOST_CONFIG_LOG(dev->ifname, ERR,
3243 			"vhost message (req: %d) was not handled.",
3244 			request);
3245 		close_msg_fds(&ctx);
3246 		msg_result = RTE_VHOST_MSG_RESULT_ERR;
3247 	}
3248 
3249 	/*
3250 	 * If the request required a reply that was already sent,
3251 	 * this optional reply-ack won't be sent as the
3252 	 * VHOST_USER_NEED_REPLY was cleared in send_vhost_reply().
3253 	 */
3254 	if (ctx.msg.flags & VHOST_USER_NEED_REPLY) {
3255 		ctx.msg.payload.u64 = msg_result == RTE_VHOST_MSG_RESULT_ERR;
3256 		ctx.msg.size = sizeof(ctx.msg.payload.u64);
3257 		ctx.fd_num = 0;
3258 		send_vhost_reply(dev, fd, &ctx);
3259 	} else if (msg_result == RTE_VHOST_MSG_RESULT_ERR) {
3260 		VHOST_CONFIG_LOG(dev->ifname, ERR, "vhost message handling failed.");
3261 		ret = -1;
3262 		goto unlock;
3263 	}
3264 
3265 	for (i = 0; i < dev->nr_vring; i++) {
3266 		struct vhost_virtqueue *vq = dev->virtqueue[i];
3267 		bool cur_ready = vq_is_ready(dev, vq);
3268 
3269 		if (cur_ready != (vq && vq->ready)) {
3270 			vq->ready = cur_ready;
3271 			vhost_user_notify_queue_state(dev, vq, cur_ready);
3272 		}
3273 	}
3274 
3275 unlock:
3276 	if (unlock_required)
3277 		vhost_user_unlock_all_queue_pairs(dev);
3278 
3279 	if (ret != 0 || !virtio_is_ready(dev))
3280 		goto out;
3281 
3282 	/*
3283 	 * Virtio is now ready. If not done already, it is time
3284 	 * to notify the application it can process the rings and
3285 	 * configure the vDPA device if present.
3286 	 */
3287 
3288 	if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
3289 		if (dev->notify_ops->new_device(dev->vid) == 0)
3290 			dev->flags |= VIRTIO_DEV_RUNNING;
3291 	}
3292 
3293 	vdpa_dev = dev->vdpa_dev;
3294 	if (!vdpa_dev)
3295 		goto out;
3296 
3297 	if (vdpa_dev->type == RTE_VHOST_VDPA_DEVICE_TYPE_BLK) {
3298 		if (request == VHOST_USER_SET_VRING_CALL) {
3299 			blk_call_fd = ctx.msg.payload.u64 & VHOST_USER_VRING_IDX_MASK;
3300 			if (blk_call_fd != dev->nr_vring - 1)
3301 				goto out;
3302 		} else {
3303 			goto out;
3304 		}
3305 	}
3306 
3307 	if (!(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED)) {
3308 		if (vdpa_dev->ops->dev_conf(dev->vid))
3309 			VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to configure vDPA device");
3310 		else
3311 			dev->flags |= VIRTIO_DEV_VDPA_CONFIGURED;
3312 	}
3313 
3314 out:
3315 	return ret;
3316 }
3317 
3318 static int
3319 vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm)
3320 {
3321 	int ret;
3322 	struct vhu_msg_context ctx = {
3323 		.msg = {
3324 			.request.backend = VHOST_USER_BACKEND_IOTLB_MSG,
3325 			.flags = VHOST_USER_VERSION,
3326 			.size = sizeof(ctx.msg.payload.iotlb),
3327 			.payload.iotlb = {
3328 				.iova = iova,
3329 				.perm = perm,
3330 				.type = VHOST_IOTLB_MISS,
3331 			},
3332 		},
3333 	};
3334 
3335 	ret = send_vhost_message(dev, dev->backend_req_fd, &ctx);
3336 	if (ret < 0) {
3337 		VHOST_CONFIG_LOG(dev->ifname, ERR,
3338 			"failed to send IOTLB miss message (%d)",
3339 			ret);
3340 		return ret;
3341 	}
3342 
3343 	return 0;
3344 }
3345 
3346 int
3347 rte_vhost_backend_config_change(int vid, bool need_reply)
3348 {
3349 	struct vhu_msg_context ctx = {
3350 		.msg = {
3351 			.request.backend = VHOST_USER_BACKEND_CONFIG_CHANGE_MSG,
3352 			.flags = VHOST_USER_VERSION,
3353 			.size = 0,
3354 		}
3355 	};
3356 	struct virtio_net *dev;
3357 	int ret;
3358 
3359 	dev = get_device(vid);
3360 	if (!dev)
3361 		return -ENODEV;
3362 
3363 	if (!need_reply) {
3364 		ret = send_vhost_backend_message(dev, &ctx);
3365 	} else {
3366 		ctx.msg.flags |= VHOST_USER_NEED_REPLY;
3367 		ret = send_vhost_backend_message_process_reply(dev, &ctx);
3368 	}
3369 
3370 	if (ret < 0)
3371 		VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to send config change (%d)", ret);
3372 	return ret;
3373 }
3374 
3375 static int vhost_user_backend_set_vring_host_notifier(struct virtio_net *dev,
3376 						    int index, int fd,
3377 						    uint64_t offset,
3378 						    uint64_t size)
3379 {
3380 	int ret;
3381 	struct vhu_msg_context ctx = {
3382 		.msg = {
3383 			.request.backend = VHOST_USER_BACKEND_VRING_HOST_NOTIFIER_MSG,
3384 			.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY,
3385 			.size = sizeof(ctx.msg.payload.area),
3386 			.payload.area = {
3387 				.u64 = index & VHOST_USER_VRING_IDX_MASK,
3388 				.size = size,
3389 				.offset = offset,
3390 			},
3391 		},
3392 	};
3393 
3394 	if (fd < 0)
3395 		ctx.msg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK;
3396 	else {
3397 		ctx.fds[0] = fd;
3398 		ctx.fd_num = 1;
3399 	}
3400 
3401 	ret = send_vhost_backend_message_process_reply(dev, &ctx);
3402 	if (ret < 0)
3403 		VHOST_CONFIG_LOG(dev->ifname, ERR, "failed to set host notifier (%d)", ret);
3404 
3405 	return ret;
3406 }
3407 
3408 int rte_vhost_host_notifier_ctrl(int vid, uint16_t qid, bool enable)
3409 {
3410 	struct virtio_net *dev;
3411 	struct rte_vdpa_device *vdpa_dev;
3412 	int vfio_device_fd, ret = 0;
3413 	uint64_t offset, size;
3414 	unsigned int i, q_start, q_last;
3415 
3416 	dev = get_device(vid);
3417 	if (!dev)
3418 		return -ENODEV;
3419 
3420 	vdpa_dev = dev->vdpa_dev;
3421 	if (vdpa_dev == NULL)
3422 		return -ENODEV;
3423 
3424 	if (!(dev->features & (1ULL << VIRTIO_F_VERSION_1)) ||
3425 	    !(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)) ||
3426 	    !(dev->protocol_features &
3427 			(1ULL << VHOST_USER_PROTOCOL_F_BACKEND_REQ)) ||
3428 	    !(dev->protocol_features &
3429 			(1ULL << VHOST_USER_PROTOCOL_F_BACKEND_SEND_FD)) ||
3430 	    !(dev->protocol_features &
3431 			(1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER)))
3432 		return -ENOTSUP;
3433 
3434 	if (qid == RTE_VHOST_QUEUE_ALL) {
3435 		q_start = 0;
3436 		q_last = dev->nr_vring - 1;
3437 	} else {
3438 		if (qid >= dev->nr_vring)
3439 			return -EINVAL;
3440 		q_start = qid;
3441 		q_last = qid;
3442 	}
3443 
3444 	if (vdpa_dev->ops->get_vfio_device_fd == NULL)
3445 		return -ENOTSUP;
3446 	if (vdpa_dev->ops->get_notify_area == NULL)
3447 		return -ENOTSUP;
3448 
3449 	vfio_device_fd = vdpa_dev->ops->get_vfio_device_fd(vid);
3450 	if (vfio_device_fd < 0)
3451 		return -ENOTSUP;
3452 
3453 	if (enable) {
3454 		for (i = q_start; i <= q_last; i++) {
3455 			if (vdpa_dev->ops->get_notify_area(vid, i, &offset,
3456 					&size) < 0) {
3457 				ret = -ENOTSUP;
3458 				goto disable;
3459 			}
3460 
3461 			if (vhost_user_backend_set_vring_host_notifier(dev, i,
3462 					vfio_device_fd, offset, size) < 0) {
3463 				ret = -EFAULT;
3464 				goto disable;
3465 			}
3466 		}
3467 	} else {
3468 disable:
3469 		for (i = q_start; i <= q_last; i++) {
3470 			vhost_user_backend_set_vring_host_notifier(dev, i, -1,
3471 					0, 0);
3472 		}
3473 	}
3474 
3475 	return ret;
3476 }
3477 
3478 static int
3479 vhost_user_inject_irq(struct virtio_net *dev __rte_unused, struct vhost_virtqueue *vq)
3480 {
3481 	if (vq->callfd < 0)
3482 		return -1;
3483 
3484 	return eventfd_write(vq->callfd, (eventfd_t)1);
3485 }
3486 
3487 static struct vhost_backend_ops vhost_user_backend_ops = {
3488 	.iotlb_miss = vhost_user_iotlb_miss,
3489 	.inject_irq = vhost_user_inject_irq,
3490 };
3491 
3492 int
3493 vhost_user_new_device(void)
3494 {
3495 	return vhost_new_device(&vhost_user_backend_ops);
3496 }
3497