xref: /dpdk/lib/vhost/vhost_user.c (revision daa02b5cddbb8e11b31d41e2bf7bb1ae64dcae2f)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2018 Intel Corporation
3  */
4 
5 /* Security model
6  * --------------
7  * The vhost-user protocol connection is an external interface, so it must be
8  * robust against invalid inputs.
9  *
10  * This is important because the vhost-user master is only one step removed
11  * from the guest.  Malicious guests that have escaped will then launch further
12  * attacks from the vhost-user master.
13  *
14  * Even in deployments where guests are trusted, a bug in the vhost-user master
15  * can still cause invalid messages to be sent.  Such messages must not
16  * compromise the stability of the DPDK application by causing crashes, memory
17  * corruption, or other problematic behavior.
18  *
19  * Do not assume received VhostUserMsg fields contain sensible values!
20  */
21 
22 #include <stdint.h>
23 #include <stdio.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <unistd.h>
27 #include <fcntl.h>
28 #include <sys/ioctl.h>
29 #include <sys/mman.h>
30 #include <sys/types.h>
31 #include <sys/stat.h>
32 #include <sys/syscall.h>
33 #include <assert.h>
34 #ifdef RTE_LIBRTE_VHOST_NUMA
35 #include <numaif.h>
36 #endif
37 #ifdef RTE_LIBRTE_VHOST_POSTCOPY
38 #include <linux/userfaultfd.h>
39 #endif
40 #ifdef F_ADD_SEALS /* if file sealing is supported, so is memfd */
41 #include <linux/memfd.h>
42 #define MEMFD_SUPPORTED
43 #endif
44 
45 #include <rte_common.h>
46 #include <rte_malloc.h>
47 #include <rte_log.h>
48 #include <rte_vfio.h>
49 #include <rte_errno.h>
50 
51 #include "iotlb.h"
52 #include "vhost.h"
53 #include "vhost_user.h"
54 
55 #define VIRTIO_MIN_MTU 68
56 #define VIRTIO_MAX_MTU 65535
57 
58 #define INFLIGHT_ALIGNMENT	64
59 #define INFLIGHT_VERSION	0x1
60 
61 static const char *vhost_message_str[VHOST_USER_MAX] = {
62 	[VHOST_USER_NONE] = "VHOST_USER_NONE",
63 	[VHOST_USER_GET_FEATURES] = "VHOST_USER_GET_FEATURES",
64 	[VHOST_USER_SET_FEATURES] = "VHOST_USER_SET_FEATURES",
65 	[VHOST_USER_SET_OWNER] = "VHOST_USER_SET_OWNER",
66 	[VHOST_USER_RESET_OWNER] = "VHOST_USER_RESET_OWNER",
67 	[VHOST_USER_SET_MEM_TABLE] = "VHOST_USER_SET_MEM_TABLE",
68 	[VHOST_USER_SET_LOG_BASE] = "VHOST_USER_SET_LOG_BASE",
69 	[VHOST_USER_SET_LOG_FD] = "VHOST_USER_SET_LOG_FD",
70 	[VHOST_USER_SET_VRING_NUM] = "VHOST_USER_SET_VRING_NUM",
71 	[VHOST_USER_SET_VRING_ADDR] = "VHOST_USER_SET_VRING_ADDR",
72 	[VHOST_USER_SET_VRING_BASE] = "VHOST_USER_SET_VRING_BASE",
73 	[VHOST_USER_GET_VRING_BASE] = "VHOST_USER_GET_VRING_BASE",
74 	[VHOST_USER_SET_VRING_KICK] = "VHOST_USER_SET_VRING_KICK",
75 	[VHOST_USER_SET_VRING_CALL] = "VHOST_USER_SET_VRING_CALL",
76 	[VHOST_USER_SET_VRING_ERR]  = "VHOST_USER_SET_VRING_ERR",
77 	[VHOST_USER_GET_PROTOCOL_FEATURES]  = "VHOST_USER_GET_PROTOCOL_FEATURES",
78 	[VHOST_USER_SET_PROTOCOL_FEATURES]  = "VHOST_USER_SET_PROTOCOL_FEATURES",
79 	[VHOST_USER_GET_QUEUE_NUM]  = "VHOST_USER_GET_QUEUE_NUM",
80 	[VHOST_USER_SET_VRING_ENABLE]  = "VHOST_USER_SET_VRING_ENABLE",
81 	[VHOST_USER_SEND_RARP]  = "VHOST_USER_SEND_RARP",
82 	[VHOST_USER_NET_SET_MTU]  = "VHOST_USER_NET_SET_MTU",
83 	[VHOST_USER_SET_SLAVE_REQ_FD]  = "VHOST_USER_SET_SLAVE_REQ_FD",
84 	[VHOST_USER_IOTLB_MSG]  = "VHOST_USER_IOTLB_MSG",
85 	[VHOST_USER_CRYPTO_CREATE_SESS] = "VHOST_USER_CRYPTO_CREATE_SESS",
86 	[VHOST_USER_CRYPTO_CLOSE_SESS] = "VHOST_USER_CRYPTO_CLOSE_SESS",
87 	[VHOST_USER_POSTCOPY_ADVISE]  = "VHOST_USER_POSTCOPY_ADVISE",
88 	[VHOST_USER_POSTCOPY_LISTEN]  = "VHOST_USER_POSTCOPY_LISTEN",
89 	[VHOST_USER_POSTCOPY_END]  = "VHOST_USER_POSTCOPY_END",
90 	[VHOST_USER_GET_INFLIGHT_FD] = "VHOST_USER_GET_INFLIGHT_FD",
91 	[VHOST_USER_SET_INFLIGHT_FD] = "VHOST_USER_SET_INFLIGHT_FD",
92 	[VHOST_USER_SET_STATUS] = "VHOST_USER_SET_STATUS",
93 	[VHOST_USER_GET_STATUS] = "VHOST_USER_GET_STATUS",
94 };
95 
96 static int send_vhost_reply(int sockfd, struct VhostUserMsg *msg);
97 static int read_vhost_message(int sockfd, struct VhostUserMsg *msg);
98 
99 static void
100 close_msg_fds(struct VhostUserMsg *msg)
101 {
102 	int i;
103 
104 	for (i = 0; i < msg->fd_num; i++) {
105 		int fd = msg->fds[i];
106 
107 		if (fd == -1)
108 			continue;
109 
110 		msg->fds[i] = -1;
111 		close(fd);
112 	}
113 }
114 
115 /*
116  * Ensure the expected number of FDs is received,
117  * close all FDs and return an error if this is not the case.
118  */
119 static int
120 validate_msg_fds(struct VhostUserMsg *msg, int expected_fds)
121 {
122 	if (msg->fd_num == expected_fds)
123 		return 0;
124 
125 	VHOST_LOG_CONFIG(ERR,
126 		" Expect %d FDs for request %s, received %d\n",
127 		expected_fds,
128 		vhost_message_str[msg->request.master],
129 		msg->fd_num);
130 
131 	close_msg_fds(msg);
132 
133 	return -1;
134 }
135 
136 static uint64_t
137 get_blk_size(int fd)
138 {
139 	struct stat stat;
140 	int ret;
141 
142 	ret = fstat(fd, &stat);
143 	return ret == -1 ? (uint64_t)-1 : (uint64_t)stat.st_blksize;
144 }
145 
146 static int
147 async_dma_map(struct rte_vhost_mem_region *region, bool *dma_map_success, bool do_map)
148 {
149 	uint64_t host_iova;
150 	int ret = 0;
151 
152 	host_iova = rte_mem_virt2iova((void *)(uintptr_t)region->host_user_addr);
153 	if (do_map) {
154 		/* Add mapped region into the default container of DPDK. */
155 		ret = rte_vfio_container_dma_map(RTE_VFIO_DEFAULT_CONTAINER_FD,
156 						 region->host_user_addr,
157 						 host_iova,
158 						 region->size);
159 		*dma_map_success = ret == 0;
160 
161 		if (ret) {
162 			/*
163 			 * DMA device may bind with kernel driver, in this case,
164 			 * we don't need to program IOMMU manually. However, if no
165 			 * device is bound with vfio/uio in DPDK, and vfio kernel
166 			 * module is loaded, the API will still be called and return
167 			 * with ENODEV/ENOSUP.
168 			 *
169 			 * DPDK vfio only returns ENODEV/ENOSUP in very similar
170 			 * situations(vfio either unsupported, or supported
171 			 * but no devices found). Either way, no mappings could be
172 			 * performed. We treat it as normal case in async path.
173 			 */
174 			if (rte_errno == ENODEV || rte_errno == ENOTSUP)
175 				return 0;
176 
177 			VHOST_LOG_CONFIG(ERR, "DMA engine map failed\n");
178 			return ret;
179 
180 		}
181 
182 	} else {
183 		/* No need to do vfio unmap if the map failed. */
184 		if (!*dma_map_success)
185 			return 0;
186 
187 		/* Remove mapped region from the default container of DPDK. */
188 		ret = rte_vfio_container_dma_unmap(RTE_VFIO_DEFAULT_CONTAINER_FD,
189 						   region->host_user_addr,
190 						   host_iova,
191 						   region->size);
192 		if (ret) {
193 			VHOST_LOG_CONFIG(ERR, "DMA engine unmap failed\n");
194 			return ret;
195 		}
196 		/* Clear the flag once the unmap succeeds. */
197 		*dma_map_success = 0;
198 	}
199 
200 	return ret;
201 }
202 
203 static void
204 free_mem_region(struct virtio_net *dev)
205 {
206 	uint32_t i;
207 	struct rte_vhost_mem_region *reg;
208 
209 	if (!dev || !dev->mem)
210 		return;
211 
212 	for (i = 0; i < dev->mem->nregions; i++) {
213 		reg = &dev->mem->regions[i];
214 		if (reg->host_user_addr) {
215 			if (dev->async_copy && rte_vfio_is_enabled("vfio"))
216 				async_dma_map(reg, &dev->async_map_status[i], false);
217 
218 			munmap(reg->mmap_addr, reg->mmap_size);
219 			close(reg->fd);
220 		}
221 	}
222 }
223 
224 void
225 vhost_backend_cleanup(struct virtio_net *dev)
226 {
227 	if (dev->mem) {
228 		free_mem_region(dev);
229 		rte_free(dev->mem);
230 		dev->mem = NULL;
231 
232 		if (dev->async_map_status) {
233 			rte_free(dev->async_map_status);
234 			dev->async_map_status = NULL;
235 		}
236 	}
237 
238 	rte_free(dev->guest_pages);
239 	dev->guest_pages = NULL;
240 
241 	if (dev->log_addr) {
242 		munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
243 		dev->log_addr = 0;
244 	}
245 
246 	if (dev->inflight_info) {
247 		if (dev->inflight_info->addr) {
248 			munmap(dev->inflight_info->addr,
249 			       dev->inflight_info->size);
250 			dev->inflight_info->addr = NULL;
251 		}
252 
253 		if (dev->inflight_info->fd >= 0) {
254 			close(dev->inflight_info->fd);
255 			dev->inflight_info->fd = -1;
256 		}
257 
258 		rte_free(dev->inflight_info);
259 		dev->inflight_info = NULL;
260 	}
261 
262 	if (dev->slave_req_fd >= 0) {
263 		close(dev->slave_req_fd);
264 		dev->slave_req_fd = -1;
265 	}
266 
267 	if (dev->postcopy_ufd >= 0) {
268 		close(dev->postcopy_ufd);
269 		dev->postcopy_ufd = -1;
270 	}
271 
272 	dev->postcopy_listening = 0;
273 }
274 
275 static void
276 vhost_user_notify_queue_state(struct virtio_net *dev, uint16_t index,
277 			      int enable)
278 {
279 	struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev;
280 	struct vhost_virtqueue *vq = dev->virtqueue[index];
281 
282 	/* Configure guest notifications on enable */
283 	if (enable && vq->notif_enable != VIRTIO_UNINITIALIZED_NOTIF)
284 		vhost_enable_guest_notification(dev, vq, vq->notif_enable);
285 
286 	if (vdpa_dev && vdpa_dev->ops->set_vring_state)
287 		vdpa_dev->ops->set_vring_state(dev->vid, index, enable);
288 
289 	if (dev->notify_ops->vring_state_changed)
290 		dev->notify_ops->vring_state_changed(dev->vid,
291 				index, enable);
292 }
293 
294 /*
295  * This function just returns success at the moment unless
296  * the device hasn't been initialised.
297  */
298 static int
299 vhost_user_set_owner(struct virtio_net **pdev __rte_unused,
300 			struct VhostUserMsg *msg,
301 			int main_fd __rte_unused)
302 {
303 	if (validate_msg_fds(msg, 0) != 0)
304 		return RTE_VHOST_MSG_RESULT_ERR;
305 
306 	return RTE_VHOST_MSG_RESULT_OK;
307 }
308 
309 static int
310 vhost_user_reset_owner(struct virtio_net **pdev,
311 			struct VhostUserMsg *msg,
312 			int main_fd __rte_unused)
313 {
314 	struct virtio_net *dev = *pdev;
315 
316 	if (validate_msg_fds(msg, 0) != 0)
317 		return RTE_VHOST_MSG_RESULT_ERR;
318 
319 	vhost_destroy_device_notify(dev);
320 
321 	cleanup_device(dev, 0);
322 	reset_device(dev);
323 	return RTE_VHOST_MSG_RESULT_OK;
324 }
325 
326 /*
327  * The features that we support are requested.
328  */
329 static int
330 vhost_user_get_features(struct virtio_net **pdev, struct VhostUserMsg *msg,
331 			int main_fd __rte_unused)
332 {
333 	struct virtio_net *dev = *pdev;
334 	uint64_t features = 0;
335 
336 	if (validate_msg_fds(msg, 0) != 0)
337 		return RTE_VHOST_MSG_RESULT_ERR;
338 
339 	rte_vhost_driver_get_features(dev->ifname, &features);
340 
341 	msg->payload.u64 = features;
342 	msg->size = sizeof(msg->payload.u64);
343 	msg->fd_num = 0;
344 
345 	return RTE_VHOST_MSG_RESULT_REPLY;
346 }
347 
348 /*
349  * The queue number that we support are requested.
350  */
351 static int
352 vhost_user_get_queue_num(struct virtio_net **pdev, struct VhostUserMsg *msg,
353 			int main_fd __rte_unused)
354 {
355 	struct virtio_net *dev = *pdev;
356 	uint32_t queue_num = 0;
357 
358 	if (validate_msg_fds(msg, 0) != 0)
359 		return RTE_VHOST_MSG_RESULT_ERR;
360 
361 	rte_vhost_driver_get_queue_num(dev->ifname, &queue_num);
362 
363 	msg->payload.u64 = (uint64_t)queue_num;
364 	msg->size = sizeof(msg->payload.u64);
365 	msg->fd_num = 0;
366 
367 	return RTE_VHOST_MSG_RESULT_REPLY;
368 }
369 
370 /*
371  * We receive the negotiated features supported by us and the virtio device.
372  */
373 static int
374 vhost_user_set_features(struct virtio_net **pdev, struct VhostUserMsg *msg,
375 			int main_fd __rte_unused)
376 {
377 	struct virtio_net *dev = *pdev;
378 	uint64_t features = msg->payload.u64;
379 	uint64_t vhost_features = 0;
380 	struct rte_vdpa_device *vdpa_dev;
381 
382 	if (validate_msg_fds(msg, 0) != 0)
383 		return RTE_VHOST_MSG_RESULT_ERR;
384 
385 	rte_vhost_driver_get_features(dev->ifname, &vhost_features);
386 	if (features & ~vhost_features) {
387 		VHOST_LOG_CONFIG(ERR,
388 			"(%d) received invalid negotiated features.\n",
389 			dev->vid);
390 		dev->flags |= VIRTIO_DEV_FEATURES_FAILED;
391 		dev->status &= ~VIRTIO_DEVICE_STATUS_FEATURES_OK;
392 
393 		return RTE_VHOST_MSG_RESULT_ERR;
394 	}
395 
396 	if (dev->flags & VIRTIO_DEV_RUNNING) {
397 		if (dev->features == features)
398 			return RTE_VHOST_MSG_RESULT_OK;
399 
400 		/*
401 		 * Error out if master tries to change features while device is
402 		 * in running state. The exception being VHOST_F_LOG_ALL, which
403 		 * is enabled when the live-migration starts.
404 		 */
405 		if ((dev->features ^ features) & ~(1ULL << VHOST_F_LOG_ALL)) {
406 			VHOST_LOG_CONFIG(ERR,
407 				"(%d) features changed while device is running.\n",
408 				dev->vid);
409 			return RTE_VHOST_MSG_RESULT_ERR;
410 		}
411 
412 		if (dev->notify_ops->features_changed)
413 			dev->notify_ops->features_changed(dev->vid, features);
414 	}
415 
416 	dev->features = features;
417 	if (dev->features &
418 		((1ULL << VIRTIO_NET_F_MRG_RXBUF) |
419 		 (1ULL << VIRTIO_F_VERSION_1) |
420 		 (1ULL << VIRTIO_F_RING_PACKED))) {
421 		dev->vhost_hlen = sizeof(struct virtio_net_hdr_mrg_rxbuf);
422 	} else {
423 		dev->vhost_hlen = sizeof(struct virtio_net_hdr);
424 	}
425 	VHOST_LOG_CONFIG(INFO,
426 		"negotiated Virtio features: 0x%" PRIx64 "\n", dev->features);
427 	VHOST_LOG_CONFIG(DEBUG,
428 		"(%d) mergeable RX buffers %s, virtio 1 %s\n",
429 		dev->vid,
430 		(dev->features & (1 << VIRTIO_NET_F_MRG_RXBUF)) ? "on" : "off",
431 		(dev->features & (1ULL << VIRTIO_F_VERSION_1)) ? "on" : "off");
432 
433 	if ((dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET) &&
434 	    !(dev->features & (1ULL << VIRTIO_NET_F_MQ))) {
435 		/*
436 		 * Remove all but first queue pair if MQ hasn't been
437 		 * negotiated. This is safe because the device is not
438 		 * running at this stage.
439 		 */
440 		while (dev->nr_vring > 2) {
441 			struct vhost_virtqueue *vq;
442 
443 			vq = dev->virtqueue[--dev->nr_vring];
444 			if (!vq)
445 				continue;
446 
447 			dev->virtqueue[dev->nr_vring] = NULL;
448 			cleanup_vq(vq, 1);
449 			cleanup_vq_inflight(dev, vq);
450 			free_vq(dev, vq);
451 		}
452 	}
453 
454 	vdpa_dev = dev->vdpa_dev;
455 	if (vdpa_dev)
456 		vdpa_dev->ops->set_features(dev->vid);
457 
458 	dev->flags &= ~VIRTIO_DEV_FEATURES_FAILED;
459 	return RTE_VHOST_MSG_RESULT_OK;
460 }
461 
462 /*
463  * The virtio device sends us the size of the descriptor ring.
464  */
465 static int
466 vhost_user_set_vring_num(struct virtio_net **pdev,
467 			struct VhostUserMsg *msg,
468 			int main_fd __rte_unused)
469 {
470 	struct virtio_net *dev = *pdev;
471 	struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
472 
473 	if (validate_msg_fds(msg, 0) != 0)
474 		return RTE_VHOST_MSG_RESULT_ERR;
475 
476 	if (msg->payload.state.num > 32768) {
477 		VHOST_LOG_CONFIG(ERR, "invalid virtqueue size %u\n", msg->payload.state.num);
478 		return RTE_VHOST_MSG_RESULT_ERR;
479 	}
480 
481 	vq->size = msg->payload.state.num;
482 
483 	/* VIRTIO 1.0, 2.4 Virtqueues says:
484 	 *
485 	 *   Queue Size value is always a power of 2. The maximum Queue Size
486 	 *   value is 32768.
487 	 *
488 	 * VIRTIO 1.1 2.7 Virtqueues says:
489 	 *
490 	 *   Packed virtqueues support up to 2^15 entries each.
491 	 */
492 	if (!vq_is_packed(dev)) {
493 		if (vq->size & (vq->size - 1)) {
494 			VHOST_LOG_CONFIG(ERR,
495 				"invalid virtqueue size %u\n", vq->size);
496 			return RTE_VHOST_MSG_RESULT_ERR;
497 		}
498 	}
499 
500 	if (vq_is_packed(dev)) {
501 		if (vq->shadow_used_packed)
502 			rte_free(vq->shadow_used_packed);
503 		vq->shadow_used_packed = rte_malloc_socket(NULL,
504 				vq->size *
505 				sizeof(struct vring_used_elem_packed),
506 				RTE_CACHE_LINE_SIZE, vq->numa_node);
507 		if (!vq->shadow_used_packed) {
508 			VHOST_LOG_CONFIG(ERR,
509 					"failed to allocate memory for shadow used ring.\n");
510 			return RTE_VHOST_MSG_RESULT_ERR;
511 		}
512 
513 	} else {
514 		if (vq->shadow_used_split)
515 			rte_free(vq->shadow_used_split);
516 
517 		vq->shadow_used_split = rte_malloc_socket(NULL,
518 				vq->size * sizeof(struct vring_used_elem),
519 				RTE_CACHE_LINE_SIZE, vq->numa_node);
520 
521 		if (!vq->shadow_used_split) {
522 			VHOST_LOG_CONFIG(ERR,
523 					"failed to allocate memory for vq internal data.\n");
524 			return RTE_VHOST_MSG_RESULT_ERR;
525 		}
526 	}
527 
528 	if (vq->batch_copy_elems)
529 		rte_free(vq->batch_copy_elems);
530 	vq->batch_copy_elems = rte_malloc_socket(NULL,
531 				vq->size * sizeof(struct batch_copy_elem),
532 				RTE_CACHE_LINE_SIZE, vq->numa_node);
533 	if (!vq->batch_copy_elems) {
534 		VHOST_LOG_CONFIG(ERR,
535 			"failed to allocate memory for batching copy.\n");
536 		return RTE_VHOST_MSG_RESULT_ERR;
537 	}
538 
539 	return RTE_VHOST_MSG_RESULT_OK;
540 }
541 
542 /*
543  * Reallocate virtio_dev, vhost_virtqueue and related data structures to
544  * make them on the same numa node as the memory of vring descriptor.
545  */
546 #ifdef RTE_LIBRTE_VHOST_NUMA
547 static struct virtio_net*
548 numa_realloc(struct virtio_net *dev, int index)
549 {
550 	int node, dev_node;
551 	struct virtio_net *old_dev;
552 	struct vhost_virtqueue *vq;
553 	struct batch_copy_elem *bce;
554 	struct guest_page *gp;
555 	struct rte_vhost_memory *mem;
556 	size_t mem_size;
557 	int ret;
558 
559 	old_dev = dev;
560 	vq = dev->virtqueue[index];
561 
562 	/*
563 	 * If VQ is ready, it is too late to reallocate, it certainly already
564 	 * happened anyway on VHOST_USER_SET_VRING_ADRR.
565 	 */
566 	if (vq->ready)
567 		return dev;
568 
569 	ret = get_mempolicy(&node, NULL, 0, vq->desc, MPOL_F_NODE | MPOL_F_ADDR);
570 	if (ret) {
571 		VHOST_LOG_CONFIG(ERR, "Unable to get virtqueue %d numa information.\n", index);
572 		return dev;
573 	}
574 
575 	if (node == vq->numa_node)
576 		goto out_dev_realloc;
577 
578 	vq = rte_realloc_socket(vq, sizeof(*vq), 0, node);
579 	if (!vq) {
580 		VHOST_LOG_CONFIG(ERR, "Failed to realloc virtqueue %d on node %d\n",
581 				index, node);
582 		return dev;
583 	}
584 
585 	if (vq != dev->virtqueue[index]) {
586 		VHOST_LOG_CONFIG(INFO, "reallocated virtqueue on node %d\n", node);
587 		dev->virtqueue[index] = vq;
588 		vhost_user_iotlb_init(dev, index);
589 	}
590 
591 	if (vq_is_packed(dev)) {
592 		struct vring_used_elem_packed *sup;
593 
594 		sup = rte_realloc_socket(vq->shadow_used_packed, vq->size * sizeof(*sup),
595 				RTE_CACHE_LINE_SIZE, node);
596 		if (!sup) {
597 			VHOST_LOG_CONFIG(ERR, "Failed to realloc shadow packed on node %d\n", node);
598 			return dev;
599 		}
600 		vq->shadow_used_packed = sup;
601 	} else {
602 		struct vring_used_elem *sus;
603 
604 		sus = rte_realloc_socket(vq->shadow_used_split, vq->size * sizeof(*sus),
605 				RTE_CACHE_LINE_SIZE, node);
606 		if (!sus) {
607 			VHOST_LOG_CONFIG(ERR, "Failed to realloc shadow split on node %d\n", node);
608 			return dev;
609 		}
610 		vq->shadow_used_split = sus;
611 	}
612 
613 	bce = rte_realloc_socket(vq->batch_copy_elems, vq->size * sizeof(*bce),
614 			RTE_CACHE_LINE_SIZE, node);
615 	if (!bce) {
616 		VHOST_LOG_CONFIG(ERR, "Failed to realloc batch copy elem on node %d\n", node);
617 		return dev;
618 	}
619 	vq->batch_copy_elems = bce;
620 
621 	if (vq->log_cache) {
622 		struct log_cache_entry *lc;
623 
624 		lc = rte_realloc_socket(vq->log_cache, sizeof(*lc) * VHOST_LOG_CACHE_NR, 0, node);
625 		if (!lc) {
626 			VHOST_LOG_CONFIG(ERR, "Failed to realloc log cache on node %d\n", node);
627 			return dev;
628 		}
629 		vq->log_cache = lc;
630 	}
631 
632 	if (vq->resubmit_inflight) {
633 		struct rte_vhost_resubmit_info *ri;
634 
635 		ri = rte_realloc_socket(vq->resubmit_inflight, sizeof(*ri), 0, node);
636 		if (!ri) {
637 			VHOST_LOG_CONFIG(ERR, "Failed to realloc resubmit inflight on node %d\n",
638 					node);
639 			return dev;
640 		}
641 		vq->resubmit_inflight = ri;
642 
643 		if (ri->resubmit_list) {
644 			struct rte_vhost_resubmit_desc *rd;
645 
646 			rd = rte_realloc_socket(ri->resubmit_list, sizeof(*rd) * ri->resubmit_num,
647 					0, node);
648 			if (!rd) {
649 				VHOST_LOG_CONFIG(ERR, "Failed to realloc resubmit list on node %d\n",
650 						node);
651 				return dev;
652 			}
653 			ri->resubmit_list = rd;
654 		}
655 	}
656 
657 	vq->numa_node = node;
658 
659 out_dev_realloc:
660 
661 	if (dev->flags & VIRTIO_DEV_RUNNING)
662 		return dev;
663 
664 	ret = get_mempolicy(&dev_node, NULL, 0, dev, MPOL_F_NODE | MPOL_F_ADDR);
665 	if (ret) {
666 		VHOST_LOG_CONFIG(ERR, "Unable to get Virtio dev %d numa information.\n", dev->vid);
667 		return dev;
668 	}
669 
670 	if (dev_node == node)
671 		return dev;
672 
673 	dev = rte_realloc_socket(old_dev, sizeof(*dev), 0, node);
674 	if (!dev) {
675 		VHOST_LOG_CONFIG(ERR, "Failed to realloc dev on node %d\n", node);
676 		return old_dev;
677 	}
678 
679 	VHOST_LOG_CONFIG(INFO, "reallocated device on node %d\n", node);
680 	vhost_devices[dev->vid] = dev;
681 
682 	mem_size = sizeof(struct rte_vhost_memory) +
683 		sizeof(struct rte_vhost_mem_region) * dev->mem->nregions;
684 	mem = rte_realloc_socket(dev->mem, mem_size, 0, node);
685 	if (!mem) {
686 		VHOST_LOG_CONFIG(ERR, "Failed to realloc mem table on node %d\n", node);
687 		return dev;
688 	}
689 	dev->mem = mem;
690 
691 	if (dev->async_copy && rte_vfio_is_enabled("vfio")) {
692 		if (dev->async_map_status == NULL) {
693 			dev->async_map_status = rte_zmalloc_socket("async-dma-map-status",
694 					sizeof(bool) * dev->mem->nregions, 0, node);
695 			if (!dev->async_map_status) {
696 				VHOST_LOG_CONFIG(ERR,
697 					"(%d) failed to realloc dma mapping status on node\n",
698 					dev->vid);
699 				return dev;
700 			}
701 		}
702 	}
703 
704 	gp = rte_realloc_socket(dev->guest_pages, dev->max_guest_pages * sizeof(*gp),
705 			RTE_CACHE_LINE_SIZE, node);
706 	if (!gp) {
707 		VHOST_LOG_CONFIG(ERR, "Failed to realloc guest pages on node %d\n", node);
708 		return dev;
709 	}
710 	dev->guest_pages = gp;
711 
712 	return dev;
713 }
714 #else
715 static struct virtio_net*
716 numa_realloc(struct virtio_net *dev, int index __rte_unused)
717 {
718 	return dev;
719 }
720 #endif
721 
722 /* Converts QEMU virtual address to Vhost virtual address. */
723 static uint64_t
724 qva_to_vva(struct virtio_net *dev, uint64_t qva, uint64_t *len)
725 {
726 	struct rte_vhost_mem_region *r;
727 	uint32_t i;
728 
729 	if (unlikely(!dev || !dev->mem))
730 		goto out_error;
731 
732 	/* Find the region where the address lives. */
733 	for (i = 0; i < dev->mem->nregions; i++) {
734 		r = &dev->mem->regions[i];
735 
736 		if (qva >= r->guest_user_addr &&
737 		    qva <  r->guest_user_addr + r->size) {
738 
739 			if (unlikely(*len > r->guest_user_addr + r->size - qva))
740 				*len = r->guest_user_addr + r->size - qva;
741 
742 			return qva - r->guest_user_addr +
743 			       r->host_user_addr;
744 		}
745 	}
746 out_error:
747 	*len = 0;
748 
749 	return 0;
750 }
751 
752 
753 /*
754  * Converts ring address to Vhost virtual address.
755  * If IOMMU is enabled, the ring address is a guest IO virtual address,
756  * else it is a QEMU virtual address.
757  */
758 static uint64_t
759 ring_addr_to_vva(struct virtio_net *dev, struct vhost_virtqueue *vq,
760 		uint64_t ra, uint64_t *size)
761 {
762 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM)) {
763 		uint64_t vva;
764 
765 		vhost_user_iotlb_rd_lock(vq);
766 		vva = vhost_iova_to_vva(dev, vq, ra,
767 					size, VHOST_ACCESS_RW);
768 		vhost_user_iotlb_rd_unlock(vq);
769 
770 		return vva;
771 	}
772 
773 	return qva_to_vva(dev, ra, size);
774 }
775 
776 static uint64_t
777 log_addr_to_gpa(struct virtio_net *dev, struct vhost_virtqueue *vq)
778 {
779 	uint64_t log_gpa;
780 
781 	vhost_user_iotlb_rd_lock(vq);
782 	log_gpa = translate_log_addr(dev, vq, vq->ring_addrs.log_guest_addr);
783 	vhost_user_iotlb_rd_unlock(vq);
784 
785 	return log_gpa;
786 }
787 
788 static struct virtio_net *
789 translate_ring_addresses(struct virtio_net *dev, int vq_index)
790 {
791 	struct vhost_virtqueue *vq = dev->virtqueue[vq_index];
792 	struct vhost_vring_addr *addr = &vq->ring_addrs;
793 	uint64_t len, expected_len;
794 
795 	if (addr->flags & (1 << VHOST_VRING_F_LOG)) {
796 		vq->log_guest_addr =
797 			log_addr_to_gpa(dev, vq);
798 		if (vq->log_guest_addr == 0) {
799 			VHOST_LOG_CONFIG(DEBUG,
800 				"(%d) failed to map log_guest_addr.\n",
801 				dev->vid);
802 			return dev;
803 		}
804 	}
805 
806 	if (vq_is_packed(dev)) {
807 		len = sizeof(struct vring_packed_desc) * vq->size;
808 		vq->desc_packed = (struct vring_packed_desc *)(uintptr_t)
809 			ring_addr_to_vva(dev, vq, addr->desc_user_addr, &len);
810 		if (vq->desc_packed == NULL ||
811 				len != sizeof(struct vring_packed_desc) *
812 				vq->size) {
813 			VHOST_LOG_CONFIG(DEBUG,
814 				"(%d) failed to map desc_packed ring.\n",
815 				dev->vid);
816 			return dev;
817 		}
818 
819 		dev = numa_realloc(dev, vq_index);
820 		vq = dev->virtqueue[vq_index];
821 		addr = &vq->ring_addrs;
822 
823 		len = sizeof(struct vring_packed_desc_event);
824 		vq->driver_event = (struct vring_packed_desc_event *)
825 					(uintptr_t)ring_addr_to_vva(dev,
826 					vq, addr->avail_user_addr, &len);
827 		if (vq->driver_event == NULL ||
828 				len != sizeof(struct vring_packed_desc_event)) {
829 			VHOST_LOG_CONFIG(DEBUG,
830 				"(%d) failed to find driver area address.\n",
831 				dev->vid);
832 			return dev;
833 		}
834 
835 		len = sizeof(struct vring_packed_desc_event);
836 		vq->device_event = (struct vring_packed_desc_event *)
837 					(uintptr_t)ring_addr_to_vva(dev,
838 					vq, addr->used_user_addr, &len);
839 		if (vq->device_event == NULL ||
840 				len != sizeof(struct vring_packed_desc_event)) {
841 			VHOST_LOG_CONFIG(DEBUG,
842 				"(%d) failed to find device area address.\n",
843 				dev->vid);
844 			return dev;
845 		}
846 
847 		vq->access_ok = true;
848 		return dev;
849 	}
850 
851 	/* The addresses are converted from QEMU virtual to Vhost virtual. */
852 	if (vq->desc && vq->avail && vq->used)
853 		return dev;
854 
855 	len = sizeof(struct vring_desc) * vq->size;
856 	vq->desc = (struct vring_desc *)(uintptr_t)ring_addr_to_vva(dev,
857 			vq, addr->desc_user_addr, &len);
858 	if (vq->desc == 0 || len != sizeof(struct vring_desc) * vq->size) {
859 		VHOST_LOG_CONFIG(DEBUG,
860 			"(%d) failed to map desc ring.\n",
861 			dev->vid);
862 		return dev;
863 	}
864 
865 	dev = numa_realloc(dev, vq_index);
866 	vq = dev->virtqueue[vq_index];
867 	addr = &vq->ring_addrs;
868 
869 	len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size;
870 	if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))
871 		len += sizeof(uint16_t);
872 	expected_len = len;
873 	vq->avail = (struct vring_avail *)(uintptr_t)ring_addr_to_vva(dev,
874 			vq, addr->avail_user_addr, &len);
875 	if (vq->avail == 0 || len != expected_len) {
876 		VHOST_LOG_CONFIG(DEBUG,
877 			"(%d) failed to map avail ring.\n",
878 			dev->vid);
879 		return dev;
880 	}
881 
882 	len = sizeof(struct vring_used) +
883 		sizeof(struct vring_used_elem) * vq->size;
884 	if (dev->features & (1ULL << VIRTIO_RING_F_EVENT_IDX))
885 		len += sizeof(uint16_t);
886 	expected_len = len;
887 	vq->used = (struct vring_used *)(uintptr_t)ring_addr_to_vva(dev,
888 			vq, addr->used_user_addr, &len);
889 	if (vq->used == 0 || len != expected_len) {
890 		VHOST_LOG_CONFIG(DEBUG,
891 			"(%d) failed to map used ring.\n",
892 			dev->vid);
893 		return dev;
894 	}
895 
896 	if (vq->last_used_idx != vq->used->idx) {
897 		VHOST_LOG_CONFIG(WARNING,
898 			"last_used_idx (%u) and vq->used->idx (%u) mismatches; "
899 			"some packets maybe resent for Tx and dropped for Rx\n",
900 			vq->last_used_idx, vq->used->idx);
901 		vq->last_used_idx  = vq->used->idx;
902 		vq->last_avail_idx = vq->used->idx;
903 	}
904 
905 	vq->access_ok = true;
906 
907 	VHOST_LOG_CONFIG(DEBUG, "(%d) mapped address desc: %p\n",
908 			dev->vid, vq->desc);
909 	VHOST_LOG_CONFIG(DEBUG, "(%d) mapped address avail: %p\n",
910 			dev->vid, vq->avail);
911 	VHOST_LOG_CONFIG(DEBUG, "(%d) mapped address used: %p\n",
912 			dev->vid, vq->used);
913 	VHOST_LOG_CONFIG(DEBUG, "(%d) log_guest_addr: %" PRIx64 "\n",
914 			dev->vid, vq->log_guest_addr);
915 
916 	return dev;
917 }
918 
919 /*
920  * The virtio device sends us the desc, used and avail ring addresses.
921  * This function then converts these to our address space.
922  */
923 static int
924 vhost_user_set_vring_addr(struct virtio_net **pdev, struct VhostUserMsg *msg,
925 			int main_fd __rte_unused)
926 {
927 	struct virtio_net *dev = *pdev;
928 	struct vhost_virtqueue *vq;
929 	struct vhost_vring_addr *addr = &msg->payload.addr;
930 	bool access_ok;
931 
932 	if (validate_msg_fds(msg, 0) != 0)
933 		return RTE_VHOST_MSG_RESULT_ERR;
934 
935 	if (dev->mem == NULL)
936 		return RTE_VHOST_MSG_RESULT_ERR;
937 
938 	/* addr->index refers to the queue index. The txq 1, rxq is 0. */
939 	vq = dev->virtqueue[msg->payload.addr.index];
940 
941 	access_ok = vq->access_ok;
942 
943 	/*
944 	 * Rings addresses should not be interpreted as long as the ring is not
945 	 * started and enabled
946 	 */
947 	memcpy(&vq->ring_addrs, addr, sizeof(*addr));
948 
949 	vring_invalidate(dev, vq);
950 
951 	if ((vq->enabled && (dev->features &
952 				(1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) ||
953 			access_ok) {
954 		dev = translate_ring_addresses(dev, msg->payload.addr.index);
955 		if (!dev)
956 			return RTE_VHOST_MSG_RESULT_ERR;
957 
958 		*pdev = dev;
959 	}
960 
961 	return RTE_VHOST_MSG_RESULT_OK;
962 }
963 
964 /*
965  * The virtio device sends us the available ring last used index.
966  */
967 static int
968 vhost_user_set_vring_base(struct virtio_net **pdev,
969 			struct VhostUserMsg *msg,
970 			int main_fd __rte_unused)
971 {
972 	struct virtio_net *dev = *pdev;
973 	struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
974 	uint64_t val = msg->payload.state.num;
975 
976 	if (validate_msg_fds(msg, 0) != 0)
977 		return RTE_VHOST_MSG_RESULT_ERR;
978 
979 	if (vq_is_packed(dev)) {
980 		/*
981 		 * Bit[0:14]: avail index
982 		 * Bit[15]: avail wrap counter
983 		 */
984 		vq->last_avail_idx = val & 0x7fff;
985 		vq->avail_wrap_counter = !!(val & (0x1 << 15));
986 		/*
987 		 * Set used index to same value as available one, as
988 		 * their values should be the same since ring processing
989 		 * was stopped at get time.
990 		 */
991 		vq->last_used_idx = vq->last_avail_idx;
992 		vq->used_wrap_counter = vq->avail_wrap_counter;
993 	} else {
994 		vq->last_used_idx = msg->payload.state.num;
995 		vq->last_avail_idx = msg->payload.state.num;
996 	}
997 
998 	return RTE_VHOST_MSG_RESULT_OK;
999 }
1000 
1001 static int
1002 add_one_guest_page(struct virtio_net *dev, uint64_t guest_phys_addr,
1003 		   uint64_t host_phys_addr, uint64_t size)
1004 {
1005 	struct guest_page *page, *last_page;
1006 	struct guest_page *old_pages;
1007 
1008 	if (dev->nr_guest_pages == dev->max_guest_pages) {
1009 		dev->max_guest_pages *= 2;
1010 		old_pages = dev->guest_pages;
1011 		dev->guest_pages = rte_realloc(dev->guest_pages,
1012 					dev->max_guest_pages * sizeof(*page),
1013 					RTE_CACHE_LINE_SIZE);
1014 		if (dev->guest_pages == NULL) {
1015 			VHOST_LOG_CONFIG(ERR, "cannot realloc guest_pages\n");
1016 			rte_free(old_pages);
1017 			return -1;
1018 		}
1019 	}
1020 
1021 	if (dev->nr_guest_pages > 0) {
1022 		last_page = &dev->guest_pages[dev->nr_guest_pages - 1];
1023 		/* merge if the two pages are continuous */
1024 		if (host_phys_addr == last_page->host_phys_addr +
1025 				      last_page->size) {
1026 			last_page->size += size;
1027 			return 0;
1028 		}
1029 	}
1030 
1031 	page = &dev->guest_pages[dev->nr_guest_pages++];
1032 	page->guest_phys_addr = guest_phys_addr;
1033 	page->host_phys_addr  = host_phys_addr;
1034 	page->size = size;
1035 
1036 	return 0;
1037 }
1038 
1039 static int
1040 add_guest_pages(struct virtio_net *dev, struct rte_vhost_mem_region *reg,
1041 		uint64_t page_size)
1042 {
1043 	uint64_t reg_size = reg->size;
1044 	uint64_t host_user_addr  = reg->host_user_addr;
1045 	uint64_t guest_phys_addr = reg->guest_phys_addr;
1046 	uint64_t host_phys_addr;
1047 	uint64_t size;
1048 
1049 	host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)host_user_addr);
1050 	size = page_size - (guest_phys_addr & (page_size - 1));
1051 	size = RTE_MIN(size, reg_size);
1052 
1053 	if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr, size) < 0)
1054 		return -1;
1055 
1056 	host_user_addr  += size;
1057 	guest_phys_addr += size;
1058 	reg_size -= size;
1059 
1060 	while (reg_size > 0) {
1061 		size = RTE_MIN(reg_size, page_size);
1062 		host_phys_addr = rte_mem_virt2iova((void *)(uintptr_t)
1063 						  host_user_addr);
1064 		if (add_one_guest_page(dev, guest_phys_addr, host_phys_addr,
1065 				size) < 0)
1066 			return -1;
1067 
1068 		host_user_addr  += size;
1069 		guest_phys_addr += size;
1070 		reg_size -= size;
1071 	}
1072 
1073 	/* sort guest page array if over binary search threshold */
1074 	if (dev->nr_guest_pages >= VHOST_BINARY_SEARCH_THRESH) {
1075 		qsort((void *)dev->guest_pages, dev->nr_guest_pages,
1076 			sizeof(struct guest_page), guest_page_addrcmp);
1077 	}
1078 
1079 	return 0;
1080 }
1081 
1082 #ifdef RTE_LIBRTE_VHOST_DEBUG
1083 /* TODO: enable it only in debug mode? */
1084 static void
1085 dump_guest_pages(struct virtio_net *dev)
1086 {
1087 	uint32_t i;
1088 	struct guest_page *page;
1089 
1090 	for (i = 0; i < dev->nr_guest_pages; i++) {
1091 		page = &dev->guest_pages[i];
1092 
1093 		VHOST_LOG_CONFIG(INFO,
1094 			"guest physical page region %u\n"
1095 			"\t guest_phys_addr: %" PRIx64 "\n"
1096 			"\t host_phys_addr : %" PRIx64 "\n"
1097 			"\t size           : %" PRIx64 "\n",
1098 			i,
1099 			page->guest_phys_addr,
1100 			page->host_phys_addr,
1101 			page->size);
1102 	}
1103 }
1104 #else
1105 #define dump_guest_pages(dev)
1106 #endif
1107 
1108 static bool
1109 vhost_memory_changed(struct VhostUserMemory *new,
1110 		     struct rte_vhost_memory *old)
1111 {
1112 	uint32_t i;
1113 
1114 	if (new->nregions != old->nregions)
1115 		return true;
1116 
1117 	for (i = 0; i < new->nregions; ++i) {
1118 		VhostUserMemoryRegion *new_r = &new->regions[i];
1119 		struct rte_vhost_mem_region *old_r = &old->regions[i];
1120 
1121 		if (new_r->guest_phys_addr != old_r->guest_phys_addr)
1122 			return true;
1123 		if (new_r->memory_size != old_r->size)
1124 			return true;
1125 		if (new_r->userspace_addr != old_r->guest_user_addr)
1126 			return true;
1127 	}
1128 
1129 	return false;
1130 }
1131 
1132 #ifdef RTE_LIBRTE_VHOST_POSTCOPY
1133 static int
1134 vhost_user_postcopy_region_register(struct virtio_net *dev,
1135 		struct rte_vhost_mem_region *reg)
1136 {
1137 	struct uffdio_register reg_struct;
1138 
1139 	/*
1140 	 * Let's register all the mmap'ed area to ensure
1141 	 * alignment on page boundary.
1142 	 */
1143 	reg_struct.range.start = (uint64_t)(uintptr_t)reg->mmap_addr;
1144 	reg_struct.range.len = reg->mmap_size;
1145 	reg_struct.mode = UFFDIO_REGISTER_MODE_MISSING;
1146 
1147 	if (ioctl(dev->postcopy_ufd, UFFDIO_REGISTER,
1148 				&reg_struct)) {
1149 		VHOST_LOG_CONFIG(ERR, "Failed to register ufd for region "
1150 				"%" PRIx64 " - %" PRIx64 " (ufd = %d) %s\n",
1151 				(uint64_t)reg_struct.range.start,
1152 				(uint64_t)reg_struct.range.start +
1153 				(uint64_t)reg_struct.range.len - 1,
1154 				dev->postcopy_ufd,
1155 				strerror(errno));
1156 		return -1;
1157 	}
1158 
1159 	VHOST_LOG_CONFIG(INFO, "\t userfaultfd registered for range : %" PRIx64 " - %" PRIx64 "\n",
1160 			(uint64_t)reg_struct.range.start,
1161 			(uint64_t)reg_struct.range.start +
1162 			(uint64_t)reg_struct.range.len - 1);
1163 
1164 	return 0;
1165 }
1166 #else
1167 static int
1168 vhost_user_postcopy_region_register(struct virtio_net *dev __rte_unused,
1169 		struct rte_vhost_mem_region *reg __rte_unused)
1170 {
1171 	return -1;
1172 }
1173 #endif
1174 
1175 static int
1176 vhost_user_postcopy_register(struct virtio_net *dev, int main_fd,
1177 		struct VhostUserMsg *msg)
1178 {
1179 	struct VhostUserMemory *memory;
1180 	struct rte_vhost_mem_region *reg;
1181 	VhostUserMsg ack_msg;
1182 	uint32_t i;
1183 
1184 	if (!dev->postcopy_listening)
1185 		return 0;
1186 
1187 	/*
1188 	 * We haven't a better way right now than sharing
1189 	 * DPDK's virtual address with Qemu, so that Qemu can
1190 	 * retrieve the region offset when handling userfaults.
1191 	 */
1192 	memory = &msg->payload.memory;
1193 	for (i = 0; i < memory->nregions; i++) {
1194 		reg = &dev->mem->regions[i];
1195 		memory->regions[i].userspace_addr = reg->host_user_addr;
1196 	}
1197 
1198 	/* Send the addresses back to qemu */
1199 	msg->fd_num = 0;
1200 	send_vhost_reply(main_fd, msg);
1201 
1202 	/* Wait for qemu to acknolwedge it's got the addresses
1203 	 * we've got to wait before we're allowed to generate faults.
1204 	 */
1205 	if (read_vhost_message(main_fd, &ack_msg) <= 0) {
1206 		VHOST_LOG_CONFIG(ERR,
1207 				"Failed to read qemu ack on postcopy set-mem-table\n");
1208 		return -1;
1209 	}
1210 
1211 	if (validate_msg_fds(&ack_msg, 0) != 0)
1212 		return -1;
1213 
1214 	if (ack_msg.request.master != VHOST_USER_SET_MEM_TABLE) {
1215 		VHOST_LOG_CONFIG(ERR,
1216 				"Bad qemu ack on postcopy set-mem-table (%d)\n",
1217 				ack_msg.request.master);
1218 		return -1;
1219 	}
1220 
1221 	/* Now userfault register and we can use the memory */
1222 	for (i = 0; i < memory->nregions; i++) {
1223 		reg = &dev->mem->regions[i];
1224 		if (vhost_user_postcopy_region_register(dev, reg) < 0)
1225 			return -1;
1226 	}
1227 
1228 	return 0;
1229 }
1230 
1231 static int
1232 vhost_user_mmap_region(struct virtio_net *dev,
1233 		struct rte_vhost_mem_region *region,
1234 		uint32_t region_index,
1235 		uint64_t mmap_offset)
1236 {
1237 	void *mmap_addr;
1238 	uint64_t mmap_size;
1239 	uint64_t alignment;
1240 	int populate;
1241 	int ret;
1242 
1243 	/* Check for memory_size + mmap_offset overflow */
1244 	if (mmap_offset >= -region->size) {
1245 		VHOST_LOG_CONFIG(ERR,
1246 				"mmap_offset (%#"PRIx64") and memory_size "
1247 				"(%#"PRIx64") overflow\n",
1248 				mmap_offset, region->size);
1249 		return -1;
1250 	}
1251 
1252 	mmap_size = region->size + mmap_offset;
1253 
1254 	/* mmap() without flag of MAP_ANONYMOUS, should be called with length
1255 	 * argument aligned with hugepagesz at older longterm version Linux,
1256 	 * like 2.6.32 and 3.2.72, or mmap() will fail with EINVAL.
1257 	 *
1258 	 * To avoid failure, make sure in caller to keep length aligned.
1259 	 */
1260 	alignment = get_blk_size(region->fd);
1261 	if (alignment == (uint64_t)-1) {
1262 		VHOST_LOG_CONFIG(ERR,
1263 				"couldn't get hugepage size through fstat\n");
1264 		return -1;
1265 	}
1266 	mmap_size = RTE_ALIGN_CEIL(mmap_size, alignment);
1267 	if (mmap_size == 0) {
1268 		/*
1269 		 * It could happen if initial mmap_size + alignment overflows
1270 		 * the sizeof uint64, which could happen if either mmap_size or
1271 		 * alignment value is wrong.
1272 		 *
1273 		 * mmap() kernel implementation would return an error, but
1274 		 * better catch it before and provide useful info in the logs.
1275 		 */
1276 		VHOST_LOG_CONFIG(ERR, "mmap size (0x%" PRIx64 ") "
1277 				"or alignment (0x%" PRIx64 ") is invalid\n",
1278 				region->size + mmap_offset, alignment);
1279 		return -1;
1280 	}
1281 
1282 	populate = dev->async_copy ? MAP_POPULATE : 0;
1283 	mmap_addr = mmap(NULL, mmap_size, PROT_READ | PROT_WRITE,
1284 			MAP_SHARED | populate, region->fd, 0);
1285 
1286 	if (mmap_addr == MAP_FAILED) {
1287 		VHOST_LOG_CONFIG(ERR, "mmap failed (%s).\n", strerror(errno));
1288 		return -1;
1289 	}
1290 
1291 	region->mmap_addr = mmap_addr;
1292 	region->mmap_size = mmap_size;
1293 	region->host_user_addr = (uint64_t)(uintptr_t)mmap_addr + mmap_offset;
1294 
1295 	if (dev->async_copy) {
1296 		if (add_guest_pages(dev, region, alignment) < 0) {
1297 			VHOST_LOG_CONFIG(ERR,
1298 					"adding guest pages to region failed.\n");
1299 			return -1;
1300 		}
1301 
1302 		if (rte_vfio_is_enabled("vfio")) {
1303 			ret = async_dma_map(region, &dev->async_map_status[region_index], true);
1304 			if (ret) {
1305 				VHOST_LOG_CONFIG(ERR, "Configure IOMMU for DMA "
1306 							"engine failed\n");
1307 				return -1;
1308 			}
1309 		}
1310 	}
1311 
1312 	VHOST_LOG_CONFIG(INFO,
1313 			"guest memory region size: 0x%" PRIx64 "\n"
1314 			"\t guest physical addr: 0x%" PRIx64 "\n"
1315 			"\t guest virtual  addr: 0x%" PRIx64 "\n"
1316 			"\t host  virtual  addr: 0x%" PRIx64 "\n"
1317 			"\t mmap addr : 0x%" PRIx64 "\n"
1318 			"\t mmap size : 0x%" PRIx64 "\n"
1319 			"\t mmap align: 0x%" PRIx64 "\n"
1320 			"\t mmap off  : 0x%" PRIx64 "\n",
1321 			region->size,
1322 			region->guest_phys_addr,
1323 			region->guest_user_addr,
1324 			region->host_user_addr,
1325 			(uint64_t)(uintptr_t)mmap_addr,
1326 			mmap_size,
1327 			alignment,
1328 			mmap_offset);
1329 
1330 	return 0;
1331 }
1332 
1333 static int
1334 vhost_user_set_mem_table(struct virtio_net **pdev, struct VhostUserMsg *msg,
1335 			int main_fd)
1336 {
1337 	struct virtio_net *dev = *pdev;
1338 	struct VhostUserMemory *memory = &msg->payload.memory;
1339 	struct rte_vhost_mem_region *reg;
1340 	int numa_node = SOCKET_ID_ANY;
1341 	uint64_t mmap_offset;
1342 	uint32_t i;
1343 	bool async_notify = false;
1344 
1345 	if (validate_msg_fds(msg, memory->nregions) != 0)
1346 		return RTE_VHOST_MSG_RESULT_ERR;
1347 
1348 	if (memory->nregions > VHOST_MEMORY_MAX_NREGIONS) {
1349 		VHOST_LOG_CONFIG(ERR,
1350 			"too many memory regions (%u)\n", memory->nregions);
1351 		goto close_msg_fds;
1352 	}
1353 
1354 	if (dev->mem && !vhost_memory_changed(memory, dev->mem)) {
1355 		VHOST_LOG_CONFIG(INFO,
1356 			"(%d) memory regions not changed\n", dev->vid);
1357 
1358 		close_msg_fds(msg);
1359 
1360 		return RTE_VHOST_MSG_RESULT_OK;
1361 	}
1362 
1363 	if (dev->mem) {
1364 		if (dev->flags & VIRTIO_DEV_VDPA_CONFIGURED) {
1365 			struct rte_vdpa_device *vdpa_dev = dev->vdpa_dev;
1366 
1367 			if (vdpa_dev && vdpa_dev->ops->dev_close)
1368 				vdpa_dev->ops->dev_close(dev->vid);
1369 			dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED;
1370 		}
1371 
1372 		/* notify the vhost application to stop DMA transfers */
1373 		if (dev->async_copy && dev->notify_ops->vring_state_changed) {
1374 			for (i = 0; i < dev->nr_vring; i++) {
1375 				dev->notify_ops->vring_state_changed(dev->vid,
1376 						i, 0);
1377 			}
1378 			async_notify = true;
1379 		}
1380 
1381 		free_mem_region(dev);
1382 		rte_free(dev->mem);
1383 		dev->mem = NULL;
1384 
1385 		if (dev->async_map_status) {
1386 			rte_free(dev->async_map_status);
1387 			dev->async_map_status = NULL;
1388 		}
1389 	}
1390 
1391 	/* Flush IOTLB cache as previous HVAs are now invalid */
1392 	if (dev->features & (1ULL << VIRTIO_F_IOMMU_PLATFORM))
1393 		for (i = 0; i < dev->nr_vring; i++)
1394 			vhost_user_iotlb_flush_all(dev->virtqueue[i]);
1395 
1396 	/*
1397 	 * If VQ 0 has already been allocated, try to allocate on the same
1398 	 * NUMA node. It can be reallocated later in numa_realloc().
1399 	 */
1400 	if (dev->nr_vring > 0)
1401 		numa_node = dev->virtqueue[0]->numa_node;
1402 
1403 	dev->nr_guest_pages = 0;
1404 	if (dev->guest_pages == NULL) {
1405 		dev->max_guest_pages = 8;
1406 		dev->guest_pages = rte_zmalloc_socket(NULL,
1407 					dev->max_guest_pages *
1408 					sizeof(struct guest_page),
1409 					RTE_CACHE_LINE_SIZE,
1410 					numa_node);
1411 		if (dev->guest_pages == NULL) {
1412 			VHOST_LOG_CONFIG(ERR,
1413 				"(%d) failed to allocate memory "
1414 				"for dev->guest_pages\n",
1415 				dev->vid);
1416 			goto close_msg_fds;
1417 		}
1418 	}
1419 
1420 	dev->mem = rte_zmalloc_socket("vhost-mem-table", sizeof(struct rte_vhost_memory) +
1421 		sizeof(struct rte_vhost_mem_region) * memory->nregions, 0, numa_node);
1422 	if (dev->mem == NULL) {
1423 		VHOST_LOG_CONFIG(ERR,
1424 			"(%d) failed to allocate memory for dev->mem\n",
1425 			dev->vid);
1426 		goto free_guest_pages;
1427 	}
1428 
1429 	if (dev->async_copy) {
1430 		dev->async_map_status = rte_zmalloc_socket("async-dma-map-status",
1431 					sizeof(bool) * memory->nregions, 0, numa_node);
1432 		if (!dev->async_map_status) {
1433 			VHOST_LOG_CONFIG(ERR,
1434 				"(%d) failed to allocate memory for dma mapping status\n",
1435 				dev->vid);
1436 			goto free_mem_table;
1437 		}
1438 	}
1439 
1440 	for (i = 0; i < memory->nregions; i++) {
1441 		reg = &dev->mem->regions[i];
1442 
1443 		reg->guest_phys_addr = memory->regions[i].guest_phys_addr;
1444 		reg->guest_user_addr = memory->regions[i].userspace_addr;
1445 		reg->size            = memory->regions[i].memory_size;
1446 		reg->fd              = msg->fds[i];
1447 
1448 		/*
1449 		 * Assign invalid file descriptor value to avoid double
1450 		 * closing on error path.
1451 		 */
1452 		msg->fds[i] = -1;
1453 
1454 		mmap_offset = memory->regions[i].mmap_offset;
1455 
1456 		if (vhost_user_mmap_region(dev, reg, i, mmap_offset) < 0) {
1457 			VHOST_LOG_CONFIG(ERR, "Failed to mmap region %u\n", i);
1458 			goto free_mem_table;
1459 		}
1460 
1461 		dev->mem->nregions++;
1462 	}
1463 
1464 	if (vhost_user_postcopy_register(dev, main_fd, msg) < 0)
1465 		goto free_mem_table;
1466 
1467 	for (i = 0; i < dev->nr_vring; i++) {
1468 		struct vhost_virtqueue *vq = dev->virtqueue[i];
1469 
1470 		if (!vq)
1471 			continue;
1472 
1473 		if (vq->desc || vq->avail || vq->used) {
1474 			/*
1475 			 * If the memory table got updated, the ring addresses
1476 			 * need to be translated again as virtual addresses have
1477 			 * changed.
1478 			 */
1479 			vring_invalidate(dev, vq);
1480 
1481 			dev = translate_ring_addresses(dev, i);
1482 			if (!dev) {
1483 				dev = *pdev;
1484 				goto free_mem_table;
1485 			}
1486 
1487 			*pdev = dev;
1488 		}
1489 	}
1490 
1491 	dump_guest_pages(dev);
1492 
1493 	if (async_notify) {
1494 		for (i = 0; i < dev->nr_vring; i++)
1495 			dev->notify_ops->vring_state_changed(dev->vid, i, 1);
1496 	}
1497 
1498 	return RTE_VHOST_MSG_RESULT_OK;
1499 
1500 free_mem_table:
1501 	free_mem_region(dev);
1502 	rte_free(dev->mem);
1503 	dev->mem = NULL;
1504 	if (dev->async_map_status) {
1505 		rte_free(dev->async_map_status);
1506 		dev->async_map_status = NULL;
1507 	}
1508 free_guest_pages:
1509 	rte_free(dev->guest_pages);
1510 	dev->guest_pages = NULL;
1511 close_msg_fds:
1512 	close_msg_fds(msg);
1513 	return RTE_VHOST_MSG_RESULT_ERR;
1514 }
1515 
1516 static bool
1517 vq_is_ready(struct virtio_net *dev, struct vhost_virtqueue *vq)
1518 {
1519 	bool rings_ok;
1520 
1521 	if (!vq)
1522 		return false;
1523 
1524 	if (vq_is_packed(dev))
1525 		rings_ok = vq->desc_packed && vq->driver_event &&
1526 			vq->device_event;
1527 	else
1528 		rings_ok = vq->desc && vq->avail && vq->used;
1529 
1530 	return rings_ok &&
1531 	       vq->kickfd != VIRTIO_UNINITIALIZED_EVENTFD &&
1532 	       vq->callfd != VIRTIO_UNINITIALIZED_EVENTFD &&
1533 	       vq->enabled;
1534 }
1535 
1536 #define VIRTIO_BUILTIN_NUM_VQS_TO_BE_READY 2u
1537 
1538 static int
1539 virtio_is_ready(struct virtio_net *dev)
1540 {
1541 	struct vhost_virtqueue *vq;
1542 	uint32_t i, nr_vring = dev->nr_vring;
1543 
1544 	if (dev->flags & VIRTIO_DEV_READY)
1545 		return 1;
1546 
1547 	if (!dev->nr_vring)
1548 		return 0;
1549 
1550 	if (dev->flags & VIRTIO_DEV_BUILTIN_VIRTIO_NET) {
1551 		nr_vring = VIRTIO_BUILTIN_NUM_VQS_TO_BE_READY;
1552 
1553 		if (dev->nr_vring < nr_vring)
1554 			return 0;
1555 	}
1556 
1557 	for (i = 0; i < nr_vring; i++) {
1558 		vq = dev->virtqueue[i];
1559 
1560 		if (!vq_is_ready(dev, vq))
1561 			return 0;
1562 	}
1563 
1564 	/* If supported, ensure the frontend is really done with config */
1565 	if (dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_STATUS))
1566 		if (!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER_OK))
1567 			return 0;
1568 
1569 	dev->flags |= VIRTIO_DEV_READY;
1570 
1571 	if (!(dev->flags & VIRTIO_DEV_RUNNING))
1572 		VHOST_LOG_CONFIG(INFO,
1573 			"virtio is now ready for processing.\n");
1574 	return 1;
1575 }
1576 
1577 static void *
1578 inflight_mem_alloc(const char *name, size_t size, int *fd)
1579 {
1580 	void *ptr;
1581 	int mfd = -1;
1582 	char fname[20] = "/tmp/memfd-XXXXXX";
1583 
1584 	*fd = -1;
1585 #ifdef MEMFD_SUPPORTED
1586 	mfd = memfd_create(name, MFD_CLOEXEC);
1587 #else
1588 	RTE_SET_USED(name);
1589 #endif
1590 	if (mfd == -1) {
1591 		mfd = mkstemp(fname);
1592 		if (mfd == -1) {
1593 			VHOST_LOG_CONFIG(ERR,
1594 				"failed to get inflight buffer fd\n");
1595 			return NULL;
1596 		}
1597 
1598 		unlink(fname);
1599 	}
1600 
1601 	if (ftruncate(mfd, size) == -1) {
1602 		VHOST_LOG_CONFIG(ERR,
1603 			"failed to alloc inflight buffer\n");
1604 		close(mfd);
1605 		return NULL;
1606 	}
1607 
1608 	ptr = mmap(0, size, PROT_READ | PROT_WRITE, MAP_SHARED, mfd, 0);
1609 	if (ptr == MAP_FAILED) {
1610 		VHOST_LOG_CONFIG(ERR,
1611 			"failed to mmap inflight buffer\n");
1612 		close(mfd);
1613 		return NULL;
1614 	}
1615 
1616 	*fd = mfd;
1617 	return ptr;
1618 }
1619 
1620 static uint32_t
1621 get_pervq_shm_size_split(uint16_t queue_size)
1622 {
1623 	return RTE_ALIGN_MUL_CEIL(sizeof(struct rte_vhost_inflight_desc_split) *
1624 				  queue_size + sizeof(uint64_t) +
1625 				  sizeof(uint16_t) * 4, INFLIGHT_ALIGNMENT);
1626 }
1627 
1628 static uint32_t
1629 get_pervq_shm_size_packed(uint16_t queue_size)
1630 {
1631 	return RTE_ALIGN_MUL_CEIL(sizeof(struct rte_vhost_inflight_desc_packed)
1632 				  * queue_size + sizeof(uint64_t) +
1633 				  sizeof(uint16_t) * 6 + sizeof(uint8_t) * 9,
1634 				  INFLIGHT_ALIGNMENT);
1635 }
1636 
1637 static int
1638 vhost_user_get_inflight_fd(struct virtio_net **pdev,
1639 			   VhostUserMsg *msg,
1640 			   int main_fd __rte_unused)
1641 {
1642 	struct rte_vhost_inflight_info_packed *inflight_packed;
1643 	uint64_t pervq_inflight_size, mmap_size;
1644 	uint16_t num_queues, queue_size;
1645 	struct virtio_net *dev = *pdev;
1646 	int fd, i, j;
1647 	int numa_node = SOCKET_ID_ANY;
1648 	void *addr;
1649 
1650 	if (msg->size != sizeof(msg->payload.inflight)) {
1651 		VHOST_LOG_CONFIG(ERR,
1652 			"invalid get_inflight_fd message size is %d\n",
1653 			msg->size);
1654 		return RTE_VHOST_MSG_RESULT_ERR;
1655 	}
1656 
1657 	/*
1658 	 * If VQ 0 has already been allocated, try to allocate on the same
1659 	 * NUMA node. It can be reallocated later in numa_realloc().
1660 	 */
1661 	if (dev->nr_vring > 0)
1662 		numa_node = dev->virtqueue[0]->numa_node;
1663 
1664 	if (dev->inflight_info == NULL) {
1665 		dev->inflight_info = rte_zmalloc_socket("inflight_info",
1666 				sizeof(struct inflight_mem_info), 0, numa_node);
1667 		if (!dev->inflight_info) {
1668 			VHOST_LOG_CONFIG(ERR,
1669 				"failed to alloc dev inflight area\n");
1670 			return RTE_VHOST_MSG_RESULT_ERR;
1671 		}
1672 		dev->inflight_info->fd = -1;
1673 	}
1674 
1675 	num_queues = msg->payload.inflight.num_queues;
1676 	queue_size = msg->payload.inflight.queue_size;
1677 
1678 	VHOST_LOG_CONFIG(INFO, "get_inflight_fd num_queues: %u\n",
1679 		msg->payload.inflight.num_queues);
1680 	VHOST_LOG_CONFIG(INFO, "get_inflight_fd queue_size: %u\n",
1681 		msg->payload.inflight.queue_size);
1682 
1683 	if (vq_is_packed(dev))
1684 		pervq_inflight_size = get_pervq_shm_size_packed(queue_size);
1685 	else
1686 		pervq_inflight_size = get_pervq_shm_size_split(queue_size);
1687 
1688 	mmap_size = num_queues * pervq_inflight_size;
1689 	addr = inflight_mem_alloc("vhost-inflight", mmap_size, &fd);
1690 	if (!addr) {
1691 		VHOST_LOG_CONFIG(ERR,
1692 			"failed to alloc vhost inflight area\n");
1693 			msg->payload.inflight.mmap_size = 0;
1694 		return RTE_VHOST_MSG_RESULT_ERR;
1695 	}
1696 	memset(addr, 0, mmap_size);
1697 
1698 	if (dev->inflight_info->addr) {
1699 		munmap(dev->inflight_info->addr, dev->inflight_info->size);
1700 		dev->inflight_info->addr = NULL;
1701 	}
1702 
1703 	if (dev->inflight_info->fd >= 0) {
1704 		close(dev->inflight_info->fd);
1705 		dev->inflight_info->fd = -1;
1706 	}
1707 
1708 	dev->inflight_info->addr = addr;
1709 	dev->inflight_info->size = msg->payload.inflight.mmap_size = mmap_size;
1710 	dev->inflight_info->fd = msg->fds[0] = fd;
1711 	msg->payload.inflight.mmap_offset = 0;
1712 	msg->fd_num = 1;
1713 
1714 	if (vq_is_packed(dev)) {
1715 		for (i = 0; i < num_queues; i++) {
1716 			inflight_packed =
1717 				(struct rte_vhost_inflight_info_packed *)addr;
1718 			inflight_packed->used_wrap_counter = 1;
1719 			inflight_packed->old_used_wrap_counter = 1;
1720 			for (j = 0; j < queue_size; j++)
1721 				inflight_packed->desc[j].next = j + 1;
1722 			addr = (void *)((char *)addr + pervq_inflight_size);
1723 		}
1724 	}
1725 
1726 	VHOST_LOG_CONFIG(INFO,
1727 		"send inflight mmap_size: %"PRIu64"\n",
1728 		msg->payload.inflight.mmap_size);
1729 	VHOST_LOG_CONFIG(INFO,
1730 		"send inflight mmap_offset: %"PRIu64"\n",
1731 		msg->payload.inflight.mmap_offset);
1732 	VHOST_LOG_CONFIG(INFO,
1733 		"send inflight fd: %d\n", msg->fds[0]);
1734 
1735 	return RTE_VHOST_MSG_RESULT_REPLY;
1736 }
1737 
1738 static int
1739 vhost_user_set_inflight_fd(struct virtio_net **pdev, VhostUserMsg *msg,
1740 			   int main_fd __rte_unused)
1741 {
1742 	uint64_t mmap_size, mmap_offset;
1743 	uint16_t num_queues, queue_size;
1744 	struct virtio_net *dev = *pdev;
1745 	uint32_t pervq_inflight_size;
1746 	struct vhost_virtqueue *vq;
1747 	void *addr;
1748 	int fd, i;
1749 	int numa_node = SOCKET_ID_ANY;
1750 
1751 	fd = msg->fds[0];
1752 	if (msg->size != sizeof(msg->payload.inflight) || fd < 0) {
1753 		VHOST_LOG_CONFIG(ERR,
1754 			"invalid set_inflight_fd message size is %d,fd is %d\n",
1755 			msg->size, fd);
1756 		return RTE_VHOST_MSG_RESULT_ERR;
1757 	}
1758 
1759 	mmap_size = msg->payload.inflight.mmap_size;
1760 	mmap_offset = msg->payload.inflight.mmap_offset;
1761 	num_queues = msg->payload.inflight.num_queues;
1762 	queue_size = msg->payload.inflight.queue_size;
1763 
1764 	if (vq_is_packed(dev))
1765 		pervq_inflight_size = get_pervq_shm_size_packed(queue_size);
1766 	else
1767 		pervq_inflight_size = get_pervq_shm_size_split(queue_size);
1768 
1769 	VHOST_LOG_CONFIG(INFO,
1770 		"set_inflight_fd mmap_size: %"PRIu64"\n", mmap_size);
1771 	VHOST_LOG_CONFIG(INFO,
1772 		"set_inflight_fd mmap_offset: %"PRIu64"\n", mmap_offset);
1773 	VHOST_LOG_CONFIG(INFO,
1774 		"set_inflight_fd num_queues: %u\n", num_queues);
1775 	VHOST_LOG_CONFIG(INFO,
1776 		"set_inflight_fd queue_size: %u\n", queue_size);
1777 	VHOST_LOG_CONFIG(INFO,
1778 		"set_inflight_fd fd: %d\n", fd);
1779 	VHOST_LOG_CONFIG(INFO,
1780 		"set_inflight_fd pervq_inflight_size: %d\n",
1781 		pervq_inflight_size);
1782 
1783 	/*
1784 	 * If VQ 0 has already been allocated, try to allocate on the same
1785 	 * NUMA node. It can be reallocated later in numa_realloc().
1786 	 */
1787 	if (dev->nr_vring > 0)
1788 		numa_node = dev->virtqueue[0]->numa_node;
1789 
1790 	if (!dev->inflight_info) {
1791 		dev->inflight_info = rte_zmalloc_socket("inflight_info",
1792 				sizeof(struct inflight_mem_info), 0, numa_node);
1793 		if (dev->inflight_info == NULL) {
1794 			VHOST_LOG_CONFIG(ERR,
1795 				"failed to alloc dev inflight area\n");
1796 			return RTE_VHOST_MSG_RESULT_ERR;
1797 		}
1798 		dev->inflight_info->fd = -1;
1799 	}
1800 
1801 	if (dev->inflight_info->addr) {
1802 		munmap(dev->inflight_info->addr, dev->inflight_info->size);
1803 		dev->inflight_info->addr = NULL;
1804 	}
1805 
1806 	addr = mmap(0, mmap_size, PROT_READ | PROT_WRITE, MAP_SHARED,
1807 		    fd, mmap_offset);
1808 	if (addr == MAP_FAILED) {
1809 		VHOST_LOG_CONFIG(ERR, "failed to mmap share memory.\n");
1810 		return RTE_VHOST_MSG_RESULT_ERR;
1811 	}
1812 
1813 	if (dev->inflight_info->fd >= 0) {
1814 		close(dev->inflight_info->fd);
1815 		dev->inflight_info->fd = -1;
1816 	}
1817 
1818 	dev->inflight_info->fd = fd;
1819 	dev->inflight_info->addr = addr;
1820 	dev->inflight_info->size = mmap_size;
1821 
1822 	for (i = 0; i < num_queues; i++) {
1823 		vq = dev->virtqueue[i];
1824 		if (!vq)
1825 			continue;
1826 
1827 		if (vq_is_packed(dev)) {
1828 			vq->inflight_packed = addr;
1829 			vq->inflight_packed->desc_num = queue_size;
1830 		} else {
1831 			vq->inflight_split = addr;
1832 			vq->inflight_split->desc_num = queue_size;
1833 		}
1834 		addr = (void *)((char *)addr + pervq_inflight_size);
1835 	}
1836 
1837 	return RTE_VHOST_MSG_RESULT_OK;
1838 }
1839 
1840 static int
1841 vhost_user_set_vring_call(struct virtio_net **pdev, struct VhostUserMsg *msg,
1842 			int main_fd __rte_unused)
1843 {
1844 	struct virtio_net *dev = *pdev;
1845 	struct vhost_vring_file file;
1846 	struct vhost_virtqueue *vq;
1847 	int expected_fds;
1848 
1849 	expected_fds = (msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1;
1850 	if (validate_msg_fds(msg, expected_fds) != 0)
1851 		return RTE_VHOST_MSG_RESULT_ERR;
1852 
1853 	file.index = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
1854 	if (msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
1855 		file.fd = VIRTIO_INVALID_EVENTFD;
1856 	else
1857 		file.fd = msg->fds[0];
1858 	VHOST_LOG_CONFIG(INFO,
1859 		"vring call idx:%d file:%d\n", file.index, file.fd);
1860 
1861 	vq = dev->virtqueue[file.index];
1862 
1863 	if (vq->ready) {
1864 		vq->ready = false;
1865 		vhost_user_notify_queue_state(dev, file.index, 0);
1866 	}
1867 
1868 	if (vq->callfd >= 0)
1869 		close(vq->callfd);
1870 
1871 	vq->callfd = file.fd;
1872 
1873 	return RTE_VHOST_MSG_RESULT_OK;
1874 }
1875 
1876 static int vhost_user_set_vring_err(struct virtio_net **pdev __rte_unused,
1877 			struct VhostUserMsg *msg,
1878 			int main_fd __rte_unused)
1879 {
1880 	int expected_fds;
1881 
1882 	expected_fds = (msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1;
1883 	if (validate_msg_fds(msg, expected_fds) != 0)
1884 		return RTE_VHOST_MSG_RESULT_ERR;
1885 
1886 	if (!(msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK))
1887 		close(msg->fds[0]);
1888 	VHOST_LOG_CONFIG(INFO, "not implemented\n");
1889 
1890 	return RTE_VHOST_MSG_RESULT_OK;
1891 }
1892 
1893 static int
1894 resubmit_desc_compare(const void *a, const void *b)
1895 {
1896 	const struct rte_vhost_resubmit_desc *desc0 = a;
1897 	const struct rte_vhost_resubmit_desc *desc1 = b;
1898 
1899 	if (desc1->counter > desc0->counter)
1900 		return 1;
1901 
1902 	return -1;
1903 }
1904 
1905 static int
1906 vhost_check_queue_inflights_split(struct virtio_net *dev,
1907 				  struct vhost_virtqueue *vq)
1908 {
1909 	uint16_t i;
1910 	uint16_t resubmit_num = 0, last_io, num;
1911 	struct vring_used *used = vq->used;
1912 	struct rte_vhost_resubmit_info *resubmit;
1913 	struct rte_vhost_inflight_info_split *inflight_split;
1914 
1915 	if (!(dev->protocol_features &
1916 	    (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
1917 		return RTE_VHOST_MSG_RESULT_OK;
1918 
1919 	/* The frontend may still not support the inflight feature
1920 	 * although we negotiate the protocol feature.
1921 	 */
1922 	if ((!vq->inflight_split))
1923 		return RTE_VHOST_MSG_RESULT_OK;
1924 
1925 	if (!vq->inflight_split->version) {
1926 		vq->inflight_split->version = INFLIGHT_VERSION;
1927 		return RTE_VHOST_MSG_RESULT_OK;
1928 	}
1929 
1930 	if (vq->resubmit_inflight)
1931 		return RTE_VHOST_MSG_RESULT_OK;
1932 
1933 	inflight_split = vq->inflight_split;
1934 	vq->global_counter = 0;
1935 	last_io = inflight_split->last_inflight_io;
1936 
1937 	if (inflight_split->used_idx != used->idx) {
1938 		inflight_split->desc[last_io].inflight = 0;
1939 		rte_atomic_thread_fence(__ATOMIC_SEQ_CST);
1940 		inflight_split->used_idx = used->idx;
1941 	}
1942 
1943 	for (i = 0; i < inflight_split->desc_num; i++) {
1944 		if (inflight_split->desc[i].inflight == 1)
1945 			resubmit_num++;
1946 	}
1947 
1948 	vq->last_avail_idx += resubmit_num;
1949 
1950 	if (resubmit_num) {
1951 		resubmit = rte_zmalloc_socket("resubmit", sizeof(struct rte_vhost_resubmit_info),
1952 				0, vq->numa_node);
1953 		if (!resubmit) {
1954 			VHOST_LOG_CONFIG(ERR,
1955 				"failed to allocate memory for resubmit info.\n");
1956 			return RTE_VHOST_MSG_RESULT_ERR;
1957 		}
1958 
1959 		resubmit->resubmit_list = rte_zmalloc_socket("resubmit_list",
1960 				resubmit_num * sizeof(struct rte_vhost_resubmit_desc),
1961 				0, vq->numa_node);
1962 		if (!resubmit->resubmit_list) {
1963 			VHOST_LOG_CONFIG(ERR,
1964 				"failed to allocate memory for inflight desc.\n");
1965 			rte_free(resubmit);
1966 			return RTE_VHOST_MSG_RESULT_ERR;
1967 		}
1968 
1969 		num = 0;
1970 		for (i = 0; i < vq->inflight_split->desc_num; i++) {
1971 			if (vq->inflight_split->desc[i].inflight == 1) {
1972 				resubmit->resubmit_list[num].index = i;
1973 				resubmit->resubmit_list[num].counter =
1974 					inflight_split->desc[i].counter;
1975 				num++;
1976 			}
1977 		}
1978 		resubmit->resubmit_num = num;
1979 
1980 		if (resubmit->resubmit_num > 1)
1981 			qsort(resubmit->resubmit_list, resubmit->resubmit_num,
1982 			      sizeof(struct rte_vhost_resubmit_desc),
1983 			      resubmit_desc_compare);
1984 
1985 		vq->global_counter = resubmit->resubmit_list[0].counter + 1;
1986 		vq->resubmit_inflight = resubmit;
1987 	}
1988 
1989 	return RTE_VHOST_MSG_RESULT_OK;
1990 }
1991 
1992 static int
1993 vhost_check_queue_inflights_packed(struct virtio_net *dev,
1994 				   struct vhost_virtqueue *vq)
1995 {
1996 	uint16_t i;
1997 	uint16_t resubmit_num = 0, old_used_idx, num;
1998 	struct rte_vhost_resubmit_info *resubmit;
1999 	struct rte_vhost_inflight_info_packed *inflight_packed;
2000 
2001 	if (!(dev->protocol_features &
2002 	    (1ULL << VHOST_USER_PROTOCOL_F_INFLIGHT_SHMFD)))
2003 		return RTE_VHOST_MSG_RESULT_OK;
2004 
2005 	/* The frontend may still not support the inflight feature
2006 	 * although we negotiate the protocol feature.
2007 	 */
2008 	if ((!vq->inflight_packed))
2009 		return RTE_VHOST_MSG_RESULT_OK;
2010 
2011 	if (!vq->inflight_packed->version) {
2012 		vq->inflight_packed->version = INFLIGHT_VERSION;
2013 		return RTE_VHOST_MSG_RESULT_OK;
2014 	}
2015 
2016 	if (vq->resubmit_inflight)
2017 		return RTE_VHOST_MSG_RESULT_OK;
2018 
2019 	inflight_packed = vq->inflight_packed;
2020 	vq->global_counter = 0;
2021 	old_used_idx = inflight_packed->old_used_idx;
2022 
2023 	if (inflight_packed->used_idx != old_used_idx) {
2024 		if (inflight_packed->desc[old_used_idx].inflight == 0) {
2025 			inflight_packed->old_used_idx =
2026 				inflight_packed->used_idx;
2027 			inflight_packed->old_used_wrap_counter =
2028 				inflight_packed->used_wrap_counter;
2029 			inflight_packed->old_free_head =
2030 				inflight_packed->free_head;
2031 		} else {
2032 			inflight_packed->used_idx =
2033 				inflight_packed->old_used_idx;
2034 			inflight_packed->used_wrap_counter =
2035 				inflight_packed->old_used_wrap_counter;
2036 			inflight_packed->free_head =
2037 				inflight_packed->old_free_head;
2038 		}
2039 	}
2040 
2041 	for (i = 0; i < inflight_packed->desc_num; i++) {
2042 		if (inflight_packed->desc[i].inflight == 1)
2043 			resubmit_num++;
2044 	}
2045 
2046 	if (resubmit_num) {
2047 		resubmit = rte_zmalloc_socket("resubmit", sizeof(struct rte_vhost_resubmit_info),
2048 				0, vq->numa_node);
2049 		if (resubmit == NULL) {
2050 			VHOST_LOG_CONFIG(ERR,
2051 				"failed to allocate memory for resubmit info.\n");
2052 			return RTE_VHOST_MSG_RESULT_ERR;
2053 		}
2054 
2055 		resubmit->resubmit_list = rte_zmalloc_socket("resubmit_list",
2056 				resubmit_num * sizeof(struct rte_vhost_resubmit_desc),
2057 				0, vq->numa_node);
2058 		if (resubmit->resubmit_list == NULL) {
2059 			VHOST_LOG_CONFIG(ERR,
2060 				"failed to allocate memory for resubmit desc.\n");
2061 			rte_free(resubmit);
2062 			return RTE_VHOST_MSG_RESULT_ERR;
2063 		}
2064 
2065 		num = 0;
2066 		for (i = 0; i < inflight_packed->desc_num; i++) {
2067 			if (vq->inflight_packed->desc[i].inflight == 1) {
2068 				resubmit->resubmit_list[num].index = i;
2069 				resubmit->resubmit_list[num].counter =
2070 					inflight_packed->desc[i].counter;
2071 				num++;
2072 			}
2073 		}
2074 		resubmit->resubmit_num = num;
2075 
2076 		if (resubmit->resubmit_num > 1)
2077 			qsort(resubmit->resubmit_list, resubmit->resubmit_num,
2078 			      sizeof(struct rte_vhost_resubmit_desc),
2079 			      resubmit_desc_compare);
2080 
2081 		vq->global_counter = resubmit->resubmit_list[0].counter + 1;
2082 		vq->resubmit_inflight = resubmit;
2083 	}
2084 
2085 	return RTE_VHOST_MSG_RESULT_OK;
2086 }
2087 
2088 static int
2089 vhost_user_set_vring_kick(struct virtio_net **pdev, struct VhostUserMsg *msg,
2090 			int main_fd __rte_unused)
2091 {
2092 	struct virtio_net *dev = *pdev;
2093 	struct vhost_vring_file file;
2094 	struct vhost_virtqueue *vq;
2095 	int expected_fds;
2096 
2097 	expected_fds = (msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK) ? 0 : 1;
2098 	if (validate_msg_fds(msg, expected_fds) != 0)
2099 		return RTE_VHOST_MSG_RESULT_ERR;
2100 
2101 	file.index = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
2102 	if (msg->payload.u64 & VHOST_USER_VRING_NOFD_MASK)
2103 		file.fd = VIRTIO_INVALID_EVENTFD;
2104 	else
2105 		file.fd = msg->fds[0];
2106 	VHOST_LOG_CONFIG(INFO,
2107 		"vring kick idx:%d file:%d\n", file.index, file.fd);
2108 
2109 	/* Interpret ring addresses only when ring is started. */
2110 	dev = translate_ring_addresses(dev, file.index);
2111 	if (!dev) {
2112 		if (file.fd != VIRTIO_INVALID_EVENTFD)
2113 			close(file.fd);
2114 
2115 		return RTE_VHOST_MSG_RESULT_ERR;
2116 	}
2117 
2118 	*pdev = dev;
2119 
2120 	vq = dev->virtqueue[file.index];
2121 
2122 	/*
2123 	 * When VHOST_USER_F_PROTOCOL_FEATURES is not negotiated,
2124 	 * the ring starts already enabled. Otherwise, it is enabled via
2125 	 * the SET_VRING_ENABLE message.
2126 	 */
2127 	if (!(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES))) {
2128 		vq->enabled = true;
2129 	}
2130 
2131 	if (vq->ready) {
2132 		vq->ready = false;
2133 		vhost_user_notify_queue_state(dev, file.index, 0);
2134 	}
2135 
2136 	if (vq->kickfd >= 0)
2137 		close(vq->kickfd);
2138 	vq->kickfd = file.fd;
2139 
2140 	if (vq_is_packed(dev)) {
2141 		if (vhost_check_queue_inflights_packed(dev, vq)) {
2142 			VHOST_LOG_CONFIG(ERR,
2143 				"failed to inflights for vq: %d\n", file.index);
2144 			return RTE_VHOST_MSG_RESULT_ERR;
2145 		}
2146 	} else {
2147 		if (vhost_check_queue_inflights_split(dev, vq)) {
2148 			VHOST_LOG_CONFIG(ERR,
2149 				"failed to inflights for vq: %d\n", file.index);
2150 			return RTE_VHOST_MSG_RESULT_ERR;
2151 		}
2152 	}
2153 
2154 	return RTE_VHOST_MSG_RESULT_OK;
2155 }
2156 
2157 /*
2158  * when virtio is stopped, qemu will send us the GET_VRING_BASE message.
2159  */
2160 static int
2161 vhost_user_get_vring_base(struct virtio_net **pdev,
2162 			struct VhostUserMsg *msg,
2163 			int main_fd __rte_unused)
2164 {
2165 	struct virtio_net *dev = *pdev;
2166 	struct vhost_virtqueue *vq = dev->virtqueue[msg->payload.state.index];
2167 	uint64_t val;
2168 
2169 	if (validate_msg_fds(msg, 0) != 0)
2170 		return RTE_VHOST_MSG_RESULT_ERR;
2171 
2172 	/* We have to stop the queue (virtio) if it is running. */
2173 	vhost_destroy_device_notify(dev);
2174 
2175 	dev->flags &= ~VIRTIO_DEV_READY;
2176 	dev->flags &= ~VIRTIO_DEV_VDPA_CONFIGURED;
2177 
2178 	/* Here we are safe to get the indexes */
2179 	if (vq_is_packed(dev)) {
2180 		/*
2181 		 * Bit[0:14]: avail index
2182 		 * Bit[15]: avail wrap counter
2183 		 */
2184 		val = vq->last_avail_idx & 0x7fff;
2185 		val |= vq->avail_wrap_counter << 15;
2186 		msg->payload.state.num = val;
2187 	} else {
2188 		msg->payload.state.num = vq->last_avail_idx;
2189 	}
2190 
2191 	VHOST_LOG_CONFIG(INFO,
2192 		"vring base idx:%d file:%d\n", msg->payload.state.index,
2193 		msg->payload.state.num);
2194 	/*
2195 	 * Based on current qemu vhost-user implementation, this message is
2196 	 * sent and only sent in vhost_vring_stop.
2197 	 * TODO: cleanup the vring, it isn't usable since here.
2198 	 */
2199 	if (vq->kickfd >= 0)
2200 		close(vq->kickfd);
2201 
2202 	vq->kickfd = VIRTIO_UNINITIALIZED_EVENTFD;
2203 
2204 	if (vq->callfd >= 0)
2205 		close(vq->callfd);
2206 
2207 	vq->callfd = VIRTIO_UNINITIALIZED_EVENTFD;
2208 
2209 	vq->signalled_used_valid = false;
2210 
2211 	if (vq_is_packed(dev)) {
2212 		rte_free(vq->shadow_used_packed);
2213 		vq->shadow_used_packed = NULL;
2214 	} else {
2215 		rte_free(vq->shadow_used_split);
2216 		vq->shadow_used_split = NULL;
2217 	}
2218 
2219 	rte_free(vq->batch_copy_elems);
2220 	vq->batch_copy_elems = NULL;
2221 
2222 	rte_free(vq->log_cache);
2223 	vq->log_cache = NULL;
2224 
2225 	msg->size = sizeof(msg->payload.state);
2226 	msg->fd_num = 0;
2227 
2228 	vhost_user_iotlb_flush_all(vq);
2229 
2230 	vring_invalidate(dev, vq);
2231 
2232 	return RTE_VHOST_MSG_RESULT_REPLY;
2233 }
2234 
2235 /*
2236  * when virtio queues are ready to work, qemu will send us to
2237  * enable the virtio queue pair.
2238  */
2239 static int
2240 vhost_user_set_vring_enable(struct virtio_net **pdev,
2241 			struct VhostUserMsg *msg,
2242 			int main_fd __rte_unused)
2243 {
2244 	struct virtio_net *dev = *pdev;
2245 	bool enable = !!msg->payload.state.num;
2246 	int index = (int)msg->payload.state.index;
2247 
2248 	if (validate_msg_fds(msg, 0) != 0)
2249 		return RTE_VHOST_MSG_RESULT_ERR;
2250 
2251 	VHOST_LOG_CONFIG(INFO,
2252 		"set queue enable: %d to qp idx: %d\n",
2253 		enable, index);
2254 
2255 	if (enable && dev->virtqueue[index]->async_registered) {
2256 		if (dev->virtqueue[index]->async_pkts_inflight_n) {
2257 			VHOST_LOG_CONFIG(ERR, "failed to enable vring. "
2258 			"async inflight packets must be completed first\n");
2259 			return RTE_VHOST_MSG_RESULT_ERR;
2260 		}
2261 	}
2262 
2263 	dev->virtqueue[index]->enabled = enable;
2264 
2265 	return RTE_VHOST_MSG_RESULT_OK;
2266 }
2267 
2268 static int
2269 vhost_user_get_protocol_features(struct virtio_net **pdev,
2270 			struct VhostUserMsg *msg,
2271 			int main_fd __rte_unused)
2272 {
2273 	struct virtio_net *dev = *pdev;
2274 	uint64_t features, protocol_features;
2275 
2276 	if (validate_msg_fds(msg, 0) != 0)
2277 		return RTE_VHOST_MSG_RESULT_ERR;
2278 
2279 	rte_vhost_driver_get_features(dev->ifname, &features);
2280 	rte_vhost_driver_get_protocol_features(dev->ifname, &protocol_features);
2281 
2282 	msg->payload.u64 = protocol_features;
2283 	msg->size = sizeof(msg->payload.u64);
2284 	msg->fd_num = 0;
2285 
2286 	return RTE_VHOST_MSG_RESULT_REPLY;
2287 }
2288 
2289 static int
2290 vhost_user_set_protocol_features(struct virtio_net **pdev,
2291 			struct VhostUserMsg *msg,
2292 			int main_fd __rte_unused)
2293 {
2294 	struct virtio_net *dev = *pdev;
2295 	uint64_t protocol_features = msg->payload.u64;
2296 	uint64_t slave_protocol_features = 0;
2297 
2298 	if (validate_msg_fds(msg, 0) != 0)
2299 		return RTE_VHOST_MSG_RESULT_ERR;
2300 
2301 	rte_vhost_driver_get_protocol_features(dev->ifname,
2302 			&slave_protocol_features);
2303 	if (protocol_features & ~slave_protocol_features) {
2304 		VHOST_LOG_CONFIG(ERR,
2305 			"(%d) received invalid protocol features.\n",
2306 			dev->vid);
2307 		return RTE_VHOST_MSG_RESULT_ERR;
2308 	}
2309 
2310 	dev->protocol_features = protocol_features;
2311 	VHOST_LOG_CONFIG(INFO,
2312 		"negotiated Vhost-user protocol features: 0x%" PRIx64 "\n",
2313 		dev->protocol_features);
2314 
2315 	return RTE_VHOST_MSG_RESULT_OK;
2316 }
2317 
2318 static int
2319 vhost_user_set_log_base(struct virtio_net **pdev, struct VhostUserMsg *msg,
2320 			int main_fd __rte_unused)
2321 {
2322 	struct virtio_net *dev = *pdev;
2323 	int fd = msg->fds[0];
2324 	uint64_t size, off;
2325 	void *addr;
2326 	uint32_t i;
2327 
2328 	if (validate_msg_fds(msg, 1) != 0)
2329 		return RTE_VHOST_MSG_RESULT_ERR;
2330 
2331 	if (fd < 0) {
2332 		VHOST_LOG_CONFIG(ERR, "invalid log fd: %d\n", fd);
2333 		return RTE_VHOST_MSG_RESULT_ERR;
2334 	}
2335 
2336 	if (msg->size != sizeof(VhostUserLog)) {
2337 		VHOST_LOG_CONFIG(ERR,
2338 			"invalid log base msg size: %"PRId32" != %d\n",
2339 			msg->size, (int)sizeof(VhostUserLog));
2340 		goto close_msg_fds;
2341 	}
2342 
2343 	size = msg->payload.log.mmap_size;
2344 	off  = msg->payload.log.mmap_offset;
2345 
2346 	/* Check for mmap size and offset overflow. */
2347 	if (off >= -size) {
2348 		VHOST_LOG_CONFIG(ERR,
2349 			"log offset %#"PRIx64" and log size %#"PRIx64" overflow\n",
2350 			off, size);
2351 		goto close_msg_fds;
2352 	}
2353 
2354 	VHOST_LOG_CONFIG(INFO,
2355 		"log mmap size: %"PRId64", offset: %"PRId64"\n",
2356 		size, off);
2357 
2358 	/*
2359 	 * mmap from 0 to workaround a hugepage mmap bug: mmap will
2360 	 * fail when offset is not page size aligned.
2361 	 */
2362 	addr = mmap(0, size + off, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
2363 	close(fd);
2364 	if (addr == MAP_FAILED) {
2365 		VHOST_LOG_CONFIG(ERR, "mmap log base failed!\n");
2366 		return RTE_VHOST_MSG_RESULT_ERR;
2367 	}
2368 
2369 	/*
2370 	 * Free previously mapped log memory on occasionally
2371 	 * multiple VHOST_USER_SET_LOG_BASE.
2372 	 */
2373 	if (dev->log_addr) {
2374 		munmap((void *)(uintptr_t)dev->log_addr, dev->log_size);
2375 	}
2376 	dev->log_addr = (uint64_t)(uintptr_t)addr;
2377 	dev->log_base = dev->log_addr + off;
2378 	dev->log_size = size;
2379 
2380 	for (i = 0; i < dev->nr_vring; i++) {
2381 		struct vhost_virtqueue *vq = dev->virtqueue[i];
2382 
2383 		rte_free(vq->log_cache);
2384 		vq->log_cache = NULL;
2385 		vq->log_cache_nb_elem = 0;
2386 		vq->log_cache = rte_malloc_socket("vq log cache",
2387 				sizeof(struct log_cache_entry) * VHOST_LOG_CACHE_NR,
2388 				0, vq->numa_node);
2389 		/*
2390 		 * If log cache alloc fail, don't fail migration, but no
2391 		 * caching will be done, which will impact performance
2392 		 */
2393 		if (!vq->log_cache)
2394 			VHOST_LOG_CONFIG(ERR, "Failed to allocate VQ logging cache\n");
2395 	}
2396 
2397 	/*
2398 	 * The spec is not clear about it (yet), but QEMU doesn't expect
2399 	 * any payload in the reply.
2400 	 */
2401 	msg->size = 0;
2402 	msg->fd_num = 0;
2403 
2404 	return RTE_VHOST_MSG_RESULT_REPLY;
2405 
2406 close_msg_fds:
2407 	close_msg_fds(msg);
2408 	return RTE_VHOST_MSG_RESULT_ERR;
2409 }
2410 
2411 static int vhost_user_set_log_fd(struct virtio_net **pdev __rte_unused,
2412 			struct VhostUserMsg *msg,
2413 			int main_fd __rte_unused)
2414 {
2415 	if (validate_msg_fds(msg, 1) != 0)
2416 		return RTE_VHOST_MSG_RESULT_ERR;
2417 
2418 	close(msg->fds[0]);
2419 	VHOST_LOG_CONFIG(INFO, "not implemented.\n");
2420 
2421 	return RTE_VHOST_MSG_RESULT_OK;
2422 }
2423 
2424 /*
2425  * An rarp packet is constructed and broadcasted to notify switches about
2426  * the new location of the migrated VM, so that packets from outside will
2427  * not be lost after migration.
2428  *
2429  * However, we don't actually "send" a rarp packet here, instead, we set
2430  * a flag 'broadcast_rarp' to let rte_vhost_dequeue_burst() inject it.
2431  */
2432 static int
2433 vhost_user_send_rarp(struct virtio_net **pdev, struct VhostUserMsg *msg,
2434 			int main_fd __rte_unused)
2435 {
2436 	struct virtio_net *dev = *pdev;
2437 	uint8_t *mac = (uint8_t *)&msg->payload.u64;
2438 	struct rte_vdpa_device *vdpa_dev;
2439 
2440 	if (validate_msg_fds(msg, 0) != 0)
2441 		return RTE_VHOST_MSG_RESULT_ERR;
2442 
2443 	VHOST_LOG_CONFIG(DEBUG,
2444 		":: mac: " RTE_ETHER_ADDR_PRT_FMT "\n",
2445 		mac[0], mac[1], mac[2], mac[3], mac[4], mac[5]);
2446 	memcpy(dev->mac.addr_bytes, mac, 6);
2447 
2448 	/*
2449 	 * Set the flag to inject a RARP broadcast packet at
2450 	 * rte_vhost_dequeue_burst().
2451 	 *
2452 	 * __ATOMIC_RELEASE ordering is for making sure the mac is
2453 	 * copied before the flag is set.
2454 	 */
2455 	__atomic_store_n(&dev->broadcast_rarp, 1, __ATOMIC_RELEASE);
2456 	vdpa_dev = dev->vdpa_dev;
2457 	if (vdpa_dev && vdpa_dev->ops->migration_done)
2458 		vdpa_dev->ops->migration_done(dev->vid);
2459 
2460 	return RTE_VHOST_MSG_RESULT_OK;
2461 }
2462 
2463 static int
2464 vhost_user_net_set_mtu(struct virtio_net **pdev, struct VhostUserMsg *msg,
2465 			int main_fd __rte_unused)
2466 {
2467 	struct virtio_net *dev = *pdev;
2468 
2469 	if (validate_msg_fds(msg, 0) != 0)
2470 		return RTE_VHOST_MSG_RESULT_ERR;
2471 
2472 	if (msg->payload.u64 < VIRTIO_MIN_MTU ||
2473 			msg->payload.u64 > VIRTIO_MAX_MTU) {
2474 		VHOST_LOG_CONFIG(ERR, "Invalid MTU size (%"PRIu64")\n",
2475 				msg->payload.u64);
2476 
2477 		return RTE_VHOST_MSG_RESULT_ERR;
2478 	}
2479 
2480 	dev->mtu = msg->payload.u64;
2481 
2482 	return RTE_VHOST_MSG_RESULT_OK;
2483 }
2484 
2485 static int
2486 vhost_user_set_req_fd(struct virtio_net **pdev, struct VhostUserMsg *msg,
2487 			int main_fd __rte_unused)
2488 {
2489 	struct virtio_net *dev = *pdev;
2490 	int fd = msg->fds[0];
2491 
2492 	if (validate_msg_fds(msg, 1) != 0)
2493 		return RTE_VHOST_MSG_RESULT_ERR;
2494 
2495 	if (fd < 0) {
2496 		VHOST_LOG_CONFIG(ERR,
2497 				"Invalid file descriptor for slave channel (%d)\n",
2498 				fd);
2499 		return RTE_VHOST_MSG_RESULT_ERR;
2500 	}
2501 
2502 	if (dev->slave_req_fd >= 0)
2503 		close(dev->slave_req_fd);
2504 
2505 	dev->slave_req_fd = fd;
2506 
2507 	return RTE_VHOST_MSG_RESULT_OK;
2508 }
2509 
2510 static int
2511 is_vring_iotlb_split(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg)
2512 {
2513 	struct vhost_vring_addr *ra;
2514 	uint64_t start, end, len;
2515 
2516 	start = imsg->iova;
2517 	end = start + imsg->size;
2518 
2519 	ra = &vq->ring_addrs;
2520 	len = sizeof(struct vring_desc) * vq->size;
2521 	if (ra->desc_user_addr < end && (ra->desc_user_addr + len) > start)
2522 		return 1;
2523 
2524 	len = sizeof(struct vring_avail) + sizeof(uint16_t) * vq->size;
2525 	if (ra->avail_user_addr < end && (ra->avail_user_addr + len) > start)
2526 		return 1;
2527 
2528 	len = sizeof(struct vring_used) +
2529 	       sizeof(struct vring_used_elem) * vq->size;
2530 	if (ra->used_user_addr < end && (ra->used_user_addr + len) > start)
2531 		return 1;
2532 
2533 	if (ra->flags & (1 << VHOST_VRING_F_LOG)) {
2534 		len = sizeof(uint64_t);
2535 		if (ra->log_guest_addr < end &&
2536 		    (ra->log_guest_addr + len) > start)
2537 			return 1;
2538 	}
2539 
2540 	return 0;
2541 }
2542 
2543 static int
2544 is_vring_iotlb_packed(struct vhost_virtqueue *vq, struct vhost_iotlb_msg *imsg)
2545 {
2546 	struct vhost_vring_addr *ra;
2547 	uint64_t start, end, len;
2548 
2549 	start = imsg->iova;
2550 	end = start + imsg->size;
2551 
2552 	ra = &vq->ring_addrs;
2553 	len = sizeof(struct vring_packed_desc) * vq->size;
2554 	if (ra->desc_user_addr < end && (ra->desc_user_addr + len) > start)
2555 		return 1;
2556 
2557 	len = sizeof(struct vring_packed_desc_event);
2558 	if (ra->avail_user_addr < end && (ra->avail_user_addr + len) > start)
2559 		return 1;
2560 
2561 	len = sizeof(struct vring_packed_desc_event);
2562 	if (ra->used_user_addr < end && (ra->used_user_addr + len) > start)
2563 		return 1;
2564 
2565 	if (ra->flags & (1 << VHOST_VRING_F_LOG)) {
2566 		len = sizeof(uint64_t);
2567 		if (ra->log_guest_addr < end &&
2568 		    (ra->log_guest_addr + len) > start)
2569 			return 1;
2570 	}
2571 
2572 	return 0;
2573 }
2574 
2575 static int is_vring_iotlb(struct virtio_net *dev,
2576 			  struct vhost_virtqueue *vq,
2577 			  struct vhost_iotlb_msg *imsg)
2578 {
2579 	if (vq_is_packed(dev))
2580 		return is_vring_iotlb_packed(vq, imsg);
2581 	else
2582 		return is_vring_iotlb_split(vq, imsg);
2583 }
2584 
2585 static int
2586 vhost_user_iotlb_msg(struct virtio_net **pdev, struct VhostUserMsg *msg,
2587 			int main_fd __rte_unused)
2588 {
2589 	struct virtio_net *dev = *pdev;
2590 	struct vhost_iotlb_msg *imsg = &msg->payload.iotlb;
2591 	uint16_t i;
2592 	uint64_t vva, len;
2593 
2594 	if (validate_msg_fds(msg, 0) != 0)
2595 		return RTE_VHOST_MSG_RESULT_ERR;
2596 
2597 	switch (imsg->type) {
2598 	case VHOST_IOTLB_UPDATE:
2599 		len = imsg->size;
2600 		vva = qva_to_vva(dev, imsg->uaddr, &len);
2601 		if (!vva)
2602 			return RTE_VHOST_MSG_RESULT_ERR;
2603 
2604 		for (i = 0; i < dev->nr_vring; i++) {
2605 			struct vhost_virtqueue *vq = dev->virtqueue[i];
2606 
2607 			if (!vq)
2608 				continue;
2609 
2610 			vhost_user_iotlb_cache_insert(vq, imsg->iova, vva,
2611 					len, imsg->perm);
2612 
2613 			if (is_vring_iotlb(dev, vq, imsg))
2614 				*pdev = dev = translate_ring_addresses(dev, i);
2615 		}
2616 		break;
2617 	case VHOST_IOTLB_INVALIDATE:
2618 		for (i = 0; i < dev->nr_vring; i++) {
2619 			struct vhost_virtqueue *vq = dev->virtqueue[i];
2620 
2621 			if (!vq)
2622 				continue;
2623 
2624 			vhost_user_iotlb_cache_remove(vq, imsg->iova,
2625 					imsg->size);
2626 
2627 			if (is_vring_iotlb(dev, vq, imsg))
2628 				vring_invalidate(dev, vq);
2629 		}
2630 		break;
2631 	default:
2632 		VHOST_LOG_CONFIG(ERR, "Invalid IOTLB message type (%d)\n",
2633 				imsg->type);
2634 		return RTE_VHOST_MSG_RESULT_ERR;
2635 	}
2636 
2637 	return RTE_VHOST_MSG_RESULT_OK;
2638 }
2639 
2640 static int
2641 vhost_user_set_postcopy_advise(struct virtio_net **pdev,
2642 			struct VhostUserMsg *msg,
2643 			int main_fd __rte_unused)
2644 {
2645 	struct virtio_net *dev = *pdev;
2646 #ifdef RTE_LIBRTE_VHOST_POSTCOPY
2647 	struct uffdio_api api_struct;
2648 
2649 	if (validate_msg_fds(msg, 0) != 0)
2650 		return RTE_VHOST_MSG_RESULT_ERR;
2651 
2652 	dev->postcopy_ufd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
2653 
2654 	if (dev->postcopy_ufd == -1) {
2655 		VHOST_LOG_CONFIG(ERR, "Userfaultfd not available: %s\n",
2656 			strerror(errno));
2657 		return RTE_VHOST_MSG_RESULT_ERR;
2658 	}
2659 	api_struct.api = UFFD_API;
2660 	api_struct.features = 0;
2661 	if (ioctl(dev->postcopy_ufd, UFFDIO_API, &api_struct)) {
2662 		VHOST_LOG_CONFIG(ERR, "UFFDIO_API ioctl failure: %s\n",
2663 			strerror(errno));
2664 		close(dev->postcopy_ufd);
2665 		dev->postcopy_ufd = -1;
2666 		return RTE_VHOST_MSG_RESULT_ERR;
2667 	}
2668 	msg->fds[0] = dev->postcopy_ufd;
2669 	msg->fd_num = 1;
2670 
2671 	return RTE_VHOST_MSG_RESULT_REPLY;
2672 #else
2673 	dev->postcopy_ufd = -1;
2674 	msg->fd_num = 0;
2675 
2676 	return RTE_VHOST_MSG_RESULT_ERR;
2677 #endif
2678 }
2679 
2680 static int
2681 vhost_user_set_postcopy_listen(struct virtio_net **pdev,
2682 			struct VhostUserMsg *msg __rte_unused,
2683 			int main_fd __rte_unused)
2684 {
2685 	struct virtio_net *dev = *pdev;
2686 
2687 	if (validate_msg_fds(msg, 0) != 0)
2688 		return RTE_VHOST_MSG_RESULT_ERR;
2689 
2690 	if (dev->mem && dev->mem->nregions) {
2691 		VHOST_LOG_CONFIG(ERR,
2692 			"Regions already registered at postcopy-listen\n");
2693 		return RTE_VHOST_MSG_RESULT_ERR;
2694 	}
2695 	dev->postcopy_listening = 1;
2696 
2697 	return RTE_VHOST_MSG_RESULT_OK;
2698 }
2699 
2700 static int
2701 vhost_user_postcopy_end(struct virtio_net **pdev, struct VhostUserMsg *msg,
2702 			int main_fd __rte_unused)
2703 {
2704 	struct virtio_net *dev = *pdev;
2705 
2706 	if (validate_msg_fds(msg, 0) != 0)
2707 		return RTE_VHOST_MSG_RESULT_ERR;
2708 
2709 	dev->postcopy_listening = 0;
2710 	if (dev->postcopy_ufd >= 0) {
2711 		close(dev->postcopy_ufd);
2712 		dev->postcopy_ufd = -1;
2713 	}
2714 
2715 	msg->payload.u64 = 0;
2716 	msg->size = sizeof(msg->payload.u64);
2717 	msg->fd_num = 0;
2718 
2719 	return RTE_VHOST_MSG_RESULT_REPLY;
2720 }
2721 
2722 static int
2723 vhost_user_get_status(struct virtio_net **pdev, struct VhostUserMsg *msg,
2724 		      int main_fd __rte_unused)
2725 {
2726 	struct virtio_net *dev = *pdev;
2727 
2728 	if (validate_msg_fds(msg, 0) != 0)
2729 		return RTE_VHOST_MSG_RESULT_ERR;
2730 
2731 	msg->payload.u64 = dev->status;
2732 	msg->size = sizeof(msg->payload.u64);
2733 	msg->fd_num = 0;
2734 
2735 	return RTE_VHOST_MSG_RESULT_REPLY;
2736 }
2737 
2738 static int
2739 vhost_user_set_status(struct virtio_net **pdev, struct VhostUserMsg *msg,
2740 			int main_fd __rte_unused)
2741 {
2742 	struct virtio_net *dev = *pdev;
2743 
2744 	if (validate_msg_fds(msg, 0) != 0)
2745 		return RTE_VHOST_MSG_RESULT_ERR;
2746 
2747 	/* As per Virtio specification, the device status is 8bits long */
2748 	if (msg->payload.u64 > UINT8_MAX) {
2749 		VHOST_LOG_CONFIG(ERR, "Invalid VHOST_USER_SET_STATUS payload 0x%" PRIx64 "\n",
2750 				msg->payload.u64);
2751 		return RTE_VHOST_MSG_RESULT_ERR;
2752 	}
2753 
2754 	dev->status = msg->payload.u64;
2755 
2756 	if ((dev->status & VIRTIO_DEVICE_STATUS_FEATURES_OK) &&
2757 	    (dev->flags & VIRTIO_DEV_FEATURES_FAILED)) {
2758 		VHOST_LOG_CONFIG(ERR, "FEATURES_OK bit is set but feature negotiation failed\n");
2759 		/*
2760 		 * Clear the bit to let the driver know about the feature
2761 		 * negotiation failure
2762 		 */
2763 		dev->status &= ~VIRTIO_DEVICE_STATUS_FEATURES_OK;
2764 	}
2765 
2766 	VHOST_LOG_CONFIG(INFO, "New device status(0x%08x):\n"
2767 			"\t-RESET: %u\n"
2768 			"\t-ACKNOWLEDGE: %u\n"
2769 			"\t-DRIVER: %u\n"
2770 			"\t-FEATURES_OK: %u\n"
2771 			"\t-DRIVER_OK: %u\n"
2772 			"\t-DEVICE_NEED_RESET: %u\n"
2773 			"\t-FAILED: %u\n",
2774 			dev->status,
2775 			(dev->status == VIRTIO_DEVICE_STATUS_RESET),
2776 			!!(dev->status & VIRTIO_DEVICE_STATUS_ACK),
2777 			!!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER),
2778 			!!(dev->status & VIRTIO_DEVICE_STATUS_FEATURES_OK),
2779 			!!(dev->status & VIRTIO_DEVICE_STATUS_DRIVER_OK),
2780 			!!(dev->status & VIRTIO_DEVICE_STATUS_DEV_NEED_RESET),
2781 			!!(dev->status & VIRTIO_DEVICE_STATUS_FAILED));
2782 
2783 	return RTE_VHOST_MSG_RESULT_OK;
2784 }
2785 
2786 typedef int (*vhost_message_handler_t)(struct virtio_net **pdev,
2787 					struct VhostUserMsg *msg,
2788 					int main_fd);
2789 static vhost_message_handler_t vhost_message_handlers[VHOST_USER_MAX] = {
2790 	[VHOST_USER_NONE] = NULL,
2791 	[VHOST_USER_GET_FEATURES] = vhost_user_get_features,
2792 	[VHOST_USER_SET_FEATURES] = vhost_user_set_features,
2793 	[VHOST_USER_SET_OWNER] = vhost_user_set_owner,
2794 	[VHOST_USER_RESET_OWNER] = vhost_user_reset_owner,
2795 	[VHOST_USER_SET_MEM_TABLE] = vhost_user_set_mem_table,
2796 	[VHOST_USER_SET_LOG_BASE] = vhost_user_set_log_base,
2797 	[VHOST_USER_SET_LOG_FD] = vhost_user_set_log_fd,
2798 	[VHOST_USER_SET_VRING_NUM] = vhost_user_set_vring_num,
2799 	[VHOST_USER_SET_VRING_ADDR] = vhost_user_set_vring_addr,
2800 	[VHOST_USER_SET_VRING_BASE] = vhost_user_set_vring_base,
2801 	[VHOST_USER_GET_VRING_BASE] = vhost_user_get_vring_base,
2802 	[VHOST_USER_SET_VRING_KICK] = vhost_user_set_vring_kick,
2803 	[VHOST_USER_SET_VRING_CALL] = vhost_user_set_vring_call,
2804 	[VHOST_USER_SET_VRING_ERR] = vhost_user_set_vring_err,
2805 	[VHOST_USER_GET_PROTOCOL_FEATURES] = vhost_user_get_protocol_features,
2806 	[VHOST_USER_SET_PROTOCOL_FEATURES] = vhost_user_set_protocol_features,
2807 	[VHOST_USER_GET_QUEUE_NUM] = vhost_user_get_queue_num,
2808 	[VHOST_USER_SET_VRING_ENABLE] = vhost_user_set_vring_enable,
2809 	[VHOST_USER_SEND_RARP] = vhost_user_send_rarp,
2810 	[VHOST_USER_NET_SET_MTU] = vhost_user_net_set_mtu,
2811 	[VHOST_USER_SET_SLAVE_REQ_FD] = vhost_user_set_req_fd,
2812 	[VHOST_USER_IOTLB_MSG] = vhost_user_iotlb_msg,
2813 	[VHOST_USER_POSTCOPY_ADVISE] = vhost_user_set_postcopy_advise,
2814 	[VHOST_USER_POSTCOPY_LISTEN] = vhost_user_set_postcopy_listen,
2815 	[VHOST_USER_POSTCOPY_END] = vhost_user_postcopy_end,
2816 	[VHOST_USER_GET_INFLIGHT_FD] = vhost_user_get_inflight_fd,
2817 	[VHOST_USER_SET_INFLIGHT_FD] = vhost_user_set_inflight_fd,
2818 	[VHOST_USER_SET_STATUS] = vhost_user_set_status,
2819 	[VHOST_USER_GET_STATUS] = vhost_user_get_status,
2820 };
2821 
2822 /* return bytes# of read on success or negative val on failure. */
2823 static int
2824 read_vhost_message(int sockfd, struct VhostUserMsg *msg)
2825 {
2826 	int ret;
2827 
2828 	ret = read_fd_message(sockfd, (char *)msg, VHOST_USER_HDR_SIZE,
2829 		msg->fds, VHOST_MEMORY_MAX_NREGIONS, &msg->fd_num);
2830 	if (ret <= 0) {
2831 		return ret;
2832 	} else if (ret != VHOST_USER_HDR_SIZE) {
2833 		VHOST_LOG_CONFIG(ERR, "Unexpected header size read\n");
2834 		close_msg_fds(msg);
2835 		return -1;
2836 	}
2837 
2838 	if (msg->size) {
2839 		if (msg->size > sizeof(msg->payload)) {
2840 			VHOST_LOG_CONFIG(ERR,
2841 				"invalid msg size: %d\n", msg->size);
2842 			return -1;
2843 		}
2844 		ret = read(sockfd, &msg->payload, msg->size);
2845 		if (ret <= 0)
2846 			return ret;
2847 		if (ret != (int)msg->size) {
2848 			VHOST_LOG_CONFIG(ERR,
2849 				"read control message failed\n");
2850 			return -1;
2851 		}
2852 	}
2853 
2854 	return ret;
2855 }
2856 
2857 static int
2858 send_vhost_message(int sockfd, struct VhostUserMsg *msg)
2859 {
2860 	if (!msg)
2861 		return 0;
2862 
2863 	return send_fd_message(sockfd, (char *)msg,
2864 		VHOST_USER_HDR_SIZE + msg->size, msg->fds, msg->fd_num);
2865 }
2866 
2867 static int
2868 send_vhost_reply(int sockfd, struct VhostUserMsg *msg)
2869 {
2870 	if (!msg)
2871 		return 0;
2872 
2873 	msg->flags &= ~VHOST_USER_VERSION_MASK;
2874 	msg->flags &= ~VHOST_USER_NEED_REPLY;
2875 	msg->flags |= VHOST_USER_VERSION;
2876 	msg->flags |= VHOST_USER_REPLY_MASK;
2877 
2878 	return send_vhost_message(sockfd, msg);
2879 }
2880 
2881 static int
2882 send_vhost_slave_message(struct virtio_net *dev, struct VhostUserMsg *msg)
2883 {
2884 	int ret;
2885 
2886 	if (msg->flags & VHOST_USER_NEED_REPLY)
2887 		rte_spinlock_lock(&dev->slave_req_lock);
2888 
2889 	ret = send_vhost_message(dev->slave_req_fd, msg);
2890 	if (ret < 0 && (msg->flags & VHOST_USER_NEED_REPLY))
2891 		rte_spinlock_unlock(&dev->slave_req_lock);
2892 
2893 	return ret;
2894 }
2895 
2896 /*
2897  * Allocate a queue pair if it hasn't been allocated yet
2898  */
2899 static int
2900 vhost_user_check_and_alloc_queue_pair(struct virtio_net *dev,
2901 			struct VhostUserMsg *msg)
2902 {
2903 	uint32_t vring_idx;
2904 
2905 	switch (msg->request.master) {
2906 	case VHOST_USER_SET_VRING_KICK:
2907 	case VHOST_USER_SET_VRING_CALL:
2908 	case VHOST_USER_SET_VRING_ERR:
2909 		vring_idx = msg->payload.u64 & VHOST_USER_VRING_IDX_MASK;
2910 		break;
2911 	case VHOST_USER_SET_VRING_NUM:
2912 	case VHOST_USER_SET_VRING_BASE:
2913 	case VHOST_USER_GET_VRING_BASE:
2914 	case VHOST_USER_SET_VRING_ENABLE:
2915 		vring_idx = msg->payload.state.index;
2916 		break;
2917 	case VHOST_USER_SET_VRING_ADDR:
2918 		vring_idx = msg->payload.addr.index;
2919 		break;
2920 	default:
2921 		return 0;
2922 	}
2923 
2924 	if (vring_idx >= VHOST_MAX_VRING) {
2925 		VHOST_LOG_CONFIG(ERR,
2926 			"invalid vring index: %u\n", vring_idx);
2927 		return -1;
2928 	}
2929 
2930 	if (dev->virtqueue[vring_idx])
2931 		return 0;
2932 
2933 	return alloc_vring_queue(dev, vring_idx);
2934 }
2935 
2936 static void
2937 vhost_user_lock_all_queue_pairs(struct virtio_net *dev)
2938 {
2939 	unsigned int i = 0;
2940 	unsigned int vq_num = 0;
2941 
2942 	while (vq_num < dev->nr_vring) {
2943 		struct vhost_virtqueue *vq = dev->virtqueue[i];
2944 
2945 		if (vq) {
2946 			rte_spinlock_lock(&vq->access_lock);
2947 			vq_num++;
2948 		}
2949 		i++;
2950 	}
2951 }
2952 
2953 static void
2954 vhost_user_unlock_all_queue_pairs(struct virtio_net *dev)
2955 {
2956 	unsigned int i = 0;
2957 	unsigned int vq_num = 0;
2958 
2959 	while (vq_num < dev->nr_vring) {
2960 		struct vhost_virtqueue *vq = dev->virtqueue[i];
2961 
2962 		if (vq) {
2963 			rte_spinlock_unlock(&vq->access_lock);
2964 			vq_num++;
2965 		}
2966 		i++;
2967 	}
2968 }
2969 
2970 int
2971 vhost_user_msg_handler(int vid, int fd)
2972 {
2973 	struct virtio_net *dev;
2974 	struct VhostUserMsg msg;
2975 	struct rte_vdpa_device *vdpa_dev;
2976 	int ret;
2977 	int unlock_required = 0;
2978 	bool handled;
2979 	int request;
2980 	uint32_t i;
2981 
2982 	dev = get_device(vid);
2983 	if (dev == NULL)
2984 		return -1;
2985 
2986 	if (!dev->notify_ops) {
2987 		dev->notify_ops = vhost_driver_callback_get(dev->ifname);
2988 		if (!dev->notify_ops) {
2989 			VHOST_LOG_CONFIG(ERR,
2990 				"failed to get callback ops for driver %s\n",
2991 				dev->ifname);
2992 			return -1;
2993 		}
2994 	}
2995 
2996 	ret = read_vhost_message(fd, &msg);
2997 	if (ret <= 0) {
2998 		if (ret < 0)
2999 			VHOST_LOG_CONFIG(ERR,
3000 				"vhost read message failed\n");
3001 		else
3002 			VHOST_LOG_CONFIG(INFO,
3003 				"vhost peer closed\n");
3004 
3005 		return -1;
3006 	}
3007 
3008 	ret = 0;
3009 	request = msg.request.master;
3010 	if (request > VHOST_USER_NONE && request < VHOST_USER_MAX &&
3011 			vhost_message_str[request]) {
3012 		if (request != VHOST_USER_IOTLB_MSG)
3013 			VHOST_LOG_CONFIG(INFO, "read message %s\n",
3014 				vhost_message_str[request]);
3015 		else
3016 			VHOST_LOG_CONFIG(DEBUG, "read message %s\n",
3017 				vhost_message_str[request]);
3018 	} else {
3019 		VHOST_LOG_CONFIG(DEBUG, "External request %d\n", request);
3020 	}
3021 
3022 	ret = vhost_user_check_and_alloc_queue_pair(dev, &msg);
3023 	if (ret < 0) {
3024 		VHOST_LOG_CONFIG(ERR,
3025 			"failed to alloc queue\n");
3026 		return -1;
3027 	}
3028 
3029 	/*
3030 	 * Note: we don't lock all queues on VHOST_USER_GET_VRING_BASE
3031 	 * and VHOST_USER_RESET_OWNER, since it is sent when virtio stops
3032 	 * and device is destroyed. destroy_device waits for queues to be
3033 	 * inactive, so it is safe. Otherwise taking the access_lock
3034 	 * would cause a dead lock.
3035 	 */
3036 	switch (request) {
3037 	case VHOST_USER_SET_FEATURES:
3038 	case VHOST_USER_SET_PROTOCOL_FEATURES:
3039 	case VHOST_USER_SET_OWNER:
3040 	case VHOST_USER_SET_MEM_TABLE:
3041 	case VHOST_USER_SET_LOG_BASE:
3042 	case VHOST_USER_SET_LOG_FD:
3043 	case VHOST_USER_SET_VRING_NUM:
3044 	case VHOST_USER_SET_VRING_ADDR:
3045 	case VHOST_USER_SET_VRING_BASE:
3046 	case VHOST_USER_SET_VRING_KICK:
3047 	case VHOST_USER_SET_VRING_CALL:
3048 	case VHOST_USER_SET_VRING_ERR:
3049 	case VHOST_USER_SET_VRING_ENABLE:
3050 	case VHOST_USER_SEND_RARP:
3051 	case VHOST_USER_NET_SET_MTU:
3052 	case VHOST_USER_SET_SLAVE_REQ_FD:
3053 		if (!(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED)) {
3054 			vhost_user_lock_all_queue_pairs(dev);
3055 			unlock_required = 1;
3056 		}
3057 		break;
3058 	default:
3059 		break;
3060 
3061 	}
3062 
3063 	handled = false;
3064 	if (dev->extern_ops.pre_msg_handle) {
3065 		ret = (*dev->extern_ops.pre_msg_handle)(dev->vid,
3066 				(void *)&msg);
3067 		switch (ret) {
3068 		case RTE_VHOST_MSG_RESULT_REPLY:
3069 			send_vhost_reply(fd, &msg);
3070 			/* Fall-through */
3071 		case RTE_VHOST_MSG_RESULT_ERR:
3072 		case RTE_VHOST_MSG_RESULT_OK:
3073 			handled = true;
3074 			goto skip_to_post_handle;
3075 		case RTE_VHOST_MSG_RESULT_NOT_HANDLED:
3076 		default:
3077 			break;
3078 		}
3079 	}
3080 
3081 	if (request > VHOST_USER_NONE && request < VHOST_USER_MAX) {
3082 		if (!vhost_message_handlers[request])
3083 			goto skip_to_post_handle;
3084 		ret = vhost_message_handlers[request](&dev, &msg, fd);
3085 
3086 		switch (ret) {
3087 		case RTE_VHOST_MSG_RESULT_ERR:
3088 			VHOST_LOG_CONFIG(ERR,
3089 				"Processing %s failed.\n",
3090 				vhost_message_str[request]);
3091 			handled = true;
3092 			break;
3093 		case RTE_VHOST_MSG_RESULT_OK:
3094 			VHOST_LOG_CONFIG(DEBUG,
3095 				"Processing %s succeeded.\n",
3096 				vhost_message_str[request]);
3097 			handled = true;
3098 			break;
3099 		case RTE_VHOST_MSG_RESULT_REPLY:
3100 			VHOST_LOG_CONFIG(DEBUG,
3101 				"Processing %s succeeded and needs reply.\n",
3102 				vhost_message_str[request]);
3103 			send_vhost_reply(fd, &msg);
3104 			handled = true;
3105 			break;
3106 		default:
3107 			break;
3108 		}
3109 	}
3110 
3111 skip_to_post_handle:
3112 	if (ret != RTE_VHOST_MSG_RESULT_ERR &&
3113 			dev->extern_ops.post_msg_handle) {
3114 		ret = (*dev->extern_ops.post_msg_handle)(dev->vid,
3115 				(void *)&msg);
3116 		switch (ret) {
3117 		case RTE_VHOST_MSG_RESULT_REPLY:
3118 			send_vhost_reply(fd, &msg);
3119 			/* Fall-through */
3120 		case RTE_VHOST_MSG_RESULT_ERR:
3121 		case RTE_VHOST_MSG_RESULT_OK:
3122 			handled = true;
3123 		case RTE_VHOST_MSG_RESULT_NOT_HANDLED:
3124 		default:
3125 			break;
3126 		}
3127 	}
3128 
3129 	/* If message was not handled at this stage, treat it as an error */
3130 	if (!handled) {
3131 		VHOST_LOG_CONFIG(ERR,
3132 			"vhost message (req: %d) was not handled.\n", request);
3133 		close_msg_fds(&msg);
3134 		ret = RTE_VHOST_MSG_RESULT_ERR;
3135 	}
3136 
3137 	/*
3138 	 * If the request required a reply that was already sent,
3139 	 * this optional reply-ack won't be sent as the
3140 	 * VHOST_USER_NEED_REPLY was cleared in send_vhost_reply().
3141 	 */
3142 	if (msg.flags & VHOST_USER_NEED_REPLY) {
3143 		msg.payload.u64 = ret == RTE_VHOST_MSG_RESULT_ERR;
3144 		msg.size = sizeof(msg.payload.u64);
3145 		msg.fd_num = 0;
3146 		send_vhost_reply(fd, &msg);
3147 	} else if (ret == RTE_VHOST_MSG_RESULT_ERR) {
3148 		VHOST_LOG_CONFIG(ERR,
3149 			"vhost message handling failed.\n");
3150 		return -1;
3151 	}
3152 
3153 	for (i = 0; i < dev->nr_vring; i++) {
3154 		struct vhost_virtqueue *vq = dev->virtqueue[i];
3155 		bool cur_ready = vq_is_ready(dev, vq);
3156 
3157 		if (cur_ready != (vq && vq->ready)) {
3158 			vq->ready = cur_ready;
3159 			vhost_user_notify_queue_state(dev, i, cur_ready);
3160 		}
3161 	}
3162 
3163 	if (unlock_required)
3164 		vhost_user_unlock_all_queue_pairs(dev);
3165 
3166 	if (!virtio_is_ready(dev))
3167 		goto out;
3168 
3169 	/*
3170 	 * Virtio is now ready. If not done already, it is time
3171 	 * to notify the application it can process the rings and
3172 	 * configure the vDPA device if present.
3173 	 */
3174 
3175 	if (!(dev->flags & VIRTIO_DEV_RUNNING)) {
3176 		if (dev->notify_ops->new_device(dev->vid) == 0)
3177 			dev->flags |= VIRTIO_DEV_RUNNING;
3178 	}
3179 
3180 	vdpa_dev = dev->vdpa_dev;
3181 	if (!vdpa_dev)
3182 		goto out;
3183 
3184 	if (!(dev->flags & VIRTIO_DEV_VDPA_CONFIGURED)) {
3185 		if (vdpa_dev->ops->dev_conf(dev->vid))
3186 			VHOST_LOG_CONFIG(ERR,
3187 					 "Failed to configure vDPA device\n");
3188 		else
3189 			dev->flags |= VIRTIO_DEV_VDPA_CONFIGURED;
3190 	}
3191 
3192 out:
3193 	return 0;
3194 }
3195 
3196 static int process_slave_message_reply(struct virtio_net *dev,
3197 				       const struct VhostUserMsg *msg)
3198 {
3199 	struct VhostUserMsg msg_reply;
3200 	int ret;
3201 
3202 	if ((msg->flags & VHOST_USER_NEED_REPLY) == 0)
3203 		return 0;
3204 
3205 	ret = read_vhost_message(dev->slave_req_fd, &msg_reply);
3206 	if (ret <= 0) {
3207 		if (ret < 0)
3208 			VHOST_LOG_CONFIG(ERR,
3209 				"vhost read slave message reply failed\n");
3210 		else
3211 			VHOST_LOG_CONFIG(INFO,
3212 				"vhost peer closed\n");
3213 		ret = -1;
3214 		goto out;
3215 	}
3216 
3217 	ret = 0;
3218 	if (msg_reply.request.slave != msg->request.slave) {
3219 		VHOST_LOG_CONFIG(ERR,
3220 			"Received unexpected msg type (%u), expected %u\n",
3221 			msg_reply.request.slave, msg->request.slave);
3222 		ret = -1;
3223 		goto out;
3224 	}
3225 
3226 	ret = msg_reply.payload.u64 ? -1 : 0;
3227 
3228 out:
3229 	rte_spinlock_unlock(&dev->slave_req_lock);
3230 	return ret;
3231 }
3232 
3233 int
3234 vhost_user_iotlb_miss(struct virtio_net *dev, uint64_t iova, uint8_t perm)
3235 {
3236 	int ret;
3237 	struct VhostUserMsg msg = {
3238 		.request.slave = VHOST_USER_SLAVE_IOTLB_MSG,
3239 		.flags = VHOST_USER_VERSION,
3240 		.size = sizeof(msg.payload.iotlb),
3241 		.payload.iotlb = {
3242 			.iova = iova,
3243 			.perm = perm,
3244 			.type = VHOST_IOTLB_MISS,
3245 		},
3246 	};
3247 
3248 	ret = send_vhost_message(dev->slave_req_fd, &msg);
3249 	if (ret < 0) {
3250 		VHOST_LOG_CONFIG(ERR,
3251 				"Failed to send IOTLB miss message (%d)\n",
3252 				ret);
3253 		return ret;
3254 	}
3255 
3256 	return 0;
3257 }
3258 
3259 static int
3260 vhost_user_slave_config_change(struct virtio_net *dev, bool need_reply)
3261 {
3262 	int ret;
3263 	struct VhostUserMsg msg = {
3264 		.request.slave = VHOST_USER_SLAVE_CONFIG_CHANGE_MSG,
3265 		.flags = VHOST_USER_VERSION,
3266 		.size = 0,
3267 	};
3268 
3269 	if (need_reply)
3270 		msg.flags |= VHOST_USER_NEED_REPLY;
3271 
3272 	ret = send_vhost_slave_message(dev, &msg);
3273 	if (ret < 0) {
3274 		VHOST_LOG_CONFIG(ERR,
3275 				"Failed to send config change (%d)\n",
3276 				ret);
3277 		return ret;
3278 	}
3279 
3280 	return process_slave_message_reply(dev, &msg);
3281 }
3282 
3283 int
3284 rte_vhost_slave_config_change(int vid, bool need_reply)
3285 {
3286 	struct virtio_net *dev;
3287 
3288 	dev = get_device(vid);
3289 	if (!dev)
3290 		return -ENODEV;
3291 
3292 	return vhost_user_slave_config_change(dev, need_reply);
3293 }
3294 
3295 static int vhost_user_slave_set_vring_host_notifier(struct virtio_net *dev,
3296 						    int index, int fd,
3297 						    uint64_t offset,
3298 						    uint64_t size)
3299 {
3300 	int ret;
3301 	struct VhostUserMsg msg = {
3302 		.request.slave = VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG,
3303 		.flags = VHOST_USER_VERSION | VHOST_USER_NEED_REPLY,
3304 		.size = sizeof(msg.payload.area),
3305 		.payload.area = {
3306 			.u64 = index & VHOST_USER_VRING_IDX_MASK,
3307 			.size = size,
3308 			.offset = offset,
3309 		},
3310 	};
3311 
3312 	if (fd < 0)
3313 		msg.payload.area.u64 |= VHOST_USER_VRING_NOFD_MASK;
3314 	else {
3315 		msg.fds[0] = fd;
3316 		msg.fd_num = 1;
3317 	}
3318 
3319 	ret = send_vhost_slave_message(dev, &msg);
3320 	if (ret < 0) {
3321 		VHOST_LOG_CONFIG(ERR,
3322 			"Failed to set host notifier (%d)\n", ret);
3323 		return ret;
3324 	}
3325 
3326 	return process_slave_message_reply(dev, &msg);
3327 }
3328 
3329 int rte_vhost_host_notifier_ctrl(int vid, uint16_t qid, bool enable)
3330 {
3331 	struct virtio_net *dev;
3332 	struct rte_vdpa_device *vdpa_dev;
3333 	int vfio_device_fd, ret = 0;
3334 	uint64_t offset, size;
3335 	unsigned int i, q_start, q_last;
3336 
3337 	dev = get_device(vid);
3338 	if (!dev)
3339 		return -ENODEV;
3340 
3341 	vdpa_dev = dev->vdpa_dev;
3342 	if (vdpa_dev == NULL)
3343 		return -ENODEV;
3344 
3345 	if (!(dev->features & (1ULL << VIRTIO_F_VERSION_1)) ||
3346 	    !(dev->features & (1ULL << VHOST_USER_F_PROTOCOL_FEATURES)) ||
3347 	    !(dev->protocol_features &
3348 			(1ULL << VHOST_USER_PROTOCOL_F_SLAVE_REQ)) ||
3349 	    !(dev->protocol_features &
3350 			(1ULL << VHOST_USER_PROTOCOL_F_SLAVE_SEND_FD)) ||
3351 	    !(dev->protocol_features &
3352 			(1ULL << VHOST_USER_PROTOCOL_F_HOST_NOTIFIER)))
3353 		return -ENOTSUP;
3354 
3355 	if (qid == RTE_VHOST_QUEUE_ALL) {
3356 		q_start = 0;
3357 		q_last = dev->nr_vring - 1;
3358 	} else {
3359 		if (qid >= dev->nr_vring)
3360 			return -EINVAL;
3361 		q_start = qid;
3362 		q_last = qid;
3363 	}
3364 
3365 	RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_vfio_device_fd, -ENOTSUP);
3366 	RTE_FUNC_PTR_OR_ERR_RET(vdpa_dev->ops->get_notify_area, -ENOTSUP);
3367 
3368 	vfio_device_fd = vdpa_dev->ops->get_vfio_device_fd(vid);
3369 	if (vfio_device_fd < 0)
3370 		return -ENOTSUP;
3371 
3372 	if (enable) {
3373 		for (i = q_start; i <= q_last; i++) {
3374 			if (vdpa_dev->ops->get_notify_area(vid, i, &offset,
3375 					&size) < 0) {
3376 				ret = -ENOTSUP;
3377 				goto disable;
3378 			}
3379 
3380 			if (vhost_user_slave_set_vring_host_notifier(dev, i,
3381 					vfio_device_fd, offset, size) < 0) {
3382 				ret = -EFAULT;
3383 				goto disable;
3384 			}
3385 		}
3386 	} else {
3387 disable:
3388 		for (i = q_start; i <= q_last; i++) {
3389 			vhost_user_slave_set_vring_host_notifier(dev, i, -1,
3390 					0, 0);
3391 		}
3392 	}
3393 
3394 	return ret;
3395 }
3396