xref: /spdk/lib/virtio/virtio_vhost_user.c (revision 4e6e7eafefea30d301a64549d65c40aa4d975dc6)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
3  *   All rights reserved.
4  */
5 
6 #include "spdk/stdinc.h"
7 
8 #include <sys/eventfd.h>
9 
10 #include "spdk/string.h"
11 #include "spdk/config.h"
12 #include "spdk/util.h"
13 
14 #include "spdk_internal/virtio.h"
15 #include "spdk_internal/vhost_user.h"
16 
17 /* The version of the protocol we support */
18 #define VHOST_USER_VERSION    0x1
19 
20 #define VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES \
21 	((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
22 	(1ULL << VHOST_USER_PROTOCOL_F_CONFIG))
23 
24 struct virtio_user_backend_ops;
25 
26 struct virtio_user_dev {
27 	int		vhostfd;
28 
29 	int		callfds[SPDK_VIRTIO_MAX_VIRTQUEUES];
30 	int		kickfds[SPDK_VIRTIO_MAX_VIRTQUEUES];
31 	uint32_t	queue_size;
32 
33 	uint8_t		status;
34 	char		path[PATH_MAX];
35 	uint64_t	protocol_features;
36 	struct vring	vrings[SPDK_VIRTIO_MAX_VIRTQUEUES];
37 	struct virtio_user_backend_ops *ops;
38 	struct spdk_mem_map *mem_map;
39 };
40 
41 struct virtio_user_backend_ops {
42 	int (*setup)(struct virtio_user_dev *dev);
43 	int (*send_request)(struct virtio_user_dev *dev,
44 			    enum vhost_user_request req,
45 			    void *arg);
46 };
47 
48 static int
49 vhost_user_write(int fd, void *buf, int len, int *fds, int fd_num)
50 {
51 	int r;
52 	struct msghdr msgh;
53 	struct iovec iov;
54 	size_t fd_size = fd_num * sizeof(int);
55 	char control[CMSG_SPACE(fd_size)];
56 	struct cmsghdr *cmsg;
57 
58 	memset(&msgh, 0, sizeof(msgh));
59 	memset(control, 0, sizeof(control));
60 
61 	iov.iov_base = (uint8_t *)buf;
62 	iov.iov_len = len;
63 
64 	msgh.msg_iov = &iov;
65 	msgh.msg_iovlen = 1;
66 
67 	if (fds && fd_num > 0) {
68 		msgh.msg_control = control;
69 		msgh.msg_controllen = sizeof(control);
70 		cmsg = CMSG_FIRSTHDR(&msgh);
71 		cmsg->cmsg_len = CMSG_LEN(fd_size);
72 		cmsg->cmsg_level = SOL_SOCKET;
73 		cmsg->cmsg_type = SCM_RIGHTS;
74 		memcpy(CMSG_DATA(cmsg), fds, fd_size);
75 	} else {
76 		msgh.msg_control = NULL;
77 		msgh.msg_controllen = 0;
78 	}
79 
80 	do {
81 		r = sendmsg(fd, &msgh, 0);
82 	} while (r < 0 && errno == EINTR);
83 
84 	if (r == -1) {
85 		return -errno;
86 	}
87 
88 	return 0;
89 }
90 
91 static int
92 vhost_user_read(int fd, struct vhost_user_msg *msg)
93 {
94 	uint32_t valid_flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION;
95 	ssize_t ret;
96 	size_t sz_hdr = VHOST_USER_HDR_SIZE, sz_payload;
97 
98 	ret = recv(fd, (void *)msg, sz_hdr, 0);
99 	if ((size_t)ret != sz_hdr) {
100 		SPDK_WARNLOG("Failed to recv msg hdr: %zd instead of %zu.\n",
101 			     ret, sz_hdr);
102 		if (ret == -1) {
103 			return -errno;
104 		} else {
105 			return -EBUSY;
106 		}
107 	}
108 
109 	/* validate msg flags */
110 	if (msg->flags != (valid_flags)) {
111 		SPDK_WARNLOG("Failed to recv msg: flags %"PRIx32" instead of %"PRIx32".\n",
112 			     msg->flags, valid_flags);
113 		return -EIO;
114 	}
115 
116 	sz_payload = msg->size;
117 
118 	if (sz_payload > VHOST_USER_PAYLOAD_SIZE) {
119 		SPDK_WARNLOG("Received oversized msg: payload size %zu > available space %zu\n",
120 			     sz_payload, VHOST_USER_PAYLOAD_SIZE);
121 		return -EIO;
122 	}
123 
124 	if (sz_payload) {
125 		ret = recv(fd, (void *)((char *)msg + sz_hdr), sz_payload, 0);
126 		if ((size_t)ret != sz_payload) {
127 			SPDK_WARNLOG("Failed to recv msg payload: %zd instead of %"PRIu32".\n",
128 				     ret, msg->size);
129 			if (ret == -1) {
130 				return -errno;
131 			} else {
132 				return -EBUSY;
133 			}
134 		}
135 	}
136 
137 	return 0;
138 }
139 
140 struct hugepage_file_info {
141 	uint64_t addr;            /**< virtual addr */
142 	size_t   size;            /**< the file size */
143 	char     path[PATH_MAX];  /**< path to backing file */
144 };
145 
146 /* Two possible options:
147  * 1. Match HUGEPAGE_INFO_FMT to find the file storing struct hugepage_file
148  * array. This is simple but cannot be used in secondary process because
149  * secondary process will close and munmap that file.
150  * 2. Match HUGEFILE_FMT to find hugepage files directly.
151  *
152  * We choose option 2.
153  */
154 static int
155 get_hugepage_file_info(struct hugepage_file_info hugepages[], int max)
156 {
157 	int idx, rc;
158 	FILE *f;
159 	char buf[BUFSIZ], *tmp, *tail;
160 	char *str_underline, *str_start;
161 	int huge_index;
162 	uint64_t v_start, v_end;
163 
164 	f = fopen("/proc/self/maps", "r");
165 	if (!f) {
166 		SPDK_ERRLOG("cannot open /proc/self/maps\n");
167 		rc = -errno;
168 		assert(rc < 0); /* scan-build hack */
169 		return rc;
170 	}
171 
172 	idx = 0;
173 	while (fgets(buf, sizeof(buf), f) != NULL) {
174 		if (sscanf(buf, "%" PRIx64 "-%" PRIx64, &v_start, &v_end) < 2) {
175 			SPDK_ERRLOG("Failed to parse address\n");
176 			rc = -EIO;
177 			goto out;
178 		}
179 
180 		tmp = strchr(buf, ' ') + 1; /** skip address */
181 		tmp = strchr(tmp, ' ') + 1; /** skip perm */
182 		tmp = strchr(tmp, ' ') + 1; /** skip offset */
183 		tmp = strchr(tmp, ' ') + 1; /** skip dev */
184 		tmp = strchr(tmp, ' ') + 1; /** skip inode */
185 		while (*tmp == ' ') {       /** skip spaces */
186 			tmp++;
187 		}
188 		tail = strrchr(tmp, '\n');  /** remove newline if exists */
189 		if (tail) {
190 			*tail = '\0';
191 		}
192 
193 		/* Match HUGEFILE_FMT, aka "%s/%smap_%d",
194 		 * which is defined in eal_filesystem.h
195 		 */
196 		str_underline = strrchr(tmp, '_');
197 		if (!str_underline) {
198 			continue;
199 		}
200 
201 		str_start = str_underline - strlen("map");
202 		if (str_start < tmp) {
203 			continue;
204 		}
205 
206 		if (sscanf(str_start, "map_%d", &huge_index) != 1) {
207 			continue;
208 		}
209 
210 		if (idx >= max) {
211 			SPDK_ERRLOG("Exceed maximum of %d\n", max);
212 			rc = -ENOSPC;
213 			goto out;
214 		}
215 
216 		if (idx > 0 &&
217 		    strncmp(tmp, hugepages[idx - 1].path, PATH_MAX) == 0 &&
218 		    v_start == hugepages[idx - 1].addr + hugepages[idx - 1].size) {
219 			hugepages[idx - 1].size += (v_end - v_start);
220 			continue;
221 		}
222 
223 		hugepages[idx].addr = v_start;
224 		hugepages[idx].size = v_end - v_start;
225 		snprintf(hugepages[idx].path, PATH_MAX, "%s", tmp);
226 		idx++;
227 	}
228 
229 	rc = idx;
230 out:
231 	fclose(f);
232 	return rc;
233 }
234 
235 static int
236 prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[])
237 {
238 	int i, num;
239 	struct hugepage_file_info hugepages[VHOST_USER_MEMORY_MAX_NREGIONS];
240 
241 	num = get_hugepage_file_info(hugepages, VHOST_USER_MEMORY_MAX_NREGIONS);
242 	if (num < 0) {
243 		SPDK_ERRLOG("Failed to prepare memory for vhost-user\n");
244 		return num;
245 	}
246 
247 	for (i = 0; i < num; ++i) {
248 		/* the memory regions are unaligned */
249 		msg->payload.memory.regions[i].guest_phys_addr = hugepages[i].addr; /* use vaddr! */
250 		msg->payload.memory.regions[i].userspace_addr = hugepages[i].addr;
251 		msg->payload.memory.regions[i].memory_size = hugepages[i].size;
252 		msg->payload.memory.regions[i].flags_padding = 0;
253 		fds[i] = open(hugepages[i].path, O_RDWR);
254 	}
255 
256 	msg->payload.memory.nregions = num;
257 	msg->payload.memory.padding = 0;
258 
259 	return 0;
260 }
261 
262 static const char *const vhost_msg_strings[VHOST_USER_MAX] = {
263 	[VHOST_USER_SET_OWNER] = "VHOST_SET_OWNER",
264 	[VHOST_USER_RESET_OWNER] = "VHOST_RESET_OWNER",
265 	[VHOST_USER_SET_FEATURES] = "VHOST_SET_FEATURES",
266 	[VHOST_USER_GET_FEATURES] = "VHOST_GET_FEATURES",
267 	[VHOST_USER_SET_VRING_CALL] = "VHOST_SET_VRING_CALL",
268 	[VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES",
269 	[VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES",
270 	[VHOST_USER_SET_VRING_NUM] = "VHOST_SET_VRING_NUM",
271 	[VHOST_USER_SET_VRING_BASE] = "VHOST_SET_VRING_BASE",
272 	[VHOST_USER_GET_VRING_BASE] = "VHOST_GET_VRING_BASE",
273 	[VHOST_USER_SET_VRING_ADDR] = "VHOST_SET_VRING_ADDR",
274 	[VHOST_USER_SET_VRING_KICK] = "VHOST_SET_VRING_KICK",
275 	[VHOST_USER_SET_MEM_TABLE] = "VHOST_SET_MEM_TABLE",
276 	[VHOST_USER_SET_VRING_ENABLE] = "VHOST_SET_VRING_ENABLE",
277 	[VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM",
278 	[VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG",
279 	[VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG",
280 };
281 
282 static int
283 vhost_user_sock(struct virtio_user_dev *dev,
284 		enum vhost_user_request req,
285 		void *arg)
286 {
287 	struct vhost_user_msg msg;
288 	struct vhost_vring_file *file = 0;
289 	int need_reply = 0;
290 	int fds[VHOST_USER_MEMORY_MAX_NREGIONS];
291 	int fd_num = 0;
292 	int i, len, rc;
293 	int vhostfd = dev->vhostfd;
294 
295 	SPDK_DEBUGLOG(virtio_user, "sent message %d = %s\n", req, vhost_msg_strings[req]);
296 
297 	msg.request = req;
298 	msg.flags = VHOST_USER_VERSION;
299 	msg.size = 0;
300 
301 	switch (req) {
302 	case VHOST_USER_GET_FEATURES:
303 	case VHOST_USER_GET_PROTOCOL_FEATURES:
304 	case VHOST_USER_GET_QUEUE_NUM:
305 		need_reply = 1;
306 		break;
307 
308 	case VHOST_USER_SET_FEATURES:
309 	case VHOST_USER_SET_LOG_BASE:
310 	case VHOST_USER_SET_PROTOCOL_FEATURES:
311 		msg.payload.u64 = *((__u64 *)arg);
312 		msg.size = sizeof(msg.payload.u64);
313 		break;
314 
315 	case VHOST_USER_SET_OWNER:
316 	case VHOST_USER_RESET_OWNER:
317 		break;
318 
319 	case VHOST_USER_SET_MEM_TABLE:
320 		rc = prepare_vhost_memory_user(&msg, fds);
321 		if (rc < 0) {
322 			return rc;
323 		}
324 		fd_num = msg.payload.memory.nregions;
325 		msg.size = sizeof(msg.payload.memory.nregions);
326 		msg.size += sizeof(msg.payload.memory.padding);
327 		msg.size += fd_num * sizeof(struct vhost_memory_region);
328 		break;
329 
330 	case VHOST_USER_SET_LOG_FD:
331 		fds[fd_num++] = *((int *)arg);
332 		break;
333 
334 	case VHOST_USER_SET_VRING_NUM:
335 	case VHOST_USER_SET_VRING_BASE:
336 	case VHOST_USER_SET_VRING_ENABLE:
337 		memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
338 		msg.size = sizeof(msg.payload.state);
339 		break;
340 
341 	case VHOST_USER_GET_VRING_BASE:
342 		memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
343 		msg.size = sizeof(msg.payload.state);
344 		need_reply = 1;
345 		break;
346 
347 	case VHOST_USER_SET_VRING_ADDR:
348 		memcpy(&msg.payload.addr, arg, sizeof(msg.payload.addr));
349 		msg.size = sizeof(msg.payload.addr);
350 		break;
351 
352 	case VHOST_USER_SET_VRING_KICK:
353 	case VHOST_USER_SET_VRING_CALL:
354 	case VHOST_USER_SET_VRING_ERR:
355 		file = arg;
356 		msg.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK;
357 		msg.size = sizeof(msg.payload.u64);
358 		if (file->fd > 0) {
359 			fds[fd_num++] = file->fd;
360 		} else {
361 			msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
362 		}
363 		break;
364 
365 	case VHOST_USER_GET_CONFIG:
366 		memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg));
367 		msg.size = sizeof(msg.payload.cfg);
368 		need_reply = 1;
369 		break;
370 
371 	case VHOST_USER_SET_CONFIG:
372 		memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg));
373 		msg.size = sizeof(msg.payload.cfg);
374 		break;
375 
376 	default:
377 		SPDK_ERRLOG("trying to send unknown msg\n");
378 		return -EINVAL;
379 	}
380 
381 	len = VHOST_USER_HDR_SIZE + msg.size;
382 	rc = vhost_user_write(vhostfd, &msg, len, fds, fd_num);
383 	if (rc < 0) {
384 		SPDK_ERRLOG("%s failed: %s\n",
385 			    vhost_msg_strings[req], spdk_strerror(-rc));
386 		return rc;
387 	}
388 
389 	if (req == VHOST_USER_SET_MEM_TABLE)
390 		for (i = 0; i < fd_num; ++i) {
391 			close(fds[i]);
392 		}
393 
394 	if (need_reply) {
395 		rc = vhost_user_read(vhostfd, &msg);
396 		if (rc < 0) {
397 			SPDK_WARNLOG("Received msg failed: %s\n", spdk_strerror(-rc));
398 			return rc;
399 		}
400 
401 		if (req != msg.request) {
402 			SPDK_WARNLOG("Received unexpected msg type\n");
403 			return -EIO;
404 		}
405 
406 		switch (req) {
407 		case VHOST_USER_GET_FEATURES:
408 		case VHOST_USER_GET_PROTOCOL_FEATURES:
409 		case VHOST_USER_GET_QUEUE_NUM:
410 			if (msg.size != sizeof(msg.payload.u64)) {
411 				SPDK_WARNLOG("Received bad msg size\n");
412 				return -EIO;
413 			}
414 			*((__u64 *)arg) = msg.payload.u64;
415 			break;
416 		case VHOST_USER_GET_VRING_BASE:
417 			if (msg.size != sizeof(msg.payload.state)) {
418 				SPDK_WARNLOG("Received bad msg size\n");
419 				return -EIO;
420 			}
421 			memcpy(arg, &msg.payload.state,
422 			       sizeof(struct vhost_vring_state));
423 			break;
424 		case VHOST_USER_GET_CONFIG:
425 			if (msg.size != sizeof(msg.payload.cfg)) {
426 				SPDK_WARNLOG("Received bad msg size\n");
427 				return -EIO;
428 			}
429 			memcpy(arg, &msg.payload.cfg, sizeof(msg.payload.cfg));
430 			break;
431 		default:
432 			SPDK_WARNLOG("Received unexpected msg type\n");
433 			return -EBADMSG;
434 		}
435 	}
436 
437 	return 0;
438 }
439 
440 /**
441  * Set up environment to talk with a vhost user backend.
442  *
443  * @return
444  *   - (-1) if fail;
445  *   - (0) if succeed.
446  */
447 static int
448 vhost_user_setup(struct virtio_user_dev *dev)
449 {
450 	int fd;
451 	int flag;
452 	struct sockaddr_un un;
453 	ssize_t rc;
454 
455 	fd = socket(AF_UNIX, SOCK_STREAM, 0);
456 	if (fd < 0) {
457 		SPDK_ERRLOG("socket() error, %s\n", spdk_strerror(errno));
458 		return -errno;
459 	}
460 
461 	flag = fcntl(fd, F_GETFD);
462 	if (fcntl(fd, F_SETFD, flag | FD_CLOEXEC) < 0) {
463 		SPDK_ERRLOG("fcntl failed, %s\n", spdk_strerror(errno));
464 	}
465 
466 	memset(&un, 0, sizeof(un));
467 	un.sun_family = AF_UNIX;
468 	rc = snprintf(un.sun_path, sizeof(un.sun_path), "%s", dev->path);
469 	if (rc < 0 || (size_t)rc >= sizeof(un.sun_path)) {
470 		SPDK_ERRLOG("socket path too long\n");
471 		close(fd);
472 		if (rc < 0) {
473 			return -errno;
474 		} else {
475 			return -EINVAL;
476 		}
477 	}
478 	if (connect(fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
479 		SPDK_ERRLOG("connect error, %s\n", spdk_strerror(errno));
480 		close(fd);
481 		return -errno;
482 	}
483 
484 	dev->vhostfd = fd;
485 	return 0;
486 }
487 
488 struct virtio_user_backend_ops ops_user = {
489 	.setup = vhost_user_setup,
490 	.send_request = vhost_user_sock,
491 };
492 
493 static int
494 virtio_user_create_queue(struct virtio_dev *vdev, uint32_t queue_sel)
495 {
496 	struct virtio_user_dev *dev = vdev->ctx;
497 
498 	/* Of all per virtqueue MSGs, make sure VHOST_SET_VRING_CALL come
499 	 * firstly because vhost depends on this msg to allocate virtqueue
500 	 * pair.
501 	 */
502 	struct vhost_vring_file file;
503 
504 	file.index = queue_sel;
505 	file.fd = dev->callfds[queue_sel];
506 	return dev->ops->send_request(dev, VHOST_USER_SET_VRING_CALL, &file);
507 }
508 
509 static int
510 virtio_user_set_vring_addr(struct virtio_dev *vdev, uint32_t queue_sel)
511 {
512 	struct virtio_user_dev *dev = vdev->ctx;
513 	struct vring *vring = &dev->vrings[queue_sel];
514 	struct vhost_vring_addr addr = {
515 		.index = queue_sel,
516 		.desc_user_addr = (uint64_t)(uintptr_t)vring->desc,
517 		.avail_user_addr = (uint64_t)(uintptr_t)vring->avail,
518 		.used_user_addr = (uint64_t)(uintptr_t)vring->used,
519 		.log_guest_addr = 0,
520 		.flags = 0, /* disable log */
521 	};
522 
523 	return dev->ops->send_request(dev, VHOST_USER_SET_VRING_ADDR, &addr);
524 }
525 
526 static int
527 virtio_user_kick_queue(struct virtio_dev *vdev, uint32_t queue_sel)
528 {
529 	struct virtio_user_dev *dev = vdev->ctx;
530 	struct vhost_vring_file file;
531 	struct vhost_vring_state state;
532 	struct vring *vring = &dev->vrings[queue_sel];
533 	int rc;
534 
535 	state.index = queue_sel;
536 	state.num = vring->num;
537 	rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_NUM, &state);
538 	if (rc < 0) {
539 		return rc;
540 	}
541 
542 	state.index = queue_sel;
543 	state.num = 0; /* no reservation */
544 	rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_BASE, &state);
545 	if (rc < 0) {
546 		return rc;
547 	}
548 
549 	virtio_user_set_vring_addr(vdev, queue_sel);
550 
551 	/* Of all per virtqueue MSGs, make sure VHOST_USER_SET_VRING_KICK comes
552 	 * lastly because vhost depends on this msg to judge if
553 	 * virtio is ready.
554 	 */
555 	file.index = queue_sel;
556 	file.fd = dev->kickfds[queue_sel];
557 	return dev->ops->send_request(dev, VHOST_USER_SET_VRING_KICK, &file);
558 }
559 
560 static int
561 virtio_user_stop_queue(struct virtio_dev *vdev, uint32_t queue_sel)
562 {
563 	struct virtio_user_dev *dev = vdev->ctx;
564 	struct vhost_vring_state state;
565 
566 	state.index = queue_sel;
567 	state.num = 0;
568 
569 	return dev->ops->send_request(dev, VHOST_USER_GET_VRING_BASE, &state);
570 }
571 
572 static int
573 virtio_user_queue_setup(struct virtio_dev *vdev,
574 			int (*fn)(struct virtio_dev *, uint32_t))
575 {
576 	uint32_t i;
577 	int rc;
578 
579 	for (i = 0; i < vdev->max_queues; ++i) {
580 		rc = fn(vdev, i);
581 		if (rc < 0) {
582 			SPDK_ERRLOG("setup tx vq fails: %"PRIu32".\n", i);
583 			return rc;
584 		}
585 	}
586 
587 	return 0;
588 }
589 
590 static int
591 virtio_user_map_notify(void *cb_ctx, struct spdk_mem_map *map,
592 		       enum spdk_mem_map_notify_action action,
593 		       void *vaddr, size_t size)
594 {
595 	struct virtio_dev *vdev = cb_ctx;
596 	struct virtio_user_dev *dev = vdev->ctx;
597 	uint64_t features;
598 	int ret;
599 
600 	/* We have to resend all mappings anyway, so don't bother with any
601 	 * page tracking.
602 	 */
603 	ret = dev->ops->send_request(dev, VHOST_USER_SET_MEM_TABLE, NULL);
604 	if (ret < 0) {
605 		return ret;
606 	}
607 
608 	/* Since we might want to use that mapping straight away, we have to
609 	 * make sure the guest has already processed our SET_MEM_TABLE message.
610 	 * F_REPLY_ACK is just a feature and the host is not obliged to
611 	 * support it, so we send a simple message that always has a response
612 	 * and we wait for that response. Messages are always processed in order.
613 	 */
614 	return dev->ops->send_request(dev, VHOST_USER_GET_FEATURES, &features);
615 }
616 
617 static int
618 virtio_user_register_mem(struct virtio_dev *vdev)
619 {
620 	struct virtio_user_dev *dev = vdev->ctx;
621 	const struct spdk_mem_map_ops virtio_user_map_ops = {
622 		.notify_cb = virtio_user_map_notify,
623 		.are_contiguous = NULL
624 	};
625 
626 	dev->mem_map = spdk_mem_map_alloc(0, &virtio_user_map_ops, vdev);
627 	if (dev->mem_map == NULL) {
628 		SPDK_ERRLOG("spdk_mem_map_alloc() failed\n");
629 		return -1;
630 	}
631 
632 	return 0;
633 }
634 
635 static void
636 virtio_user_unregister_mem(struct virtio_dev *vdev)
637 {
638 	struct virtio_user_dev *dev = vdev->ctx;
639 
640 	spdk_mem_map_free(&dev->mem_map);
641 }
642 
643 static int
644 virtio_user_start_device(struct virtio_dev *vdev)
645 {
646 	struct virtio_user_dev *dev = vdev->ctx;
647 	uint64_t host_max_queues;
648 	int ret;
649 
650 	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) == 0 &&
651 	    vdev->max_queues > 1 + vdev->fixed_queues_num) {
652 		SPDK_WARNLOG("%s: requested %"PRIu16" request queues, but the "
653 			     "host doesn't support VHOST_USER_PROTOCOL_F_MQ. "
654 			     "Only one request queue will be used.\n",
655 			     vdev->name, vdev->max_queues - vdev->fixed_queues_num);
656 		vdev->max_queues = 1 + vdev->fixed_queues_num;
657 	}
658 
659 	/* negotiate the number of I/O queues. */
660 	ret = dev->ops->send_request(dev, VHOST_USER_GET_QUEUE_NUM, &host_max_queues);
661 	if (ret < 0) {
662 		return ret;
663 	}
664 
665 	if (vdev->max_queues > host_max_queues + vdev->fixed_queues_num) {
666 		SPDK_WARNLOG("%s: requested %"PRIu16" request queues"
667 			     "but only %"PRIu64" available\n",
668 			     vdev->name, vdev->max_queues - vdev->fixed_queues_num,
669 			     host_max_queues);
670 		vdev->max_queues = host_max_queues;
671 	}
672 
673 	/* tell vhost to create queues */
674 	ret = virtio_user_queue_setup(vdev, virtio_user_create_queue);
675 	if (ret < 0) {
676 		return ret;
677 	}
678 
679 	ret = virtio_user_register_mem(vdev);
680 	if (ret < 0) {
681 		return ret;
682 	}
683 
684 	return virtio_user_queue_setup(vdev, virtio_user_kick_queue);
685 }
686 
687 static int
688 virtio_user_stop_device(struct virtio_dev *vdev)
689 {
690 	int ret;
691 
692 	ret = virtio_user_queue_setup(vdev, virtio_user_stop_queue);
693 	/* a queue might fail to stop for various reasons, e.g. socket
694 	 * connection going down, but this mustn't prevent us from freeing
695 	 * the mem map.
696 	 */
697 	virtio_user_unregister_mem(vdev);
698 	return ret;
699 }
700 
701 static int
702 virtio_user_dev_setup(struct virtio_dev *vdev)
703 {
704 	struct virtio_user_dev *dev = vdev->ctx;
705 	uint16_t i;
706 
707 	dev->vhostfd = -1;
708 
709 	for (i = 0; i < SPDK_VIRTIO_MAX_VIRTQUEUES; ++i) {
710 		dev->callfds[i] = -1;
711 		dev->kickfds[i] = -1;
712 	}
713 
714 	dev->ops = &ops_user;
715 
716 	return dev->ops->setup(dev);
717 }
718 
719 static int
720 virtio_user_read_dev_config(struct virtio_dev *vdev, size_t offset,
721 			    void *dst, int length)
722 {
723 	struct virtio_user_dev *dev = vdev->ctx;
724 	struct vhost_user_config cfg = {0};
725 	int rc;
726 
727 	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) {
728 		return -ENOTSUP;
729 	}
730 
731 	cfg.offset = 0;
732 	cfg.size = VHOST_USER_MAX_CONFIG_SIZE;
733 
734 	rc = dev->ops->send_request(dev, VHOST_USER_GET_CONFIG, &cfg);
735 	if (rc < 0) {
736 		SPDK_ERRLOG("get_config failed: %s\n", spdk_strerror(-rc));
737 		return rc;
738 	}
739 
740 	memcpy(dst, cfg.region + offset, length);
741 	return 0;
742 }
743 
744 static int
745 virtio_user_write_dev_config(struct virtio_dev *vdev, size_t offset,
746 			     const void *src, int length)
747 {
748 	struct virtio_user_dev *dev = vdev->ctx;
749 	struct vhost_user_config cfg = {0};
750 	int rc;
751 
752 	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) {
753 		return -ENOTSUP;
754 	}
755 
756 	cfg.offset = offset;
757 	cfg.size = length;
758 	memcpy(cfg.region, src, length);
759 
760 	rc = dev->ops->send_request(dev, VHOST_USER_SET_CONFIG, &cfg);
761 	if (rc < 0) {
762 		SPDK_ERRLOG("set_config failed: %s\n", spdk_strerror(-rc));
763 		return rc;
764 	}
765 
766 	return 0;
767 }
768 
769 static void
770 virtio_user_set_status(struct virtio_dev *vdev, uint8_t status)
771 {
772 	struct virtio_user_dev *dev = vdev->ctx;
773 	int rc = 0;
774 
775 	if ((dev->status & VIRTIO_CONFIG_S_NEEDS_RESET) &&
776 	    status != VIRTIO_CONFIG_S_RESET) {
777 		rc = -1;
778 	} else if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
779 		rc = virtio_user_start_device(vdev);
780 	} else if (status == VIRTIO_CONFIG_S_RESET &&
781 		   (dev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
782 		rc = virtio_user_stop_device(vdev);
783 	}
784 
785 	if (rc != 0) {
786 		dev->status |= VIRTIO_CONFIG_S_NEEDS_RESET;
787 	} else {
788 		dev->status = status;
789 	}
790 }
791 
792 static uint8_t
793 virtio_user_get_status(struct virtio_dev *vdev)
794 {
795 	struct virtio_user_dev *dev = vdev->ctx;
796 
797 	return dev->status;
798 }
799 
800 static uint64_t
801 virtio_user_get_features(struct virtio_dev *vdev)
802 {
803 	struct virtio_user_dev *dev = vdev->ctx;
804 	uint64_t features;
805 	int rc;
806 
807 	rc = dev->ops->send_request(dev, VHOST_USER_GET_FEATURES, &features);
808 	if (rc < 0) {
809 		SPDK_ERRLOG("get_features failed: %s\n", spdk_strerror(-rc));
810 		return 0;
811 	}
812 
813 	return features;
814 }
815 
816 static int
817 virtio_user_set_features(struct virtio_dev *vdev, uint64_t features)
818 {
819 	struct virtio_user_dev *dev = vdev->ctx;
820 	uint64_t protocol_features;
821 	int ret;
822 
823 	ret = dev->ops->send_request(dev, VHOST_USER_SET_FEATURES, &features);
824 	if (ret < 0) {
825 		return ret;
826 	}
827 
828 	vdev->negotiated_features = features;
829 	vdev->modern = virtio_dev_has_feature(vdev, VIRTIO_F_VERSION_1);
830 
831 	if (!virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) {
832 		/* nothing else to do */
833 		return 0;
834 	}
835 
836 	ret = dev->ops->send_request(dev, VHOST_USER_GET_PROTOCOL_FEATURES, &protocol_features);
837 	if (ret < 0) {
838 		return ret;
839 	}
840 
841 	protocol_features &= VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES;
842 	ret = dev->ops->send_request(dev, VHOST_USER_SET_PROTOCOL_FEATURES, &protocol_features);
843 	if (ret < 0) {
844 		return ret;
845 	}
846 
847 	dev->protocol_features = protocol_features;
848 	return 0;
849 }
850 
851 static uint16_t
852 virtio_user_get_queue_size(struct virtio_dev *vdev, uint16_t queue_id)
853 {
854 	struct virtio_user_dev *dev = vdev->ctx;
855 
856 	/* Currently each queue has same queue size */
857 	return dev->queue_size;
858 }
859 
860 static int
861 virtio_user_setup_queue(struct virtio_dev *vdev, struct virtqueue *vq)
862 {
863 	struct virtio_user_dev *dev = vdev->ctx;
864 	struct vhost_vring_state state;
865 	uint16_t queue_idx = vq->vq_queue_index;
866 	void *queue_mem;
867 	uint64_t desc_addr, avail_addr, used_addr;
868 	int callfd, kickfd, rc;
869 
870 	if (dev->callfds[queue_idx] != -1 || dev->kickfds[queue_idx] != -1) {
871 		SPDK_ERRLOG("queue %"PRIu16" already exists\n", queue_idx);
872 		return -EEXIST;
873 	}
874 
875 	/* May use invalid flag, but some backend uses kickfd and
876 	 * callfd as criteria to judge if dev is alive. so finally we
877 	 * use real event_fd.
878 	 */
879 	callfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
880 	if (callfd < 0) {
881 		SPDK_ERRLOG("callfd error, %s\n", spdk_strerror(errno));
882 		return -errno;
883 	}
884 
885 	kickfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
886 	if (kickfd < 0) {
887 		SPDK_ERRLOG("kickfd error, %s\n", spdk_strerror(errno));
888 		close(callfd);
889 		return -errno;
890 	}
891 
892 	queue_mem = spdk_zmalloc(vq->vq_ring_size, VIRTIO_PCI_VRING_ALIGN, NULL,
893 				 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
894 	if (queue_mem == NULL) {
895 		close(kickfd);
896 		close(callfd);
897 		return -ENOMEM;
898 	}
899 
900 	vq->vq_ring_mem = SPDK_VTOPHYS_ERROR;
901 	vq->vq_ring_virt_mem = queue_mem;
902 
903 	state.index = vq->vq_queue_index;
904 	state.num = vq->vq_nentries;
905 
906 	if (virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) {
907 		rc = dev->ops->send_request(dev, VHOST_USER_SET_VRING_ENABLE, &state);
908 		if (rc < 0) {
909 			SPDK_ERRLOG("failed to send VHOST_USER_SET_VRING_ENABLE: %s\n",
910 				    spdk_strerror(-rc));
911 			close(kickfd);
912 			close(callfd);
913 			spdk_free(queue_mem);
914 			return -rc;
915 		}
916 	}
917 
918 	dev->callfds[queue_idx] = callfd;
919 	dev->kickfds[queue_idx] = kickfd;
920 
921 	desc_addr = (uintptr_t)vq->vq_ring_virt_mem;
922 	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
923 	used_addr = SPDK_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
924 				    ring[vq->vq_nentries]),
925 				    VIRTIO_PCI_VRING_ALIGN);
926 
927 	dev->vrings[queue_idx].num = vq->vq_nentries;
928 	dev->vrings[queue_idx].desc = (void *)(uintptr_t)desc_addr;
929 	dev->vrings[queue_idx].avail = (void *)(uintptr_t)avail_addr;
930 	dev->vrings[queue_idx].used = (void *)(uintptr_t)used_addr;
931 
932 	return 0;
933 }
934 
935 static void
936 virtio_user_del_queue(struct virtio_dev *vdev, struct virtqueue *vq)
937 {
938 	/* For legacy devices, write 0 to VIRTIO_PCI_QUEUE_PFN port, QEMU
939 	 * correspondingly stops the ioeventfds, and reset the status of
940 	 * the device.
941 	 * For modern devices, set queue desc, avail, used in PCI bar to 0,
942 	 * not see any more behavior in QEMU.
943 	 *
944 	 * Here we just care about what information to deliver to vhost-user.
945 	 * So we just close ioeventfd for now.
946 	 */
947 	struct virtio_user_dev *dev = vdev->ctx;
948 
949 	close(dev->callfds[vq->vq_queue_index]);
950 	close(dev->kickfds[vq->vq_queue_index]);
951 	dev->callfds[vq->vq_queue_index] = -1;
952 	dev->kickfds[vq->vq_queue_index] = -1;
953 
954 	spdk_free(vq->vq_ring_virt_mem);
955 }
956 
957 static void
958 virtio_user_notify_queue(struct virtio_dev *vdev, struct virtqueue *vq)
959 {
960 	uint64_t buf = 1;
961 	struct virtio_user_dev *dev = vdev->ctx;
962 
963 	if (write(dev->kickfds[vq->vq_queue_index], &buf, sizeof(buf)) < 0) {
964 		SPDK_ERRLOG("failed to kick backend: %s.\n", spdk_strerror(errno));
965 	}
966 }
967 
968 static void
969 virtio_user_destroy(struct virtio_dev *vdev)
970 {
971 	struct virtio_user_dev *dev = vdev->ctx;
972 
973 	close(dev->vhostfd);
974 	free(dev);
975 }
976 
977 static void
978 virtio_user_dump_json_info(struct virtio_dev *vdev, struct spdk_json_write_ctx *w)
979 {
980 	struct virtio_user_dev *dev = vdev->ctx;
981 
982 	spdk_json_write_named_string(w, "type", "user");
983 	spdk_json_write_named_string(w, "socket", dev->path);
984 }
985 
986 static void
987 virtio_user_write_json_config(struct virtio_dev *vdev, struct spdk_json_write_ctx *w)
988 {
989 	struct virtio_user_dev *dev = vdev->ctx;
990 
991 	spdk_json_write_named_string(w, "trtype", "user");
992 	spdk_json_write_named_string(w, "traddr", dev->path);
993 	spdk_json_write_named_uint32(w, "vq_count", vdev->max_queues - vdev->fixed_queues_num);
994 	spdk_json_write_named_uint32(w, "vq_size", virtio_dev_backend_ops(vdev)->get_queue_size(vdev, 0));
995 }
996 
997 static const struct virtio_dev_ops virtio_user_ops = {
998 	.read_dev_cfg	= virtio_user_read_dev_config,
999 	.write_dev_cfg	= virtio_user_write_dev_config,
1000 	.get_status	= virtio_user_get_status,
1001 	.set_status	= virtio_user_set_status,
1002 	.get_features	= virtio_user_get_features,
1003 	.set_features	= virtio_user_set_features,
1004 	.destruct_dev	= virtio_user_destroy,
1005 	.get_queue_size	= virtio_user_get_queue_size,
1006 	.setup_queue	= virtio_user_setup_queue,
1007 	.del_queue	= virtio_user_del_queue,
1008 	.notify_queue	= virtio_user_notify_queue,
1009 	.dump_json_info = virtio_user_dump_json_info,
1010 	.write_json_config = virtio_user_write_json_config,
1011 };
1012 
1013 int
1014 virtio_user_dev_init(struct virtio_dev *vdev, const char *name, const char *path,
1015 		     uint32_t queue_size)
1016 {
1017 	struct virtio_user_dev *dev;
1018 	int rc;
1019 
1020 	if (name == NULL) {
1021 		SPDK_ERRLOG("No name gived for controller: %s\n", path);
1022 		return -EINVAL;
1023 	}
1024 
1025 	dev = calloc(1, sizeof(*dev));
1026 	if (dev == NULL) {
1027 		return -ENOMEM;
1028 	}
1029 
1030 	rc = virtio_dev_construct(vdev, name, &virtio_user_ops, dev);
1031 	if (rc != 0) {
1032 		SPDK_ERRLOG("Failed to init device: %s\n", path);
1033 		free(dev);
1034 		return rc;
1035 	}
1036 
1037 	vdev->is_hw = 0;
1038 
1039 	snprintf(dev->path, PATH_MAX, "%s", path);
1040 	dev->queue_size = queue_size;
1041 
1042 	rc = virtio_user_dev_setup(vdev);
1043 	if (rc < 0) {
1044 		SPDK_ERRLOG("backend set up fails\n");
1045 		goto err;
1046 	}
1047 
1048 	rc = dev->ops->send_request(dev, VHOST_USER_SET_OWNER, NULL);
1049 	if (rc < 0) {
1050 		SPDK_ERRLOG("set_owner fails: %s\n", spdk_strerror(-rc));
1051 		goto err;
1052 	}
1053 
1054 	return 0;
1055 
1056 err:
1057 	virtio_dev_destruct(vdev);
1058 	return rc;
1059 }
1060 SPDK_LOG_REGISTER_COMPONENT(virtio_user)
1061