xref: /spdk/lib/virtio/virtio_vhost_user.c (revision 84ac072e2c3f7df651edae9122b219e97445ac82)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright(c) 2010-2016 Intel Corporation. All rights reserved.
3  *   All rights reserved.
4  */
5 
6 #include "spdk/stdinc.h"
7 
8 #include <sys/eventfd.h>
9 
10 #include "spdk/string.h"
11 #include "spdk/config.h"
12 #include "spdk/util.h"
13 
14 #include "spdk_internal/virtio.h"
15 #include "spdk_internal/vhost_user.h"
16 
17 /* The version of the protocol we support */
18 #define VHOST_USER_VERSION    0x1
19 
20 #define VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES \
21 	((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
22 	(1ULL << VHOST_USER_PROTOCOL_F_CONFIG))
23 
24 struct virtio_user_dev {
25 	int		vhostfd;
26 
27 	int		callfds[SPDK_VIRTIO_MAX_VIRTQUEUES];
28 	int		kickfds[SPDK_VIRTIO_MAX_VIRTQUEUES];
29 	uint32_t	queue_size;
30 
31 	uint8_t		status;
32 	char		path[PATH_MAX];
33 	uint64_t	protocol_features;
34 	struct vring	vrings[SPDK_VIRTIO_MAX_VIRTQUEUES];
35 	struct spdk_mem_map *mem_map;
36 };
37 
38 static int
39 vhost_user_write(int fd, void *buf, int len, int *fds, int fd_num)
40 {
41 	int r;
42 	struct msghdr msgh;
43 	struct iovec iov;
44 	size_t fd_size = fd_num * sizeof(int);
45 	char control[CMSG_SPACE(fd_size)];
46 	struct cmsghdr *cmsg;
47 
48 	memset(&msgh, 0, sizeof(msgh));
49 	memset(control, 0, sizeof(control));
50 
51 	iov.iov_base = (uint8_t *)buf;
52 	iov.iov_len = len;
53 
54 	msgh.msg_iov = &iov;
55 	msgh.msg_iovlen = 1;
56 
57 	if (fds && fd_num > 0) {
58 		msgh.msg_control = control;
59 		msgh.msg_controllen = sizeof(control);
60 		cmsg = CMSG_FIRSTHDR(&msgh);
61 		cmsg->cmsg_len = CMSG_LEN(fd_size);
62 		cmsg->cmsg_level = SOL_SOCKET;
63 		cmsg->cmsg_type = SCM_RIGHTS;
64 		memcpy(CMSG_DATA(cmsg), fds, fd_size);
65 	} else {
66 		msgh.msg_control = NULL;
67 		msgh.msg_controllen = 0;
68 	}
69 
70 	do {
71 		r = sendmsg(fd, &msgh, 0);
72 	} while (r < 0 && errno == EINTR);
73 
74 	if (r == -1) {
75 		return -errno;
76 	}
77 
78 	return 0;
79 }
80 
81 static int
82 vhost_user_read(int fd, struct vhost_user_msg *msg)
83 {
84 	uint32_t valid_flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION;
85 	ssize_t ret;
86 	size_t sz_hdr = VHOST_USER_HDR_SIZE, sz_payload;
87 
88 	ret = recv(fd, (void *)msg, sz_hdr, 0);
89 	if ((size_t)ret != sz_hdr) {
90 		SPDK_WARNLOG("Failed to recv msg hdr: %zd instead of %zu.\n",
91 			     ret, sz_hdr);
92 		if (ret == -1) {
93 			return -errno;
94 		} else {
95 			return -EBUSY;
96 		}
97 	}
98 
99 	/* validate msg flags */
100 	if (msg->flags != (valid_flags)) {
101 		SPDK_WARNLOG("Failed to recv msg: flags %"PRIx32" instead of %"PRIx32".\n",
102 			     msg->flags, valid_flags);
103 		return -EIO;
104 	}
105 
106 	sz_payload = msg->size;
107 
108 	if (sz_payload > VHOST_USER_PAYLOAD_SIZE) {
109 		SPDK_WARNLOG("Received oversized msg: payload size %zu > available space %zu\n",
110 			     sz_payload, VHOST_USER_PAYLOAD_SIZE);
111 		return -EIO;
112 	}
113 
114 	if (sz_payload) {
115 		ret = recv(fd, (void *)((char *)msg + sz_hdr), sz_payload, 0);
116 		if ((size_t)ret != sz_payload) {
117 			SPDK_WARNLOG("Failed to recv msg payload: %zd instead of %"PRIu32".\n",
118 				     ret, msg->size);
119 			if (ret == -1) {
120 				return -errno;
121 			} else {
122 				return -EBUSY;
123 			}
124 		}
125 	}
126 
127 	return 0;
128 }
129 
130 struct hugepage_file_info {
131 	uint64_t addr;            /**< virtual addr */
132 	size_t   size;            /**< the file size */
133 	char     path[PATH_MAX];  /**< path to backing file */
134 };
135 
136 /* Two possible options:
137  * 1. Match HUGEPAGE_INFO_FMT to find the file storing struct hugepage_file
138  * array. This is simple but cannot be used in secondary process because
139  * secondary process will close and munmap that file.
140  * 2. Match HUGEFILE_FMT to find hugepage files directly.
141  *
142  * We choose option 2.
143  */
144 static int
145 get_hugepage_file_info(struct hugepage_file_info hugepages[], int max)
146 {
147 	int idx, rc;
148 	FILE *f;
149 	char buf[BUFSIZ], *tmp, *tail;
150 	char *str_underline, *str_start;
151 	int huge_index;
152 	uint64_t v_start, v_end;
153 
154 	f = fopen("/proc/self/maps", "r");
155 	if (!f) {
156 		SPDK_ERRLOG("cannot open /proc/self/maps\n");
157 		rc = -errno;
158 		assert(rc < 0); /* scan-build hack */
159 		return rc;
160 	}
161 
162 	idx = 0;
163 	while (fgets(buf, sizeof(buf), f) != NULL) {
164 		if (sscanf(buf, "%" PRIx64 "-%" PRIx64, &v_start, &v_end) < 2) {
165 			SPDK_ERRLOG("Failed to parse address\n");
166 			rc = -EIO;
167 			goto out;
168 		}
169 
170 		tmp = strchr(buf, ' ') + 1; /** skip address */
171 		tmp = strchr(tmp, ' ') + 1; /** skip perm */
172 		tmp = strchr(tmp, ' ') + 1; /** skip offset */
173 		tmp = strchr(tmp, ' ') + 1; /** skip dev */
174 		tmp = strchr(tmp, ' ') + 1; /** skip inode */
175 		while (*tmp == ' ') {       /** skip spaces */
176 			tmp++;
177 		}
178 		tail = strrchr(tmp, '\n');  /** remove newline if exists */
179 		if (tail) {
180 			*tail = '\0';
181 		}
182 
183 		/* Match HUGEFILE_FMT, aka "%s/%smap_%d",
184 		 * which is defined in eal_filesystem.h
185 		 */
186 		str_underline = strrchr(tmp, '_');
187 		if (!str_underline) {
188 			continue;
189 		}
190 
191 		str_start = str_underline - strlen("map");
192 		if (str_start < tmp) {
193 			continue;
194 		}
195 
196 		if (sscanf(str_start, "map_%d", &huge_index) != 1) {
197 			continue;
198 		}
199 
200 		if (idx >= max) {
201 			SPDK_ERRLOG("Exceed maximum of %d\n", max);
202 			rc = -ENOSPC;
203 			goto out;
204 		}
205 
206 		if (idx > 0 &&
207 		    strncmp(tmp, hugepages[idx - 1].path, PATH_MAX) == 0 &&
208 		    v_start == hugepages[idx - 1].addr + hugepages[idx - 1].size) {
209 			hugepages[idx - 1].size += (v_end - v_start);
210 			continue;
211 		}
212 
213 		hugepages[idx].addr = v_start;
214 		hugepages[idx].size = v_end - v_start;
215 		snprintf(hugepages[idx].path, PATH_MAX, "%s", tmp);
216 		idx++;
217 	}
218 
219 	rc = idx;
220 out:
221 	fclose(f);
222 	return rc;
223 }
224 
225 static int
226 prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[])
227 {
228 	int i, num;
229 	struct hugepage_file_info hugepages[VHOST_USER_MEMORY_MAX_NREGIONS];
230 
231 	num = get_hugepage_file_info(hugepages, VHOST_USER_MEMORY_MAX_NREGIONS);
232 	if (num < 0) {
233 		SPDK_ERRLOG("Failed to prepare memory for vhost-user\n");
234 		return num;
235 	}
236 
237 	for (i = 0; i < num; ++i) {
238 		/* the memory regions are unaligned */
239 		msg->payload.memory.regions[i].guest_phys_addr = hugepages[i].addr; /* use vaddr! */
240 		msg->payload.memory.regions[i].userspace_addr = hugepages[i].addr;
241 		msg->payload.memory.regions[i].memory_size = hugepages[i].size;
242 		msg->payload.memory.regions[i].flags_padding = 0;
243 		fds[i] = open(hugepages[i].path, O_RDWR);
244 	}
245 
246 	msg->payload.memory.nregions = num;
247 	msg->payload.memory.padding = 0;
248 
249 	return 0;
250 }
251 
252 static const char *const vhost_msg_strings[VHOST_USER_MAX] = {
253 	[VHOST_USER_SET_OWNER] = "VHOST_SET_OWNER",
254 	[VHOST_USER_RESET_OWNER] = "VHOST_RESET_OWNER",
255 	[VHOST_USER_SET_FEATURES] = "VHOST_SET_FEATURES",
256 	[VHOST_USER_GET_FEATURES] = "VHOST_GET_FEATURES",
257 	[VHOST_USER_SET_VRING_CALL] = "VHOST_SET_VRING_CALL",
258 	[VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES",
259 	[VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES",
260 	[VHOST_USER_SET_VRING_NUM] = "VHOST_SET_VRING_NUM",
261 	[VHOST_USER_SET_VRING_BASE] = "VHOST_SET_VRING_BASE",
262 	[VHOST_USER_GET_VRING_BASE] = "VHOST_GET_VRING_BASE",
263 	[VHOST_USER_SET_VRING_ADDR] = "VHOST_SET_VRING_ADDR",
264 	[VHOST_USER_SET_VRING_KICK] = "VHOST_SET_VRING_KICK",
265 	[VHOST_USER_SET_MEM_TABLE] = "VHOST_SET_MEM_TABLE",
266 	[VHOST_USER_SET_VRING_ENABLE] = "VHOST_SET_VRING_ENABLE",
267 	[VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM",
268 	[VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG",
269 	[VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG",
270 };
271 
272 static int
273 vhost_user_sock(struct virtio_user_dev *dev,
274 		enum vhost_user_request req,
275 		void *arg)
276 {
277 	struct vhost_user_msg msg;
278 	struct vhost_vring_file *file = 0;
279 	int need_reply = 0;
280 	int fds[VHOST_USER_MEMORY_MAX_NREGIONS];
281 	int fd_num = 0;
282 	int i, len, rc;
283 	int vhostfd = dev->vhostfd;
284 
285 	SPDK_DEBUGLOG(virtio_user, "sent message %d = %s\n", req, vhost_msg_strings[req]);
286 
287 	msg.request = req;
288 	msg.flags = VHOST_USER_VERSION;
289 	msg.size = 0;
290 
291 	switch (req) {
292 	case VHOST_USER_GET_FEATURES:
293 	case VHOST_USER_GET_PROTOCOL_FEATURES:
294 	case VHOST_USER_GET_QUEUE_NUM:
295 		need_reply = 1;
296 		break;
297 
298 	case VHOST_USER_SET_FEATURES:
299 	case VHOST_USER_SET_LOG_BASE:
300 	case VHOST_USER_SET_PROTOCOL_FEATURES:
301 		msg.payload.u64 = *((__u64 *)arg);
302 		msg.size = sizeof(msg.payload.u64);
303 		break;
304 
305 	case VHOST_USER_SET_OWNER:
306 	case VHOST_USER_RESET_OWNER:
307 		break;
308 
309 	case VHOST_USER_SET_MEM_TABLE:
310 		rc = prepare_vhost_memory_user(&msg, fds);
311 		if (rc < 0) {
312 			return rc;
313 		}
314 		fd_num = msg.payload.memory.nregions;
315 		msg.size = sizeof(msg.payload.memory.nregions);
316 		msg.size += sizeof(msg.payload.memory.padding);
317 		msg.size += fd_num * sizeof(struct vhost_memory_region);
318 		break;
319 
320 	case VHOST_USER_SET_LOG_FD:
321 		fds[fd_num++] = *((int *)arg);
322 		break;
323 
324 	case VHOST_USER_SET_VRING_NUM:
325 	case VHOST_USER_SET_VRING_BASE:
326 	case VHOST_USER_SET_VRING_ENABLE:
327 		memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
328 		msg.size = sizeof(msg.payload.state);
329 		break;
330 
331 	case VHOST_USER_GET_VRING_BASE:
332 		memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
333 		msg.size = sizeof(msg.payload.state);
334 		need_reply = 1;
335 		break;
336 
337 	case VHOST_USER_SET_VRING_ADDR:
338 		memcpy(&msg.payload.addr, arg, sizeof(msg.payload.addr));
339 		msg.size = sizeof(msg.payload.addr);
340 		break;
341 
342 	case VHOST_USER_SET_VRING_KICK:
343 	case VHOST_USER_SET_VRING_CALL:
344 	case VHOST_USER_SET_VRING_ERR:
345 		file = arg;
346 		msg.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK;
347 		msg.size = sizeof(msg.payload.u64);
348 		if (file->fd > 0) {
349 			fds[fd_num++] = file->fd;
350 		} else {
351 			msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
352 		}
353 		break;
354 
355 	case VHOST_USER_GET_CONFIG:
356 		memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg));
357 		msg.size = sizeof(msg.payload.cfg);
358 		need_reply = 1;
359 		break;
360 
361 	case VHOST_USER_SET_CONFIG:
362 		memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg));
363 		msg.size = sizeof(msg.payload.cfg);
364 		break;
365 
366 	default:
367 		SPDK_ERRLOG("trying to send unknown msg\n");
368 		return -EINVAL;
369 	}
370 
371 	len = VHOST_USER_HDR_SIZE + msg.size;
372 	rc = vhost_user_write(vhostfd, &msg, len, fds, fd_num);
373 	if (rc < 0) {
374 		SPDK_ERRLOG("%s failed: %s\n",
375 			    vhost_msg_strings[req], spdk_strerror(-rc));
376 		return rc;
377 	}
378 
379 	if (req == VHOST_USER_SET_MEM_TABLE)
380 		for (i = 0; i < fd_num; ++i) {
381 			close(fds[i]);
382 		}
383 
384 	if (need_reply) {
385 		rc = vhost_user_read(vhostfd, &msg);
386 		if (rc < 0) {
387 			SPDK_WARNLOG("Received msg failed: %s\n", spdk_strerror(-rc));
388 			return rc;
389 		}
390 
391 		if (req != msg.request) {
392 			SPDK_WARNLOG("Received unexpected msg type\n");
393 			return -EIO;
394 		}
395 
396 		switch (req) {
397 		case VHOST_USER_GET_FEATURES:
398 		case VHOST_USER_GET_PROTOCOL_FEATURES:
399 		case VHOST_USER_GET_QUEUE_NUM:
400 			if (msg.size != sizeof(msg.payload.u64)) {
401 				SPDK_WARNLOG("Received bad msg size\n");
402 				return -EIO;
403 			}
404 			*((__u64 *)arg) = msg.payload.u64;
405 			break;
406 		case VHOST_USER_GET_VRING_BASE:
407 			if (msg.size != sizeof(msg.payload.state)) {
408 				SPDK_WARNLOG("Received bad msg size\n");
409 				return -EIO;
410 			}
411 			memcpy(arg, &msg.payload.state,
412 			       sizeof(struct vhost_vring_state));
413 			break;
414 		case VHOST_USER_GET_CONFIG:
415 			if (msg.size != sizeof(msg.payload.cfg)) {
416 				SPDK_WARNLOG("Received bad msg size\n");
417 				return -EIO;
418 			}
419 			memcpy(arg, &msg.payload.cfg, sizeof(msg.payload.cfg));
420 			break;
421 		default:
422 			SPDK_WARNLOG("Received unexpected msg type\n");
423 			return -EBADMSG;
424 		}
425 	}
426 
427 	return 0;
428 }
429 
430 /**
431  * Set up environment to talk with a vhost user backend.
432  *
433  * @return
434  *   - (-1) if fail;
435  *   - (0) if succeed.
436  */
437 static int
438 vhost_user_setup(struct virtio_user_dev *dev)
439 {
440 	int fd;
441 	int flag;
442 	struct sockaddr_un un;
443 	ssize_t rc;
444 
445 	fd = socket(AF_UNIX, SOCK_STREAM, 0);
446 	if (fd < 0) {
447 		SPDK_ERRLOG("socket() error, %s\n", spdk_strerror(errno));
448 		return -errno;
449 	}
450 
451 	flag = fcntl(fd, F_GETFD);
452 	if (fcntl(fd, F_SETFD, flag | FD_CLOEXEC) < 0) {
453 		SPDK_ERRLOG("fcntl failed, %s\n", spdk_strerror(errno));
454 	}
455 
456 	memset(&un, 0, sizeof(un));
457 	un.sun_family = AF_UNIX;
458 	rc = snprintf(un.sun_path, sizeof(un.sun_path), "%s", dev->path);
459 	if (rc < 0 || (size_t)rc >= sizeof(un.sun_path)) {
460 		SPDK_ERRLOG("socket path too long\n");
461 		close(fd);
462 		if (rc < 0) {
463 			return -errno;
464 		} else {
465 			return -EINVAL;
466 		}
467 	}
468 	if (connect(fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
469 		SPDK_ERRLOG("connect error, %s\n", spdk_strerror(errno));
470 		close(fd);
471 		return -errno;
472 	}
473 
474 	dev->vhostfd = fd;
475 	return 0;
476 }
477 
478 static int
479 virtio_user_create_queue(struct virtio_dev *vdev, uint32_t queue_sel)
480 {
481 	struct virtio_user_dev *dev = vdev->ctx;
482 
483 	/* Of all per virtqueue MSGs, make sure VHOST_SET_VRING_CALL come
484 	 * firstly because vhost depends on this msg to allocate virtqueue
485 	 * pair.
486 	 */
487 	struct vhost_vring_file file;
488 
489 	file.index = queue_sel;
490 	file.fd = dev->callfds[queue_sel];
491 	return vhost_user_sock(dev, VHOST_USER_SET_VRING_CALL, &file);
492 }
493 
494 static int
495 virtio_user_set_vring_addr(struct virtio_dev *vdev, uint32_t queue_sel)
496 {
497 	struct virtio_user_dev *dev = vdev->ctx;
498 	struct vring *vring = &dev->vrings[queue_sel];
499 	struct vhost_vring_addr addr = {
500 		.index = queue_sel,
501 		.desc_user_addr = (uint64_t)(uintptr_t)vring->desc,
502 		.avail_user_addr = (uint64_t)(uintptr_t)vring->avail,
503 		.used_user_addr = (uint64_t)(uintptr_t)vring->used,
504 		.log_guest_addr = 0,
505 		.flags = 0, /* disable log */
506 	};
507 
508 	return vhost_user_sock(dev, VHOST_USER_SET_VRING_ADDR, &addr);
509 }
510 
511 static int
512 virtio_user_kick_queue(struct virtio_dev *vdev, uint32_t queue_sel)
513 {
514 	struct virtio_user_dev *dev = vdev->ctx;
515 	struct vhost_vring_file file;
516 	struct vhost_vring_state state;
517 	struct vring *vring = &dev->vrings[queue_sel];
518 	int rc;
519 
520 	state.index = queue_sel;
521 	state.num = vring->num;
522 	rc = vhost_user_sock(dev, VHOST_USER_SET_VRING_NUM, &state);
523 	if (rc < 0) {
524 		return rc;
525 	}
526 
527 	state.index = queue_sel;
528 	state.num = 0; /* no reservation */
529 	rc = vhost_user_sock(dev, VHOST_USER_SET_VRING_BASE, &state);
530 	if (rc < 0) {
531 		return rc;
532 	}
533 
534 	virtio_user_set_vring_addr(vdev, queue_sel);
535 
536 	/* Of all per virtqueue MSGs, make sure VHOST_USER_SET_VRING_KICK comes
537 	 * lastly because vhost depends on this msg to judge if
538 	 * virtio is ready.
539 	 */
540 	file.index = queue_sel;
541 	file.fd = dev->kickfds[queue_sel];
542 	return vhost_user_sock(dev, VHOST_USER_SET_VRING_KICK, &file);
543 }
544 
545 static int
546 virtio_user_stop_queue(struct virtio_dev *vdev, uint32_t queue_sel)
547 {
548 	struct virtio_user_dev *dev = vdev->ctx;
549 	struct vhost_vring_state state;
550 
551 	state.index = queue_sel;
552 	state.num = 0;
553 
554 	return vhost_user_sock(dev, VHOST_USER_GET_VRING_BASE, &state);
555 }
556 
557 static int
558 virtio_user_queue_setup(struct virtio_dev *vdev,
559 			int (*fn)(struct virtio_dev *, uint32_t))
560 {
561 	uint32_t i;
562 	int rc;
563 
564 	for (i = 0; i < vdev->max_queues; ++i) {
565 		rc = fn(vdev, i);
566 		if (rc < 0) {
567 			SPDK_ERRLOG("setup tx vq fails: %"PRIu32".\n", i);
568 			return rc;
569 		}
570 	}
571 
572 	return 0;
573 }
574 
575 static int
576 virtio_user_map_notify(void *cb_ctx, struct spdk_mem_map *map,
577 		       enum spdk_mem_map_notify_action action,
578 		       void *vaddr, size_t size)
579 {
580 	struct virtio_dev *vdev = cb_ctx;
581 	struct virtio_user_dev *dev = vdev->ctx;
582 	uint64_t features;
583 	int ret;
584 
585 	/* We have to resend all mappings anyway, so don't bother with any
586 	 * page tracking.
587 	 */
588 	ret = vhost_user_sock(dev, VHOST_USER_SET_MEM_TABLE, NULL);
589 	if (ret < 0) {
590 		return ret;
591 	}
592 
593 	/* Since we might want to use that mapping straight away, we have to
594 	 * make sure the guest has already processed our SET_MEM_TABLE message.
595 	 * F_REPLY_ACK is just a feature and the host is not obliged to
596 	 * support it, so we send a simple message that always has a response
597 	 * and we wait for that response. Messages are always processed in order.
598 	 */
599 	return vhost_user_sock(dev, VHOST_USER_GET_FEATURES, &features);
600 }
601 
602 static int
603 virtio_user_register_mem(struct virtio_dev *vdev)
604 {
605 	struct virtio_user_dev *dev = vdev->ctx;
606 	const struct spdk_mem_map_ops virtio_user_map_ops = {
607 		.notify_cb = virtio_user_map_notify,
608 		.are_contiguous = NULL
609 	};
610 
611 	dev->mem_map = spdk_mem_map_alloc(0, &virtio_user_map_ops, vdev);
612 	if (dev->mem_map == NULL) {
613 		SPDK_ERRLOG("spdk_mem_map_alloc() failed\n");
614 		return -1;
615 	}
616 
617 	return 0;
618 }
619 
620 static void
621 virtio_user_unregister_mem(struct virtio_dev *vdev)
622 {
623 	struct virtio_user_dev *dev = vdev->ctx;
624 
625 	spdk_mem_map_free(&dev->mem_map);
626 }
627 
628 static int
629 virtio_user_start_device(struct virtio_dev *vdev)
630 {
631 	struct virtio_user_dev *dev = vdev->ctx;
632 	uint64_t host_max_queues;
633 	int ret;
634 
635 	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) == 0 &&
636 	    vdev->max_queues > 1 + vdev->fixed_queues_num) {
637 		SPDK_WARNLOG("%s: requested %"PRIu16" request queues, but the "
638 			     "host doesn't support VHOST_USER_PROTOCOL_F_MQ. "
639 			     "Only one request queue will be used.\n",
640 			     vdev->name, vdev->max_queues - vdev->fixed_queues_num);
641 		vdev->max_queues = 1 + vdev->fixed_queues_num;
642 	}
643 
644 	/* negotiate the number of I/O queues. */
645 	ret = vhost_user_sock(dev, VHOST_USER_GET_QUEUE_NUM, &host_max_queues);
646 	if (ret < 0) {
647 		return ret;
648 	}
649 
650 	if (vdev->max_queues > host_max_queues + vdev->fixed_queues_num) {
651 		SPDK_WARNLOG("%s: requested %"PRIu16" request queues"
652 			     "but only %"PRIu64" available\n",
653 			     vdev->name, vdev->max_queues - vdev->fixed_queues_num,
654 			     host_max_queues);
655 		vdev->max_queues = host_max_queues;
656 	}
657 
658 	/* tell vhost to create queues */
659 	ret = virtio_user_queue_setup(vdev, virtio_user_create_queue);
660 	if (ret < 0) {
661 		return ret;
662 	}
663 
664 	ret = virtio_user_register_mem(vdev);
665 	if (ret < 0) {
666 		return ret;
667 	}
668 
669 	return virtio_user_queue_setup(vdev, virtio_user_kick_queue);
670 }
671 
672 static int
673 virtio_user_stop_device(struct virtio_dev *vdev)
674 {
675 	int ret;
676 
677 	ret = virtio_user_queue_setup(vdev, virtio_user_stop_queue);
678 	/* a queue might fail to stop for various reasons, e.g. socket
679 	 * connection going down, but this mustn't prevent us from freeing
680 	 * the mem map.
681 	 */
682 	virtio_user_unregister_mem(vdev);
683 	return ret;
684 }
685 
686 static int
687 virtio_user_dev_setup(struct virtio_dev *vdev)
688 {
689 	struct virtio_user_dev *dev = vdev->ctx;
690 	uint16_t i;
691 
692 	dev->vhostfd = -1;
693 
694 	for (i = 0; i < SPDK_VIRTIO_MAX_VIRTQUEUES; ++i) {
695 		dev->callfds[i] = -1;
696 		dev->kickfds[i] = -1;
697 	}
698 
699 	return vhost_user_setup(dev);
700 }
701 
702 static int
703 virtio_user_read_dev_config(struct virtio_dev *vdev, size_t offset,
704 			    void *dst, int length)
705 {
706 	struct virtio_user_dev *dev = vdev->ctx;
707 	struct vhost_user_config cfg = {0};
708 	int rc;
709 
710 	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) {
711 		return -ENOTSUP;
712 	}
713 
714 	cfg.offset = 0;
715 	cfg.size = VHOST_USER_MAX_CONFIG_SIZE;
716 
717 	rc = vhost_user_sock(dev, VHOST_USER_GET_CONFIG, &cfg);
718 	if (rc < 0) {
719 		SPDK_ERRLOG("get_config failed: %s\n", spdk_strerror(-rc));
720 		return rc;
721 	}
722 
723 	memcpy(dst, cfg.region + offset, length);
724 	return 0;
725 }
726 
727 static int
728 virtio_user_write_dev_config(struct virtio_dev *vdev, size_t offset,
729 			     const void *src, int length)
730 {
731 	struct virtio_user_dev *dev = vdev->ctx;
732 	struct vhost_user_config cfg = {0};
733 	int rc;
734 
735 	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) {
736 		return -ENOTSUP;
737 	}
738 
739 	cfg.offset = offset;
740 	cfg.size = length;
741 	memcpy(cfg.region, src, length);
742 
743 	rc = vhost_user_sock(dev, VHOST_USER_SET_CONFIG, &cfg);
744 	if (rc < 0) {
745 		SPDK_ERRLOG("set_config failed: %s\n", spdk_strerror(-rc));
746 		return rc;
747 	}
748 
749 	return 0;
750 }
751 
752 static void
753 virtio_user_set_status(struct virtio_dev *vdev, uint8_t status)
754 {
755 	struct virtio_user_dev *dev = vdev->ctx;
756 	int rc = 0;
757 
758 	if ((dev->status & VIRTIO_CONFIG_S_NEEDS_RESET) &&
759 	    status != VIRTIO_CONFIG_S_RESET) {
760 		rc = -1;
761 	} else if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
762 		rc = virtio_user_start_device(vdev);
763 	} else if (status == VIRTIO_CONFIG_S_RESET &&
764 		   (dev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
765 		rc = virtio_user_stop_device(vdev);
766 	}
767 
768 	if (rc != 0) {
769 		dev->status |= VIRTIO_CONFIG_S_NEEDS_RESET;
770 	} else {
771 		dev->status = status;
772 	}
773 }
774 
775 static uint8_t
776 virtio_user_get_status(struct virtio_dev *vdev)
777 {
778 	struct virtio_user_dev *dev = vdev->ctx;
779 
780 	return dev->status;
781 }
782 
783 static uint64_t
784 virtio_user_get_features(struct virtio_dev *vdev)
785 {
786 	struct virtio_user_dev *dev = vdev->ctx;
787 	uint64_t features;
788 	int rc;
789 
790 	rc = vhost_user_sock(dev, VHOST_USER_GET_FEATURES, &features);
791 	if (rc < 0) {
792 		SPDK_ERRLOG("get_features failed: %s\n", spdk_strerror(-rc));
793 		return 0;
794 	}
795 
796 	return features;
797 }
798 
799 static int
800 virtio_user_set_features(struct virtio_dev *vdev, uint64_t features)
801 {
802 	struct virtio_user_dev *dev = vdev->ctx;
803 	uint64_t protocol_features;
804 	int ret;
805 
806 	ret = vhost_user_sock(dev, VHOST_USER_SET_FEATURES, &features);
807 	if (ret < 0) {
808 		return ret;
809 	}
810 
811 	vdev->negotiated_features = features;
812 	vdev->modern = virtio_dev_has_feature(vdev, VIRTIO_F_VERSION_1);
813 
814 	if (!virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) {
815 		/* nothing else to do */
816 		return 0;
817 	}
818 
819 	ret = vhost_user_sock(dev, VHOST_USER_GET_PROTOCOL_FEATURES, &protocol_features);
820 	if (ret < 0) {
821 		return ret;
822 	}
823 
824 	protocol_features &= VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES;
825 	ret = vhost_user_sock(dev, VHOST_USER_SET_PROTOCOL_FEATURES, &protocol_features);
826 	if (ret < 0) {
827 		return ret;
828 	}
829 
830 	dev->protocol_features = protocol_features;
831 	return 0;
832 }
833 
834 static uint16_t
835 virtio_user_get_queue_size(struct virtio_dev *vdev, uint16_t queue_id)
836 {
837 	struct virtio_user_dev *dev = vdev->ctx;
838 
839 	/* Currently each queue has same queue size */
840 	return dev->queue_size;
841 }
842 
843 static int
844 virtio_user_setup_queue(struct virtio_dev *vdev, struct virtqueue *vq)
845 {
846 	struct virtio_user_dev *dev = vdev->ctx;
847 	struct vhost_vring_state state;
848 	uint16_t queue_idx = vq->vq_queue_index;
849 	void *queue_mem;
850 	uint64_t desc_addr, avail_addr, used_addr;
851 	int callfd, kickfd, rc;
852 
853 	if (dev->callfds[queue_idx] != -1 || dev->kickfds[queue_idx] != -1) {
854 		SPDK_ERRLOG("queue %"PRIu16" already exists\n", queue_idx);
855 		return -EEXIST;
856 	}
857 
858 	/* May use invalid flag, but some backend uses kickfd and
859 	 * callfd as criteria to judge if dev is alive. so finally we
860 	 * use real event_fd.
861 	 */
862 	callfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
863 	if (callfd < 0) {
864 		SPDK_ERRLOG("callfd error, %s\n", spdk_strerror(errno));
865 		return -errno;
866 	}
867 
868 	kickfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
869 	if (kickfd < 0) {
870 		SPDK_ERRLOG("kickfd error, %s\n", spdk_strerror(errno));
871 		close(callfd);
872 		return -errno;
873 	}
874 
875 	queue_mem = spdk_zmalloc(vq->vq_ring_size, VIRTIO_PCI_VRING_ALIGN, NULL,
876 				 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
877 	if (queue_mem == NULL) {
878 		close(kickfd);
879 		close(callfd);
880 		return -ENOMEM;
881 	}
882 
883 	vq->vq_ring_mem = SPDK_VTOPHYS_ERROR;
884 	vq->vq_ring_virt_mem = queue_mem;
885 
886 	state.index = vq->vq_queue_index;
887 	state.num = vq->vq_nentries;
888 
889 	if (virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) {
890 		rc = vhost_user_sock(dev, VHOST_USER_SET_VRING_ENABLE, &state);
891 		if (rc < 0) {
892 			SPDK_ERRLOG("failed to send VHOST_USER_SET_VRING_ENABLE: %s\n",
893 				    spdk_strerror(-rc));
894 			close(kickfd);
895 			close(callfd);
896 			spdk_free(queue_mem);
897 			return -rc;
898 		}
899 	}
900 
901 	dev->callfds[queue_idx] = callfd;
902 	dev->kickfds[queue_idx] = kickfd;
903 
904 	desc_addr = (uintptr_t)vq->vq_ring_virt_mem;
905 	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
906 	used_addr = SPDK_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
907 				    ring[vq->vq_nentries]),
908 				    VIRTIO_PCI_VRING_ALIGN);
909 
910 	dev->vrings[queue_idx].num = vq->vq_nentries;
911 	dev->vrings[queue_idx].desc = (void *)(uintptr_t)desc_addr;
912 	dev->vrings[queue_idx].avail = (void *)(uintptr_t)avail_addr;
913 	dev->vrings[queue_idx].used = (void *)(uintptr_t)used_addr;
914 
915 	return 0;
916 }
917 
918 static void
919 virtio_user_del_queue(struct virtio_dev *vdev, struct virtqueue *vq)
920 {
921 	/* For legacy devices, write 0 to VIRTIO_PCI_QUEUE_PFN port, QEMU
922 	 * correspondingly stops the ioeventfds, and reset the status of
923 	 * the device.
924 	 * For modern devices, set queue desc, avail, used in PCI bar to 0,
925 	 * not see any more behavior in QEMU.
926 	 *
927 	 * Here we just care about what information to deliver to vhost-user.
928 	 * So we just close ioeventfd for now.
929 	 */
930 	struct virtio_user_dev *dev = vdev->ctx;
931 
932 	close(dev->callfds[vq->vq_queue_index]);
933 	close(dev->kickfds[vq->vq_queue_index]);
934 	dev->callfds[vq->vq_queue_index] = -1;
935 	dev->kickfds[vq->vq_queue_index] = -1;
936 
937 	spdk_free(vq->vq_ring_virt_mem);
938 }
939 
940 static void
941 virtio_user_notify_queue(struct virtio_dev *vdev, struct virtqueue *vq)
942 {
943 	uint64_t buf = 1;
944 	struct virtio_user_dev *dev = vdev->ctx;
945 
946 	if (write(dev->kickfds[vq->vq_queue_index], &buf, sizeof(buf)) < 0) {
947 		SPDK_ERRLOG("failed to kick backend: %s.\n", spdk_strerror(errno));
948 	}
949 }
950 
951 static void
952 virtio_user_destroy(struct virtio_dev *vdev)
953 {
954 	struct virtio_user_dev *dev = vdev->ctx;
955 
956 	close(dev->vhostfd);
957 	free(dev);
958 }
959 
960 static void
961 virtio_user_dump_json_info(struct virtio_dev *vdev, struct spdk_json_write_ctx *w)
962 {
963 	struct virtio_user_dev *dev = vdev->ctx;
964 
965 	spdk_json_write_named_string(w, "type", "user");
966 	spdk_json_write_named_string(w, "socket", dev->path);
967 }
968 
969 static void
970 virtio_user_write_json_config(struct virtio_dev *vdev, struct spdk_json_write_ctx *w)
971 {
972 	struct virtio_user_dev *dev = vdev->ctx;
973 
974 	spdk_json_write_named_string(w, "trtype", "user");
975 	spdk_json_write_named_string(w, "traddr", dev->path);
976 	spdk_json_write_named_uint32(w, "vq_count", vdev->max_queues - vdev->fixed_queues_num);
977 	spdk_json_write_named_uint32(w, "vq_size", virtio_dev_backend_ops(vdev)->get_queue_size(vdev, 0));
978 }
979 
980 static const struct virtio_dev_ops virtio_user_ops = {
981 	.read_dev_cfg	= virtio_user_read_dev_config,
982 	.write_dev_cfg	= virtio_user_write_dev_config,
983 	.get_status	= virtio_user_get_status,
984 	.set_status	= virtio_user_set_status,
985 	.get_features	= virtio_user_get_features,
986 	.set_features	= virtio_user_set_features,
987 	.destruct_dev	= virtio_user_destroy,
988 	.get_queue_size	= virtio_user_get_queue_size,
989 	.setup_queue	= virtio_user_setup_queue,
990 	.del_queue	= virtio_user_del_queue,
991 	.notify_queue	= virtio_user_notify_queue,
992 	.dump_json_info = virtio_user_dump_json_info,
993 	.write_json_config = virtio_user_write_json_config,
994 };
995 
996 int
997 virtio_user_dev_init(struct virtio_dev *vdev, const char *name, const char *path,
998 		     uint32_t queue_size)
999 {
1000 	struct virtio_user_dev *dev;
1001 	int rc;
1002 
1003 	if (name == NULL) {
1004 		SPDK_ERRLOG("No name gived for controller: %s\n", path);
1005 		return -EINVAL;
1006 	}
1007 
1008 	dev = calloc(1, sizeof(*dev));
1009 	if (dev == NULL) {
1010 		return -ENOMEM;
1011 	}
1012 
1013 	rc = virtio_dev_construct(vdev, name, &virtio_user_ops, dev);
1014 	if (rc != 0) {
1015 		SPDK_ERRLOG("Failed to init device: %s\n", path);
1016 		free(dev);
1017 		return rc;
1018 	}
1019 
1020 	vdev->is_hw = 0;
1021 
1022 	snprintf(dev->path, PATH_MAX, "%s", path);
1023 	dev->queue_size = queue_size;
1024 
1025 	rc = virtio_user_dev_setup(vdev);
1026 	if (rc < 0) {
1027 		SPDK_ERRLOG("backend set up fails\n");
1028 		goto err;
1029 	}
1030 
1031 	rc = vhost_user_sock(dev, VHOST_USER_SET_OWNER, NULL);
1032 	if (rc < 0) {
1033 		SPDK_ERRLOG("set_owner fails: %s\n", spdk_strerror(-rc));
1034 		goto err;
1035 	}
1036 
1037 	return 0;
1038 
1039 err:
1040 	virtio_dev_destruct(vdev);
1041 	return rc;
1042 }
1043 SPDK_LOG_REGISTER_COMPONENT(virtio_user)
1044