xref: /spdk/lib/virtio/virtio_vhost_user.c (revision b02581a89058ebaebe03bd0e16e3b58adfe406c1)
1 /*   SPDX-License-Identifier: BSD-3-Clause
2  *   Copyright (C) 2010-2016 Intel Corporation. All rights reserved.
3  *   All rights reserved.
4  */
5 
6 #include "spdk/stdinc.h"
7 
8 #include <sys/eventfd.h>
9 
10 #include "spdk/string.h"
11 #include "spdk/config.h"
12 #include "spdk/util.h"
13 
14 #include "spdk_internal/virtio.h"
15 #include "spdk_internal/vhost_user.h"
16 
17 /* The version of the protocol we support */
18 #define VHOST_USER_VERSION    0x1
19 
20 #define VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES \
21 	((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
22 	(1ULL << VHOST_USER_PROTOCOL_F_CONFIG))
23 
24 struct virtio_user_dev {
25 	int		vhostfd;
26 
27 	int		callfds[SPDK_VIRTIO_MAX_VIRTQUEUES];
28 	int		kickfds[SPDK_VIRTIO_MAX_VIRTQUEUES];
29 	uint32_t	queue_size;
30 
31 	uint8_t		status;
32 	bool		is_stopping;
33 	char		path[PATH_MAX];
34 	uint64_t	protocol_features;
35 	struct vring	vrings[SPDK_VIRTIO_MAX_VIRTQUEUES];
36 	struct spdk_mem_map *mem_map;
37 };
38 
39 static int
40 vhost_user_write(int fd, void *buf, int len, int *fds, int fd_num)
41 {
42 	int r;
43 	struct msghdr msgh;
44 	struct iovec iov;
45 	size_t fd_size = fd_num * sizeof(int);
46 	char control[CMSG_SPACE(fd_size)];
47 	struct cmsghdr *cmsg;
48 
49 	memset(&msgh, 0, sizeof(msgh));
50 	memset(control, 0, sizeof(control));
51 
52 	iov.iov_base = (uint8_t *)buf;
53 	iov.iov_len = len;
54 
55 	msgh.msg_iov = &iov;
56 	msgh.msg_iovlen = 1;
57 
58 	if (fds && fd_num > 0) {
59 		msgh.msg_control = control;
60 		msgh.msg_controllen = sizeof(control);
61 		cmsg = CMSG_FIRSTHDR(&msgh);
62 		if (!cmsg) {
63 			SPDK_WARNLOG("First HDR is NULL\n");
64 			return -EIO;
65 		}
66 		cmsg->cmsg_len = CMSG_LEN(fd_size);
67 		cmsg->cmsg_level = SOL_SOCKET;
68 		cmsg->cmsg_type = SCM_RIGHTS;
69 		memcpy(CMSG_DATA(cmsg), fds, fd_size);
70 	} else {
71 		msgh.msg_control = NULL;
72 		msgh.msg_controllen = 0;
73 	}
74 
75 	do {
76 		r = sendmsg(fd, &msgh, 0);
77 	} while (r < 0 && errno == EINTR);
78 
79 	if (r == -1) {
80 		return -errno;
81 	}
82 
83 	return 0;
84 }
85 
86 static int
87 vhost_user_read(int fd, struct vhost_user_msg *msg)
88 {
89 	uint32_t valid_flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION;
90 	ssize_t ret;
91 	size_t sz_hdr = VHOST_USER_HDR_SIZE, sz_payload;
92 
93 	ret = recv(fd, (void *)msg, sz_hdr, 0);
94 	if ((size_t)ret != sz_hdr) {
95 		SPDK_WARNLOG("Failed to recv msg hdr: %zd instead of %zu.\n",
96 			     ret, sz_hdr);
97 		if (ret == -1) {
98 			return -errno;
99 		} else {
100 			return -EBUSY;
101 		}
102 	}
103 
104 	/* validate msg flags */
105 	if (msg->flags != (valid_flags)) {
106 		SPDK_WARNLOG("Failed to recv msg: flags %"PRIx32" instead of %"PRIx32".\n",
107 			     msg->flags, valid_flags);
108 		return -EIO;
109 	}
110 
111 	sz_payload = msg->size;
112 
113 	if (sz_payload > VHOST_USER_PAYLOAD_SIZE) {
114 		SPDK_WARNLOG("Received oversized msg: payload size %zu > available space %zu\n",
115 			     sz_payload, VHOST_USER_PAYLOAD_SIZE);
116 		return -EIO;
117 	}
118 
119 	if (sz_payload) {
120 		ret = recv(fd, (void *)((char *)msg + sz_hdr), sz_payload, 0);
121 		if ((size_t)ret != sz_payload) {
122 			SPDK_WARNLOG("Failed to recv msg payload: %zd instead of %"PRIu32".\n",
123 				     ret, msg->size);
124 			if (ret == -1) {
125 				return -errno;
126 			} else {
127 				return -EBUSY;
128 			}
129 		}
130 	}
131 
132 	return 0;
133 }
134 
135 struct hugepage_file_info {
136 	uint64_t addr;            /**< virtual addr */
137 	size_t   size;            /**< the file size */
138 	char     path[PATH_MAX];  /**< path to backing file */
139 };
140 
141 /* Two possible options:
142  * 1. Match HUGEPAGE_INFO_FMT to find the file storing struct hugepage_file
143  * array. This is simple but cannot be used in secondary process because
144  * secondary process will close and munmap that file.
145  * 2. Match HUGEFILE_FMT to find hugepage files directly.
146  *
147  * We choose option 2.
148  */
149 static int
150 get_hugepage_file_info(struct hugepage_file_info hugepages[], int max)
151 {
152 	int idx, rc;
153 	FILE *f;
154 	char buf[BUFSIZ], *tmp, *tail;
155 	char *str_underline, *str_start;
156 	int huge_index;
157 	uint64_t v_start, v_end;
158 
159 	f = fopen("/proc/self/maps", "r");
160 	if (!f) {
161 		SPDK_ERRLOG("cannot open /proc/self/maps\n");
162 		rc = -errno;
163 		assert(rc < 0); /* scan-build hack */
164 		return rc;
165 	}
166 
167 	idx = 0;
168 	while (fgets(buf, sizeof(buf), f) != NULL) {
169 		if (sscanf(buf, "%" PRIx64 "-%" PRIx64, &v_start, &v_end) < 2) {
170 			SPDK_ERRLOG("Failed to parse address\n");
171 			rc = -EIO;
172 			goto out;
173 		}
174 
175 		tmp = strchr(buf, ' ') + 1; /** skip address */
176 		tmp = strchr(tmp, ' ') + 1; /** skip perm */
177 		tmp = strchr(tmp, ' ') + 1; /** skip offset */
178 		tmp = strchr(tmp, ' ') + 1; /** skip dev */
179 		tmp = strchr(tmp, ' ') + 1; /** skip inode */
180 		while (*tmp == ' ') {       /** skip spaces */
181 			tmp++;
182 		}
183 		tail = strrchr(tmp, '\n');  /** remove newline if exists */
184 		if (tail) {
185 			*tail = '\0';
186 		}
187 
188 		/* Match HUGEFILE_FMT, aka "%s/%smap_%d",
189 		 * which is defined in eal_filesystem.h
190 		 */
191 		str_underline = strrchr(tmp, '_');
192 		if (!str_underline) {
193 			continue;
194 		}
195 
196 		str_start = str_underline - strlen("map");
197 		if (str_start < tmp) {
198 			continue;
199 		}
200 
201 		if (sscanf(str_start, "map_%d", &huge_index) != 1) {
202 			continue;
203 		}
204 
205 		if (idx >= max) {
206 			SPDK_ERRLOG("Exceed maximum of %d\n", max);
207 			rc = -ENOSPC;
208 			goto out;
209 		}
210 
211 		if (idx > 0 &&
212 		    strncmp(tmp, hugepages[idx - 1].path, PATH_MAX) == 0 &&
213 		    v_start == hugepages[idx - 1].addr + hugepages[idx - 1].size) {
214 			hugepages[idx - 1].size += (v_end - v_start);
215 			continue;
216 		}
217 
218 		hugepages[idx].addr = v_start;
219 		hugepages[idx].size = v_end - v_start;
220 		snprintf(hugepages[idx].path, PATH_MAX, "%s", tmp);
221 		idx++;
222 	}
223 
224 	rc = idx;
225 out:
226 	fclose(f);
227 	return rc;
228 }
229 
230 static int
231 prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[])
232 {
233 	int i, num;
234 	struct hugepage_file_info hugepages[VHOST_USER_MEMORY_MAX_NREGIONS];
235 
236 	num = get_hugepage_file_info(hugepages, VHOST_USER_MEMORY_MAX_NREGIONS);
237 	if (num < 0) {
238 		SPDK_ERRLOG("Failed to prepare memory for vhost-user\n");
239 		return num;
240 	}
241 
242 	for (i = 0; i < num; ++i) {
243 		/* the memory regions are unaligned */
244 		msg->payload.memory.regions[i].guest_phys_addr = hugepages[i].addr; /* use vaddr! */
245 		msg->payload.memory.regions[i].userspace_addr = hugepages[i].addr;
246 		msg->payload.memory.regions[i].memory_size = hugepages[i].size;
247 		msg->payload.memory.regions[i].flags_padding = 0;
248 		fds[i] = open(hugepages[i].path, O_RDWR);
249 	}
250 
251 	msg->payload.memory.nregions = num;
252 	msg->payload.memory.padding = 0;
253 
254 	return 0;
255 }
256 
257 static const char *const vhost_msg_strings[VHOST_USER_MAX] = {
258 	[VHOST_USER_SET_OWNER] = "VHOST_SET_OWNER",
259 	[VHOST_USER_RESET_OWNER] = "VHOST_RESET_OWNER",
260 	[VHOST_USER_SET_FEATURES] = "VHOST_SET_FEATURES",
261 	[VHOST_USER_GET_FEATURES] = "VHOST_GET_FEATURES",
262 	[VHOST_USER_SET_VRING_CALL] = "VHOST_SET_VRING_CALL",
263 	[VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES",
264 	[VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES",
265 	[VHOST_USER_SET_VRING_NUM] = "VHOST_SET_VRING_NUM",
266 	[VHOST_USER_SET_VRING_BASE] = "VHOST_SET_VRING_BASE",
267 	[VHOST_USER_GET_VRING_BASE] = "VHOST_GET_VRING_BASE",
268 	[VHOST_USER_SET_VRING_ADDR] = "VHOST_SET_VRING_ADDR",
269 	[VHOST_USER_SET_VRING_KICK] = "VHOST_SET_VRING_KICK",
270 	[VHOST_USER_SET_MEM_TABLE] = "VHOST_SET_MEM_TABLE",
271 	[VHOST_USER_SET_VRING_ENABLE] = "VHOST_SET_VRING_ENABLE",
272 	[VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM",
273 	[VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG",
274 	[VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG",
275 };
276 
277 static int
278 vhost_user_sock(struct virtio_user_dev *dev,
279 		enum vhost_user_request req,
280 		void *arg)
281 {
282 	struct vhost_user_msg msg;
283 	struct vhost_vring_file *file = 0;
284 	int need_reply = 0;
285 	int fds[VHOST_USER_MEMORY_MAX_NREGIONS];
286 	int fd_num = 0;
287 	int i, len, rc;
288 	int vhostfd = dev->vhostfd;
289 
290 	SPDK_DEBUGLOG(virtio_user, "sent message %d = %s\n", req, vhost_msg_strings[req]);
291 
292 	msg.request = req;
293 	msg.flags = VHOST_USER_VERSION;
294 	msg.size = 0;
295 
296 	switch (req) {
297 	case VHOST_USER_GET_FEATURES:
298 	case VHOST_USER_GET_PROTOCOL_FEATURES:
299 	case VHOST_USER_GET_QUEUE_NUM:
300 		need_reply = 1;
301 		break;
302 
303 	case VHOST_USER_SET_FEATURES:
304 	case VHOST_USER_SET_LOG_BASE:
305 	case VHOST_USER_SET_PROTOCOL_FEATURES:
306 		msg.payload.u64 = *((__u64 *)arg);
307 		msg.size = sizeof(msg.payload.u64);
308 		break;
309 
310 	case VHOST_USER_SET_OWNER:
311 	case VHOST_USER_RESET_OWNER:
312 		break;
313 
314 	case VHOST_USER_SET_MEM_TABLE:
315 		rc = prepare_vhost_memory_user(&msg, fds);
316 		if (rc < 0) {
317 			return rc;
318 		}
319 		fd_num = msg.payload.memory.nregions;
320 		msg.size = sizeof(msg.payload.memory.nregions);
321 		msg.size += sizeof(msg.payload.memory.padding);
322 		msg.size += fd_num * sizeof(struct vhost_memory_region);
323 		break;
324 
325 	case VHOST_USER_SET_LOG_FD:
326 		fds[fd_num++] = *((int *)arg);
327 		break;
328 
329 	case VHOST_USER_SET_VRING_NUM:
330 	case VHOST_USER_SET_VRING_BASE:
331 	case VHOST_USER_SET_VRING_ENABLE:
332 		memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
333 		msg.size = sizeof(msg.payload.state);
334 		break;
335 
336 	case VHOST_USER_GET_VRING_BASE:
337 		memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
338 		msg.size = sizeof(msg.payload.state);
339 		need_reply = 1;
340 		break;
341 
342 	case VHOST_USER_SET_VRING_ADDR:
343 		memcpy(&msg.payload.addr, arg, sizeof(msg.payload.addr));
344 		msg.size = sizeof(msg.payload.addr);
345 		break;
346 
347 	case VHOST_USER_SET_VRING_KICK:
348 	case VHOST_USER_SET_VRING_CALL:
349 	case VHOST_USER_SET_VRING_ERR:
350 		file = arg;
351 		msg.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK;
352 		msg.size = sizeof(msg.payload.u64);
353 		if (file->fd > 0) {
354 			fds[fd_num++] = file->fd;
355 		} else {
356 			msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
357 		}
358 		break;
359 
360 	case VHOST_USER_GET_CONFIG:
361 		memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg));
362 		msg.size = sizeof(msg.payload.cfg);
363 		need_reply = 1;
364 		break;
365 
366 	case VHOST_USER_SET_CONFIG:
367 		memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg));
368 		msg.size = sizeof(msg.payload.cfg);
369 		break;
370 
371 	default:
372 		SPDK_ERRLOG("trying to send unknown msg\n");
373 		return -EINVAL;
374 	}
375 
376 	len = VHOST_USER_HDR_SIZE + msg.size;
377 	rc = vhost_user_write(vhostfd, &msg, len, fds, fd_num);
378 	if (rc < 0) {
379 		SPDK_ERRLOG("%s failed: %s\n",
380 			    vhost_msg_strings[req], spdk_strerror(-rc));
381 		return rc;
382 	}
383 
384 	if (req == VHOST_USER_SET_MEM_TABLE)
385 		for (i = 0; i < fd_num; ++i) {
386 			close(fds[i]);
387 		}
388 
389 	if (need_reply) {
390 		rc = vhost_user_read(vhostfd, &msg);
391 		if (rc < 0) {
392 			SPDK_WARNLOG("Received msg failed: %s\n", spdk_strerror(-rc));
393 			return rc;
394 		}
395 
396 		if (req != msg.request) {
397 			SPDK_WARNLOG("Received unexpected msg type\n");
398 			return -EIO;
399 		}
400 
401 		switch (req) {
402 		case VHOST_USER_GET_FEATURES:
403 		case VHOST_USER_GET_PROTOCOL_FEATURES:
404 		case VHOST_USER_GET_QUEUE_NUM:
405 			if (msg.size != sizeof(msg.payload.u64)) {
406 				SPDK_WARNLOG("Received bad msg size\n");
407 				return -EIO;
408 			}
409 			*((__u64 *)arg) = msg.payload.u64;
410 			break;
411 		case VHOST_USER_GET_VRING_BASE:
412 			if (msg.size != sizeof(msg.payload.state)) {
413 				SPDK_WARNLOG("Received bad msg size\n");
414 				return -EIO;
415 			}
416 			memcpy(arg, &msg.payload.state,
417 			       sizeof(struct vhost_vring_state));
418 			break;
419 		case VHOST_USER_GET_CONFIG:
420 			if (msg.size != sizeof(msg.payload.cfg)) {
421 				SPDK_WARNLOG("Received bad msg size\n");
422 				return -EIO;
423 			}
424 			memcpy(arg, &msg.payload.cfg, sizeof(msg.payload.cfg));
425 			break;
426 		default:
427 			SPDK_WARNLOG("Received unexpected msg type\n");
428 			return -EBADMSG;
429 		}
430 	}
431 
432 	return 0;
433 }
434 
435 /**
436  * Set up environment to talk with a vhost user backend.
437  *
438  * @return
439  *   - (-1) if fail;
440  *   - (0) if succeed.
441  */
442 static int
443 vhost_user_setup(struct virtio_user_dev *dev)
444 {
445 	int fd;
446 	int flag;
447 	struct sockaddr_un un;
448 	ssize_t rc;
449 
450 	fd = socket(AF_UNIX, SOCK_STREAM, 0);
451 	if (fd < 0) {
452 		SPDK_ERRLOG("socket() error, %s\n", spdk_strerror(errno));
453 		return -errno;
454 	}
455 
456 	flag = fcntl(fd, F_GETFD);
457 	if (fcntl(fd, F_SETFD, flag | FD_CLOEXEC) < 0) {
458 		SPDK_ERRLOG("fcntl failed, %s\n", spdk_strerror(errno));
459 	}
460 
461 	memset(&un, 0, sizeof(un));
462 	un.sun_family = AF_UNIX;
463 	rc = snprintf(un.sun_path, sizeof(un.sun_path), "%s", dev->path);
464 	if (rc < 0 || (size_t)rc >= sizeof(un.sun_path)) {
465 		SPDK_ERRLOG("socket path too long\n");
466 		close(fd);
467 		if (rc < 0) {
468 			return -errno;
469 		} else {
470 			return -EINVAL;
471 		}
472 	}
473 	if (connect(fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
474 		SPDK_ERRLOG("connect error, %s\n", spdk_strerror(errno));
475 		close(fd);
476 		return -errno;
477 	}
478 
479 	dev->vhostfd = fd;
480 	return 0;
481 }
482 
483 static int
484 virtio_user_create_queue(struct virtio_dev *vdev, uint32_t queue_sel)
485 {
486 	struct virtio_user_dev *dev = vdev->ctx;
487 
488 	/* Of all per virtqueue MSGs, make sure VHOST_SET_VRING_CALL come
489 	 * firstly because vhost depends on this msg to allocate virtqueue
490 	 * pair.
491 	 */
492 	struct vhost_vring_file file;
493 
494 	file.index = queue_sel;
495 	file.fd = dev->callfds[queue_sel];
496 	return vhost_user_sock(dev, VHOST_USER_SET_VRING_CALL, &file);
497 }
498 
499 static int
500 virtio_user_set_vring_addr(struct virtio_dev *vdev, uint32_t queue_sel)
501 {
502 	struct virtio_user_dev *dev = vdev->ctx;
503 	struct vring *vring = &dev->vrings[queue_sel];
504 	struct vhost_vring_addr addr = {
505 		.index = queue_sel,
506 		.desc_user_addr = (uint64_t)(uintptr_t)vring->desc,
507 		.avail_user_addr = (uint64_t)(uintptr_t)vring->avail,
508 		.used_user_addr = (uint64_t)(uintptr_t)vring->used,
509 		.log_guest_addr = 0,
510 		.flags = 0, /* disable log */
511 	};
512 
513 	return vhost_user_sock(dev, VHOST_USER_SET_VRING_ADDR, &addr);
514 }
515 
516 static int
517 virtio_user_kick_queue(struct virtio_dev *vdev, uint32_t queue_sel)
518 {
519 	struct virtio_user_dev *dev = vdev->ctx;
520 	struct vhost_vring_file file;
521 	struct vhost_vring_state state;
522 	struct vring *vring = &dev->vrings[queue_sel];
523 	int rc;
524 
525 	state.index = queue_sel;
526 	state.num = vring->num;
527 	rc = vhost_user_sock(dev, VHOST_USER_SET_VRING_NUM, &state);
528 	if (rc < 0) {
529 		return rc;
530 	}
531 
532 	state.index = queue_sel;
533 	state.num = 0; /* no reservation */
534 	rc = vhost_user_sock(dev, VHOST_USER_SET_VRING_BASE, &state);
535 	if (rc < 0) {
536 		return rc;
537 	}
538 
539 	virtio_user_set_vring_addr(vdev, queue_sel);
540 
541 	/* Of all per virtqueue MSGs, make sure VHOST_USER_SET_VRING_KICK comes
542 	 * lastly because vhost depends on this msg to judge if
543 	 * virtio is ready.
544 	 */
545 	file.index = queue_sel;
546 	file.fd = dev->kickfds[queue_sel];
547 	return vhost_user_sock(dev, VHOST_USER_SET_VRING_KICK, &file);
548 }
549 
550 static int
551 virtio_user_stop_queue(struct virtio_dev *vdev, uint32_t queue_sel)
552 {
553 	struct virtio_user_dev *dev = vdev->ctx;
554 	struct vhost_vring_state state;
555 
556 	state.index = queue_sel;
557 	state.num = 0;
558 
559 	return vhost_user_sock(dev, VHOST_USER_GET_VRING_BASE, &state);
560 }
561 
562 static int
563 virtio_user_queue_setup(struct virtio_dev *vdev,
564 			int (*fn)(struct virtio_dev *, uint32_t))
565 {
566 	uint32_t i;
567 	int rc;
568 
569 	for (i = 0; i < vdev->max_queues; ++i) {
570 		rc = fn(vdev, i);
571 		if (rc < 0) {
572 			SPDK_ERRLOG("setup tx vq fails: %"PRIu32".\n", i);
573 			return rc;
574 		}
575 	}
576 
577 	return 0;
578 }
579 
580 static int
581 virtio_user_map_notify(void *cb_ctx, struct spdk_mem_map *map,
582 		       enum spdk_mem_map_notify_action action,
583 		       void *vaddr, size_t size)
584 {
585 	struct virtio_dev *vdev = cb_ctx;
586 	struct virtio_user_dev *dev = vdev->ctx;
587 	uint64_t features;
588 	int ret;
589 
590 	/* We do not support dynamic memory allocation with virtio-user.  If this is the
591 	 * initial notification when the device is started, dev->mem_map will be NULL.  If
592 	 * this is the final notification when the device is stopped, dev->is_stopping will
593 	 * be true.  All other cases are unsupported.
594 	 */
595 	if (dev->mem_map != NULL && !dev->is_stopping) {
596 		assert(false);
597 		SPDK_ERRLOG("Memory map change with active virtio_user_devs not allowed.\n");
598 		SPDK_ERRLOG("Pre-allocate memory for application using -s (mem_size) option.\n");
599 		return -1;
600 	}
601 
602 	/* We have to resend all mappings anyway, so don't bother with any
603 	 * page tracking.
604 	 */
605 	ret = vhost_user_sock(dev, VHOST_USER_SET_MEM_TABLE, NULL);
606 	if (ret < 0) {
607 		return ret;
608 	}
609 
610 	/* Since we might want to use that mapping straight away, we have to
611 	 * make sure the guest has already processed our SET_MEM_TABLE message.
612 	 * F_REPLY_ACK is just a feature and the host is not obliged to
613 	 * support it, so we send a simple message that always has a response
614 	 * and we wait for that response. Messages are always processed in order.
615 	 */
616 	return vhost_user_sock(dev, VHOST_USER_GET_FEATURES, &features);
617 }
618 
619 static int
620 virtio_user_register_mem(struct virtio_dev *vdev)
621 {
622 	struct virtio_user_dev *dev = vdev->ctx;
623 	const struct spdk_mem_map_ops virtio_user_map_ops = {
624 		.notify_cb = virtio_user_map_notify,
625 		.are_contiguous = NULL
626 	};
627 
628 	dev->mem_map = spdk_mem_map_alloc(0, &virtio_user_map_ops, vdev);
629 	if (dev->mem_map == NULL) {
630 		SPDK_ERRLOG("spdk_mem_map_alloc() failed\n");
631 		return -1;
632 	}
633 
634 	return 0;
635 }
636 
637 static void
638 virtio_user_unregister_mem(struct virtio_dev *vdev)
639 {
640 	struct virtio_user_dev *dev = vdev->ctx;
641 
642 	dev->is_stopping = true;
643 	spdk_mem_map_free(&dev->mem_map);
644 }
645 
646 static int
647 virtio_user_start_device(struct virtio_dev *vdev)
648 {
649 	struct virtio_user_dev *dev = vdev->ctx;
650 	uint64_t host_max_queues;
651 	int ret;
652 
653 	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) == 0 &&
654 	    vdev->max_queues > 1 + vdev->fixed_queues_num) {
655 		SPDK_WARNLOG("%s: requested %"PRIu16" request queues, but the "
656 			     "host doesn't support VHOST_USER_PROTOCOL_F_MQ. "
657 			     "Only one request queue will be used.\n",
658 			     vdev->name, vdev->max_queues - vdev->fixed_queues_num);
659 		vdev->max_queues = 1 + vdev->fixed_queues_num;
660 	}
661 
662 	/* negotiate the number of I/O queues. */
663 	ret = vhost_user_sock(dev, VHOST_USER_GET_QUEUE_NUM, &host_max_queues);
664 	if (ret < 0) {
665 		return ret;
666 	}
667 
668 	if (vdev->max_queues > host_max_queues + vdev->fixed_queues_num) {
669 		SPDK_WARNLOG("%s: requested %"PRIu16" request queues"
670 			     "but only %"PRIu64" available\n",
671 			     vdev->name, vdev->max_queues - vdev->fixed_queues_num,
672 			     host_max_queues);
673 		vdev->max_queues = host_max_queues;
674 	}
675 
676 	/* tell vhost to create queues */
677 	ret = virtio_user_queue_setup(vdev, virtio_user_create_queue);
678 	if (ret < 0) {
679 		return ret;
680 	}
681 
682 	ret = virtio_user_register_mem(vdev);
683 	if (ret < 0) {
684 		return ret;
685 	}
686 
687 	return virtio_user_queue_setup(vdev, virtio_user_kick_queue);
688 }
689 
690 static int
691 virtio_user_stop_device(struct virtio_dev *vdev)
692 {
693 	int ret;
694 
695 	ret = virtio_user_queue_setup(vdev, virtio_user_stop_queue);
696 	/* a queue might fail to stop for various reasons, e.g. socket
697 	 * connection going down, but this mustn't prevent us from freeing
698 	 * the mem map.
699 	 */
700 	virtio_user_unregister_mem(vdev);
701 	return ret;
702 }
703 
704 static int
705 virtio_user_dev_setup(struct virtio_dev *vdev)
706 {
707 	struct virtio_user_dev *dev = vdev->ctx;
708 	uint16_t i;
709 
710 	dev->vhostfd = -1;
711 
712 	for (i = 0; i < SPDK_VIRTIO_MAX_VIRTQUEUES; ++i) {
713 		dev->callfds[i] = -1;
714 		dev->kickfds[i] = -1;
715 	}
716 
717 	return vhost_user_setup(dev);
718 }
719 
720 static int
721 virtio_user_read_dev_config(struct virtio_dev *vdev, size_t offset,
722 			    void *dst, int length)
723 {
724 	struct virtio_user_dev *dev = vdev->ctx;
725 	struct vhost_user_config cfg = {0};
726 	int rc;
727 
728 	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) {
729 		return -ENOTSUP;
730 	}
731 
732 	cfg.offset = 0;
733 	cfg.size = VHOST_USER_MAX_CONFIG_SIZE;
734 
735 	rc = vhost_user_sock(dev, VHOST_USER_GET_CONFIG, &cfg);
736 	if (rc < 0) {
737 		SPDK_ERRLOG("get_config failed: %s\n", spdk_strerror(-rc));
738 		return rc;
739 	}
740 
741 	memcpy(dst, cfg.region + offset, length);
742 	return 0;
743 }
744 
745 static int
746 virtio_user_write_dev_config(struct virtio_dev *vdev, size_t offset,
747 			     const void *src, int length)
748 {
749 	struct virtio_user_dev *dev = vdev->ctx;
750 	struct vhost_user_config cfg = {0};
751 	int rc;
752 
753 	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) {
754 		return -ENOTSUP;
755 	}
756 
757 	cfg.offset = offset;
758 	cfg.size = length;
759 	memcpy(cfg.region, src, length);
760 
761 	rc = vhost_user_sock(dev, VHOST_USER_SET_CONFIG, &cfg);
762 	if (rc < 0) {
763 		SPDK_ERRLOG("set_config failed: %s\n", spdk_strerror(-rc));
764 		return rc;
765 	}
766 
767 	return 0;
768 }
769 
770 static void
771 virtio_user_set_status(struct virtio_dev *vdev, uint8_t status)
772 {
773 	struct virtio_user_dev *dev = vdev->ctx;
774 	int rc = 0;
775 
776 	if ((dev->status & VIRTIO_CONFIG_S_NEEDS_RESET) &&
777 	    status != VIRTIO_CONFIG_S_RESET) {
778 		rc = -1;
779 	} else if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
780 		rc = virtio_user_start_device(vdev);
781 	} else if (status == VIRTIO_CONFIG_S_RESET &&
782 		   (dev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
783 		rc = virtio_user_stop_device(vdev);
784 	}
785 
786 	if (rc != 0) {
787 		dev->status |= VIRTIO_CONFIG_S_NEEDS_RESET;
788 	} else {
789 		dev->status = status;
790 	}
791 }
792 
793 static uint8_t
794 virtio_user_get_status(struct virtio_dev *vdev)
795 {
796 	struct virtio_user_dev *dev = vdev->ctx;
797 
798 	return dev->status;
799 }
800 
801 static uint64_t
802 virtio_user_get_features(struct virtio_dev *vdev)
803 {
804 	struct virtio_user_dev *dev = vdev->ctx;
805 	uint64_t features;
806 	int rc;
807 
808 	rc = vhost_user_sock(dev, VHOST_USER_GET_FEATURES, &features);
809 	if (rc < 0) {
810 		SPDK_ERRLOG("get_features failed: %s\n", spdk_strerror(-rc));
811 		return 0;
812 	}
813 
814 	return features;
815 }
816 
817 static int
818 virtio_user_set_features(struct virtio_dev *vdev, uint64_t features)
819 {
820 	struct virtio_user_dev *dev = vdev->ctx;
821 	uint64_t protocol_features;
822 	int ret;
823 
824 	ret = vhost_user_sock(dev, VHOST_USER_SET_FEATURES, &features);
825 	if (ret < 0) {
826 		return ret;
827 	}
828 
829 	vdev->negotiated_features = features;
830 	vdev->modern = virtio_dev_has_feature(vdev, VIRTIO_F_VERSION_1);
831 
832 	if (!virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) {
833 		/* nothing else to do */
834 		return 0;
835 	}
836 
837 	ret = vhost_user_sock(dev, VHOST_USER_GET_PROTOCOL_FEATURES, &protocol_features);
838 	if (ret < 0) {
839 		return ret;
840 	}
841 
842 	protocol_features &= VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES;
843 	ret = vhost_user_sock(dev, VHOST_USER_SET_PROTOCOL_FEATURES, &protocol_features);
844 	if (ret < 0) {
845 		return ret;
846 	}
847 
848 	dev->protocol_features = protocol_features;
849 	return 0;
850 }
851 
852 static uint16_t
853 virtio_user_get_queue_size(struct virtio_dev *vdev, uint16_t queue_id)
854 {
855 	struct virtio_user_dev *dev = vdev->ctx;
856 
857 	/* Currently each queue has same queue size */
858 	return dev->queue_size;
859 }
860 
861 static int
862 virtio_user_setup_queue(struct virtio_dev *vdev, struct virtqueue *vq)
863 {
864 	struct virtio_user_dev *dev = vdev->ctx;
865 	struct vhost_vring_state state;
866 	uint16_t queue_idx = vq->vq_queue_index;
867 	void *queue_mem;
868 	uint64_t desc_addr, avail_addr, used_addr;
869 	int callfd, kickfd, rc;
870 
871 	if (dev->callfds[queue_idx] != -1 || dev->kickfds[queue_idx] != -1) {
872 		SPDK_ERRLOG("queue %"PRIu16" already exists\n", queue_idx);
873 		return -EEXIST;
874 	}
875 
876 	/* May use invalid flag, but some backend uses kickfd and
877 	 * callfd as criteria to judge if dev is alive. so finally we
878 	 * use real event_fd.
879 	 */
880 	callfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
881 	if (callfd < 0) {
882 		SPDK_ERRLOG("callfd error, %s\n", spdk_strerror(errno));
883 		return -errno;
884 	}
885 
886 	kickfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
887 	if (kickfd < 0) {
888 		SPDK_ERRLOG("kickfd error, %s\n", spdk_strerror(errno));
889 		close(callfd);
890 		return -errno;
891 	}
892 
893 	queue_mem = spdk_zmalloc(vq->vq_ring_size, VIRTIO_PCI_VRING_ALIGN, NULL,
894 				 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
895 	if (queue_mem == NULL) {
896 		close(kickfd);
897 		close(callfd);
898 		return -ENOMEM;
899 	}
900 
901 	vq->vq_ring_mem = SPDK_VTOPHYS_ERROR;
902 	vq->vq_ring_virt_mem = queue_mem;
903 
904 	state.index = vq->vq_queue_index;
905 	state.num = 1;
906 
907 	if (virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) {
908 		rc = vhost_user_sock(dev, VHOST_USER_SET_VRING_ENABLE, &state);
909 		if (rc < 0) {
910 			SPDK_ERRLOG("failed to send VHOST_USER_SET_VRING_ENABLE: %s\n",
911 				    spdk_strerror(-rc));
912 			close(kickfd);
913 			close(callfd);
914 			spdk_free(queue_mem);
915 			return -rc;
916 		}
917 	}
918 
919 	dev->callfds[queue_idx] = callfd;
920 	dev->kickfds[queue_idx] = kickfd;
921 
922 	desc_addr = (uintptr_t)vq->vq_ring_virt_mem;
923 	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
924 	used_addr = SPDK_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
925 				    ring[vq->vq_nentries]),
926 				    VIRTIO_PCI_VRING_ALIGN);
927 
928 	dev->vrings[queue_idx].num = vq->vq_nentries;
929 	dev->vrings[queue_idx].desc = (void *)(uintptr_t)desc_addr;
930 	dev->vrings[queue_idx].avail = (void *)(uintptr_t)avail_addr;
931 	dev->vrings[queue_idx].used = (void *)(uintptr_t)used_addr;
932 
933 	return 0;
934 }
935 
936 static void
937 virtio_user_del_queue(struct virtio_dev *vdev, struct virtqueue *vq)
938 {
939 	/* For legacy devices, write 0 to VIRTIO_PCI_QUEUE_PFN port, QEMU
940 	 * correspondingly stops the ioeventfds, and reset the status of
941 	 * the device.
942 	 * For modern devices, set queue desc, avail, used in PCI bar to 0,
943 	 * not see any more behavior in QEMU.
944 	 *
945 	 * Here we just care about what information to deliver to vhost-user.
946 	 * So we just close ioeventfd for now.
947 	 */
948 	struct virtio_user_dev *dev = vdev->ctx;
949 
950 	close(dev->callfds[vq->vq_queue_index]);
951 	close(dev->kickfds[vq->vq_queue_index]);
952 	dev->callfds[vq->vq_queue_index] = -1;
953 	dev->kickfds[vq->vq_queue_index] = -1;
954 
955 	spdk_free(vq->vq_ring_virt_mem);
956 }
957 
958 static void
959 virtio_user_notify_queue(struct virtio_dev *vdev, struct virtqueue *vq)
960 {
961 	uint64_t buf = 1;
962 	struct virtio_user_dev *dev = vdev->ctx;
963 
964 	if (write(dev->kickfds[vq->vq_queue_index], &buf, sizeof(buf)) < 0) {
965 		SPDK_ERRLOG("failed to kick backend: %s.\n", spdk_strerror(errno));
966 	}
967 }
968 
969 static void
970 virtio_user_destroy(struct virtio_dev *vdev)
971 {
972 	struct virtio_user_dev *dev = vdev->ctx;
973 
974 	if (dev) {
975 		close(dev->vhostfd);
976 		free(dev);
977 	}
978 }
979 
980 static void
981 virtio_user_dump_json_info(struct virtio_dev *vdev, struct spdk_json_write_ctx *w)
982 {
983 	struct virtio_user_dev *dev = vdev->ctx;
984 
985 	spdk_json_write_named_string(w, "type", "user");
986 	spdk_json_write_named_string(w, "socket", dev->path);
987 }
988 
989 static void
990 virtio_user_write_json_config(struct virtio_dev *vdev, struct spdk_json_write_ctx *w)
991 {
992 	struct virtio_user_dev *dev = vdev->ctx;
993 
994 	spdk_json_write_named_string(w, "trtype", "user");
995 	spdk_json_write_named_string(w, "traddr", dev->path);
996 	spdk_json_write_named_uint32(w, "vq_count", vdev->max_queues - vdev->fixed_queues_num);
997 	spdk_json_write_named_uint32(w, "vq_size", virtio_dev_backend_ops(vdev)->get_queue_size(vdev, 0));
998 }
999 
1000 static const struct virtio_dev_ops virtio_user_ops = {
1001 	.read_dev_cfg	= virtio_user_read_dev_config,
1002 	.write_dev_cfg	= virtio_user_write_dev_config,
1003 	.get_status	= virtio_user_get_status,
1004 	.set_status	= virtio_user_set_status,
1005 	.get_features	= virtio_user_get_features,
1006 	.set_features	= virtio_user_set_features,
1007 	.destruct_dev	= virtio_user_destroy,
1008 	.get_queue_size	= virtio_user_get_queue_size,
1009 	.setup_queue	= virtio_user_setup_queue,
1010 	.del_queue	= virtio_user_del_queue,
1011 	.notify_queue	= virtio_user_notify_queue,
1012 	.dump_json_info = virtio_user_dump_json_info,
1013 	.write_json_config = virtio_user_write_json_config,
1014 };
1015 
1016 int
1017 virtio_user_dev_init(struct virtio_dev *vdev, const char *name, const char *path,
1018 		     uint32_t queue_size)
1019 {
1020 	struct virtio_user_dev *dev;
1021 	int rc;
1022 
1023 	if (name == NULL) {
1024 		SPDK_ERRLOG("No name gived for controller: %s\n", path);
1025 		return -EINVAL;
1026 	}
1027 
1028 	dev = calloc(1, sizeof(*dev));
1029 	if (dev == NULL) {
1030 		return -ENOMEM;
1031 	}
1032 
1033 	rc = virtio_dev_construct(vdev, name, &virtio_user_ops, dev);
1034 	if (rc != 0) {
1035 		SPDK_ERRLOG("Failed to init device: %s\n", path);
1036 		free(dev);
1037 		return rc;
1038 	}
1039 
1040 	vdev->is_hw = 0;
1041 
1042 	snprintf(dev->path, PATH_MAX, "%s", path);
1043 	dev->queue_size = queue_size;
1044 
1045 	rc = virtio_user_dev_setup(vdev);
1046 	if (rc < 0) {
1047 		SPDK_ERRLOG("backend set up fails\n");
1048 		goto err;
1049 	}
1050 
1051 	rc = vhost_user_sock(dev, VHOST_USER_SET_OWNER, NULL);
1052 	if (rc < 0) {
1053 		SPDK_ERRLOG("set_owner fails: %s\n", spdk_strerror(-rc));
1054 		goto err;
1055 	}
1056 
1057 	return 0;
1058 
1059 err:
1060 	virtio_dev_destruct(vdev);
1061 	return rc;
1062 }
1063 SPDK_LOG_REGISTER_COMPONENT(virtio_user)
1064