xref: /spdk/lib/virtio/virtio_vhost_user.c (revision 784b9d48746955f210926648a0131f84f58de76f)
1  /*   SPDX-License-Identifier: BSD-3-Clause
2   *   Copyright (C) 2010-2016 Intel Corporation. All rights reserved.
3   *   All rights reserved.
4   */
5  
6  #include "spdk/stdinc.h"
7  
8  #include <sys/eventfd.h>
9  
10  #include "spdk/string.h"
11  #include "spdk/config.h"
12  #include "spdk/util.h"
13  
14  #include "spdk_internal/virtio.h"
15  #include "spdk_internal/vhost_user.h"
16  
17  /* The version of the protocol we support */
18  #define VHOST_USER_VERSION    0x1
19  
20  #define VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES \
21  	((1ULL << VHOST_USER_PROTOCOL_F_MQ) | \
22  	(1ULL << VHOST_USER_PROTOCOL_F_CONFIG))
23  
24  struct virtio_user_dev {
25  	int		vhostfd;
26  
27  	int		callfds[SPDK_VIRTIO_MAX_VIRTQUEUES];
28  	int		kickfds[SPDK_VIRTIO_MAX_VIRTQUEUES];
29  	uint32_t	queue_size;
30  
31  	uint8_t		status;
32  	bool		is_stopping;
33  	char		path[PATH_MAX];
34  	uint64_t	protocol_features;
35  	struct vring	vrings[SPDK_VIRTIO_MAX_VIRTQUEUES];
36  	struct spdk_mem_map *mem_map;
37  };
38  
39  static int
40  vhost_user_write(int fd, void *buf, int len, int *fds, int fd_num)
41  {
42  	int r;
43  	struct msghdr msgh;
44  	struct iovec iov;
45  	size_t fd_size = fd_num * sizeof(int);
46  	char control[CMSG_SPACE(fd_size)];
47  	struct cmsghdr *cmsg;
48  
49  	memset(&msgh, 0, sizeof(msgh));
50  	memset(control, 0, sizeof(control));
51  
52  	iov.iov_base = (uint8_t *)buf;
53  	iov.iov_len = len;
54  
55  	msgh.msg_iov = &iov;
56  	msgh.msg_iovlen = 1;
57  
58  	if (fds && fd_num > 0) {
59  		msgh.msg_control = control;
60  		msgh.msg_controllen = sizeof(control);
61  		cmsg = CMSG_FIRSTHDR(&msgh);
62  		if (!cmsg) {
63  			SPDK_WARNLOG("First HDR is NULL\n");
64  			return -EIO;
65  		}
66  		cmsg->cmsg_len = CMSG_LEN(fd_size);
67  		cmsg->cmsg_level = SOL_SOCKET;
68  		cmsg->cmsg_type = SCM_RIGHTS;
69  		memcpy(CMSG_DATA(cmsg), fds, fd_size);
70  	} else {
71  		msgh.msg_control = NULL;
72  		msgh.msg_controllen = 0;
73  	}
74  
75  	do {
76  		r = sendmsg(fd, &msgh, 0);
77  	} while (r < 0 && errno == EINTR);
78  
79  	if (r == -1) {
80  		return -errno;
81  	}
82  
83  	return 0;
84  }
85  
86  static int
87  vhost_user_read(int fd, struct vhost_user_msg *msg)
88  {
89  	uint32_t valid_flags = VHOST_USER_REPLY_MASK | VHOST_USER_VERSION;
90  	ssize_t ret;
91  	size_t sz_hdr = VHOST_USER_HDR_SIZE, sz_payload;
92  
93  	ret = recv(fd, (void *)msg, sz_hdr, 0);
94  	if ((size_t)ret != sz_hdr) {
95  		SPDK_WARNLOG("Failed to recv msg hdr: %zd instead of %zu.\n",
96  			     ret, sz_hdr);
97  		if (ret == -1) {
98  			return -errno;
99  		} else {
100  			return -EBUSY;
101  		}
102  	}
103  
104  	/* validate msg flags */
105  	if (msg->flags != (valid_flags)) {
106  		SPDK_WARNLOG("Failed to recv msg: flags %"PRIx32" instead of %"PRIx32".\n",
107  			     msg->flags, valid_flags);
108  		return -EIO;
109  	}
110  
111  	sz_payload = msg->size;
112  
113  	if (sz_payload > VHOST_USER_PAYLOAD_SIZE) {
114  		SPDK_WARNLOG("Received oversized msg: payload size %zu > available space %zu\n",
115  			     sz_payload, VHOST_USER_PAYLOAD_SIZE);
116  		return -EIO;
117  	}
118  
119  	if (sz_payload) {
120  		ret = recv(fd, (void *)((char *)msg + sz_hdr), sz_payload, 0);
121  		if ((size_t)ret != sz_payload) {
122  			SPDK_WARNLOG("Failed to recv msg payload: %zd instead of %"PRIu32".\n",
123  				     ret, msg->size);
124  			if (ret == -1) {
125  				return -errno;
126  			} else {
127  				return -EBUSY;
128  			}
129  		}
130  	}
131  
132  	return 0;
133  }
134  
135  struct hugepage_file_info {
136  	uint64_t addr;            /**< virtual addr */
137  	size_t   size;            /**< the file size */
138  	char     path[PATH_MAX];  /**< path to backing file */
139  };
140  
141  /* Two possible options:
142   * 1. Match HUGEPAGE_INFO_FMT to find the file storing struct hugepage_file
143   * array. This is simple but cannot be used in secondary process because
144   * secondary process will close and munmap that file.
145   * 2. Match HUGEFILE_FMT to find hugepage files directly.
146   *
147   * We choose option 2.
148   */
149  static int
150  get_hugepage_file_info(struct hugepage_file_info hugepages[], int max)
151  {
152  	int idx, rc;
153  	FILE *f;
154  	char buf[BUFSIZ], *tmp, *tail;
155  	char *str_underline, *str_start;
156  	int huge_index;
157  	uint64_t v_start, v_end;
158  
159  	f = fopen("/proc/self/maps", "r");
160  	if (!f) {
161  		SPDK_ERRLOG("cannot open /proc/self/maps\n");
162  		rc = -errno;
163  		assert(rc < 0); /* scan-build hack */
164  		return rc;
165  	}
166  
167  	idx = 0;
168  	while (fgets(buf, sizeof(buf), f) != NULL) {
169  		if (sscanf(buf, "%" PRIx64 "-%" PRIx64, &v_start, &v_end) < 2) {
170  			SPDK_ERRLOG("Failed to parse address\n");
171  			rc = -EIO;
172  			goto out;
173  		}
174  
175  		tmp = strchr(buf, ' ') + 1; /** skip address */
176  		tmp = strchr(tmp, ' ') + 1; /** skip perm */
177  		tmp = strchr(tmp, ' ') + 1; /** skip offset */
178  		tmp = strchr(tmp, ' ') + 1; /** skip dev */
179  		tmp = strchr(tmp, ' ') + 1; /** skip inode */
180  		while (*tmp == ' ') {       /** skip spaces */
181  			tmp++;
182  		}
183  		tail = strrchr(tmp, '\n');  /** remove newline if exists */
184  		if (tail) {
185  			*tail = '\0';
186  		}
187  
188  		/* Match HUGEFILE_FMT, aka "%s/%smap_%d",
189  		 * which is defined in eal_filesystem.h
190  		 */
191  		str_underline = strrchr(tmp, '_');
192  		if (!str_underline) {
193  			continue;
194  		}
195  
196  		str_start = str_underline - strlen("map");
197  		if (str_start < tmp) {
198  			continue;
199  		}
200  
201  		if (sscanf(str_start, "map_%d", &huge_index) != 1) {
202  			continue;
203  		}
204  
205  		if (idx >= max) {
206  			SPDK_ERRLOG("Exceed maximum of %d\n", max);
207  			rc = -ENOSPC;
208  			goto out;
209  		}
210  
211  		if (idx > 0 &&
212  		    strncmp(tmp, hugepages[idx - 1].path, PATH_MAX) == 0 &&
213  		    v_start == hugepages[idx - 1].addr + hugepages[idx - 1].size) {
214  			hugepages[idx - 1].size += (v_end - v_start);
215  			continue;
216  		}
217  
218  		hugepages[idx].addr = v_start;
219  		hugepages[idx].size = v_end - v_start;
220  		snprintf(hugepages[idx].path, PATH_MAX, "%s", tmp);
221  		idx++;
222  	}
223  
224  	rc = idx;
225  out:
226  	fclose(f);
227  	return rc;
228  }
229  
230  static int
231  prepare_vhost_memory_user(struct vhost_user_msg *msg, int fds[])
232  {
233  	int i, num;
234  	struct hugepage_file_info hugepages[VHOST_USER_MEMORY_MAX_NREGIONS];
235  
236  	num = get_hugepage_file_info(hugepages, VHOST_USER_MEMORY_MAX_NREGIONS);
237  	if (num < 0) {
238  		SPDK_ERRLOG("Failed to prepare memory for vhost-user\n");
239  		return num;
240  	}
241  
242  	for (i = 0; i < num; ++i) {
243  		/* the memory regions are unaligned */
244  		msg->payload.memory.regions[i].guest_phys_addr = hugepages[i].addr; /* use vaddr! */
245  		msg->payload.memory.regions[i].userspace_addr = hugepages[i].addr;
246  		msg->payload.memory.regions[i].memory_size = hugepages[i].size;
247  		msg->payload.memory.regions[i].flags_padding = 0;
248  		fds[i] = open(hugepages[i].path, O_RDWR);
249  	}
250  
251  	msg->payload.memory.nregions = num;
252  	msg->payload.memory.padding = 0;
253  
254  	return 0;
255  }
256  
257  static const char *const vhost_msg_strings[VHOST_USER_MAX] = {
258  	[VHOST_USER_SET_OWNER] = "VHOST_SET_OWNER",
259  	[VHOST_USER_RESET_OWNER] = "VHOST_RESET_OWNER",
260  	[VHOST_USER_SET_FEATURES] = "VHOST_SET_FEATURES",
261  	[VHOST_USER_GET_FEATURES] = "VHOST_GET_FEATURES",
262  	[VHOST_USER_SET_VRING_CALL] = "VHOST_SET_VRING_CALL",
263  	[VHOST_USER_GET_PROTOCOL_FEATURES] = "VHOST_USER_GET_PROTOCOL_FEATURES",
264  	[VHOST_USER_SET_PROTOCOL_FEATURES] = "VHOST_USER_SET_PROTOCOL_FEATURES",
265  	[VHOST_USER_SET_VRING_NUM] = "VHOST_SET_VRING_NUM",
266  	[VHOST_USER_SET_VRING_BASE] = "VHOST_SET_VRING_BASE",
267  	[VHOST_USER_GET_VRING_BASE] = "VHOST_GET_VRING_BASE",
268  	[VHOST_USER_SET_VRING_ADDR] = "VHOST_SET_VRING_ADDR",
269  	[VHOST_USER_SET_VRING_KICK] = "VHOST_SET_VRING_KICK",
270  	[VHOST_USER_SET_MEM_TABLE] = "VHOST_SET_MEM_TABLE",
271  	[VHOST_USER_SET_VRING_ENABLE] = "VHOST_SET_VRING_ENABLE",
272  	[VHOST_USER_GET_QUEUE_NUM] = "VHOST_USER_GET_QUEUE_NUM",
273  	[VHOST_USER_GET_CONFIG] = "VHOST_USER_GET_CONFIG",
274  	[VHOST_USER_SET_CONFIG] = "VHOST_USER_SET_CONFIG",
275  };
276  
277  static int
278  vhost_user_sock(struct virtio_user_dev *dev,
279  		enum vhost_user_request req,
280  		void *arg)
281  {
282  	struct vhost_user_msg msg;
283  	struct vhost_vring_file *file = 0;
284  	int need_reply = 0;
285  	int fds[VHOST_USER_MEMORY_MAX_NREGIONS];
286  	int fd_num = 0;
287  	int i, len, rc;
288  	int vhostfd = dev->vhostfd;
289  
290  	SPDK_DEBUGLOG(virtio_user, "sent message %d = %s\n", req, vhost_msg_strings[req]);
291  
292  	msg.request = req;
293  	msg.flags = VHOST_USER_VERSION;
294  	msg.size = 0;
295  
296  	switch (req) {
297  	case VHOST_USER_GET_FEATURES:
298  	case VHOST_USER_GET_PROTOCOL_FEATURES:
299  	case VHOST_USER_GET_QUEUE_NUM:
300  		need_reply = 1;
301  		break;
302  
303  	case VHOST_USER_SET_FEATURES:
304  	case VHOST_USER_SET_LOG_BASE:
305  	case VHOST_USER_SET_PROTOCOL_FEATURES:
306  		msg.payload.u64 = *((__u64 *)arg);
307  		msg.size = sizeof(msg.payload.u64);
308  		break;
309  
310  	case VHOST_USER_SET_OWNER:
311  	case VHOST_USER_RESET_OWNER:
312  		break;
313  
314  	case VHOST_USER_SET_MEM_TABLE:
315  		rc = prepare_vhost_memory_user(&msg, fds);
316  		if (rc < 0) {
317  			return rc;
318  		}
319  		fd_num = msg.payload.memory.nregions;
320  		msg.size = sizeof(msg.payload.memory.nregions);
321  		msg.size += sizeof(msg.payload.memory.padding);
322  		msg.size += fd_num * sizeof(struct vhost_memory_region);
323  		break;
324  
325  	case VHOST_USER_SET_LOG_FD:
326  		fds[fd_num++] = *((int *)arg);
327  		break;
328  
329  	case VHOST_USER_SET_VRING_NUM:
330  	case VHOST_USER_SET_VRING_BASE:
331  	case VHOST_USER_SET_VRING_ENABLE:
332  		memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
333  		msg.size = sizeof(msg.payload.state);
334  		break;
335  
336  	case VHOST_USER_GET_VRING_BASE:
337  		memcpy(&msg.payload.state, arg, sizeof(msg.payload.state));
338  		msg.size = sizeof(msg.payload.state);
339  		need_reply = 1;
340  		break;
341  
342  	case VHOST_USER_SET_VRING_ADDR:
343  		memcpy(&msg.payload.addr, arg, sizeof(msg.payload.addr));
344  		msg.size = sizeof(msg.payload.addr);
345  		break;
346  
347  	case VHOST_USER_SET_VRING_KICK:
348  	case VHOST_USER_SET_VRING_CALL:
349  	case VHOST_USER_SET_VRING_ERR:
350  		file = arg;
351  		msg.payload.u64 = file->index & VHOST_USER_VRING_IDX_MASK;
352  		msg.size = sizeof(msg.payload.u64);
353  		if (file->fd > 0) {
354  			fds[fd_num++] = file->fd;
355  		} else {
356  			msg.payload.u64 |= VHOST_USER_VRING_NOFD_MASK;
357  		}
358  		break;
359  
360  	case VHOST_USER_GET_CONFIG:
361  		memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg));
362  		msg.size = sizeof(msg.payload.cfg);
363  		need_reply = 1;
364  		break;
365  
366  	case VHOST_USER_SET_CONFIG:
367  		memcpy(&msg.payload.cfg, arg, sizeof(msg.payload.cfg));
368  		msg.size = sizeof(msg.payload.cfg);
369  		break;
370  
371  	default:
372  		SPDK_ERRLOG("trying to send unknown msg\n");
373  		return -EINVAL;
374  	}
375  
376  	len = VHOST_USER_HDR_SIZE + msg.size;
377  	rc = vhost_user_write(vhostfd, &msg, len, fds, fd_num);
378  	if (rc < 0) {
379  		SPDK_ERRLOG("%s failed: %s\n",
380  			    vhost_msg_strings[req], spdk_strerror(-rc));
381  		return rc;
382  	}
383  
384  	if (req == VHOST_USER_SET_MEM_TABLE)
385  		for (i = 0; i < fd_num; ++i) {
386  			close(fds[i]);
387  		}
388  
389  	if (need_reply) {
390  		rc = vhost_user_read(vhostfd, &msg);
391  		if (rc < 0) {
392  			SPDK_WARNLOG("Received msg failed: %s\n", spdk_strerror(-rc));
393  			return rc;
394  		}
395  
396  		if (req != msg.request) {
397  			SPDK_WARNLOG("Received unexpected msg type\n");
398  			return -EIO;
399  		}
400  
401  		switch (req) {
402  		case VHOST_USER_GET_FEATURES:
403  		case VHOST_USER_GET_PROTOCOL_FEATURES:
404  		case VHOST_USER_GET_QUEUE_NUM:
405  			if (msg.size != sizeof(msg.payload.u64)) {
406  				SPDK_WARNLOG("Received bad msg size\n");
407  				return -EIO;
408  			}
409  			*((__u64 *)arg) = msg.payload.u64;
410  			break;
411  		case VHOST_USER_GET_VRING_BASE:
412  			if (msg.size != sizeof(msg.payload.state)) {
413  				SPDK_WARNLOG("Received bad msg size\n");
414  				return -EIO;
415  			}
416  			memcpy(arg, &msg.payload.state,
417  			       sizeof(struct vhost_vring_state));
418  			break;
419  		case VHOST_USER_GET_CONFIG:
420  			if (msg.size != sizeof(msg.payload.cfg)) {
421  				SPDK_WARNLOG("Received bad msg size\n");
422  				return -EIO;
423  			}
424  			memcpy(arg, &msg.payload.cfg, sizeof(msg.payload.cfg));
425  			break;
426  		default:
427  			SPDK_WARNLOG("Received unexpected msg type\n");
428  			return -EBADMSG;
429  		}
430  	}
431  
432  	return 0;
433  }
434  
435  /**
436   * Set up environment to talk with a vhost user backend.
437   *
438   * @return
439   *   - (-1) if fail;
440   *   - (0) if succeed.
441   */
442  static int
443  vhost_user_setup(struct virtio_user_dev *dev)
444  {
445  	int fd;
446  	int flag;
447  	struct sockaddr_un un;
448  	ssize_t rc;
449  
450  	fd = socket(AF_UNIX, SOCK_STREAM, 0);
451  	if (fd < 0) {
452  		SPDK_ERRLOG("socket() error, %s\n", spdk_strerror(errno));
453  		return -errno;
454  	}
455  
456  	flag = fcntl(fd, F_GETFD);
457  	if (fcntl(fd, F_SETFD, flag | FD_CLOEXEC) < 0) {
458  		SPDK_ERRLOG("fcntl failed, %s\n", spdk_strerror(errno));
459  	}
460  
461  	memset(&un, 0, sizeof(un));
462  	un.sun_family = AF_UNIX;
463  	rc = snprintf(un.sun_path, sizeof(un.sun_path), "%s", dev->path);
464  	if (rc < 0 || (size_t)rc >= sizeof(un.sun_path)) {
465  		SPDK_ERRLOG("socket path too long\n");
466  		close(fd);
467  		if (rc < 0) {
468  			return -errno;
469  		} else {
470  			return -EINVAL;
471  		}
472  	}
473  	if (connect(fd, (struct sockaddr *)&un, sizeof(un)) < 0) {
474  		SPDK_ERRLOG("connect error, %s\n", spdk_strerror(errno));
475  		close(fd);
476  		return -errno;
477  	}
478  
479  	dev->vhostfd = fd;
480  	return 0;
481  }
482  
483  static int
484  virtio_user_create_queue(struct virtio_dev *vdev, uint32_t queue_sel)
485  {
486  	struct virtio_user_dev *dev = vdev->ctx;
487  
488  	/* Of all per virtqueue MSGs, make sure VHOST_SET_VRING_CALL come
489  	 * firstly because vhost depends on this msg to allocate virtqueue
490  	 * pair.
491  	 */
492  	struct vhost_vring_file file;
493  
494  	file.index = queue_sel;
495  	file.fd = dev->callfds[queue_sel];
496  	return vhost_user_sock(dev, VHOST_USER_SET_VRING_CALL, &file);
497  }
498  
499  static int
500  virtio_user_set_vring_addr(struct virtio_dev *vdev, uint32_t queue_sel)
501  {
502  	struct virtio_user_dev *dev = vdev->ctx;
503  	struct vring *vring = &dev->vrings[queue_sel];
504  	struct vhost_vring_addr addr = {
505  		.index = queue_sel,
506  		.desc_user_addr = (uint64_t)(uintptr_t)vring->desc,
507  		.avail_user_addr = (uint64_t)(uintptr_t)vring->avail,
508  		.used_user_addr = (uint64_t)(uintptr_t)vring->used,
509  		.log_guest_addr = 0,
510  		.flags = 0, /* disable log */
511  	};
512  
513  	return vhost_user_sock(dev, VHOST_USER_SET_VRING_ADDR, &addr);
514  }
515  
516  static int
517  virtio_user_kick_queue(struct virtio_dev *vdev, uint32_t queue_sel)
518  {
519  	struct virtio_user_dev *dev = vdev->ctx;
520  	struct vhost_vring_file file;
521  	struct vhost_vring_state state;
522  	struct vring *vring = &dev->vrings[queue_sel];
523  	int rc;
524  
525  	state.index = queue_sel;
526  	state.num = vring->num;
527  	rc = vhost_user_sock(dev, VHOST_USER_SET_VRING_NUM, &state);
528  	if (rc < 0) {
529  		return rc;
530  	}
531  
532  	state.index = queue_sel;
533  	state.num = 0; /* no reservation */
534  	rc = vhost_user_sock(dev, VHOST_USER_SET_VRING_BASE, &state);
535  	if (rc < 0) {
536  		return rc;
537  	}
538  
539  	virtio_user_set_vring_addr(vdev, queue_sel);
540  
541  	/* Of all per virtqueue MSGs, make sure VHOST_USER_SET_VRING_KICK comes
542  	 * lastly because vhost depends on this msg to judge if
543  	 * virtio is ready.
544  	 */
545  	file.index = queue_sel;
546  	file.fd = dev->kickfds[queue_sel];
547  	return vhost_user_sock(dev, VHOST_USER_SET_VRING_KICK, &file);
548  }
549  
550  static int
551  virtio_user_stop_queue(struct virtio_dev *vdev, uint32_t queue_sel)
552  {
553  	struct virtio_user_dev *dev = vdev->ctx;
554  	struct vhost_vring_state state;
555  
556  	state.index = queue_sel;
557  	state.num = 0;
558  
559  	return vhost_user_sock(dev, VHOST_USER_GET_VRING_BASE, &state);
560  }
561  
562  static int
563  virtio_user_queue_setup(struct virtio_dev *vdev,
564  			int (*fn)(struct virtio_dev *, uint32_t))
565  {
566  	uint32_t i;
567  	int rc;
568  
569  	for (i = 0; i < vdev->max_queues; ++i) {
570  		rc = fn(vdev, i);
571  		if (rc < 0) {
572  			SPDK_ERRLOG("setup tx vq fails: %"PRIu32".\n", i);
573  			return rc;
574  		}
575  	}
576  
577  	return 0;
578  }
579  
580  static int
581  virtio_user_map_notify(void *cb_ctx, struct spdk_mem_map *map,
582  		       enum spdk_mem_map_notify_action action,
583  		       void *vaddr, size_t size)
584  {
585  	struct virtio_dev *vdev = cb_ctx;
586  	struct virtio_user_dev *dev = vdev->ctx;
587  	uint64_t features;
588  	int ret;
589  
590  	/* We do not support dynamic memory allocation with virtio-user.  If this is the
591  	 * initial notification when the device is started, dev->mem_map will be NULL.  If
592  	 * this is the final notification when the device is stopped, dev->is_stopping will
593  	 * be true.  All other cases are unsupported.
594  	 */
595  	if (dev->mem_map != NULL && !dev->is_stopping) {
596  		assert(false);
597  		SPDK_ERRLOG("Memory map change with active virtio_user_devs not allowed.\n");
598  		SPDK_ERRLOG("Pre-allocate memory for application using -s (mem_size) option.\n");
599  		return -1;
600  	}
601  
602  	/* We have to resend all mappings anyway, so don't bother with any
603  	 * page tracking.
604  	 */
605  	ret = vhost_user_sock(dev, VHOST_USER_SET_MEM_TABLE, NULL);
606  	if (ret < 0) {
607  		return ret;
608  	}
609  
610  	/* Since we might want to use that mapping straight away, we have to
611  	 * make sure the guest has already processed our SET_MEM_TABLE message.
612  	 * F_REPLY_ACK is just a feature and the host is not obliged to
613  	 * support it, so we send a simple message that always has a response
614  	 * and we wait for that response. Messages are always processed in order.
615  	 */
616  	return vhost_user_sock(dev, VHOST_USER_GET_FEATURES, &features);
617  }
618  
619  static int
620  virtio_user_register_mem(struct virtio_dev *vdev)
621  {
622  	struct virtio_user_dev *dev = vdev->ctx;
623  	const struct spdk_mem_map_ops virtio_user_map_ops = {
624  		.notify_cb = virtio_user_map_notify,
625  		.are_contiguous = NULL
626  	};
627  
628  	dev->mem_map = spdk_mem_map_alloc(0, &virtio_user_map_ops, vdev);
629  	if (dev->mem_map == NULL) {
630  		SPDK_ERRLOG("spdk_mem_map_alloc() failed\n");
631  		return -1;
632  	}
633  
634  	return 0;
635  }
636  
637  static void
638  virtio_user_unregister_mem(struct virtio_dev *vdev)
639  {
640  	struct virtio_user_dev *dev = vdev->ctx;
641  
642  	dev->is_stopping = true;
643  	spdk_mem_map_free(&dev->mem_map);
644  }
645  
646  static int
647  virtio_user_start_device(struct virtio_dev *vdev)
648  {
649  	struct virtio_user_dev *dev = vdev->ctx;
650  	uint64_t host_max_queues;
651  	int ret;
652  
653  	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_MQ)) == 0 &&
654  	    vdev->max_queues > 1 + vdev->fixed_queues_num) {
655  		SPDK_WARNLOG("%s: requested %"PRIu16" request queues, but the "
656  			     "host doesn't support VHOST_USER_PROTOCOL_F_MQ. "
657  			     "Only one request queue will be used.\n",
658  			     vdev->name, vdev->max_queues - vdev->fixed_queues_num);
659  		vdev->max_queues = 1 + vdev->fixed_queues_num;
660  	}
661  
662  	/* negotiate the number of I/O queues. */
663  	ret = vhost_user_sock(dev, VHOST_USER_GET_QUEUE_NUM, &host_max_queues);
664  	if (ret < 0) {
665  		return ret;
666  	}
667  
668  	if (vdev->max_queues > host_max_queues + vdev->fixed_queues_num) {
669  		SPDK_WARNLOG("%s: requested %"PRIu16" request queues"
670  			     "but only %"PRIu64" available\n",
671  			     vdev->name, vdev->max_queues - vdev->fixed_queues_num,
672  			     host_max_queues);
673  		vdev->max_queues = host_max_queues;
674  	}
675  
676  	/* tell vhost to create queues */
677  	ret = virtio_user_queue_setup(vdev, virtio_user_create_queue);
678  	if (ret < 0) {
679  		return ret;
680  	}
681  
682  	ret = virtio_user_register_mem(vdev);
683  	if (ret < 0) {
684  		return ret;
685  	}
686  
687  	return virtio_user_queue_setup(vdev, virtio_user_kick_queue);
688  }
689  
690  static int
691  virtio_user_stop_device(struct virtio_dev *vdev)
692  {
693  	int ret;
694  
695  	ret = virtio_user_queue_setup(vdev, virtio_user_stop_queue);
696  	/* a queue might fail to stop for various reasons, e.g. socket
697  	 * connection going down, but this mustn't prevent us from freeing
698  	 * the mem map.
699  	 */
700  	virtio_user_unregister_mem(vdev);
701  	return ret;
702  }
703  
704  static int
705  virtio_user_dev_setup(struct virtio_dev *vdev)
706  {
707  	struct virtio_user_dev *dev = vdev->ctx;
708  	uint16_t i;
709  
710  	dev->vhostfd = -1;
711  
712  	for (i = 0; i < SPDK_VIRTIO_MAX_VIRTQUEUES; ++i) {
713  		dev->callfds[i] = -1;
714  		dev->kickfds[i] = -1;
715  	}
716  
717  	return vhost_user_setup(dev);
718  }
719  
720  static int
721  virtio_user_read_dev_config(struct virtio_dev *vdev, size_t offset,
722  			    void *dst, int length)
723  {
724  	struct virtio_user_dev *dev = vdev->ctx;
725  	struct vhost_user_config cfg = {0};
726  	int rc;
727  
728  	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) {
729  		return -ENOTSUP;
730  	}
731  
732  	cfg.offset = 0;
733  	cfg.size = VHOST_USER_MAX_CONFIG_SIZE;
734  
735  	rc = vhost_user_sock(dev, VHOST_USER_GET_CONFIG, &cfg);
736  	if (rc < 0) {
737  		SPDK_ERRLOG("get_config failed: %s\n", spdk_strerror(-rc));
738  		return rc;
739  	}
740  
741  	memcpy(dst, cfg.region + offset, length);
742  	return 0;
743  }
744  
745  static int
746  virtio_user_write_dev_config(struct virtio_dev *vdev, size_t offset,
747  			     const void *src, int length)
748  {
749  	struct virtio_user_dev *dev = vdev->ctx;
750  	struct vhost_user_config cfg = {0};
751  	int rc;
752  
753  	if ((dev->protocol_features & (1ULL << VHOST_USER_PROTOCOL_F_CONFIG)) == 0) {
754  		return -ENOTSUP;
755  	}
756  
757  	cfg.offset = offset;
758  	cfg.size = length;
759  	memcpy(cfg.region, src, length);
760  
761  	rc = vhost_user_sock(dev, VHOST_USER_SET_CONFIG, &cfg);
762  	if (rc < 0) {
763  		SPDK_ERRLOG("set_config failed: %s\n", spdk_strerror(-rc));
764  		return rc;
765  	}
766  
767  	return 0;
768  }
769  
770  static void
771  virtio_user_set_status(struct virtio_dev *vdev, uint8_t status)
772  {
773  	struct virtio_user_dev *dev = vdev->ctx;
774  	int rc = 0;
775  
776  	if ((dev->status & VIRTIO_CONFIG_S_NEEDS_RESET) &&
777  	    status != VIRTIO_CONFIG_S_RESET) {
778  		rc = -1;
779  	} else if (status & VIRTIO_CONFIG_S_DRIVER_OK) {
780  		rc = virtio_user_start_device(vdev);
781  	} else if (status == VIRTIO_CONFIG_S_RESET &&
782  		   (dev->status & VIRTIO_CONFIG_S_DRIVER_OK)) {
783  		rc = virtio_user_stop_device(vdev);
784  	}
785  
786  	if (rc != 0) {
787  		dev->status |= VIRTIO_CONFIG_S_NEEDS_RESET;
788  	} else {
789  		dev->status = status;
790  	}
791  }
792  
793  static uint8_t
794  virtio_user_get_status(struct virtio_dev *vdev)
795  {
796  	struct virtio_user_dev *dev = vdev->ctx;
797  
798  	return dev->status;
799  }
800  
801  static uint64_t
802  virtio_user_get_features(struct virtio_dev *vdev)
803  {
804  	struct virtio_user_dev *dev = vdev->ctx;
805  	uint64_t features;
806  	int rc;
807  
808  	rc = vhost_user_sock(dev, VHOST_USER_GET_FEATURES, &features);
809  	if (rc < 0) {
810  		SPDK_ERRLOG("get_features failed: %s\n", spdk_strerror(-rc));
811  		return 0;
812  	}
813  
814  	return features;
815  }
816  
817  static int
818  virtio_user_set_features(struct virtio_dev *vdev, uint64_t features)
819  {
820  	struct virtio_user_dev *dev = vdev->ctx;
821  	uint64_t protocol_features;
822  	int ret;
823  
824  	ret = vhost_user_sock(dev, VHOST_USER_SET_FEATURES, &features);
825  	if (ret < 0) {
826  		return ret;
827  	}
828  
829  	vdev->negotiated_features = features;
830  	vdev->modern = virtio_dev_has_feature(vdev, VIRTIO_F_VERSION_1);
831  
832  	if (!virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) {
833  		/* nothing else to do */
834  		return 0;
835  	}
836  
837  	ret = vhost_user_sock(dev, VHOST_USER_GET_PROTOCOL_FEATURES, &protocol_features);
838  	if (ret < 0) {
839  		return ret;
840  	}
841  
842  	protocol_features &= VIRTIO_USER_SUPPORTED_PROTOCOL_FEATURES;
843  	ret = vhost_user_sock(dev, VHOST_USER_SET_PROTOCOL_FEATURES, &protocol_features);
844  	if (ret < 0) {
845  		return ret;
846  	}
847  
848  	dev->protocol_features = protocol_features;
849  	return 0;
850  }
851  
852  static uint16_t
853  virtio_user_get_queue_size(struct virtio_dev *vdev, uint16_t queue_id)
854  {
855  	struct virtio_user_dev *dev = vdev->ctx;
856  
857  	/* Currently each queue has same queue size */
858  	return dev->queue_size;
859  }
860  
861  static int
862  virtio_user_setup_queue(struct virtio_dev *vdev, struct virtqueue *vq)
863  {
864  	struct virtio_user_dev *dev = vdev->ctx;
865  	struct vhost_vring_state state;
866  	uint16_t queue_idx = vq->vq_queue_index;
867  	void *queue_mem;
868  	uint64_t desc_addr, avail_addr, used_addr;
869  	int callfd, kickfd, rc;
870  
871  	if (dev->callfds[queue_idx] != -1 || dev->kickfds[queue_idx] != -1) {
872  		SPDK_ERRLOG("queue %"PRIu16" already exists\n", queue_idx);
873  		return -EEXIST;
874  	}
875  
876  	/* May use invalid flag, but some backend uses kickfd and
877  	 * callfd as criteria to judge if dev is alive. so finally we
878  	 * use real event_fd.
879  	 */
880  	callfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
881  	if (callfd < 0) {
882  		SPDK_ERRLOG("callfd error, %s\n", spdk_strerror(errno));
883  		return -errno;
884  	}
885  
886  	kickfd = eventfd(0, EFD_CLOEXEC | EFD_NONBLOCK);
887  	if (kickfd < 0) {
888  		SPDK_ERRLOG("kickfd error, %s\n", spdk_strerror(errno));
889  		close(callfd);
890  		return -errno;
891  	}
892  
893  	queue_mem = spdk_zmalloc(vq->vq_ring_size, VIRTIO_PCI_VRING_ALIGN, NULL,
894  				 SPDK_ENV_LCORE_ID_ANY, SPDK_MALLOC_DMA);
895  	if (queue_mem == NULL) {
896  		close(kickfd);
897  		close(callfd);
898  		return -ENOMEM;
899  	}
900  
901  	vq->vq_ring_mem = SPDK_VTOPHYS_ERROR;
902  	vq->vq_ring_virt_mem = queue_mem;
903  
904  	state.index = vq->vq_queue_index;
905  	state.num = vq->vq_nentries;
906  
907  	if (virtio_dev_has_feature(vdev, VHOST_USER_F_PROTOCOL_FEATURES)) {
908  		rc = vhost_user_sock(dev, VHOST_USER_SET_VRING_ENABLE, &state);
909  		if (rc < 0) {
910  			SPDK_ERRLOG("failed to send VHOST_USER_SET_VRING_ENABLE: %s\n",
911  				    spdk_strerror(-rc));
912  			close(kickfd);
913  			close(callfd);
914  			spdk_free(queue_mem);
915  			return -rc;
916  		}
917  	}
918  
919  	dev->callfds[queue_idx] = callfd;
920  	dev->kickfds[queue_idx] = kickfd;
921  
922  	desc_addr = (uintptr_t)vq->vq_ring_virt_mem;
923  	avail_addr = desc_addr + vq->vq_nentries * sizeof(struct vring_desc);
924  	used_addr = SPDK_ALIGN_CEIL(avail_addr + offsetof(struct vring_avail,
925  				    ring[vq->vq_nentries]),
926  				    VIRTIO_PCI_VRING_ALIGN);
927  
928  	dev->vrings[queue_idx].num = vq->vq_nentries;
929  	dev->vrings[queue_idx].desc = (void *)(uintptr_t)desc_addr;
930  	dev->vrings[queue_idx].avail = (void *)(uintptr_t)avail_addr;
931  	dev->vrings[queue_idx].used = (void *)(uintptr_t)used_addr;
932  
933  	return 0;
934  }
935  
936  static void
937  virtio_user_del_queue(struct virtio_dev *vdev, struct virtqueue *vq)
938  {
939  	/* For legacy devices, write 0 to VIRTIO_PCI_QUEUE_PFN port, QEMU
940  	 * correspondingly stops the ioeventfds, and reset the status of
941  	 * the device.
942  	 * For modern devices, set queue desc, avail, used in PCI bar to 0,
943  	 * not see any more behavior in QEMU.
944  	 *
945  	 * Here we just care about what information to deliver to vhost-user.
946  	 * So we just close ioeventfd for now.
947  	 */
948  	struct virtio_user_dev *dev = vdev->ctx;
949  
950  	close(dev->callfds[vq->vq_queue_index]);
951  	close(dev->kickfds[vq->vq_queue_index]);
952  	dev->callfds[vq->vq_queue_index] = -1;
953  	dev->kickfds[vq->vq_queue_index] = -1;
954  
955  	spdk_free(vq->vq_ring_virt_mem);
956  }
957  
958  static void
959  virtio_user_notify_queue(struct virtio_dev *vdev, struct virtqueue *vq)
960  {
961  	uint64_t buf = 1;
962  	struct virtio_user_dev *dev = vdev->ctx;
963  
964  	if (write(dev->kickfds[vq->vq_queue_index], &buf, sizeof(buf)) < 0) {
965  		SPDK_ERRLOG("failed to kick backend: %s.\n", spdk_strerror(errno));
966  	}
967  }
968  
969  static void
970  virtio_user_destroy(struct virtio_dev *vdev)
971  {
972  	struct virtio_user_dev *dev = vdev->ctx;
973  
974  	if (dev) {
975  		close(dev->vhostfd);
976  		free(dev);
977  	}
978  }
979  
980  static void
981  virtio_user_dump_json_info(struct virtio_dev *vdev, struct spdk_json_write_ctx *w)
982  {
983  	struct virtio_user_dev *dev = vdev->ctx;
984  
985  	spdk_json_write_named_string(w, "type", "user");
986  	spdk_json_write_named_string(w, "socket", dev->path);
987  }
988  
989  static void
990  virtio_user_write_json_config(struct virtio_dev *vdev, struct spdk_json_write_ctx *w)
991  {
992  	struct virtio_user_dev *dev = vdev->ctx;
993  
994  	spdk_json_write_named_string(w, "trtype", "user");
995  	spdk_json_write_named_string(w, "traddr", dev->path);
996  	spdk_json_write_named_uint32(w, "vq_count", vdev->max_queues - vdev->fixed_queues_num);
997  	spdk_json_write_named_uint32(w, "vq_size", virtio_dev_backend_ops(vdev)->get_queue_size(vdev, 0));
998  }
999  
1000  static const struct virtio_dev_ops virtio_user_ops = {
1001  	.read_dev_cfg	= virtio_user_read_dev_config,
1002  	.write_dev_cfg	= virtio_user_write_dev_config,
1003  	.get_status	= virtio_user_get_status,
1004  	.set_status	= virtio_user_set_status,
1005  	.get_features	= virtio_user_get_features,
1006  	.set_features	= virtio_user_set_features,
1007  	.destruct_dev	= virtio_user_destroy,
1008  	.get_queue_size	= virtio_user_get_queue_size,
1009  	.setup_queue	= virtio_user_setup_queue,
1010  	.del_queue	= virtio_user_del_queue,
1011  	.notify_queue	= virtio_user_notify_queue,
1012  	.dump_json_info = virtio_user_dump_json_info,
1013  	.write_json_config = virtio_user_write_json_config,
1014  };
1015  
1016  int
1017  virtio_user_dev_init(struct virtio_dev *vdev, const char *name, const char *path,
1018  		     uint32_t queue_size)
1019  {
1020  	struct virtio_user_dev *dev;
1021  	int rc;
1022  
1023  	if (name == NULL) {
1024  		SPDK_ERRLOG("No name gived for controller: %s\n", path);
1025  		return -EINVAL;
1026  	}
1027  
1028  	dev = calloc(1, sizeof(*dev));
1029  	if (dev == NULL) {
1030  		return -ENOMEM;
1031  	}
1032  
1033  	rc = virtio_dev_construct(vdev, name, &virtio_user_ops, dev);
1034  	if (rc != 0) {
1035  		SPDK_ERRLOG("Failed to init device: %s\n", path);
1036  		free(dev);
1037  		return rc;
1038  	}
1039  
1040  	vdev->is_hw = 0;
1041  
1042  	snprintf(dev->path, PATH_MAX, "%s", path);
1043  	dev->queue_size = queue_size;
1044  
1045  	rc = virtio_user_dev_setup(vdev);
1046  	if (rc < 0) {
1047  		SPDK_ERRLOG("backend set up fails\n");
1048  		goto err;
1049  	}
1050  
1051  	rc = vhost_user_sock(dev, VHOST_USER_SET_OWNER, NULL);
1052  	if (rc < 0) {
1053  		SPDK_ERRLOG("set_owner fails: %s\n", spdk_strerror(-rc));
1054  		goto err;
1055  	}
1056  
1057  	return 0;
1058  
1059  err:
1060  	virtio_dev_destruct(vdev);
1061  	return rc;
1062  }
1063  SPDK_LOG_REGISTER_COMPONENT(virtio_user)
1064