xref: /dpdk/lib/vhost/socket.c (revision e9fd1ebf981f361844aea9ec94e17f4bda5e1479)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4 
5 #include <stdint.h>
6 #include <stdio.h>
7 #include <limits.h>
8 #include <stdlib.h>
9 #include <unistd.h>
10 #include <string.h>
11 #include <sys/socket.h>
12 #include <sys/un.h>
13 #include <sys/queue.h>
14 #include <errno.h>
15 #include <fcntl.h>
16 
17 #include <rte_thread.h>
18 #include <rte_log.h>
19 
20 #include "fd_man.h"
21 #include "vduse.h"
22 #include "vhost.h"
23 #include "vhost_user.h"
24 
25 
26 TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection);
27 
28 /*
29  * Every time rte_vhost_driver_register() is invoked, an associated
30  * vhost_user_socket struct will be created.
31  */
32 struct vhost_user_socket {
33 	struct vhost_user_connection_list conn_list;
34 	pthread_mutex_t conn_mutex;
35 	char *path;
36 	int socket_fd;
37 	struct sockaddr_un un;
38 	bool is_server;
39 	bool is_vduse;
40 	bool reconnect;
41 	bool iommu_support;
42 	bool use_builtin_virtio_net;
43 	bool extbuf;
44 	bool linearbuf;
45 	bool async_copy;
46 	bool net_compliant_ol_flags;
47 	bool stats_enabled;
48 
49 	/*
50 	 * The "supported_features" indicates the feature bits the
51 	 * vhost driver supports. The "features" indicates the feature
52 	 * bits after the rte_vhost_driver_features_disable/enable().
53 	 * It is also the final feature bits used for vhost-user
54 	 * features negotiation.
55 	 */
56 	uint64_t supported_features;
57 	uint64_t features;
58 
59 	uint64_t protocol_features;
60 
61 	uint32_t max_queue_pairs;
62 
63 	struct rte_vdpa_device *vdpa_dev;
64 
65 	struct rte_vhost_device_ops const *notify_ops;
66 };
67 
68 struct vhost_user_connection {
69 	struct vhost_user_socket *vsocket;
70 	int connfd;
71 	int vid;
72 
73 	TAILQ_ENTRY(vhost_user_connection) next;
74 };
75 
76 #define MAX_VHOST_SOCKET 1024
77 struct vhost_user {
78 	struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
79 	struct fdset fdset;
80 	int vsocket_cnt;
81 	pthread_mutex_t mutex;
82 };
83 
84 #define MAX_VIRTIO_BACKLOG 128
85 
86 static void vhost_user_server_new_connection(int fd, void *data, int *remove);
87 static void vhost_user_read_cb(int fd, void *dat, int *remove);
88 static int create_unix_socket(struct vhost_user_socket *vsocket);
89 static int vhost_user_start_client(struct vhost_user_socket *vsocket);
90 
91 static struct vhost_user vhost_user = {
92 	.fdset = {
93 		.fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
94 		.fd_mutex = PTHREAD_MUTEX_INITIALIZER,
95 		.fd_pooling_mutex = PTHREAD_MUTEX_INITIALIZER,
96 		.sync_mutex = PTHREAD_MUTEX_INITIALIZER,
97 		.num = 0
98 	},
99 	.vsocket_cnt = 0,
100 	.mutex = PTHREAD_MUTEX_INITIALIZER,
101 };
102 
103 /*
104  * return bytes# of read on success or negative val on failure. Update fdnum
105  * with number of fds read.
106  */
107 int
108 read_fd_message(char *ifname, int sockfd, char *buf, int buflen, int *fds, int max_fds,
109 		int *fd_num)
110 {
111 	struct iovec iov;
112 	struct msghdr msgh;
113 	char control[CMSG_SPACE(max_fds * sizeof(int))];
114 	struct cmsghdr *cmsg;
115 	int got_fds = 0;
116 	int ret;
117 
118 	*fd_num = 0;
119 
120 	memset(&msgh, 0, sizeof(msgh));
121 	iov.iov_base = buf;
122 	iov.iov_len  = buflen;
123 
124 	msgh.msg_iov = &iov;
125 	msgh.msg_iovlen = 1;
126 	msgh.msg_control = control;
127 	msgh.msg_controllen = sizeof(control);
128 
129 	ret = recvmsg(sockfd, &msgh, 0);
130 	if (ret <= 0) {
131 		if (ret)
132 			VHOST_CONFIG_LOG(ifname, ERR, "recvmsg failed on fd %d (%s)",
133 				sockfd, strerror(errno));
134 		return ret;
135 	}
136 
137 	if (msgh.msg_flags & MSG_TRUNC)
138 		VHOST_CONFIG_LOG(ifname, ERR, "truncated msg (fd %d)", sockfd);
139 
140 	/* MSG_CTRUNC may be caused by LSM misconfiguration */
141 	if (msgh.msg_flags & MSG_CTRUNC)
142 		VHOST_CONFIG_LOG(ifname, ERR, "truncated control data (fd %d)", sockfd);
143 
144 	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
145 		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
146 		if ((cmsg->cmsg_level == SOL_SOCKET) &&
147 			(cmsg->cmsg_type == SCM_RIGHTS)) {
148 			got_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
149 			*fd_num = got_fds;
150 			memcpy(fds, CMSG_DATA(cmsg), got_fds * sizeof(int));
151 			break;
152 		}
153 	}
154 
155 	/* Clear out unused file descriptors */
156 	while (got_fds < max_fds)
157 		fds[got_fds++] = -1;
158 
159 	return ret;
160 }
161 
162 int
163 send_fd_message(char *ifname, int sockfd, char *buf, int buflen, int *fds, int fd_num)
164 {
165 
166 	struct iovec iov;
167 	struct msghdr msgh;
168 	size_t fdsize = fd_num * sizeof(int);
169 	char control[CMSG_SPACE(fdsize)];
170 	struct cmsghdr *cmsg;
171 	int ret;
172 
173 	memset(&msgh, 0, sizeof(msgh));
174 	iov.iov_base = buf;
175 	iov.iov_len = buflen;
176 
177 	msgh.msg_iov = &iov;
178 	msgh.msg_iovlen = 1;
179 
180 	if (fds && fd_num > 0) {
181 		msgh.msg_control = control;
182 		msgh.msg_controllen = sizeof(control);
183 		cmsg = CMSG_FIRSTHDR(&msgh);
184 		if (cmsg == NULL) {
185 			VHOST_CONFIG_LOG(ifname, ERR, "cmsg == NULL");
186 			errno = EINVAL;
187 			return -1;
188 		}
189 		cmsg->cmsg_len = CMSG_LEN(fdsize);
190 		cmsg->cmsg_level = SOL_SOCKET;
191 		cmsg->cmsg_type = SCM_RIGHTS;
192 		memcpy(CMSG_DATA(cmsg), fds, fdsize);
193 	} else {
194 		msgh.msg_control = NULL;
195 		msgh.msg_controllen = 0;
196 	}
197 
198 	do {
199 		ret = sendmsg(sockfd, &msgh, MSG_NOSIGNAL);
200 	} while (ret < 0 && errno == EINTR);
201 
202 	if (ret < 0) {
203 		VHOST_CONFIG_LOG(ifname, ERR, "sendmsg error on fd %d (%s)",
204 			sockfd, strerror(errno));
205 		return ret;
206 	}
207 
208 	return ret;
209 }
210 
211 static void
212 vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
213 {
214 	int vid;
215 	size_t size;
216 	struct vhost_user_connection *conn;
217 	int ret;
218 	struct virtio_net *dev;
219 
220 	if (vsocket == NULL)
221 		return;
222 
223 	conn = malloc(sizeof(*conn));
224 	if (conn == NULL) {
225 		close(fd);
226 		return;
227 	}
228 
229 	vid = vhost_user_new_device();
230 	if (vid == -1) {
231 		goto err;
232 	}
233 
234 	size = strnlen(vsocket->path, PATH_MAX);
235 	vhost_set_ifname(vid, vsocket->path, size);
236 
237 	vhost_setup_virtio_net(vid, vsocket->use_builtin_virtio_net,
238 		vsocket->net_compliant_ol_flags, vsocket->stats_enabled,
239 		vsocket->iommu_support);
240 
241 	vhost_attach_vdpa_device(vid, vsocket->vdpa_dev);
242 
243 	if (vsocket->extbuf)
244 		vhost_enable_extbuf(vid);
245 
246 	if (vsocket->linearbuf)
247 		vhost_enable_linearbuf(vid);
248 
249 	if (vsocket->async_copy) {
250 		dev = get_device(vid);
251 
252 		if (dev)
253 			dev->async_copy = 1;
254 	}
255 
256 	VHOST_CONFIG_LOG(vsocket->path, INFO, "new device, handle is %d", vid);
257 
258 	if (vsocket->notify_ops->new_connection) {
259 		ret = vsocket->notify_ops->new_connection(vid);
260 		if (ret < 0) {
261 			VHOST_CONFIG_LOG(vsocket->path, ERR,
262 				"failed to add vhost user connection with fd %d",
263 				fd);
264 			goto err_cleanup;
265 		}
266 	}
267 
268 	conn->connfd = fd;
269 	conn->vsocket = vsocket;
270 	conn->vid = vid;
271 	ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb,
272 			NULL, conn);
273 	if (ret < 0) {
274 		VHOST_CONFIG_LOG(vsocket->path, ERR,
275 			"failed to add fd %d into vhost server fdset",
276 			fd);
277 
278 		if (vsocket->notify_ops->destroy_connection)
279 			vsocket->notify_ops->destroy_connection(conn->vid);
280 
281 		goto err_cleanup;
282 	}
283 
284 	pthread_mutex_lock(&vsocket->conn_mutex);
285 	TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next);
286 	pthread_mutex_unlock(&vsocket->conn_mutex);
287 
288 	fdset_pipe_notify(&vhost_user.fdset);
289 	return;
290 
291 err_cleanup:
292 	vhost_destroy_device(vid);
293 err:
294 	free(conn);
295 	close(fd);
296 }
297 
298 /* call back when there is new vhost-user connection from client  */
299 static void
300 vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
301 {
302 	struct vhost_user_socket *vsocket = dat;
303 
304 	fd = accept(fd, NULL, NULL);
305 	if (fd < 0)
306 		return;
307 
308 	VHOST_CONFIG_LOG(vsocket->path, INFO, "new vhost user connection is %d", fd);
309 	vhost_user_add_connection(fd, vsocket);
310 }
311 
312 static void
313 vhost_user_read_cb(int connfd, void *dat, int *remove)
314 {
315 	struct vhost_user_connection *conn = dat;
316 	struct vhost_user_socket *vsocket = conn->vsocket;
317 	int ret;
318 
319 	ret = vhost_user_msg_handler(conn->vid, connfd);
320 	if (ret < 0) {
321 		struct virtio_net *dev = get_device(conn->vid);
322 
323 		close(connfd);
324 		*remove = 1;
325 
326 		if (dev)
327 			vhost_destroy_device_notify(dev);
328 
329 		if (vsocket->notify_ops->destroy_connection)
330 			vsocket->notify_ops->destroy_connection(conn->vid);
331 
332 		vhost_destroy_device(conn->vid);
333 
334 		if (vsocket->reconnect) {
335 			create_unix_socket(vsocket);
336 			vhost_user_start_client(vsocket);
337 		}
338 
339 		pthread_mutex_lock(&vsocket->conn_mutex);
340 		TAILQ_REMOVE(&vsocket->conn_list, conn, next);
341 		pthread_mutex_unlock(&vsocket->conn_mutex);
342 
343 		free(conn);
344 	}
345 }
346 
347 static int
348 create_unix_socket(struct vhost_user_socket *vsocket)
349 {
350 	int fd;
351 	struct sockaddr_un *un = &vsocket->un;
352 
353 	fd = socket(AF_UNIX, SOCK_STREAM, 0);
354 	if (fd < 0)
355 		return -1;
356 	VHOST_CONFIG_LOG(vsocket->path, INFO, "vhost-user %s: socket created, fd: %d",
357 		vsocket->is_server ? "server" : "client", fd);
358 
359 	if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
360 		VHOST_CONFIG_LOG(vsocket->path, ERR,
361 			"vhost-user: can't set nonblocking mode for socket, fd: %d (%s)",
362 			fd, strerror(errno));
363 		close(fd);
364 		return -1;
365 	}
366 
367 	memset(un, 0, sizeof(*un));
368 	un->sun_family = AF_UNIX;
369 	strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path));
370 	un->sun_path[sizeof(un->sun_path) - 1] = '\0';
371 
372 	vsocket->socket_fd = fd;
373 	return 0;
374 }
375 
376 static int
377 vhost_user_start_server(struct vhost_user_socket *vsocket)
378 {
379 	int ret;
380 	int fd = vsocket->socket_fd;
381 	const char *path = vsocket->path;
382 
383 	/*
384 	 * bind () may fail if the socket file with the same name already
385 	 * exists. But the library obviously should not delete the file
386 	 * provided by the user, since we can not be sure that it is not
387 	 * being used by other applications. Moreover, many applications form
388 	 * socket names based on user input, which is prone to errors.
389 	 *
390 	 * The user must ensure that the socket does not exist before
391 	 * registering the vhost driver in server mode.
392 	 */
393 	ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un));
394 	if (ret < 0) {
395 		VHOST_CONFIG_LOG(path, ERR, "failed to bind: %s; remove it and try again",
396 			strerror(errno));
397 		goto err;
398 	}
399 	VHOST_CONFIG_LOG(path, INFO, "binding succeeded");
400 
401 	ret = listen(fd, MAX_VIRTIO_BACKLOG);
402 	if (ret < 0)
403 		goto err;
404 
405 	ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
406 		  NULL, vsocket);
407 	if (ret < 0) {
408 		VHOST_CONFIG_LOG(path, ERR, "failed to add listen fd %d to vhost server fdset",
409 			fd);
410 		goto err;
411 	}
412 
413 	return 0;
414 
415 err:
416 	close(fd);
417 	return -1;
418 }
419 
420 struct vhost_user_reconnect {
421 	struct sockaddr_un un;
422 	int fd;
423 	struct vhost_user_socket *vsocket;
424 
425 	TAILQ_ENTRY(vhost_user_reconnect) next;
426 };
427 
428 TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
429 struct vhost_user_reconnect_list {
430 	struct vhost_user_reconnect_tailq_list head;
431 	pthread_mutex_t mutex;
432 };
433 
434 static struct vhost_user_reconnect_list reconn_list;
435 static rte_thread_t reconn_tid;
436 
437 static int
438 vhost_user_connect_nonblock(char *path, int fd, struct sockaddr *un, size_t sz)
439 {
440 	int ret, flags;
441 
442 	ret = connect(fd, un, sz);
443 	if (ret < 0 && errno != EISCONN)
444 		return -1;
445 
446 	flags = fcntl(fd, F_GETFL, 0);
447 	if (flags < 0) {
448 		VHOST_CONFIG_LOG(path, ERR, "can't get flags for connfd %d (%s)",
449 			fd, strerror(errno));
450 		return -2;
451 	}
452 	if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
453 		VHOST_CONFIG_LOG(path, ERR, "can't disable nonblocking on fd %d", fd);
454 		return -2;
455 	}
456 	return 0;
457 }
458 
459 static uint32_t
460 vhost_user_client_reconnect(void *arg __rte_unused)
461 {
462 	int ret;
463 	struct vhost_user_reconnect *reconn, *next;
464 
465 	while (1) {
466 		pthread_mutex_lock(&reconn_list.mutex);
467 
468 		/*
469 		 * An equal implementation of TAILQ_FOREACH_SAFE,
470 		 * which does not exist on all platforms.
471 		 */
472 		for (reconn = TAILQ_FIRST(&reconn_list.head);
473 		     reconn != NULL; reconn = next) {
474 			next = TAILQ_NEXT(reconn, next);
475 
476 			ret = vhost_user_connect_nonblock(reconn->vsocket->path, reconn->fd,
477 						(struct sockaddr *)&reconn->un,
478 						sizeof(reconn->un));
479 			if (ret == -2) {
480 				close(reconn->fd);
481 				VHOST_CONFIG_LOG(reconn->vsocket->path, ERR,
482 					"reconnection for fd %d failed",
483 					reconn->fd);
484 				goto remove_fd;
485 			}
486 			if (ret == -1)
487 				continue;
488 
489 			VHOST_CONFIG_LOG(reconn->vsocket->path, INFO, "connected");
490 			vhost_user_add_connection(reconn->fd, reconn->vsocket);
491 remove_fd:
492 			TAILQ_REMOVE(&reconn_list.head, reconn, next);
493 			free(reconn);
494 		}
495 
496 		pthread_mutex_unlock(&reconn_list.mutex);
497 		sleep(1);
498 	}
499 
500 	return 0;
501 }
502 
503 static int
504 vhost_user_reconnect_init(void)
505 {
506 	int ret;
507 
508 	ret = pthread_mutex_init(&reconn_list.mutex, NULL);
509 	if (ret < 0) {
510 		VHOST_CONFIG_LOG("thread", ERR, "%s: failed to initialize mutex", __func__);
511 		return ret;
512 	}
513 	TAILQ_INIT(&reconn_list.head);
514 
515 	ret = rte_thread_create_internal_control(&reconn_tid, "vhost-reco",
516 			vhost_user_client_reconnect, NULL);
517 	if (ret != 0) {
518 		VHOST_CONFIG_LOG("thread", ERR, "failed to create reconnect thread");
519 		if (pthread_mutex_destroy(&reconn_list.mutex))
520 			VHOST_CONFIG_LOG("thread", ERR,
521 				"%s: failed to destroy reconnect mutex",
522 				__func__);
523 	}
524 
525 	return ret;
526 }
527 
528 static int
529 vhost_user_start_client(struct vhost_user_socket *vsocket)
530 {
531 	int ret;
532 	int fd = vsocket->socket_fd;
533 	const char *path = vsocket->path;
534 	struct vhost_user_reconnect *reconn;
535 
536 	ret = vhost_user_connect_nonblock(vsocket->path, fd, (struct sockaddr *)&vsocket->un,
537 					  sizeof(vsocket->un));
538 	if (ret == 0) {
539 		vhost_user_add_connection(fd, vsocket);
540 		return 0;
541 	}
542 
543 	VHOST_CONFIG_LOG(path, WARNING, "failed to connect: %s", strerror(errno));
544 
545 	if (ret == -2 || !vsocket->reconnect) {
546 		close(fd);
547 		return -1;
548 	}
549 
550 	VHOST_CONFIG_LOG(path, INFO, "reconnecting...");
551 	reconn = malloc(sizeof(*reconn));
552 	if (reconn == NULL) {
553 		VHOST_CONFIG_LOG(path, ERR, "failed to allocate memory for reconnect");
554 		close(fd);
555 		return -1;
556 	}
557 	reconn->un = vsocket->un;
558 	reconn->fd = fd;
559 	reconn->vsocket = vsocket;
560 	pthread_mutex_lock(&reconn_list.mutex);
561 	TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
562 	pthread_mutex_unlock(&reconn_list.mutex);
563 
564 	return 0;
565 }
566 
567 static struct vhost_user_socket *
568 find_vhost_user_socket(const char *path)
569 {
570 	int i;
571 
572 	if (path == NULL)
573 		return NULL;
574 
575 	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
576 		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
577 
578 		if (!strcmp(vsocket->path, path))
579 			return vsocket;
580 	}
581 
582 	return NULL;
583 }
584 
585 int
586 rte_vhost_driver_attach_vdpa_device(const char *path,
587 		struct rte_vdpa_device *dev)
588 {
589 	struct vhost_user_socket *vsocket;
590 
591 	if (dev == NULL || path == NULL)
592 		return -1;
593 
594 	pthread_mutex_lock(&vhost_user.mutex);
595 	vsocket = find_vhost_user_socket(path);
596 	if (vsocket)
597 		vsocket->vdpa_dev = dev;
598 	pthread_mutex_unlock(&vhost_user.mutex);
599 
600 	return vsocket ? 0 : -1;
601 }
602 
603 int
604 rte_vhost_driver_detach_vdpa_device(const char *path)
605 {
606 	struct vhost_user_socket *vsocket;
607 
608 	pthread_mutex_lock(&vhost_user.mutex);
609 	vsocket = find_vhost_user_socket(path);
610 	if (vsocket)
611 		vsocket->vdpa_dev = NULL;
612 	pthread_mutex_unlock(&vhost_user.mutex);
613 
614 	return vsocket ? 0 : -1;
615 }
616 
617 struct rte_vdpa_device *
618 rte_vhost_driver_get_vdpa_device(const char *path)
619 {
620 	struct vhost_user_socket *vsocket;
621 	struct rte_vdpa_device *dev = NULL;
622 
623 	pthread_mutex_lock(&vhost_user.mutex);
624 	vsocket = find_vhost_user_socket(path);
625 	if (vsocket)
626 		dev = vsocket->vdpa_dev;
627 	pthread_mutex_unlock(&vhost_user.mutex);
628 
629 	return dev;
630 }
631 
632 int
633 rte_vhost_driver_get_vdpa_dev_type(const char *path, uint32_t *type)
634 {
635 	struct vhost_user_socket *vsocket;
636 	struct rte_vdpa_device *vdpa_dev;
637 	int ret = 0;
638 
639 	pthread_mutex_lock(&vhost_user.mutex);
640 	vsocket = find_vhost_user_socket(path);
641 	if (!vsocket) {
642 		VHOST_CONFIG_LOG(path, ERR, "socket file is not registered yet.");
643 		ret = -1;
644 		goto unlock_exit;
645 	}
646 
647 	vdpa_dev = vsocket->vdpa_dev;
648 	if (!vdpa_dev) {
649 		ret = -1;
650 		goto unlock_exit;
651 	}
652 
653 	*type = vdpa_dev->type;
654 
655 unlock_exit:
656 	pthread_mutex_unlock(&vhost_user.mutex);
657 	return ret;
658 }
659 
660 int
661 rte_vhost_driver_disable_features(const char *path, uint64_t features)
662 {
663 	struct vhost_user_socket *vsocket;
664 
665 	pthread_mutex_lock(&vhost_user.mutex);
666 	vsocket = find_vhost_user_socket(path);
667 
668 	/* Note that use_builtin_virtio_net is not affected by this function
669 	 * since callers may want to selectively disable features of the
670 	 * built-in vhost net device backend.
671 	 */
672 
673 	if (vsocket)
674 		vsocket->features &= ~features;
675 	pthread_mutex_unlock(&vhost_user.mutex);
676 
677 	return vsocket ? 0 : -1;
678 }
679 
680 int
681 rte_vhost_driver_enable_features(const char *path, uint64_t features)
682 {
683 	struct vhost_user_socket *vsocket;
684 
685 	pthread_mutex_lock(&vhost_user.mutex);
686 	vsocket = find_vhost_user_socket(path);
687 	if (vsocket) {
688 		if ((vsocket->supported_features & features) != features) {
689 			/*
690 			 * trying to enable features the driver doesn't
691 			 * support.
692 			 */
693 			pthread_mutex_unlock(&vhost_user.mutex);
694 			return -1;
695 		}
696 		vsocket->features |= features;
697 	}
698 	pthread_mutex_unlock(&vhost_user.mutex);
699 
700 	return vsocket ? 0 : -1;
701 }
702 
703 int
704 rte_vhost_driver_set_features(const char *path, uint64_t features)
705 {
706 	struct vhost_user_socket *vsocket;
707 
708 	pthread_mutex_lock(&vhost_user.mutex);
709 	vsocket = find_vhost_user_socket(path);
710 	if (vsocket) {
711 		vsocket->supported_features = features;
712 		vsocket->features = features;
713 
714 		/* Anyone setting feature bits is implementing their own vhost
715 		 * device backend.
716 		 */
717 		vsocket->use_builtin_virtio_net = false;
718 	}
719 	pthread_mutex_unlock(&vhost_user.mutex);
720 
721 	return vsocket ? 0 : -1;
722 }
723 
724 int
725 rte_vhost_driver_get_features(const char *path, uint64_t *features)
726 {
727 	struct vhost_user_socket *vsocket;
728 	uint64_t vdpa_features;
729 	struct rte_vdpa_device *vdpa_dev;
730 	int ret = 0;
731 
732 	pthread_mutex_lock(&vhost_user.mutex);
733 	vsocket = find_vhost_user_socket(path);
734 	if (!vsocket) {
735 		VHOST_CONFIG_LOG(path, ERR, "socket file is not registered yet.");
736 		ret = -1;
737 		goto unlock_exit;
738 	}
739 
740 	vdpa_dev = vsocket->vdpa_dev;
741 	if (!vdpa_dev) {
742 		*features = vsocket->features;
743 		goto unlock_exit;
744 	}
745 
746 	if (vdpa_dev->ops->get_features(vdpa_dev, &vdpa_features) < 0) {
747 		VHOST_CONFIG_LOG(path, ERR, "failed to get vdpa features for socket file.");
748 		ret = -1;
749 		goto unlock_exit;
750 	}
751 
752 	*features = vsocket->features & vdpa_features;
753 
754 unlock_exit:
755 	pthread_mutex_unlock(&vhost_user.mutex);
756 	return ret;
757 }
758 
759 int
760 rte_vhost_driver_set_protocol_features(const char *path,
761 		uint64_t protocol_features)
762 {
763 	struct vhost_user_socket *vsocket;
764 
765 	pthread_mutex_lock(&vhost_user.mutex);
766 	vsocket = find_vhost_user_socket(path);
767 	if (vsocket)
768 		vsocket->protocol_features = protocol_features;
769 	pthread_mutex_unlock(&vhost_user.mutex);
770 	return vsocket ? 0 : -1;
771 }
772 
773 int
774 rte_vhost_driver_get_protocol_features(const char *path,
775 		uint64_t *protocol_features)
776 {
777 	struct vhost_user_socket *vsocket;
778 	uint64_t vdpa_protocol_features;
779 	struct rte_vdpa_device *vdpa_dev;
780 	int ret = 0;
781 
782 	pthread_mutex_lock(&vhost_user.mutex);
783 	vsocket = find_vhost_user_socket(path);
784 	if (!vsocket) {
785 		VHOST_CONFIG_LOG(path, ERR, "socket file is not registered yet.");
786 		ret = -1;
787 		goto unlock_exit;
788 	}
789 
790 	vdpa_dev = vsocket->vdpa_dev;
791 	if (!vdpa_dev) {
792 		*protocol_features = vsocket->protocol_features;
793 		goto unlock_exit;
794 	}
795 
796 	if (vdpa_dev->ops->get_protocol_features(vdpa_dev,
797 				&vdpa_protocol_features) < 0) {
798 		VHOST_CONFIG_LOG(path, ERR, "failed to get vdpa protocol features.");
799 		ret = -1;
800 		goto unlock_exit;
801 	}
802 
803 	*protocol_features = vsocket->protocol_features
804 		& vdpa_protocol_features;
805 
806 unlock_exit:
807 	pthread_mutex_unlock(&vhost_user.mutex);
808 	return ret;
809 }
810 
811 int
812 rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num)
813 {
814 	struct vhost_user_socket *vsocket;
815 	uint32_t vdpa_queue_num;
816 	struct rte_vdpa_device *vdpa_dev;
817 	int ret = 0;
818 
819 	pthread_mutex_lock(&vhost_user.mutex);
820 	vsocket = find_vhost_user_socket(path);
821 	if (!vsocket) {
822 		VHOST_CONFIG_LOG(path, ERR, "socket file is not registered yet.");
823 		ret = -1;
824 		goto unlock_exit;
825 	}
826 
827 	vdpa_dev = vsocket->vdpa_dev;
828 	if (!vdpa_dev) {
829 		*queue_num = vsocket->max_queue_pairs;
830 		goto unlock_exit;
831 	}
832 
833 	if (vdpa_dev->ops->get_queue_num(vdpa_dev, &vdpa_queue_num) < 0) {
834 		VHOST_CONFIG_LOG(path, ERR, "failed to get vdpa queue number.");
835 		ret = -1;
836 		goto unlock_exit;
837 	}
838 
839 	*queue_num = RTE_MIN(vsocket->max_queue_pairs, vdpa_queue_num);
840 
841 unlock_exit:
842 	pthread_mutex_unlock(&vhost_user.mutex);
843 	return ret;
844 }
845 
846 int
847 rte_vhost_driver_set_max_queue_num(const char *path, uint32_t max_queue_pairs)
848 {
849 	struct vhost_user_socket *vsocket;
850 	int ret = 0;
851 
852 	VHOST_CONFIG_LOG(path, INFO, "Setting max queue pairs to %u", max_queue_pairs);
853 
854 	if (max_queue_pairs > VHOST_MAX_QUEUE_PAIRS) {
855 		VHOST_CONFIG_LOG(path, ERR, "Library only supports up to %u queue pairs",
856 				VHOST_MAX_QUEUE_PAIRS);
857 		return -1;
858 	}
859 
860 	pthread_mutex_lock(&vhost_user.mutex);
861 	vsocket = find_vhost_user_socket(path);
862 	if (!vsocket) {
863 		VHOST_CONFIG_LOG(path, ERR, "socket file is not registered yet.");
864 		ret = -1;
865 		goto unlock_exit;
866 	}
867 
868 	vsocket->max_queue_pairs = max_queue_pairs;
869 
870 unlock_exit:
871 	pthread_mutex_unlock(&vhost_user.mutex);
872 	return ret;
873 }
874 
875 static void
876 vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
877 {
878 	if (vsocket == NULL)
879 		return;
880 
881 	free(vsocket->path);
882 	free(vsocket);
883 }
884 
885 /*
886  * Register a new vhost-user socket; here we could act as server
887  * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
888  * is set.
889  */
890 int
891 rte_vhost_driver_register(const char *path, uint64_t flags)
892 {
893 	int ret = -1;
894 	struct vhost_user_socket *vsocket;
895 
896 	if (!path)
897 		return -1;
898 
899 	pthread_mutex_lock(&vhost_user.mutex);
900 
901 	if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
902 		VHOST_CONFIG_LOG(path, ERR, "the number of vhost sockets reaches maximum");
903 		goto out;
904 	}
905 
906 	vsocket = malloc(sizeof(struct vhost_user_socket));
907 	if (!vsocket)
908 		goto out;
909 	memset(vsocket, 0, sizeof(struct vhost_user_socket));
910 	vsocket->path = strdup(path);
911 	if (vsocket->path == NULL) {
912 		VHOST_CONFIG_LOG(path, ERR, "failed to copy socket path string");
913 		vhost_user_socket_mem_free(vsocket);
914 		goto out;
915 	}
916 	TAILQ_INIT(&vsocket->conn_list);
917 	ret = pthread_mutex_init(&vsocket->conn_mutex, NULL);
918 	if (ret) {
919 		VHOST_CONFIG_LOG(path, ERR, "failed to init connection mutex");
920 		goto out_free;
921 	}
922 
923 	if (!strncmp("/dev/vduse/", path, strlen("/dev/vduse/")))
924 		vsocket->is_vduse = true;
925 
926 	vsocket->vdpa_dev = NULL;
927 	vsocket->max_queue_pairs = VHOST_MAX_QUEUE_PAIRS;
928 	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
929 	vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
930 	vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
931 	vsocket->net_compliant_ol_flags = flags & RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
932 	vsocket->stats_enabled = flags & RTE_VHOST_USER_NET_STATS_ENABLE;
933 	if (vsocket->is_vduse)
934 		vsocket->iommu_support = true;
935 	else
936 		vsocket->iommu_support = flags & RTE_VHOST_USER_IOMMU_SUPPORT;
937 
938 	if (vsocket->async_copy && (vsocket->iommu_support ||
939 				(flags & RTE_VHOST_USER_POSTCOPY_SUPPORT))) {
940 		VHOST_CONFIG_LOG(path, ERR, "async copy with IOMMU or post-copy not supported");
941 		goto out_mutex;
942 	}
943 
944 	/*
945 	 * Set the supported features correctly for the builtin vhost-user
946 	 * net driver.
947 	 *
948 	 * Applications know nothing about features the builtin virtio net
949 	 * driver (virtio_net.c) supports, thus it's not possible for them
950 	 * to invoke rte_vhost_driver_set_features(). To workaround it, here
951 	 * we set it unconditionally. If the application want to implement
952 	 * another vhost-user driver (say SCSI), it should call the
953 	 * rte_vhost_driver_set_features(), which will overwrite following
954 	 * two values.
955 	 */
956 	vsocket->use_builtin_virtio_net = true;
957 	if (vsocket->is_vduse) {
958 		vsocket->supported_features = VDUSE_NET_SUPPORTED_FEATURES;
959 		vsocket->features           = VDUSE_NET_SUPPORTED_FEATURES;
960 	} else {
961 		vsocket->supported_features = VHOST_USER_NET_SUPPORTED_FEATURES;
962 		vsocket->features           = VHOST_USER_NET_SUPPORTED_FEATURES;
963 		vsocket->protocol_features  = VHOST_USER_PROTOCOL_FEATURES;
964 	}
965 
966 	if (vsocket->async_copy) {
967 		vsocket->supported_features &= ~(1ULL << VHOST_F_LOG_ALL);
968 		vsocket->features &= ~(1ULL << VHOST_F_LOG_ALL);
969 		VHOST_CONFIG_LOG(path, INFO, "logging feature is disabled in async copy mode");
970 	}
971 
972 	/*
973 	 * We'll not be able to receive a buffer from guest in linear mode
974 	 * without external buffer if it will not fit in a single mbuf, which is
975 	 * likely if segmentation offloading enabled.
976 	 */
977 	if (vsocket->linearbuf && !vsocket->extbuf) {
978 		uint64_t seg_offload_features =
979 				(1ULL << VIRTIO_NET_F_HOST_TSO4) |
980 				(1ULL << VIRTIO_NET_F_HOST_TSO6) |
981 				(1ULL << VIRTIO_NET_F_HOST_UFO);
982 
983 		VHOST_CONFIG_LOG(path, INFO, "Linear buffers requested without external buffers,");
984 		VHOST_CONFIG_LOG(path, INFO, "disabling host segmentation offloading support");
985 		vsocket->supported_features &= ~seg_offload_features;
986 		vsocket->features &= ~seg_offload_features;
987 	}
988 
989 	if (!vsocket->iommu_support) {
990 		vsocket->supported_features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
991 		vsocket->features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
992 	}
993 
994 	if (!(flags & RTE_VHOST_USER_POSTCOPY_SUPPORT)) {
995 		vsocket->protocol_features &=
996 			~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
997 	} else {
998 #ifndef RTE_LIBRTE_VHOST_POSTCOPY
999 		VHOST_CONFIG_LOG(path, ERR, "Postcopy requested but not compiled");
1000 		ret = -1;
1001 		goto out_mutex;
1002 #endif
1003 	}
1004 
1005 	if (!vsocket->is_vduse) {
1006 		if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
1007 			vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
1008 			if (vsocket->reconnect && reconn_tid.opaque_id == 0) {
1009 				if (vhost_user_reconnect_init() != 0)
1010 					goto out_mutex;
1011 			}
1012 		} else {
1013 			vsocket->is_server = true;
1014 		}
1015 		ret = create_unix_socket(vsocket);
1016 		if (ret < 0)
1017 			goto out_mutex;
1018 	}
1019 
1020 	vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
1021 
1022 	pthread_mutex_unlock(&vhost_user.mutex);
1023 	return ret;
1024 
1025 out_mutex:
1026 	if (pthread_mutex_destroy(&vsocket->conn_mutex)) {
1027 		VHOST_CONFIG_LOG(path, ERR, "failed to destroy connection mutex");
1028 	}
1029 out_free:
1030 	vhost_user_socket_mem_free(vsocket);
1031 out:
1032 	pthread_mutex_unlock(&vhost_user.mutex);
1033 
1034 	return ret;
1035 }
1036 
1037 static bool
1038 vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
1039 {
1040 	int found = false;
1041 	struct vhost_user_reconnect *reconn, *next;
1042 
1043 	pthread_mutex_lock(&reconn_list.mutex);
1044 
1045 	for (reconn = TAILQ_FIRST(&reconn_list.head);
1046 	     reconn != NULL; reconn = next) {
1047 		next = TAILQ_NEXT(reconn, next);
1048 
1049 		if (reconn->vsocket == vsocket) {
1050 			TAILQ_REMOVE(&reconn_list.head, reconn, next);
1051 			close(reconn->fd);
1052 			free(reconn);
1053 			found = true;
1054 			break;
1055 		}
1056 	}
1057 	pthread_mutex_unlock(&reconn_list.mutex);
1058 	return found;
1059 }
1060 
1061 /**
1062  * Unregister the specified vhost socket
1063  */
1064 int
1065 rte_vhost_driver_unregister(const char *path)
1066 {
1067 	int i;
1068 	int count;
1069 	struct vhost_user_connection *conn, *next;
1070 
1071 	if (path == NULL)
1072 		return -1;
1073 
1074 again:
1075 	pthread_mutex_lock(&vhost_user.mutex);
1076 
1077 	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
1078 		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
1079 		if (strcmp(vsocket->path, path))
1080 			continue;
1081 
1082 		if (vsocket->is_vduse) {
1083 			vduse_device_destroy(path);
1084 		} else if (vsocket->is_server) {
1085 			/*
1086 			 * If r/wcb is executing, release vhost_user's
1087 			 * mutex lock, and try again since the r/wcb
1088 			 * may use the mutex lock.
1089 			 */
1090 			if (fdset_try_del(&vhost_user.fdset, vsocket->socket_fd) == -1) {
1091 				pthread_mutex_unlock(&vhost_user.mutex);
1092 				goto again;
1093 			}
1094 		} else if (vsocket->reconnect) {
1095 			vhost_user_remove_reconnect(vsocket);
1096 		}
1097 
1098 		pthread_mutex_lock(&vsocket->conn_mutex);
1099 		for (conn = TAILQ_FIRST(&vsocket->conn_list);
1100 			 conn != NULL;
1101 			 conn = next) {
1102 			next = TAILQ_NEXT(conn, next);
1103 
1104 			/*
1105 			 * If r/wcb is executing, release vsocket's
1106 			 * conn_mutex and vhost_user's mutex locks, and
1107 			 * try again since the r/wcb may use the
1108 			 * conn_mutex and mutex locks.
1109 			 */
1110 			if (fdset_try_del(&vhost_user.fdset,
1111 					  conn->connfd) == -1) {
1112 				pthread_mutex_unlock(&vsocket->conn_mutex);
1113 				pthread_mutex_unlock(&vhost_user.mutex);
1114 				goto again;
1115 			}
1116 
1117 			VHOST_CONFIG_LOG(path, INFO, "free connfd %d", conn->connfd);
1118 			close(conn->connfd);
1119 			vhost_destroy_device(conn->vid);
1120 			TAILQ_REMOVE(&vsocket->conn_list, conn, next);
1121 			free(conn);
1122 		}
1123 		pthread_mutex_unlock(&vsocket->conn_mutex);
1124 
1125 		if (vsocket->is_server) {
1126 			close(vsocket->socket_fd);
1127 			unlink(path);
1128 		}
1129 
1130 		pthread_mutex_destroy(&vsocket->conn_mutex);
1131 		vhost_user_socket_mem_free(vsocket);
1132 
1133 		count = --vhost_user.vsocket_cnt;
1134 		vhost_user.vsockets[i] = vhost_user.vsockets[count];
1135 		vhost_user.vsockets[count] = NULL;
1136 		pthread_mutex_unlock(&vhost_user.mutex);
1137 		return 0;
1138 	}
1139 	pthread_mutex_unlock(&vhost_user.mutex);
1140 
1141 	return -1;
1142 }
1143 
1144 /*
1145  * Register ops so that we can add/remove device to data core.
1146  */
1147 int
1148 rte_vhost_driver_callback_register(const char *path,
1149 	struct rte_vhost_device_ops const * const ops)
1150 {
1151 	struct vhost_user_socket *vsocket;
1152 
1153 	pthread_mutex_lock(&vhost_user.mutex);
1154 	vsocket = find_vhost_user_socket(path);
1155 	if (vsocket)
1156 		vsocket->notify_ops = ops;
1157 	pthread_mutex_unlock(&vhost_user.mutex);
1158 
1159 	return vsocket ? 0 : -1;
1160 }
1161 
1162 struct rte_vhost_device_ops const *
1163 vhost_driver_callback_get(const char *path)
1164 {
1165 	struct vhost_user_socket *vsocket;
1166 
1167 	pthread_mutex_lock(&vhost_user.mutex);
1168 	vsocket = find_vhost_user_socket(path);
1169 	pthread_mutex_unlock(&vhost_user.mutex);
1170 
1171 	return vsocket ? vsocket->notify_ops : NULL;
1172 }
1173 
1174 int
1175 rte_vhost_driver_start(const char *path)
1176 {
1177 	struct vhost_user_socket *vsocket;
1178 	static rte_thread_t fdset_tid;
1179 
1180 	pthread_mutex_lock(&vhost_user.mutex);
1181 	vsocket = find_vhost_user_socket(path);
1182 	pthread_mutex_unlock(&vhost_user.mutex);
1183 
1184 	if (!vsocket)
1185 		return -1;
1186 
1187 	if (vsocket->is_vduse)
1188 		return vduse_device_create(path, vsocket->net_compliant_ol_flags);
1189 
1190 	if (fdset_tid.opaque_id == 0) {
1191 		/**
1192 		 * create a pipe which will be waited by poll and notified to
1193 		 * rebuild the wait list of poll.
1194 		 */
1195 		if (fdset_pipe_init(&vhost_user.fdset) < 0) {
1196 			VHOST_CONFIG_LOG(path, ERR, "failed to create pipe for vhost fdset");
1197 			return -1;
1198 		}
1199 
1200 		int ret = rte_thread_create_internal_control(&fdset_tid,
1201 				"vhost-evt", fdset_event_dispatch, &vhost_user.fdset);
1202 		if (ret != 0) {
1203 			VHOST_CONFIG_LOG(path, ERR, "failed to create fdset handling thread");
1204 			fdset_pipe_uninit(&vhost_user.fdset);
1205 			return -1;
1206 		}
1207 	}
1208 
1209 	if (vsocket->is_server)
1210 		return vhost_user_start_server(vsocket);
1211 	else
1212 		return vhost_user_start_client(vsocket);
1213 }
1214