xref: /dpdk/lib/vhost/socket.c (revision a39f5e14560df310e7af3c135a11bb6f51eb92e6)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4 
5 #include <stdint.h>
6 #include <stdio.h>
7 #include <limits.h>
8 #include <stdlib.h>
9 #include <unistd.h>
10 #include <string.h>
11 #include <sys/socket.h>
12 #include <sys/un.h>
13 #include <sys/queue.h>
14 #include <errno.h>
15 #include <fcntl.h>
16 #include <pthread.h>
17 
18 #include <rte_log.h>
19 
20 #include "fd_man.h"
21 #include "vhost.h"
22 #include "vhost_user.h"
23 
24 
25 TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection);
26 
27 /*
28  * Every time rte_vhost_driver_register() is invoked, an associated
29  * vhost_user_socket struct will be created.
30  */
31 struct vhost_user_socket {
32 	struct vhost_user_connection_list conn_list;
33 	pthread_mutex_t conn_mutex;
34 	char *path;
35 	int socket_fd;
36 	struct sockaddr_un un;
37 	bool is_server;
38 	bool reconnect;
39 	bool iommu_support;
40 	bool use_builtin_virtio_net;
41 	bool extbuf;
42 	bool linearbuf;
43 	bool async_copy;
44 	bool net_compliant_ol_flags;
45 	bool stats_enabled;
46 
47 	/*
48 	 * The "supported_features" indicates the feature bits the
49 	 * vhost driver supports. The "features" indicates the feature
50 	 * bits after the rte_vhost_driver_features_disable/enable().
51 	 * It is also the final feature bits used for vhost-user
52 	 * features negotiation.
53 	 */
54 	uint64_t supported_features;
55 	uint64_t features;
56 
57 	uint64_t protocol_features;
58 
59 	struct rte_vdpa_device *vdpa_dev;
60 
61 	struct rte_vhost_device_ops const *notify_ops;
62 };
63 
64 struct vhost_user_connection {
65 	struct vhost_user_socket *vsocket;
66 	int connfd;
67 	int vid;
68 
69 	TAILQ_ENTRY(vhost_user_connection) next;
70 };
71 
72 #define MAX_VHOST_SOCKET 1024
73 struct vhost_user {
74 	struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
75 	struct fdset fdset;
76 	int vsocket_cnt;
77 	pthread_mutex_t mutex;
78 };
79 
80 #define MAX_VIRTIO_BACKLOG 128
81 
82 static void vhost_user_server_new_connection(int fd, void *data, int *remove);
83 static void vhost_user_read_cb(int fd, void *dat, int *remove);
84 static int create_unix_socket(struct vhost_user_socket *vsocket);
85 static int vhost_user_start_client(struct vhost_user_socket *vsocket);
86 
87 static struct vhost_user vhost_user = {
88 	.fdset = {
89 		.fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
90 		.fd_mutex = PTHREAD_MUTEX_INITIALIZER,
91 		.fd_pooling_mutex = PTHREAD_MUTEX_INITIALIZER,
92 		.num = 0
93 	},
94 	.vsocket_cnt = 0,
95 	.mutex = PTHREAD_MUTEX_INITIALIZER,
96 };
97 
98 /*
99  * return bytes# of read on success or negative val on failure. Update fdnum
100  * with number of fds read.
101  */
102 int
103 read_fd_message(char *ifname, int sockfd, char *buf, int buflen, int *fds, int max_fds,
104 		int *fd_num)
105 {
106 	struct iovec iov;
107 	struct msghdr msgh;
108 	char control[CMSG_SPACE(max_fds * sizeof(int))];
109 	struct cmsghdr *cmsg;
110 	int got_fds = 0;
111 	int ret;
112 
113 	*fd_num = 0;
114 
115 	memset(&msgh, 0, sizeof(msgh));
116 	iov.iov_base = buf;
117 	iov.iov_len  = buflen;
118 
119 	msgh.msg_iov = &iov;
120 	msgh.msg_iovlen = 1;
121 	msgh.msg_control = control;
122 	msgh.msg_controllen = sizeof(control);
123 
124 	ret = recvmsg(sockfd, &msgh, 0);
125 	if (ret <= 0) {
126 		if (ret)
127 			VHOST_LOG_CONFIG(ifname, ERR, "recvmsg failed on fd %d (%s)\n",
128 				sockfd, strerror(errno));
129 		return ret;
130 	}
131 
132 	if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
133 		VHOST_LOG_CONFIG(ifname, ERR, "truncated msg (fd %d)\n", sockfd);
134 		return -1;
135 	}
136 
137 	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
138 		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
139 		if ((cmsg->cmsg_level == SOL_SOCKET) &&
140 			(cmsg->cmsg_type == SCM_RIGHTS)) {
141 			got_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
142 			*fd_num = got_fds;
143 			memcpy(fds, CMSG_DATA(cmsg), got_fds * sizeof(int));
144 			break;
145 		}
146 	}
147 
148 	/* Clear out unused file descriptors */
149 	while (got_fds < max_fds)
150 		fds[got_fds++] = -1;
151 
152 	return ret;
153 }
154 
155 int
156 send_fd_message(char *ifname, int sockfd, char *buf, int buflen, int *fds, int fd_num)
157 {
158 
159 	struct iovec iov;
160 	struct msghdr msgh;
161 	size_t fdsize = fd_num * sizeof(int);
162 	char control[CMSG_SPACE(fdsize)];
163 	struct cmsghdr *cmsg;
164 	int ret;
165 
166 	memset(&msgh, 0, sizeof(msgh));
167 	iov.iov_base = buf;
168 	iov.iov_len = buflen;
169 
170 	msgh.msg_iov = &iov;
171 	msgh.msg_iovlen = 1;
172 
173 	if (fds && fd_num > 0) {
174 		msgh.msg_control = control;
175 		msgh.msg_controllen = sizeof(control);
176 		cmsg = CMSG_FIRSTHDR(&msgh);
177 		if (cmsg == NULL) {
178 			VHOST_LOG_CONFIG(ifname, ERR, "cmsg == NULL\n");
179 			errno = EINVAL;
180 			return -1;
181 		}
182 		cmsg->cmsg_len = CMSG_LEN(fdsize);
183 		cmsg->cmsg_level = SOL_SOCKET;
184 		cmsg->cmsg_type = SCM_RIGHTS;
185 		memcpy(CMSG_DATA(cmsg), fds, fdsize);
186 	} else {
187 		msgh.msg_control = NULL;
188 		msgh.msg_controllen = 0;
189 	}
190 
191 	do {
192 		ret = sendmsg(sockfd, &msgh, MSG_NOSIGNAL);
193 	} while (ret < 0 && errno == EINTR);
194 
195 	if (ret < 0) {
196 		VHOST_LOG_CONFIG(ifname, ERR, "sendmsg error on fd %d (%s)\n",
197 			sockfd, strerror(errno));
198 		return ret;
199 	}
200 
201 	return ret;
202 }
203 
204 static void
205 vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
206 {
207 	int vid;
208 	size_t size;
209 	struct vhost_user_connection *conn;
210 	int ret;
211 	struct virtio_net *dev;
212 
213 	if (vsocket == NULL)
214 		return;
215 
216 	conn = malloc(sizeof(*conn));
217 	if (conn == NULL) {
218 		close(fd);
219 		return;
220 	}
221 
222 	vid = vhost_new_device();
223 	if (vid == -1) {
224 		goto err;
225 	}
226 
227 	size = strnlen(vsocket->path, PATH_MAX);
228 	vhost_set_ifname(vid, vsocket->path, size);
229 
230 	vhost_setup_virtio_net(vid, vsocket->use_builtin_virtio_net,
231 		vsocket->net_compliant_ol_flags, vsocket->stats_enabled,
232 		vsocket->iommu_support);
233 
234 	vhost_attach_vdpa_device(vid, vsocket->vdpa_dev);
235 
236 	if (vsocket->extbuf)
237 		vhost_enable_extbuf(vid);
238 
239 	if (vsocket->linearbuf)
240 		vhost_enable_linearbuf(vid);
241 
242 	if (vsocket->async_copy) {
243 		dev = get_device(vid);
244 
245 		if (dev)
246 			dev->async_copy = 1;
247 	}
248 
249 	VHOST_LOG_CONFIG(vsocket->path, INFO, "new device, handle is %d\n", vid);
250 
251 	if (vsocket->notify_ops->new_connection) {
252 		ret = vsocket->notify_ops->new_connection(vid);
253 		if (ret < 0) {
254 			VHOST_LOG_CONFIG(vsocket->path, ERR,
255 				"failed to add vhost user connection with fd %d\n",
256 				fd);
257 			goto err_cleanup;
258 		}
259 	}
260 
261 	conn->connfd = fd;
262 	conn->vsocket = vsocket;
263 	conn->vid = vid;
264 	ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb,
265 			NULL, conn);
266 	if (ret < 0) {
267 		VHOST_LOG_CONFIG(vsocket->path, ERR,
268 			"failed to add fd %d into vhost server fdset\n",
269 			fd);
270 
271 		if (vsocket->notify_ops->destroy_connection)
272 			vsocket->notify_ops->destroy_connection(conn->vid);
273 
274 		goto err_cleanup;
275 	}
276 
277 	pthread_mutex_lock(&vsocket->conn_mutex);
278 	TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next);
279 	pthread_mutex_unlock(&vsocket->conn_mutex);
280 
281 	fdset_pipe_notify(&vhost_user.fdset);
282 	return;
283 
284 err_cleanup:
285 	vhost_destroy_device(vid);
286 err:
287 	free(conn);
288 	close(fd);
289 }
290 
291 /* call back when there is new vhost-user connection from client  */
292 static void
293 vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
294 {
295 	struct vhost_user_socket *vsocket = dat;
296 
297 	fd = accept(fd, NULL, NULL);
298 	if (fd < 0)
299 		return;
300 
301 	VHOST_LOG_CONFIG(vsocket->path, INFO, "new vhost user connection is %d\n", fd);
302 	vhost_user_add_connection(fd, vsocket);
303 }
304 
305 static void
306 vhost_user_read_cb(int connfd, void *dat, int *remove)
307 {
308 	struct vhost_user_connection *conn = dat;
309 	struct vhost_user_socket *vsocket = conn->vsocket;
310 	int ret;
311 
312 	ret = vhost_user_msg_handler(conn->vid, connfd);
313 	if (ret < 0) {
314 		struct virtio_net *dev = get_device(conn->vid);
315 
316 		close(connfd);
317 		*remove = 1;
318 
319 		if (dev)
320 			vhost_destroy_device_notify(dev);
321 
322 		if (vsocket->notify_ops->destroy_connection)
323 			vsocket->notify_ops->destroy_connection(conn->vid);
324 
325 		vhost_destroy_device(conn->vid);
326 
327 		if (vsocket->reconnect) {
328 			create_unix_socket(vsocket);
329 			vhost_user_start_client(vsocket);
330 		}
331 
332 		pthread_mutex_lock(&vsocket->conn_mutex);
333 		TAILQ_REMOVE(&vsocket->conn_list, conn, next);
334 		pthread_mutex_unlock(&vsocket->conn_mutex);
335 
336 		free(conn);
337 	}
338 }
339 
340 static int
341 create_unix_socket(struct vhost_user_socket *vsocket)
342 {
343 	int fd;
344 	struct sockaddr_un *un = &vsocket->un;
345 
346 	fd = socket(AF_UNIX, SOCK_STREAM, 0);
347 	if (fd < 0)
348 		return -1;
349 	VHOST_LOG_CONFIG(vsocket->path, INFO, "vhost-user %s: socket created, fd: %d\n",
350 		vsocket->is_server ? "server" : "client", fd);
351 
352 	if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
353 		VHOST_LOG_CONFIG(vsocket->path, ERR,
354 			"vhost-user: can't set nonblocking mode for socket, fd: %d (%s)\n",
355 			fd, strerror(errno));
356 		close(fd);
357 		return -1;
358 	}
359 
360 	memset(un, 0, sizeof(*un));
361 	un->sun_family = AF_UNIX;
362 	strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path));
363 	un->sun_path[sizeof(un->sun_path) - 1] = '\0';
364 
365 	vsocket->socket_fd = fd;
366 	return 0;
367 }
368 
369 static int
370 vhost_user_start_server(struct vhost_user_socket *vsocket)
371 {
372 	int ret;
373 	int fd = vsocket->socket_fd;
374 	const char *path = vsocket->path;
375 
376 	/*
377 	 * bind () may fail if the socket file with the same name already
378 	 * exists. But the library obviously should not delete the file
379 	 * provided by the user, since we can not be sure that it is not
380 	 * being used by other applications. Moreover, many applications form
381 	 * socket names based on user input, which is prone to errors.
382 	 *
383 	 * The user must ensure that the socket does not exist before
384 	 * registering the vhost driver in server mode.
385 	 */
386 	ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un));
387 	if (ret < 0) {
388 		VHOST_LOG_CONFIG(path, ERR, "failed to bind: %s; remove it and try again\n",
389 			strerror(errno));
390 		goto err;
391 	}
392 	VHOST_LOG_CONFIG(path, INFO, "binding succeeded\n");
393 
394 	ret = listen(fd, MAX_VIRTIO_BACKLOG);
395 	if (ret < 0)
396 		goto err;
397 
398 	ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
399 		  NULL, vsocket);
400 	if (ret < 0) {
401 		VHOST_LOG_CONFIG(path, ERR, "failed to add listen fd %d to vhost server fdset\n",
402 			fd);
403 		goto err;
404 	}
405 
406 	return 0;
407 
408 err:
409 	close(fd);
410 	return -1;
411 }
412 
413 struct vhost_user_reconnect {
414 	struct sockaddr_un un;
415 	int fd;
416 	struct vhost_user_socket *vsocket;
417 
418 	TAILQ_ENTRY(vhost_user_reconnect) next;
419 };
420 
421 TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
422 struct vhost_user_reconnect_list {
423 	struct vhost_user_reconnect_tailq_list head;
424 	pthread_mutex_t mutex;
425 };
426 
427 static struct vhost_user_reconnect_list reconn_list;
428 static pthread_t reconn_tid;
429 
430 static int
431 vhost_user_connect_nonblock(char *path, int fd, struct sockaddr *un, size_t sz)
432 {
433 	int ret, flags;
434 
435 	ret = connect(fd, un, sz);
436 	if (ret < 0 && errno != EISCONN)
437 		return -1;
438 
439 	flags = fcntl(fd, F_GETFL, 0);
440 	if (flags < 0) {
441 		VHOST_LOG_CONFIG(path, ERR, "can't get flags for connfd %d (%s)\n",
442 			fd, strerror(errno));
443 		return -2;
444 	}
445 	if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
446 		VHOST_LOG_CONFIG(path, ERR, "can't disable nonblocking on fd %d\n", fd);
447 		return -2;
448 	}
449 	return 0;
450 }
451 
452 static void *
453 vhost_user_client_reconnect(void *arg __rte_unused)
454 {
455 	int ret;
456 	struct vhost_user_reconnect *reconn, *next;
457 
458 	while (1) {
459 		pthread_mutex_lock(&reconn_list.mutex);
460 
461 		/*
462 		 * An equal implementation of TAILQ_FOREACH_SAFE,
463 		 * which does not exist on all platforms.
464 		 */
465 		for (reconn = TAILQ_FIRST(&reconn_list.head);
466 		     reconn != NULL; reconn = next) {
467 			next = TAILQ_NEXT(reconn, next);
468 
469 			ret = vhost_user_connect_nonblock(reconn->vsocket->path, reconn->fd,
470 						(struct sockaddr *)&reconn->un,
471 						sizeof(reconn->un));
472 			if (ret == -2) {
473 				close(reconn->fd);
474 				VHOST_LOG_CONFIG(reconn->vsocket->path, ERR,
475 					"reconnection for fd %d failed\n",
476 					reconn->fd);
477 				goto remove_fd;
478 			}
479 			if (ret == -1)
480 				continue;
481 
482 			VHOST_LOG_CONFIG(reconn->vsocket->path, INFO, "connected\n");
483 			vhost_user_add_connection(reconn->fd, reconn->vsocket);
484 remove_fd:
485 			TAILQ_REMOVE(&reconn_list.head, reconn, next);
486 			free(reconn);
487 		}
488 
489 		pthread_mutex_unlock(&reconn_list.mutex);
490 		sleep(1);
491 	}
492 
493 	return NULL;
494 }
495 
496 static int
497 vhost_user_reconnect_init(void)
498 {
499 	int ret;
500 
501 	ret = pthread_mutex_init(&reconn_list.mutex, NULL);
502 	if (ret < 0) {
503 		VHOST_LOG_CONFIG("thread", ERR, "%s: failed to initialize mutex\n", __func__);
504 		return ret;
505 	}
506 	TAILQ_INIT(&reconn_list.head);
507 
508 	ret = rte_ctrl_thread_create(&reconn_tid, "vhost_reconn", NULL,
509 			     vhost_user_client_reconnect, NULL);
510 	if (ret != 0) {
511 		VHOST_LOG_CONFIG("thread", ERR, "failed to create reconnect thread\n");
512 		if (pthread_mutex_destroy(&reconn_list.mutex))
513 			VHOST_LOG_CONFIG("thread", ERR,
514 				"%s: failed to destroy reconnect mutex\n",
515 				__func__);
516 	}
517 
518 	return ret;
519 }
520 
521 static int
522 vhost_user_start_client(struct vhost_user_socket *vsocket)
523 {
524 	int ret;
525 	int fd = vsocket->socket_fd;
526 	const char *path = vsocket->path;
527 	struct vhost_user_reconnect *reconn;
528 
529 	ret = vhost_user_connect_nonblock(vsocket->path, fd, (struct sockaddr *)&vsocket->un,
530 					  sizeof(vsocket->un));
531 	if (ret == 0) {
532 		vhost_user_add_connection(fd, vsocket);
533 		return 0;
534 	}
535 
536 	VHOST_LOG_CONFIG(path, WARNING, "failed to connect: %s\n", strerror(errno));
537 
538 	if (ret == -2 || !vsocket->reconnect) {
539 		close(fd);
540 		return -1;
541 	}
542 
543 	VHOST_LOG_CONFIG(path, INFO, "reconnecting...\n");
544 	reconn = malloc(sizeof(*reconn));
545 	if (reconn == NULL) {
546 		VHOST_LOG_CONFIG(path, ERR, "failed to allocate memory for reconnect\n");
547 		close(fd);
548 		return -1;
549 	}
550 	reconn->un = vsocket->un;
551 	reconn->fd = fd;
552 	reconn->vsocket = vsocket;
553 	pthread_mutex_lock(&reconn_list.mutex);
554 	TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
555 	pthread_mutex_unlock(&reconn_list.mutex);
556 
557 	return 0;
558 }
559 
560 static struct vhost_user_socket *
561 find_vhost_user_socket(const char *path)
562 {
563 	int i;
564 
565 	if (path == NULL)
566 		return NULL;
567 
568 	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
569 		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
570 
571 		if (!strcmp(vsocket->path, path))
572 			return vsocket;
573 	}
574 
575 	return NULL;
576 }
577 
578 int
579 rte_vhost_driver_attach_vdpa_device(const char *path,
580 		struct rte_vdpa_device *dev)
581 {
582 	struct vhost_user_socket *vsocket;
583 
584 	if (dev == NULL || path == NULL)
585 		return -1;
586 
587 	pthread_mutex_lock(&vhost_user.mutex);
588 	vsocket = find_vhost_user_socket(path);
589 	if (vsocket)
590 		vsocket->vdpa_dev = dev;
591 	pthread_mutex_unlock(&vhost_user.mutex);
592 
593 	return vsocket ? 0 : -1;
594 }
595 
596 int
597 rte_vhost_driver_detach_vdpa_device(const char *path)
598 {
599 	struct vhost_user_socket *vsocket;
600 
601 	pthread_mutex_lock(&vhost_user.mutex);
602 	vsocket = find_vhost_user_socket(path);
603 	if (vsocket)
604 		vsocket->vdpa_dev = NULL;
605 	pthread_mutex_unlock(&vhost_user.mutex);
606 
607 	return vsocket ? 0 : -1;
608 }
609 
610 struct rte_vdpa_device *
611 rte_vhost_driver_get_vdpa_device(const char *path)
612 {
613 	struct vhost_user_socket *vsocket;
614 	struct rte_vdpa_device *dev = NULL;
615 
616 	pthread_mutex_lock(&vhost_user.mutex);
617 	vsocket = find_vhost_user_socket(path);
618 	if (vsocket)
619 		dev = vsocket->vdpa_dev;
620 	pthread_mutex_unlock(&vhost_user.mutex);
621 
622 	return dev;
623 }
624 
625 int
626 rte_vhost_driver_get_vdpa_dev_type(const char *path, uint32_t *type)
627 {
628 	struct vhost_user_socket *vsocket;
629 	struct rte_vdpa_device *vdpa_dev;
630 	int ret = 0;
631 
632 	pthread_mutex_lock(&vhost_user.mutex);
633 	vsocket = find_vhost_user_socket(path);
634 	if (!vsocket) {
635 		VHOST_LOG_CONFIG(path, ERR, "socket file is not registered yet.\n");
636 		ret = -1;
637 		goto unlock_exit;
638 	}
639 
640 	vdpa_dev = vsocket->vdpa_dev;
641 	if (!vdpa_dev) {
642 		ret = -1;
643 		goto unlock_exit;
644 	}
645 
646 	*type = vdpa_dev->type;
647 
648 unlock_exit:
649 	pthread_mutex_unlock(&vhost_user.mutex);
650 	return ret;
651 }
652 
653 int
654 rte_vhost_driver_disable_features(const char *path, uint64_t features)
655 {
656 	struct vhost_user_socket *vsocket;
657 
658 	pthread_mutex_lock(&vhost_user.mutex);
659 	vsocket = find_vhost_user_socket(path);
660 
661 	/* Note that use_builtin_virtio_net is not affected by this function
662 	 * since callers may want to selectively disable features of the
663 	 * built-in vhost net device backend.
664 	 */
665 
666 	if (vsocket)
667 		vsocket->features &= ~features;
668 	pthread_mutex_unlock(&vhost_user.mutex);
669 
670 	return vsocket ? 0 : -1;
671 }
672 
673 int
674 rte_vhost_driver_enable_features(const char *path, uint64_t features)
675 {
676 	struct vhost_user_socket *vsocket;
677 
678 	pthread_mutex_lock(&vhost_user.mutex);
679 	vsocket = find_vhost_user_socket(path);
680 	if (vsocket) {
681 		if ((vsocket->supported_features & features) != features) {
682 			/*
683 			 * trying to enable features the driver doesn't
684 			 * support.
685 			 */
686 			pthread_mutex_unlock(&vhost_user.mutex);
687 			return -1;
688 		}
689 		vsocket->features |= features;
690 	}
691 	pthread_mutex_unlock(&vhost_user.mutex);
692 
693 	return vsocket ? 0 : -1;
694 }
695 
696 int
697 rte_vhost_driver_set_features(const char *path, uint64_t features)
698 {
699 	struct vhost_user_socket *vsocket;
700 
701 	pthread_mutex_lock(&vhost_user.mutex);
702 	vsocket = find_vhost_user_socket(path);
703 	if (vsocket) {
704 		vsocket->supported_features = features;
705 		vsocket->features = features;
706 
707 		/* Anyone setting feature bits is implementing their own vhost
708 		 * device backend.
709 		 */
710 		vsocket->use_builtin_virtio_net = false;
711 	}
712 	pthread_mutex_unlock(&vhost_user.mutex);
713 
714 	return vsocket ? 0 : -1;
715 }
716 
717 int
718 rte_vhost_driver_get_features(const char *path, uint64_t *features)
719 {
720 	struct vhost_user_socket *vsocket;
721 	uint64_t vdpa_features;
722 	struct rte_vdpa_device *vdpa_dev;
723 	int ret = 0;
724 
725 	pthread_mutex_lock(&vhost_user.mutex);
726 	vsocket = find_vhost_user_socket(path);
727 	if (!vsocket) {
728 		VHOST_LOG_CONFIG(path, ERR, "socket file is not registered yet.\n");
729 		ret = -1;
730 		goto unlock_exit;
731 	}
732 
733 	vdpa_dev = vsocket->vdpa_dev;
734 	if (!vdpa_dev) {
735 		*features = vsocket->features;
736 		goto unlock_exit;
737 	}
738 
739 	if (vdpa_dev->ops->get_features(vdpa_dev, &vdpa_features) < 0) {
740 		VHOST_LOG_CONFIG(path, ERR, "failed to get vdpa features for socket file.\n");
741 		ret = -1;
742 		goto unlock_exit;
743 	}
744 
745 	*features = vsocket->features & vdpa_features;
746 
747 unlock_exit:
748 	pthread_mutex_unlock(&vhost_user.mutex);
749 	return ret;
750 }
751 
752 int
753 rte_vhost_driver_set_protocol_features(const char *path,
754 		uint64_t protocol_features)
755 {
756 	struct vhost_user_socket *vsocket;
757 
758 	pthread_mutex_lock(&vhost_user.mutex);
759 	vsocket = find_vhost_user_socket(path);
760 	if (vsocket)
761 		vsocket->protocol_features = protocol_features;
762 	pthread_mutex_unlock(&vhost_user.mutex);
763 	return vsocket ? 0 : -1;
764 }
765 
766 int
767 rte_vhost_driver_get_protocol_features(const char *path,
768 		uint64_t *protocol_features)
769 {
770 	struct vhost_user_socket *vsocket;
771 	uint64_t vdpa_protocol_features;
772 	struct rte_vdpa_device *vdpa_dev;
773 	int ret = 0;
774 
775 	pthread_mutex_lock(&vhost_user.mutex);
776 	vsocket = find_vhost_user_socket(path);
777 	if (!vsocket) {
778 		VHOST_LOG_CONFIG(path, ERR, "socket file is not registered yet.\n");
779 		ret = -1;
780 		goto unlock_exit;
781 	}
782 
783 	vdpa_dev = vsocket->vdpa_dev;
784 	if (!vdpa_dev) {
785 		*protocol_features = vsocket->protocol_features;
786 		goto unlock_exit;
787 	}
788 
789 	if (vdpa_dev->ops->get_protocol_features(vdpa_dev,
790 				&vdpa_protocol_features) < 0) {
791 		VHOST_LOG_CONFIG(path, ERR, "failed to get vdpa protocol features.\n");
792 		ret = -1;
793 		goto unlock_exit;
794 	}
795 
796 	*protocol_features = vsocket->protocol_features
797 		& vdpa_protocol_features;
798 
799 unlock_exit:
800 	pthread_mutex_unlock(&vhost_user.mutex);
801 	return ret;
802 }
803 
804 int
805 rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num)
806 {
807 	struct vhost_user_socket *vsocket;
808 	uint32_t vdpa_queue_num;
809 	struct rte_vdpa_device *vdpa_dev;
810 	int ret = 0;
811 
812 	pthread_mutex_lock(&vhost_user.mutex);
813 	vsocket = find_vhost_user_socket(path);
814 	if (!vsocket) {
815 		VHOST_LOG_CONFIG(path, ERR, "socket file is not registered yet.\n");
816 		ret = -1;
817 		goto unlock_exit;
818 	}
819 
820 	vdpa_dev = vsocket->vdpa_dev;
821 	if (!vdpa_dev) {
822 		*queue_num = VHOST_MAX_QUEUE_PAIRS;
823 		goto unlock_exit;
824 	}
825 
826 	if (vdpa_dev->ops->get_queue_num(vdpa_dev, &vdpa_queue_num) < 0) {
827 		VHOST_LOG_CONFIG(path, ERR, "failed to get vdpa queue number.\n");
828 		ret = -1;
829 		goto unlock_exit;
830 	}
831 
832 	*queue_num = RTE_MIN((uint32_t)VHOST_MAX_QUEUE_PAIRS, vdpa_queue_num);
833 
834 unlock_exit:
835 	pthread_mutex_unlock(&vhost_user.mutex);
836 	return ret;
837 }
838 
839 static void
840 vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
841 {
842 	if (vsocket && vsocket->path) {
843 		free(vsocket->path);
844 		vsocket->path = NULL;
845 	}
846 
847 	if (vsocket) {
848 		free(vsocket);
849 		vsocket = NULL;
850 	}
851 }
852 
853 /*
854  * Register a new vhost-user socket; here we could act as server
855  * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
856  * is set.
857  */
858 int
859 rte_vhost_driver_register(const char *path, uint64_t flags)
860 {
861 	int ret = -1;
862 	struct vhost_user_socket *vsocket;
863 
864 	if (!path)
865 		return -1;
866 
867 	pthread_mutex_lock(&vhost_user.mutex);
868 
869 	if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
870 		VHOST_LOG_CONFIG(path, ERR, "the number of vhost sockets reaches maximum\n");
871 		goto out;
872 	}
873 
874 	vsocket = malloc(sizeof(struct vhost_user_socket));
875 	if (!vsocket)
876 		goto out;
877 	memset(vsocket, 0, sizeof(struct vhost_user_socket));
878 	vsocket->path = strdup(path);
879 	if (vsocket->path == NULL) {
880 		VHOST_LOG_CONFIG(path, ERR, "failed to copy socket path string\n");
881 		vhost_user_socket_mem_free(vsocket);
882 		goto out;
883 	}
884 	TAILQ_INIT(&vsocket->conn_list);
885 	ret = pthread_mutex_init(&vsocket->conn_mutex, NULL);
886 	if (ret) {
887 		VHOST_LOG_CONFIG(path, ERR, "failed to init connection mutex\n");
888 		goto out_free;
889 	}
890 	vsocket->vdpa_dev = NULL;
891 	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
892 	vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
893 	vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
894 	vsocket->net_compliant_ol_flags = flags & RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
895 	vsocket->stats_enabled = flags & RTE_VHOST_USER_NET_STATS_ENABLE;
896 	vsocket->iommu_support = flags & RTE_VHOST_USER_IOMMU_SUPPORT;
897 
898 	if (vsocket->async_copy &&
899 		(flags & (RTE_VHOST_USER_IOMMU_SUPPORT |
900 		RTE_VHOST_USER_POSTCOPY_SUPPORT))) {
901 		VHOST_LOG_CONFIG(path, ERR, "async copy with IOMMU or post-copy not supported\n");
902 		goto out_mutex;
903 	}
904 
905 	/*
906 	 * Set the supported features correctly for the builtin vhost-user
907 	 * net driver.
908 	 *
909 	 * Applications know nothing about features the builtin virtio net
910 	 * driver (virtio_net.c) supports, thus it's not possible for them
911 	 * to invoke rte_vhost_driver_set_features(). To workaround it, here
912 	 * we set it unconditionally. If the application want to implement
913 	 * another vhost-user driver (say SCSI), it should call the
914 	 * rte_vhost_driver_set_features(), which will overwrite following
915 	 * two values.
916 	 */
917 	vsocket->use_builtin_virtio_net = true;
918 	vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES;
919 	vsocket->features           = VIRTIO_NET_SUPPORTED_FEATURES;
920 	vsocket->protocol_features  = VHOST_USER_PROTOCOL_FEATURES;
921 
922 	if (vsocket->async_copy) {
923 		vsocket->supported_features &= ~(1ULL << VHOST_F_LOG_ALL);
924 		vsocket->features &= ~(1ULL << VHOST_F_LOG_ALL);
925 		VHOST_LOG_CONFIG(path, INFO, "logging feature is disabled in async copy mode\n");
926 	}
927 
928 	/*
929 	 * We'll not be able to receive a buffer from guest in linear mode
930 	 * without external buffer if it will not fit in a single mbuf, which is
931 	 * likely if segmentation offloading enabled.
932 	 */
933 	if (vsocket->linearbuf && !vsocket->extbuf) {
934 		uint64_t seg_offload_features =
935 				(1ULL << VIRTIO_NET_F_HOST_TSO4) |
936 				(1ULL << VIRTIO_NET_F_HOST_TSO6) |
937 				(1ULL << VIRTIO_NET_F_HOST_UFO);
938 
939 		VHOST_LOG_CONFIG(path, INFO, "Linear buffers requested without external buffers,\n");
940 		VHOST_LOG_CONFIG(path, INFO, "disabling host segmentation offloading support\n");
941 		vsocket->supported_features &= ~seg_offload_features;
942 		vsocket->features &= ~seg_offload_features;
943 	}
944 
945 	if (!(flags & RTE_VHOST_USER_IOMMU_SUPPORT)) {
946 		vsocket->supported_features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
947 		vsocket->features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
948 	}
949 
950 	if (!(flags & RTE_VHOST_USER_POSTCOPY_SUPPORT)) {
951 		vsocket->protocol_features &=
952 			~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
953 	} else {
954 #ifndef RTE_LIBRTE_VHOST_POSTCOPY
955 		VHOST_LOG_CONFIG(path, ERR, "Postcopy requested but not compiled\n");
956 		ret = -1;
957 		goto out_mutex;
958 #endif
959 	}
960 
961 	if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
962 		vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
963 		if (vsocket->reconnect && reconn_tid == 0) {
964 			if (vhost_user_reconnect_init() != 0)
965 				goto out_mutex;
966 		}
967 	} else {
968 		vsocket->is_server = true;
969 	}
970 	ret = create_unix_socket(vsocket);
971 	if (ret < 0) {
972 		goto out_mutex;
973 	}
974 
975 	vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
976 
977 	pthread_mutex_unlock(&vhost_user.mutex);
978 	return ret;
979 
980 out_mutex:
981 	if (pthread_mutex_destroy(&vsocket->conn_mutex)) {
982 		VHOST_LOG_CONFIG(path, ERR, "failed to destroy connection mutex\n");
983 	}
984 out_free:
985 	vhost_user_socket_mem_free(vsocket);
986 out:
987 	pthread_mutex_unlock(&vhost_user.mutex);
988 
989 	return ret;
990 }
991 
992 static bool
993 vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
994 {
995 	int found = false;
996 	struct vhost_user_reconnect *reconn, *next;
997 
998 	pthread_mutex_lock(&reconn_list.mutex);
999 
1000 	for (reconn = TAILQ_FIRST(&reconn_list.head);
1001 	     reconn != NULL; reconn = next) {
1002 		next = TAILQ_NEXT(reconn, next);
1003 
1004 		if (reconn->vsocket == vsocket) {
1005 			TAILQ_REMOVE(&reconn_list.head, reconn, next);
1006 			close(reconn->fd);
1007 			free(reconn);
1008 			found = true;
1009 			break;
1010 		}
1011 	}
1012 	pthread_mutex_unlock(&reconn_list.mutex);
1013 	return found;
1014 }
1015 
1016 /**
1017  * Unregister the specified vhost socket
1018  */
1019 int
1020 rte_vhost_driver_unregister(const char *path)
1021 {
1022 	int i;
1023 	int count;
1024 	struct vhost_user_connection *conn, *next;
1025 
1026 	if (path == NULL)
1027 		return -1;
1028 
1029 again:
1030 	pthread_mutex_lock(&vhost_user.mutex);
1031 
1032 	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
1033 		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
1034 		if (strcmp(vsocket->path, path))
1035 			continue;
1036 
1037 		if (vsocket->is_server) {
1038 			/*
1039 			 * If r/wcb is executing, release vhost_user's
1040 			 * mutex lock, and try again since the r/wcb
1041 			 * may use the mutex lock.
1042 			 */
1043 			if (fdset_try_del(&vhost_user.fdset, vsocket->socket_fd) == -1) {
1044 				pthread_mutex_unlock(&vhost_user.mutex);
1045 				goto again;
1046 			}
1047 		} else if (vsocket->reconnect) {
1048 			vhost_user_remove_reconnect(vsocket);
1049 		}
1050 
1051 		pthread_mutex_lock(&vsocket->conn_mutex);
1052 		for (conn = TAILQ_FIRST(&vsocket->conn_list);
1053 			 conn != NULL;
1054 			 conn = next) {
1055 			next = TAILQ_NEXT(conn, next);
1056 
1057 			/*
1058 			 * If r/wcb is executing, release vsocket's
1059 			 * conn_mutex and vhost_user's mutex locks, and
1060 			 * try again since the r/wcb may use the
1061 			 * conn_mutex and mutex locks.
1062 			 */
1063 			if (fdset_try_del(&vhost_user.fdset,
1064 					  conn->connfd) == -1) {
1065 				pthread_mutex_unlock(&vsocket->conn_mutex);
1066 				pthread_mutex_unlock(&vhost_user.mutex);
1067 				goto again;
1068 			}
1069 
1070 			VHOST_LOG_CONFIG(path, INFO, "free connfd %d\n", conn->connfd);
1071 			close(conn->connfd);
1072 			vhost_destroy_device(conn->vid);
1073 			TAILQ_REMOVE(&vsocket->conn_list, conn, next);
1074 			free(conn);
1075 		}
1076 		pthread_mutex_unlock(&vsocket->conn_mutex);
1077 
1078 		if (vsocket->is_server) {
1079 			close(vsocket->socket_fd);
1080 			unlink(path);
1081 		}
1082 
1083 		pthread_mutex_destroy(&vsocket->conn_mutex);
1084 		vhost_user_socket_mem_free(vsocket);
1085 
1086 		count = --vhost_user.vsocket_cnt;
1087 		vhost_user.vsockets[i] = vhost_user.vsockets[count];
1088 		vhost_user.vsockets[count] = NULL;
1089 		pthread_mutex_unlock(&vhost_user.mutex);
1090 		return 0;
1091 	}
1092 	pthread_mutex_unlock(&vhost_user.mutex);
1093 
1094 	return -1;
1095 }
1096 
1097 /*
1098  * Register ops so that we can add/remove device to data core.
1099  */
1100 int
1101 rte_vhost_driver_callback_register(const char *path,
1102 	struct rte_vhost_device_ops const * const ops)
1103 {
1104 	struct vhost_user_socket *vsocket;
1105 
1106 	pthread_mutex_lock(&vhost_user.mutex);
1107 	vsocket = find_vhost_user_socket(path);
1108 	if (vsocket)
1109 		vsocket->notify_ops = ops;
1110 	pthread_mutex_unlock(&vhost_user.mutex);
1111 
1112 	return vsocket ? 0 : -1;
1113 }
1114 
1115 struct rte_vhost_device_ops const *
1116 vhost_driver_callback_get(const char *path)
1117 {
1118 	struct vhost_user_socket *vsocket;
1119 
1120 	pthread_mutex_lock(&vhost_user.mutex);
1121 	vsocket = find_vhost_user_socket(path);
1122 	pthread_mutex_unlock(&vhost_user.mutex);
1123 
1124 	return vsocket ? vsocket->notify_ops : NULL;
1125 }
1126 
1127 int
1128 rte_vhost_driver_start(const char *path)
1129 {
1130 	struct vhost_user_socket *vsocket;
1131 	static pthread_t fdset_tid;
1132 
1133 	pthread_mutex_lock(&vhost_user.mutex);
1134 	vsocket = find_vhost_user_socket(path);
1135 	pthread_mutex_unlock(&vhost_user.mutex);
1136 
1137 	if (!vsocket)
1138 		return -1;
1139 
1140 	if (fdset_tid == 0) {
1141 		/**
1142 		 * create a pipe which will be waited by poll and notified to
1143 		 * rebuild the wait list of poll.
1144 		 */
1145 		if (fdset_pipe_init(&vhost_user.fdset) < 0) {
1146 			VHOST_LOG_CONFIG(path, ERR, "failed to create pipe for vhost fdset\n");
1147 			return -1;
1148 		}
1149 
1150 		int ret = rte_ctrl_thread_create(&fdset_tid,
1151 			"vhost-events", NULL, fdset_event_dispatch,
1152 			&vhost_user.fdset);
1153 		if (ret != 0) {
1154 			VHOST_LOG_CONFIG(path, ERR, "failed to create fdset handling thread\n");
1155 			fdset_pipe_uninit(&vhost_user.fdset);
1156 			return -1;
1157 		}
1158 	}
1159 
1160 	if (vsocket->is_server)
1161 		return vhost_user_start_server(vsocket);
1162 	else
1163 		return vhost_user_start_client(vsocket);
1164 }
1165