xref: /dpdk/lib/vhost/socket.c (revision 33e71acf3d446ced520f07e4d75769323e0ec22c)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4 
5 #include <stdint.h>
6 #include <stdio.h>
7 #include <limits.h>
8 #include <stdlib.h>
9 #include <unistd.h>
10 #include <string.h>
11 #include <sys/types.h>
12 #include <sys/socket.h>
13 #include <sys/un.h>
14 #include <sys/queue.h>
15 #include <errno.h>
16 #include <fcntl.h>
17 #include <pthread.h>
18 
19 #include <rte_log.h>
20 
21 #include "fd_man.h"
22 #include "vhost.h"
23 #include "vhost_user.h"
24 
25 
26 TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection);
27 
28 /*
29  * Every time rte_vhost_driver_register() is invoked, an associated
30  * vhost_user_socket struct will be created.
31  */
32 struct vhost_user_socket {
33 	struct vhost_user_connection_list conn_list;
34 	pthread_mutex_t conn_mutex;
35 	char *path;
36 	int socket_fd;
37 	struct sockaddr_un un;
38 	bool is_server;
39 	bool reconnect;
40 	bool iommu_support;
41 	bool use_builtin_virtio_net;
42 	bool extbuf;
43 	bool linearbuf;
44 	bool async_copy;
45 	bool net_compliant_ol_flags;
46 
47 	/*
48 	 * The "supported_features" indicates the feature bits the
49 	 * vhost driver supports. The "features" indicates the feature
50 	 * bits after the rte_vhost_driver_features_disable/enable().
51 	 * It is also the final feature bits used for vhost-user
52 	 * features negotiation.
53 	 */
54 	uint64_t supported_features;
55 	uint64_t features;
56 
57 	uint64_t protocol_features;
58 
59 	struct rte_vdpa_device *vdpa_dev;
60 
61 	struct rte_vhost_device_ops const *notify_ops;
62 };
63 
64 struct vhost_user_connection {
65 	struct vhost_user_socket *vsocket;
66 	int connfd;
67 	int vid;
68 
69 	TAILQ_ENTRY(vhost_user_connection) next;
70 };
71 
72 #define MAX_VHOST_SOCKET 1024
73 struct vhost_user {
74 	struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
75 	struct fdset fdset;
76 	int vsocket_cnt;
77 	pthread_mutex_t mutex;
78 };
79 
80 #define MAX_VIRTIO_BACKLOG 128
81 
82 static void vhost_user_server_new_connection(int fd, void *data, int *remove);
83 static void vhost_user_read_cb(int fd, void *dat, int *remove);
84 static int create_unix_socket(struct vhost_user_socket *vsocket);
85 static int vhost_user_start_client(struct vhost_user_socket *vsocket);
86 
87 static struct vhost_user vhost_user = {
88 	.fdset = {
89 		.fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
90 		.fd_mutex = PTHREAD_MUTEX_INITIALIZER,
91 		.fd_pooling_mutex = PTHREAD_MUTEX_INITIALIZER,
92 		.num = 0
93 	},
94 	.vsocket_cnt = 0,
95 	.mutex = PTHREAD_MUTEX_INITIALIZER,
96 };
97 
98 /*
99  * return bytes# of read on success or negative val on failure. Update fdnum
100  * with number of fds read.
101  */
102 int
103 read_fd_message(int sockfd, char *buf, int buflen, int *fds, int max_fds,
104 		int *fd_num)
105 {
106 	struct iovec iov;
107 	struct msghdr msgh;
108 	char control[CMSG_SPACE(max_fds * sizeof(int))];
109 	struct cmsghdr *cmsg;
110 	int got_fds = 0;
111 	int ret;
112 
113 	*fd_num = 0;
114 
115 	memset(&msgh, 0, sizeof(msgh));
116 	iov.iov_base = buf;
117 	iov.iov_len  = buflen;
118 
119 	msgh.msg_iov = &iov;
120 	msgh.msg_iovlen = 1;
121 	msgh.msg_control = control;
122 	msgh.msg_controllen = sizeof(control);
123 
124 	ret = recvmsg(sockfd, &msgh, 0);
125 	if (ret <= 0) {
126 		if (ret)
127 			VHOST_LOG_CONFIG(ERR, "recvmsg failed\n");
128 		return ret;
129 	}
130 
131 	if (msgh.msg_flags & (MSG_TRUNC | MSG_CTRUNC)) {
132 		VHOST_LOG_CONFIG(ERR, "truncated msg\n");
133 		return -1;
134 	}
135 
136 	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
137 		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
138 		if ((cmsg->cmsg_level == SOL_SOCKET) &&
139 			(cmsg->cmsg_type == SCM_RIGHTS)) {
140 			got_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
141 			*fd_num = got_fds;
142 			memcpy(fds, CMSG_DATA(cmsg), got_fds * sizeof(int));
143 			break;
144 		}
145 	}
146 
147 	/* Clear out unused file descriptors */
148 	while (got_fds < max_fds)
149 		fds[got_fds++] = -1;
150 
151 	return ret;
152 }
153 
154 int
155 send_fd_message(int sockfd, char *buf, int buflen, int *fds, int fd_num)
156 {
157 
158 	struct iovec iov;
159 	struct msghdr msgh;
160 	size_t fdsize = fd_num * sizeof(int);
161 	char control[CMSG_SPACE(fdsize)];
162 	struct cmsghdr *cmsg;
163 	int ret;
164 
165 	memset(&msgh, 0, sizeof(msgh));
166 	iov.iov_base = buf;
167 	iov.iov_len = buflen;
168 
169 	msgh.msg_iov = &iov;
170 	msgh.msg_iovlen = 1;
171 
172 	if (fds && fd_num > 0) {
173 		msgh.msg_control = control;
174 		msgh.msg_controllen = sizeof(control);
175 		cmsg = CMSG_FIRSTHDR(&msgh);
176 		if (cmsg == NULL) {
177 			VHOST_LOG_CONFIG(ERR, "cmsg == NULL\n");
178 			errno = EINVAL;
179 			return -1;
180 		}
181 		cmsg->cmsg_len = CMSG_LEN(fdsize);
182 		cmsg->cmsg_level = SOL_SOCKET;
183 		cmsg->cmsg_type = SCM_RIGHTS;
184 		memcpy(CMSG_DATA(cmsg), fds, fdsize);
185 	} else {
186 		msgh.msg_control = NULL;
187 		msgh.msg_controllen = 0;
188 	}
189 
190 	do {
191 		ret = sendmsg(sockfd, &msgh, MSG_NOSIGNAL);
192 	} while (ret < 0 && errno == EINTR);
193 
194 	if (ret < 0) {
195 		VHOST_LOG_CONFIG(ERR,  "sendmsg error\n");
196 		return ret;
197 	}
198 
199 	return ret;
200 }
201 
202 static void
203 vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
204 {
205 	int vid;
206 	size_t size;
207 	struct vhost_user_connection *conn;
208 	int ret;
209 	struct virtio_net *dev;
210 
211 	if (vsocket == NULL)
212 		return;
213 
214 	conn = malloc(sizeof(*conn));
215 	if (conn == NULL) {
216 		close(fd);
217 		return;
218 	}
219 
220 	vid = vhost_new_device();
221 	if (vid == -1) {
222 		goto err;
223 	}
224 
225 	size = strnlen(vsocket->path, PATH_MAX);
226 	vhost_set_ifname(vid, vsocket->path, size);
227 
228 	vhost_setup_virtio_net(vid, vsocket->use_builtin_virtio_net,
229 		vsocket->net_compliant_ol_flags);
230 
231 	vhost_attach_vdpa_device(vid, vsocket->vdpa_dev);
232 
233 	if (vsocket->extbuf)
234 		vhost_enable_extbuf(vid);
235 
236 	if (vsocket->linearbuf)
237 		vhost_enable_linearbuf(vid);
238 
239 	if (vsocket->async_copy) {
240 		dev = get_device(vid);
241 
242 		if (dev)
243 			dev->async_copy = 1;
244 	}
245 
246 	VHOST_LOG_CONFIG(INFO, "new device, handle is %d, path is %s\n", vid, vsocket->path);
247 
248 	if (vsocket->notify_ops->new_connection) {
249 		ret = vsocket->notify_ops->new_connection(vid);
250 		if (ret < 0) {
251 			VHOST_LOG_CONFIG(ERR,
252 				"failed to add vhost user connection with fd %d\n",
253 				fd);
254 			goto err_cleanup;
255 		}
256 	}
257 
258 	conn->connfd = fd;
259 	conn->vsocket = vsocket;
260 	conn->vid = vid;
261 	ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb,
262 			NULL, conn);
263 	if (ret < 0) {
264 		VHOST_LOG_CONFIG(ERR,
265 			"failed to add fd %d into vhost server fdset\n",
266 			fd);
267 
268 		if (vsocket->notify_ops->destroy_connection)
269 			vsocket->notify_ops->destroy_connection(conn->vid);
270 
271 		goto err_cleanup;
272 	}
273 
274 	pthread_mutex_lock(&vsocket->conn_mutex);
275 	TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next);
276 	pthread_mutex_unlock(&vsocket->conn_mutex);
277 
278 	fdset_pipe_notify(&vhost_user.fdset);
279 	return;
280 
281 err_cleanup:
282 	vhost_destroy_device(vid);
283 err:
284 	free(conn);
285 	close(fd);
286 }
287 
288 /* call back when there is new vhost-user connection from client  */
289 static void
290 vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
291 {
292 	struct vhost_user_socket *vsocket = dat;
293 
294 	fd = accept(fd, NULL, NULL);
295 	if (fd < 0)
296 		return;
297 
298 	VHOST_LOG_CONFIG(INFO, "new vhost user connection is %d\n", fd);
299 	vhost_user_add_connection(fd, vsocket);
300 }
301 
302 static void
303 vhost_user_read_cb(int connfd, void *dat, int *remove)
304 {
305 	struct vhost_user_connection *conn = dat;
306 	struct vhost_user_socket *vsocket = conn->vsocket;
307 	int ret;
308 
309 	ret = vhost_user_msg_handler(conn->vid, connfd);
310 	if (ret < 0) {
311 		struct virtio_net *dev = get_device(conn->vid);
312 
313 		close(connfd);
314 		*remove = 1;
315 
316 		if (dev)
317 			vhost_destroy_device_notify(dev);
318 
319 		if (vsocket->notify_ops->destroy_connection)
320 			vsocket->notify_ops->destroy_connection(conn->vid);
321 
322 		vhost_destroy_device(conn->vid);
323 
324 		if (vsocket->reconnect) {
325 			create_unix_socket(vsocket);
326 			vhost_user_start_client(vsocket);
327 		}
328 
329 		pthread_mutex_lock(&vsocket->conn_mutex);
330 		TAILQ_REMOVE(&vsocket->conn_list, conn, next);
331 		pthread_mutex_unlock(&vsocket->conn_mutex);
332 
333 		free(conn);
334 	}
335 }
336 
337 static int
338 create_unix_socket(struct vhost_user_socket *vsocket)
339 {
340 	int fd;
341 	struct sockaddr_un *un = &vsocket->un;
342 
343 	fd = socket(AF_UNIX, SOCK_STREAM, 0);
344 	if (fd < 0)
345 		return -1;
346 	VHOST_LOG_CONFIG(INFO, "vhost-user %s: socket created, fd: %d\n",
347 		vsocket->is_server ? "server" : "client", fd);
348 
349 	if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
350 		VHOST_LOG_CONFIG(ERR,
351 			"vhost-user: can't set nonblocking mode for socket, fd: "
352 			"%d (%s)\n", fd, strerror(errno));
353 		close(fd);
354 		return -1;
355 	}
356 
357 	memset(un, 0, sizeof(*un));
358 	un->sun_family = AF_UNIX;
359 	strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path));
360 	un->sun_path[sizeof(un->sun_path) - 1] = '\0';
361 
362 	vsocket->socket_fd = fd;
363 	return 0;
364 }
365 
366 static int
367 vhost_user_start_server(struct vhost_user_socket *vsocket)
368 {
369 	int ret;
370 	int fd = vsocket->socket_fd;
371 	const char *path = vsocket->path;
372 
373 	/*
374 	 * bind () may fail if the socket file with the same name already
375 	 * exists. But the library obviously should not delete the file
376 	 * provided by the user, since we can not be sure that it is not
377 	 * being used by other applications. Moreover, many applications form
378 	 * socket names based on user input, which is prone to errors.
379 	 *
380 	 * The user must ensure that the socket does not exist before
381 	 * registering the vhost driver in server mode.
382 	 */
383 	ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un));
384 	if (ret < 0) {
385 		VHOST_LOG_CONFIG(ERR,
386 			"failed to bind to %s: %s; remove it and try again\n",
387 			path, strerror(errno));
388 		goto err;
389 	}
390 	VHOST_LOG_CONFIG(INFO, "bind to %s\n", path);
391 
392 	ret = listen(fd, MAX_VIRTIO_BACKLOG);
393 	if (ret < 0)
394 		goto err;
395 
396 	ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
397 		  NULL, vsocket);
398 	if (ret < 0) {
399 		VHOST_LOG_CONFIG(ERR,
400 			"failed to add listen fd %d to vhost server fdset\n",
401 			fd);
402 		goto err;
403 	}
404 
405 	return 0;
406 
407 err:
408 	close(fd);
409 	return -1;
410 }
411 
412 struct vhost_user_reconnect {
413 	struct sockaddr_un un;
414 	int fd;
415 	struct vhost_user_socket *vsocket;
416 
417 	TAILQ_ENTRY(vhost_user_reconnect) next;
418 };
419 
420 TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
421 struct vhost_user_reconnect_list {
422 	struct vhost_user_reconnect_tailq_list head;
423 	pthread_mutex_t mutex;
424 };
425 
426 static struct vhost_user_reconnect_list reconn_list;
427 static pthread_t reconn_tid;
428 
429 static int
430 vhost_user_connect_nonblock(int fd, struct sockaddr *un, size_t sz)
431 {
432 	int ret, flags;
433 
434 	ret = connect(fd, un, sz);
435 	if (ret < 0 && errno != EISCONN)
436 		return -1;
437 
438 	flags = fcntl(fd, F_GETFL, 0);
439 	if (flags < 0) {
440 		VHOST_LOG_CONFIG(ERR,
441 			"can't get flags for connfd %d\n", fd);
442 		return -2;
443 	}
444 	if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
445 		VHOST_LOG_CONFIG(ERR,
446 				"can't disable nonblocking on fd %d\n", fd);
447 		return -2;
448 	}
449 	return 0;
450 }
451 
452 static void *
453 vhost_user_client_reconnect(void *arg __rte_unused)
454 {
455 	int ret;
456 	struct vhost_user_reconnect *reconn, *next;
457 
458 	while (1) {
459 		pthread_mutex_lock(&reconn_list.mutex);
460 
461 		/*
462 		 * An equal implementation of TAILQ_FOREACH_SAFE,
463 		 * which does not exist on all platforms.
464 		 */
465 		for (reconn = TAILQ_FIRST(&reconn_list.head);
466 		     reconn != NULL; reconn = next) {
467 			next = TAILQ_NEXT(reconn, next);
468 
469 			ret = vhost_user_connect_nonblock(reconn->fd,
470 						(struct sockaddr *)&reconn->un,
471 						sizeof(reconn->un));
472 			if (ret == -2) {
473 				close(reconn->fd);
474 				VHOST_LOG_CONFIG(ERR,
475 					"reconnection for fd %d failed\n",
476 					reconn->fd);
477 				goto remove_fd;
478 			}
479 			if (ret == -1)
480 				continue;
481 
482 			VHOST_LOG_CONFIG(INFO,
483 				"%s: connected\n", reconn->vsocket->path);
484 			vhost_user_add_connection(reconn->fd, reconn->vsocket);
485 remove_fd:
486 			TAILQ_REMOVE(&reconn_list.head, reconn, next);
487 			free(reconn);
488 		}
489 
490 		pthread_mutex_unlock(&reconn_list.mutex);
491 		sleep(1);
492 	}
493 
494 	return NULL;
495 }
496 
497 static int
498 vhost_user_reconnect_init(void)
499 {
500 	int ret;
501 
502 	ret = pthread_mutex_init(&reconn_list.mutex, NULL);
503 	if (ret < 0) {
504 		VHOST_LOG_CONFIG(ERR, "failed to initialize mutex");
505 		return ret;
506 	}
507 	TAILQ_INIT(&reconn_list.head);
508 
509 	ret = rte_ctrl_thread_create(&reconn_tid, "vhost_reconn", NULL,
510 			     vhost_user_client_reconnect, NULL);
511 	if (ret != 0) {
512 		VHOST_LOG_CONFIG(ERR, "failed to create reconnect thread");
513 		if (pthread_mutex_destroy(&reconn_list.mutex)) {
514 			VHOST_LOG_CONFIG(ERR,
515 				"failed to destroy reconnect mutex");
516 		}
517 	}
518 
519 	return ret;
520 }
521 
522 static int
523 vhost_user_start_client(struct vhost_user_socket *vsocket)
524 {
525 	int ret;
526 	int fd = vsocket->socket_fd;
527 	const char *path = vsocket->path;
528 	struct vhost_user_reconnect *reconn;
529 
530 	ret = vhost_user_connect_nonblock(fd, (struct sockaddr *)&vsocket->un,
531 					  sizeof(vsocket->un));
532 	if (ret == 0) {
533 		vhost_user_add_connection(fd, vsocket);
534 		return 0;
535 	}
536 
537 	VHOST_LOG_CONFIG(WARNING,
538 		"failed to connect to %s: %s\n",
539 		path, strerror(errno));
540 
541 	if (ret == -2 || !vsocket->reconnect) {
542 		close(fd);
543 		return -1;
544 	}
545 
546 	VHOST_LOG_CONFIG(INFO, "%s: reconnecting...\n", path);
547 	reconn = malloc(sizeof(*reconn));
548 	if (reconn == NULL) {
549 		VHOST_LOG_CONFIG(ERR,
550 			"failed to allocate memory for reconnect\n");
551 		close(fd);
552 		return -1;
553 	}
554 	reconn->un = vsocket->un;
555 	reconn->fd = fd;
556 	reconn->vsocket = vsocket;
557 	pthread_mutex_lock(&reconn_list.mutex);
558 	TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
559 	pthread_mutex_unlock(&reconn_list.mutex);
560 
561 	return 0;
562 }
563 
564 static struct vhost_user_socket *
565 find_vhost_user_socket(const char *path)
566 {
567 	int i;
568 
569 	if (path == NULL)
570 		return NULL;
571 
572 	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
573 		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
574 
575 		if (!strcmp(vsocket->path, path))
576 			return vsocket;
577 	}
578 
579 	return NULL;
580 }
581 
582 int
583 rte_vhost_driver_attach_vdpa_device(const char *path,
584 		struct rte_vdpa_device *dev)
585 {
586 	struct vhost_user_socket *vsocket;
587 
588 	if (dev == NULL || path == NULL)
589 		return -1;
590 
591 	pthread_mutex_lock(&vhost_user.mutex);
592 	vsocket = find_vhost_user_socket(path);
593 	if (vsocket)
594 		vsocket->vdpa_dev = dev;
595 	pthread_mutex_unlock(&vhost_user.mutex);
596 
597 	return vsocket ? 0 : -1;
598 }
599 
600 int
601 rte_vhost_driver_detach_vdpa_device(const char *path)
602 {
603 	struct vhost_user_socket *vsocket;
604 
605 	pthread_mutex_lock(&vhost_user.mutex);
606 	vsocket = find_vhost_user_socket(path);
607 	if (vsocket)
608 		vsocket->vdpa_dev = NULL;
609 	pthread_mutex_unlock(&vhost_user.mutex);
610 
611 	return vsocket ? 0 : -1;
612 }
613 
614 struct rte_vdpa_device *
615 rte_vhost_driver_get_vdpa_device(const char *path)
616 {
617 	struct vhost_user_socket *vsocket;
618 	struct rte_vdpa_device *dev = NULL;
619 
620 	pthread_mutex_lock(&vhost_user.mutex);
621 	vsocket = find_vhost_user_socket(path);
622 	if (vsocket)
623 		dev = vsocket->vdpa_dev;
624 	pthread_mutex_unlock(&vhost_user.mutex);
625 
626 	return dev;
627 }
628 
629 int
630 rte_vhost_driver_disable_features(const char *path, uint64_t features)
631 {
632 	struct vhost_user_socket *vsocket;
633 
634 	pthread_mutex_lock(&vhost_user.mutex);
635 	vsocket = find_vhost_user_socket(path);
636 
637 	/* Note that use_builtin_virtio_net is not affected by this function
638 	 * since callers may want to selectively disable features of the
639 	 * built-in vhost net device backend.
640 	 */
641 
642 	if (vsocket)
643 		vsocket->features &= ~features;
644 	pthread_mutex_unlock(&vhost_user.mutex);
645 
646 	return vsocket ? 0 : -1;
647 }
648 
649 int
650 rte_vhost_driver_enable_features(const char *path, uint64_t features)
651 {
652 	struct vhost_user_socket *vsocket;
653 
654 	pthread_mutex_lock(&vhost_user.mutex);
655 	vsocket = find_vhost_user_socket(path);
656 	if (vsocket) {
657 		if ((vsocket->supported_features & features) != features) {
658 			/*
659 			 * trying to enable features the driver doesn't
660 			 * support.
661 			 */
662 			pthread_mutex_unlock(&vhost_user.mutex);
663 			return -1;
664 		}
665 		vsocket->features |= features;
666 	}
667 	pthread_mutex_unlock(&vhost_user.mutex);
668 
669 	return vsocket ? 0 : -1;
670 }
671 
672 int
673 rte_vhost_driver_set_features(const char *path, uint64_t features)
674 {
675 	struct vhost_user_socket *vsocket;
676 
677 	pthread_mutex_lock(&vhost_user.mutex);
678 	vsocket = find_vhost_user_socket(path);
679 	if (vsocket) {
680 		vsocket->supported_features = features;
681 		vsocket->features = features;
682 
683 		/* Anyone setting feature bits is implementing their own vhost
684 		 * device backend.
685 		 */
686 		vsocket->use_builtin_virtio_net = false;
687 	}
688 	pthread_mutex_unlock(&vhost_user.mutex);
689 
690 	return vsocket ? 0 : -1;
691 }
692 
693 int
694 rte_vhost_driver_get_features(const char *path, uint64_t *features)
695 {
696 	struct vhost_user_socket *vsocket;
697 	uint64_t vdpa_features;
698 	struct rte_vdpa_device *vdpa_dev;
699 	int ret = 0;
700 
701 	pthread_mutex_lock(&vhost_user.mutex);
702 	vsocket = find_vhost_user_socket(path);
703 	if (!vsocket) {
704 		VHOST_LOG_CONFIG(ERR,
705 			"socket file %s is not registered yet.\n", path);
706 		ret = -1;
707 		goto unlock_exit;
708 	}
709 
710 	vdpa_dev = vsocket->vdpa_dev;
711 	if (!vdpa_dev) {
712 		*features = vsocket->features;
713 		goto unlock_exit;
714 	}
715 
716 	if (vdpa_dev->ops->get_features(vdpa_dev, &vdpa_features) < 0) {
717 		VHOST_LOG_CONFIG(ERR,
718 				"failed to get vdpa features "
719 				"for socket file %s.\n", path);
720 		ret = -1;
721 		goto unlock_exit;
722 	}
723 
724 	*features = vsocket->features & vdpa_features;
725 
726 unlock_exit:
727 	pthread_mutex_unlock(&vhost_user.mutex);
728 	return ret;
729 }
730 
731 int
732 rte_vhost_driver_set_protocol_features(const char *path,
733 		uint64_t protocol_features)
734 {
735 	struct vhost_user_socket *vsocket;
736 
737 	pthread_mutex_lock(&vhost_user.mutex);
738 	vsocket = find_vhost_user_socket(path);
739 	if (vsocket)
740 		vsocket->protocol_features = protocol_features;
741 	pthread_mutex_unlock(&vhost_user.mutex);
742 	return vsocket ? 0 : -1;
743 }
744 
745 int
746 rte_vhost_driver_get_protocol_features(const char *path,
747 		uint64_t *protocol_features)
748 {
749 	struct vhost_user_socket *vsocket;
750 	uint64_t vdpa_protocol_features;
751 	struct rte_vdpa_device *vdpa_dev;
752 	int ret = 0;
753 
754 	pthread_mutex_lock(&vhost_user.mutex);
755 	vsocket = find_vhost_user_socket(path);
756 	if (!vsocket) {
757 		VHOST_LOG_CONFIG(ERR,
758 			"socket file %s is not registered yet.\n", path);
759 		ret = -1;
760 		goto unlock_exit;
761 	}
762 
763 	vdpa_dev = vsocket->vdpa_dev;
764 	if (!vdpa_dev) {
765 		*protocol_features = vsocket->protocol_features;
766 		goto unlock_exit;
767 	}
768 
769 	if (vdpa_dev->ops->get_protocol_features(vdpa_dev,
770 				&vdpa_protocol_features) < 0) {
771 		VHOST_LOG_CONFIG(ERR,
772 				"failed to get vdpa protocol features "
773 				"for socket file %s.\n", path);
774 		ret = -1;
775 		goto unlock_exit;
776 	}
777 
778 	*protocol_features = vsocket->protocol_features
779 		& vdpa_protocol_features;
780 
781 unlock_exit:
782 	pthread_mutex_unlock(&vhost_user.mutex);
783 	return ret;
784 }
785 
786 int
787 rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num)
788 {
789 	struct vhost_user_socket *vsocket;
790 	uint32_t vdpa_queue_num;
791 	struct rte_vdpa_device *vdpa_dev;
792 	int ret = 0;
793 
794 	pthread_mutex_lock(&vhost_user.mutex);
795 	vsocket = find_vhost_user_socket(path);
796 	if (!vsocket) {
797 		VHOST_LOG_CONFIG(ERR,
798 			"socket file %s is not registered yet.\n", path);
799 		ret = -1;
800 		goto unlock_exit;
801 	}
802 
803 	vdpa_dev = vsocket->vdpa_dev;
804 	if (!vdpa_dev) {
805 		*queue_num = VHOST_MAX_QUEUE_PAIRS;
806 		goto unlock_exit;
807 	}
808 
809 	if (vdpa_dev->ops->get_queue_num(vdpa_dev, &vdpa_queue_num) < 0) {
810 		VHOST_LOG_CONFIG(ERR,
811 				"failed to get vdpa queue number "
812 				"for socket file %s.\n", path);
813 		ret = -1;
814 		goto unlock_exit;
815 	}
816 
817 	*queue_num = RTE_MIN((uint32_t)VHOST_MAX_QUEUE_PAIRS, vdpa_queue_num);
818 
819 unlock_exit:
820 	pthread_mutex_unlock(&vhost_user.mutex);
821 	return ret;
822 }
823 
824 static void
825 vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
826 {
827 	if (vsocket && vsocket->path) {
828 		free(vsocket->path);
829 		vsocket->path = NULL;
830 	}
831 
832 	if (vsocket) {
833 		free(vsocket);
834 		vsocket = NULL;
835 	}
836 }
837 
838 /*
839  * Register a new vhost-user socket; here we could act as server
840  * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
841  * is set.
842  */
843 int
844 rte_vhost_driver_register(const char *path, uint64_t flags)
845 {
846 	int ret = -1;
847 	struct vhost_user_socket *vsocket;
848 
849 	if (!path)
850 		return -1;
851 
852 	pthread_mutex_lock(&vhost_user.mutex);
853 
854 	if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
855 		VHOST_LOG_CONFIG(ERR,
856 			"error: the number of vhost sockets reaches maximum\n");
857 		goto out;
858 	}
859 
860 	vsocket = malloc(sizeof(struct vhost_user_socket));
861 	if (!vsocket)
862 		goto out;
863 	memset(vsocket, 0, sizeof(struct vhost_user_socket));
864 	vsocket->path = strdup(path);
865 	if (vsocket->path == NULL) {
866 		VHOST_LOG_CONFIG(ERR,
867 			"error: failed to copy socket path string\n");
868 		vhost_user_socket_mem_free(vsocket);
869 		goto out;
870 	}
871 	TAILQ_INIT(&vsocket->conn_list);
872 	ret = pthread_mutex_init(&vsocket->conn_mutex, NULL);
873 	if (ret) {
874 		VHOST_LOG_CONFIG(ERR,
875 			"error: failed to init connection mutex\n");
876 		goto out_free;
877 	}
878 	vsocket->vdpa_dev = NULL;
879 	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
880 	vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
881 	vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
882 	vsocket->net_compliant_ol_flags = flags & RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
883 
884 	if (vsocket->async_copy &&
885 		(flags & (RTE_VHOST_USER_IOMMU_SUPPORT |
886 		RTE_VHOST_USER_POSTCOPY_SUPPORT))) {
887 		VHOST_LOG_CONFIG(ERR, "error: enabling async copy and IOMMU "
888 			"or post-copy feature simultaneously is not "
889 			"supported\n");
890 		goto out_mutex;
891 	}
892 
893 	/*
894 	 * Set the supported features correctly for the builtin vhost-user
895 	 * net driver.
896 	 *
897 	 * Applications know nothing about features the builtin virtio net
898 	 * driver (virtio_net.c) supports, thus it's not possible for them
899 	 * to invoke rte_vhost_driver_set_features(). To workaround it, here
900 	 * we set it unconditionally. If the application want to implement
901 	 * another vhost-user driver (say SCSI), it should call the
902 	 * rte_vhost_driver_set_features(), which will overwrite following
903 	 * two values.
904 	 */
905 	vsocket->use_builtin_virtio_net = true;
906 	vsocket->supported_features = VIRTIO_NET_SUPPORTED_FEATURES;
907 	vsocket->features           = VIRTIO_NET_SUPPORTED_FEATURES;
908 	vsocket->protocol_features  = VHOST_USER_PROTOCOL_FEATURES;
909 
910 	if (vsocket->async_copy) {
911 		vsocket->supported_features &= ~(1ULL << VHOST_F_LOG_ALL);
912 		vsocket->features &= ~(1ULL << VHOST_F_LOG_ALL);
913 		VHOST_LOG_CONFIG(INFO,
914 			"Logging feature is disabled in async copy mode\n");
915 	}
916 
917 	/*
918 	 * We'll not be able to receive a buffer from guest in linear mode
919 	 * without external buffer if it will not fit in a single mbuf, which is
920 	 * likely if segmentation offloading enabled.
921 	 */
922 	if (vsocket->linearbuf && !vsocket->extbuf) {
923 		uint64_t seg_offload_features =
924 				(1ULL << VIRTIO_NET_F_HOST_TSO4) |
925 				(1ULL << VIRTIO_NET_F_HOST_TSO6) |
926 				(1ULL << VIRTIO_NET_F_HOST_UFO);
927 
928 		VHOST_LOG_CONFIG(INFO,
929 			"Linear buffers requested without external buffers, "
930 			"disabling host segmentation offloading support\n");
931 		vsocket->supported_features &= ~seg_offload_features;
932 		vsocket->features &= ~seg_offload_features;
933 	}
934 
935 	if (!(flags & RTE_VHOST_USER_IOMMU_SUPPORT)) {
936 		vsocket->supported_features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
937 		vsocket->features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
938 	}
939 
940 	if (!(flags & RTE_VHOST_USER_POSTCOPY_SUPPORT)) {
941 		vsocket->protocol_features &=
942 			~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
943 	} else {
944 #ifndef RTE_LIBRTE_VHOST_POSTCOPY
945 		VHOST_LOG_CONFIG(ERR,
946 			"Postcopy requested but not compiled\n");
947 		ret = -1;
948 		goto out_mutex;
949 #endif
950 	}
951 
952 	if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
953 		vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
954 		if (vsocket->reconnect && reconn_tid == 0) {
955 			if (vhost_user_reconnect_init() != 0)
956 				goto out_mutex;
957 		}
958 	} else {
959 		vsocket->is_server = true;
960 	}
961 	ret = create_unix_socket(vsocket);
962 	if (ret < 0) {
963 		goto out_mutex;
964 	}
965 
966 	vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
967 
968 	pthread_mutex_unlock(&vhost_user.mutex);
969 	return ret;
970 
971 out_mutex:
972 	if (pthread_mutex_destroy(&vsocket->conn_mutex)) {
973 		VHOST_LOG_CONFIG(ERR,
974 			"error: failed to destroy connection mutex\n");
975 	}
976 out_free:
977 	vhost_user_socket_mem_free(vsocket);
978 out:
979 	pthread_mutex_unlock(&vhost_user.mutex);
980 
981 	return ret;
982 }
983 
984 static bool
985 vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
986 {
987 	int found = false;
988 	struct vhost_user_reconnect *reconn, *next;
989 
990 	pthread_mutex_lock(&reconn_list.mutex);
991 
992 	for (reconn = TAILQ_FIRST(&reconn_list.head);
993 	     reconn != NULL; reconn = next) {
994 		next = TAILQ_NEXT(reconn, next);
995 
996 		if (reconn->vsocket == vsocket) {
997 			TAILQ_REMOVE(&reconn_list.head, reconn, next);
998 			close(reconn->fd);
999 			free(reconn);
1000 			found = true;
1001 			break;
1002 		}
1003 	}
1004 	pthread_mutex_unlock(&reconn_list.mutex);
1005 	return found;
1006 }
1007 
1008 /**
1009  * Unregister the specified vhost socket
1010  */
1011 int
1012 rte_vhost_driver_unregister(const char *path)
1013 {
1014 	int i;
1015 	int count;
1016 	struct vhost_user_connection *conn, *next;
1017 
1018 	if (path == NULL)
1019 		return -1;
1020 
1021 again:
1022 	pthread_mutex_lock(&vhost_user.mutex);
1023 
1024 	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
1025 		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
1026 		if (strcmp(vsocket->path, path))
1027 			continue;
1028 
1029 		if (vsocket->is_server) {
1030 			/*
1031 			 * If r/wcb is executing, release vhost_user's
1032 			 * mutex lock, and try again since the r/wcb
1033 			 * may use the mutex lock.
1034 			 */
1035 			if (fdset_try_del(&vhost_user.fdset, vsocket->socket_fd) == -1) {
1036 				pthread_mutex_unlock(&vhost_user.mutex);
1037 				goto again;
1038 			}
1039 		} else if (vsocket->reconnect) {
1040 			vhost_user_remove_reconnect(vsocket);
1041 		}
1042 
1043 		pthread_mutex_lock(&vsocket->conn_mutex);
1044 		for (conn = TAILQ_FIRST(&vsocket->conn_list);
1045 			 conn != NULL;
1046 			 conn = next) {
1047 			next = TAILQ_NEXT(conn, next);
1048 
1049 			/*
1050 			 * If r/wcb is executing, release vsocket's
1051 			 * conn_mutex and vhost_user's mutex locks, and
1052 			 * try again since the r/wcb may use the
1053 			 * conn_mutex and mutex locks.
1054 			 */
1055 			if (fdset_try_del(&vhost_user.fdset,
1056 					  conn->connfd) == -1) {
1057 				pthread_mutex_unlock(&vsocket->conn_mutex);
1058 				pthread_mutex_unlock(&vhost_user.mutex);
1059 				goto again;
1060 			}
1061 
1062 			VHOST_LOG_CONFIG(INFO,
1063 				"free connfd = %d for device '%s'\n",
1064 				conn->connfd, path);
1065 			close(conn->connfd);
1066 			vhost_destroy_device(conn->vid);
1067 			TAILQ_REMOVE(&vsocket->conn_list, conn, next);
1068 			free(conn);
1069 		}
1070 		pthread_mutex_unlock(&vsocket->conn_mutex);
1071 
1072 		if (vsocket->is_server) {
1073 			close(vsocket->socket_fd);
1074 			unlink(path);
1075 		}
1076 
1077 		pthread_mutex_destroy(&vsocket->conn_mutex);
1078 		vhost_user_socket_mem_free(vsocket);
1079 
1080 		count = --vhost_user.vsocket_cnt;
1081 		vhost_user.vsockets[i] = vhost_user.vsockets[count];
1082 		vhost_user.vsockets[count] = NULL;
1083 		pthread_mutex_unlock(&vhost_user.mutex);
1084 		return 0;
1085 	}
1086 	pthread_mutex_unlock(&vhost_user.mutex);
1087 
1088 	return -1;
1089 }
1090 
1091 /*
1092  * Register ops so that we can add/remove device to data core.
1093  */
1094 int
1095 rte_vhost_driver_callback_register(const char *path,
1096 	struct rte_vhost_device_ops const * const ops)
1097 {
1098 	struct vhost_user_socket *vsocket;
1099 
1100 	pthread_mutex_lock(&vhost_user.mutex);
1101 	vsocket = find_vhost_user_socket(path);
1102 	if (vsocket)
1103 		vsocket->notify_ops = ops;
1104 	pthread_mutex_unlock(&vhost_user.mutex);
1105 
1106 	return vsocket ? 0 : -1;
1107 }
1108 
1109 struct rte_vhost_device_ops const *
1110 vhost_driver_callback_get(const char *path)
1111 {
1112 	struct vhost_user_socket *vsocket;
1113 
1114 	pthread_mutex_lock(&vhost_user.mutex);
1115 	vsocket = find_vhost_user_socket(path);
1116 	pthread_mutex_unlock(&vhost_user.mutex);
1117 
1118 	return vsocket ? vsocket->notify_ops : NULL;
1119 }
1120 
1121 int
1122 rte_vhost_driver_start(const char *path)
1123 {
1124 	struct vhost_user_socket *vsocket;
1125 	static pthread_t fdset_tid;
1126 
1127 	pthread_mutex_lock(&vhost_user.mutex);
1128 	vsocket = find_vhost_user_socket(path);
1129 	pthread_mutex_unlock(&vhost_user.mutex);
1130 
1131 	if (!vsocket)
1132 		return -1;
1133 
1134 	if (fdset_tid == 0) {
1135 		/**
1136 		 * create a pipe which will be waited by poll and notified to
1137 		 * rebuild the wait list of poll.
1138 		 */
1139 		if (fdset_pipe_init(&vhost_user.fdset) < 0) {
1140 			VHOST_LOG_CONFIG(ERR,
1141 				"failed to create pipe for vhost fdset\n");
1142 			return -1;
1143 		}
1144 
1145 		int ret = rte_ctrl_thread_create(&fdset_tid,
1146 			"vhost-events", NULL, fdset_event_dispatch,
1147 			&vhost_user.fdset);
1148 		if (ret != 0) {
1149 			VHOST_LOG_CONFIG(ERR,
1150 				"failed to create fdset handling thread");
1151 
1152 			fdset_pipe_uninit(&vhost_user.fdset);
1153 			return -1;
1154 		}
1155 	}
1156 
1157 	if (vsocket->is_server)
1158 		return vhost_user_start_server(vsocket);
1159 	else
1160 		return vhost_user_start_client(vsocket);
1161 }
1162