xref: /dpdk/lib/vhost/socket.c (revision 7122fb674e316299c6938ae8b6264dcacab58cd5)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4 
5 #include <stdint.h>
6 #include <stdio.h>
7 #include <limits.h>
8 #include <stdlib.h>
9 #include <unistd.h>
10 #include <string.h>
11 #include <sys/socket.h>
12 #include <sys/un.h>
13 #include <sys/queue.h>
14 #include <errno.h>
15 #include <fcntl.h>
16 #include <pthread.h>
17 
18 #include <rte_function_versioning.h>
19 #include <rte_log.h>
20 
21 #include "fd_man.h"
22 #include "vduse.h"
23 #include "vhost.h"
24 #include "vhost_user.h"
25 
26 
27 TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection);
28 
29 /*
30  * Every time rte_vhost_driver_register() is invoked, an associated
31  * vhost_user_socket struct will be created.
32  */
33 struct vhost_user_socket {
34 	struct vhost_user_connection_list conn_list;
35 	pthread_mutex_t conn_mutex;
36 	char *path;
37 	int socket_fd;
38 	struct sockaddr_un un;
39 	bool is_server;
40 	bool is_vduse;
41 	bool reconnect;
42 	bool iommu_support;
43 	bool use_builtin_virtio_net;
44 	bool extbuf;
45 	bool linearbuf;
46 	bool async_copy;
47 	bool net_compliant_ol_flags;
48 	bool stats_enabled;
49 
50 	/*
51 	 * The "supported_features" indicates the feature bits the
52 	 * vhost driver supports. The "features" indicates the feature
53 	 * bits after the rte_vhost_driver_features_disable/enable().
54 	 * It is also the final feature bits used for vhost-user
55 	 * features negotiation.
56 	 */
57 	uint64_t supported_features;
58 	uint64_t features;
59 
60 	uint64_t protocol_features;
61 
62 	uint32_t max_queue_pairs;
63 
64 	struct rte_vdpa_device *vdpa_dev;
65 
66 	struct rte_vhost_device_ops const *notify_ops;
67 	struct rte_vhost_device_ops *malloc_notify_ops;
68 };
69 
70 struct vhost_user_connection {
71 	struct vhost_user_socket *vsocket;
72 	int connfd;
73 	int vid;
74 
75 	TAILQ_ENTRY(vhost_user_connection) next;
76 };
77 
78 #define MAX_VHOST_SOCKET 1024
79 struct vhost_user {
80 	struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
81 	struct fdset fdset;
82 	int vsocket_cnt;
83 	pthread_mutex_t mutex;
84 };
85 
86 #define MAX_VIRTIO_BACKLOG 128
87 
88 static void vhost_user_server_new_connection(int fd, void *data, int *remove);
89 static void vhost_user_read_cb(int fd, void *dat, int *remove);
90 static int create_unix_socket(struct vhost_user_socket *vsocket);
91 static int vhost_user_start_client(struct vhost_user_socket *vsocket);
92 
93 static struct vhost_user vhost_user = {
94 	.fdset = {
95 		.fd = { [0 ... MAX_FDS - 1] = {-1, NULL, NULL, NULL, 0} },
96 		.fd_mutex = PTHREAD_MUTEX_INITIALIZER,
97 		.fd_pooling_mutex = PTHREAD_MUTEX_INITIALIZER,
98 		.num = 0
99 	},
100 	.vsocket_cnt = 0,
101 	.mutex = PTHREAD_MUTEX_INITIALIZER,
102 };
103 
104 /*
105  * return bytes# of read on success or negative val on failure. Update fdnum
106  * with number of fds read.
107  */
108 int
109 read_fd_message(char *ifname, int sockfd, char *buf, int buflen, int *fds, int max_fds,
110 		int *fd_num)
111 {
112 	struct iovec iov;
113 	struct msghdr msgh;
114 	char control[CMSG_SPACE(max_fds * sizeof(int))];
115 	struct cmsghdr *cmsg;
116 	int got_fds = 0;
117 	int ret;
118 
119 	*fd_num = 0;
120 
121 	memset(&msgh, 0, sizeof(msgh));
122 	iov.iov_base = buf;
123 	iov.iov_len  = buflen;
124 
125 	msgh.msg_iov = &iov;
126 	msgh.msg_iovlen = 1;
127 	msgh.msg_control = control;
128 	msgh.msg_controllen = sizeof(control);
129 
130 	ret = recvmsg(sockfd, &msgh, 0);
131 	if (ret <= 0) {
132 		if (ret)
133 			VHOST_LOG_CONFIG(ifname, ERR, "recvmsg failed on fd %d (%s)\n",
134 				sockfd, strerror(errno));
135 		return ret;
136 	}
137 
138 	if (msgh.msg_flags & MSG_TRUNC)
139 		VHOST_LOG_CONFIG(ifname, ERR, "truncated msg (fd %d)\n", sockfd);
140 
141 	/* MSG_CTRUNC may be caused by LSM misconfiguration */
142 	if (msgh.msg_flags & MSG_CTRUNC)
143 		VHOST_LOG_CONFIG(ifname, ERR, "truncated control data (fd %d)\n", sockfd);
144 
145 	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
146 		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
147 		if ((cmsg->cmsg_level == SOL_SOCKET) &&
148 			(cmsg->cmsg_type == SCM_RIGHTS)) {
149 			got_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
150 			*fd_num = got_fds;
151 			memcpy(fds, CMSG_DATA(cmsg), got_fds * sizeof(int));
152 			break;
153 		}
154 	}
155 
156 	/* Clear out unused file descriptors */
157 	while (got_fds < max_fds)
158 		fds[got_fds++] = -1;
159 
160 	return ret;
161 }
162 
163 int
164 send_fd_message(char *ifname, int sockfd, char *buf, int buflen, int *fds, int fd_num)
165 {
166 
167 	struct iovec iov;
168 	struct msghdr msgh;
169 	size_t fdsize = fd_num * sizeof(int);
170 	char control[CMSG_SPACE(fdsize)];
171 	struct cmsghdr *cmsg;
172 	int ret;
173 
174 	memset(&msgh, 0, sizeof(msgh));
175 	iov.iov_base = buf;
176 	iov.iov_len = buflen;
177 
178 	msgh.msg_iov = &iov;
179 	msgh.msg_iovlen = 1;
180 
181 	if (fds && fd_num > 0) {
182 		msgh.msg_control = control;
183 		msgh.msg_controllen = sizeof(control);
184 		cmsg = CMSG_FIRSTHDR(&msgh);
185 		if (cmsg == NULL) {
186 			VHOST_LOG_CONFIG(ifname, ERR, "cmsg == NULL\n");
187 			errno = EINVAL;
188 			return -1;
189 		}
190 		cmsg->cmsg_len = CMSG_LEN(fdsize);
191 		cmsg->cmsg_level = SOL_SOCKET;
192 		cmsg->cmsg_type = SCM_RIGHTS;
193 		memcpy(CMSG_DATA(cmsg), fds, fdsize);
194 	} else {
195 		msgh.msg_control = NULL;
196 		msgh.msg_controllen = 0;
197 	}
198 
199 	do {
200 		ret = sendmsg(sockfd, &msgh, MSG_NOSIGNAL);
201 	} while (ret < 0 && errno == EINTR);
202 
203 	if (ret < 0) {
204 		VHOST_LOG_CONFIG(ifname, ERR, "sendmsg error on fd %d (%s)\n",
205 			sockfd, strerror(errno));
206 		return ret;
207 	}
208 
209 	return ret;
210 }
211 
212 static void
213 vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
214 {
215 	int vid;
216 	size_t size;
217 	struct vhost_user_connection *conn;
218 	int ret;
219 	struct virtio_net *dev;
220 
221 	if (vsocket == NULL)
222 		return;
223 
224 	conn = malloc(sizeof(*conn));
225 	if (conn == NULL) {
226 		close(fd);
227 		return;
228 	}
229 
230 	vid = vhost_user_new_device();
231 	if (vid == -1) {
232 		goto err;
233 	}
234 
235 	size = strnlen(vsocket->path, PATH_MAX);
236 	vhost_set_ifname(vid, vsocket->path, size);
237 
238 	vhost_setup_virtio_net(vid, vsocket->use_builtin_virtio_net,
239 		vsocket->net_compliant_ol_flags, vsocket->stats_enabled,
240 		vsocket->iommu_support);
241 
242 	vhost_attach_vdpa_device(vid, vsocket->vdpa_dev);
243 
244 	if (vsocket->extbuf)
245 		vhost_enable_extbuf(vid);
246 
247 	if (vsocket->linearbuf)
248 		vhost_enable_linearbuf(vid);
249 
250 	if (vsocket->async_copy) {
251 		dev = get_device(vid);
252 
253 		if (dev)
254 			dev->async_copy = 1;
255 	}
256 
257 	VHOST_LOG_CONFIG(vsocket->path, INFO, "new device, handle is %d\n", vid);
258 
259 	if (vsocket->notify_ops->new_connection) {
260 		ret = vsocket->notify_ops->new_connection(vid);
261 		if (ret < 0) {
262 			VHOST_LOG_CONFIG(vsocket->path, ERR,
263 				"failed to add vhost user connection with fd %d\n",
264 				fd);
265 			goto err_cleanup;
266 		}
267 	}
268 
269 	conn->connfd = fd;
270 	conn->vsocket = vsocket;
271 	conn->vid = vid;
272 	ret = fdset_add(&vhost_user.fdset, fd, vhost_user_read_cb,
273 			NULL, conn);
274 	if (ret < 0) {
275 		VHOST_LOG_CONFIG(vsocket->path, ERR,
276 			"failed to add fd %d into vhost server fdset\n",
277 			fd);
278 
279 		if (vsocket->notify_ops->destroy_connection)
280 			vsocket->notify_ops->destroy_connection(conn->vid);
281 
282 		goto err_cleanup;
283 	}
284 
285 	pthread_mutex_lock(&vsocket->conn_mutex);
286 	TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next);
287 	pthread_mutex_unlock(&vsocket->conn_mutex);
288 
289 	fdset_pipe_notify(&vhost_user.fdset);
290 	return;
291 
292 err_cleanup:
293 	vhost_destroy_device(vid);
294 err:
295 	free(conn);
296 	close(fd);
297 }
298 
299 /* call back when there is new vhost-user connection from client  */
300 static void
301 vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
302 {
303 	struct vhost_user_socket *vsocket = dat;
304 
305 	fd = accept(fd, NULL, NULL);
306 	if (fd < 0)
307 		return;
308 
309 	VHOST_LOG_CONFIG(vsocket->path, INFO, "new vhost user connection is %d\n", fd);
310 	vhost_user_add_connection(fd, vsocket);
311 }
312 
313 static void
314 vhost_user_read_cb(int connfd, void *dat, int *remove)
315 {
316 	struct vhost_user_connection *conn = dat;
317 	struct vhost_user_socket *vsocket = conn->vsocket;
318 	int ret;
319 
320 	ret = vhost_user_msg_handler(conn->vid, connfd);
321 	if (ret < 0) {
322 		struct virtio_net *dev = get_device(conn->vid);
323 
324 		close(connfd);
325 		*remove = 1;
326 
327 		if (dev)
328 			vhost_destroy_device_notify(dev);
329 
330 		if (vsocket->notify_ops->destroy_connection)
331 			vsocket->notify_ops->destroy_connection(conn->vid);
332 
333 		vhost_destroy_device(conn->vid);
334 
335 		if (vsocket->reconnect) {
336 			create_unix_socket(vsocket);
337 			vhost_user_start_client(vsocket);
338 		}
339 
340 		pthread_mutex_lock(&vsocket->conn_mutex);
341 		TAILQ_REMOVE(&vsocket->conn_list, conn, next);
342 		pthread_mutex_unlock(&vsocket->conn_mutex);
343 
344 		free(conn);
345 	}
346 }
347 
348 static int
349 create_unix_socket(struct vhost_user_socket *vsocket)
350 {
351 	int fd;
352 	struct sockaddr_un *un = &vsocket->un;
353 
354 	fd = socket(AF_UNIX, SOCK_STREAM, 0);
355 	if (fd < 0)
356 		return -1;
357 	VHOST_LOG_CONFIG(vsocket->path, INFO, "vhost-user %s: socket created, fd: %d\n",
358 		vsocket->is_server ? "server" : "client", fd);
359 
360 	if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
361 		VHOST_LOG_CONFIG(vsocket->path, ERR,
362 			"vhost-user: can't set nonblocking mode for socket, fd: %d (%s)\n",
363 			fd, strerror(errno));
364 		close(fd);
365 		return -1;
366 	}
367 
368 	memset(un, 0, sizeof(*un));
369 	un->sun_family = AF_UNIX;
370 	strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path));
371 	un->sun_path[sizeof(un->sun_path) - 1] = '\0';
372 
373 	vsocket->socket_fd = fd;
374 	return 0;
375 }
376 
377 static int
378 vhost_user_start_server(struct vhost_user_socket *vsocket)
379 {
380 	int ret;
381 	int fd = vsocket->socket_fd;
382 	const char *path = vsocket->path;
383 
384 	/*
385 	 * bind () may fail if the socket file with the same name already
386 	 * exists. But the library obviously should not delete the file
387 	 * provided by the user, since we can not be sure that it is not
388 	 * being used by other applications. Moreover, many applications form
389 	 * socket names based on user input, which is prone to errors.
390 	 *
391 	 * The user must ensure that the socket does not exist before
392 	 * registering the vhost driver in server mode.
393 	 */
394 	ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un));
395 	if (ret < 0) {
396 		VHOST_LOG_CONFIG(path, ERR, "failed to bind: %s; remove it and try again\n",
397 			strerror(errno));
398 		goto err;
399 	}
400 	VHOST_LOG_CONFIG(path, INFO, "binding succeeded\n");
401 
402 	ret = listen(fd, MAX_VIRTIO_BACKLOG);
403 	if (ret < 0)
404 		goto err;
405 
406 	ret = fdset_add(&vhost_user.fdset, fd, vhost_user_server_new_connection,
407 		  NULL, vsocket);
408 	if (ret < 0) {
409 		VHOST_LOG_CONFIG(path, ERR, "failed to add listen fd %d to vhost server fdset\n",
410 			fd);
411 		goto err;
412 	}
413 
414 	return 0;
415 
416 err:
417 	close(fd);
418 	return -1;
419 }
420 
421 struct vhost_user_reconnect {
422 	struct sockaddr_un un;
423 	int fd;
424 	struct vhost_user_socket *vsocket;
425 
426 	TAILQ_ENTRY(vhost_user_reconnect) next;
427 };
428 
429 TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
430 struct vhost_user_reconnect_list {
431 	struct vhost_user_reconnect_tailq_list head;
432 	pthread_mutex_t mutex;
433 };
434 
435 static struct vhost_user_reconnect_list reconn_list;
436 static pthread_t reconn_tid;
437 
438 static int
439 vhost_user_connect_nonblock(char *path, int fd, struct sockaddr *un, size_t sz)
440 {
441 	int ret, flags;
442 
443 	ret = connect(fd, un, sz);
444 	if (ret < 0 && errno != EISCONN)
445 		return -1;
446 
447 	flags = fcntl(fd, F_GETFL, 0);
448 	if (flags < 0) {
449 		VHOST_LOG_CONFIG(path, ERR, "can't get flags for connfd %d (%s)\n",
450 			fd, strerror(errno));
451 		return -2;
452 	}
453 	if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
454 		VHOST_LOG_CONFIG(path, ERR, "can't disable nonblocking on fd %d\n", fd);
455 		return -2;
456 	}
457 	return 0;
458 }
459 
460 static void *
461 vhost_user_client_reconnect(void *arg __rte_unused)
462 {
463 	int ret;
464 	struct vhost_user_reconnect *reconn, *next;
465 
466 	while (1) {
467 		pthread_mutex_lock(&reconn_list.mutex);
468 
469 		/*
470 		 * An equal implementation of TAILQ_FOREACH_SAFE,
471 		 * which does not exist on all platforms.
472 		 */
473 		for (reconn = TAILQ_FIRST(&reconn_list.head);
474 		     reconn != NULL; reconn = next) {
475 			next = TAILQ_NEXT(reconn, next);
476 
477 			ret = vhost_user_connect_nonblock(reconn->vsocket->path, reconn->fd,
478 						(struct sockaddr *)&reconn->un,
479 						sizeof(reconn->un));
480 			if (ret == -2) {
481 				close(reconn->fd);
482 				VHOST_LOG_CONFIG(reconn->vsocket->path, ERR,
483 					"reconnection for fd %d failed\n",
484 					reconn->fd);
485 				goto remove_fd;
486 			}
487 			if (ret == -1)
488 				continue;
489 
490 			VHOST_LOG_CONFIG(reconn->vsocket->path, INFO, "connected\n");
491 			vhost_user_add_connection(reconn->fd, reconn->vsocket);
492 remove_fd:
493 			TAILQ_REMOVE(&reconn_list.head, reconn, next);
494 			free(reconn);
495 		}
496 
497 		pthread_mutex_unlock(&reconn_list.mutex);
498 		sleep(1);
499 	}
500 
501 	return NULL;
502 }
503 
504 static int
505 vhost_user_reconnect_init(void)
506 {
507 	int ret;
508 
509 	ret = pthread_mutex_init(&reconn_list.mutex, NULL);
510 	if (ret < 0) {
511 		VHOST_LOG_CONFIG("thread", ERR, "%s: failed to initialize mutex\n", __func__);
512 		return ret;
513 	}
514 	TAILQ_INIT(&reconn_list.head);
515 
516 	ret = rte_ctrl_thread_create(&reconn_tid, "vhost_reconn", NULL,
517 			     vhost_user_client_reconnect, NULL);
518 	if (ret != 0) {
519 		VHOST_LOG_CONFIG("thread", ERR, "failed to create reconnect thread\n");
520 		if (pthread_mutex_destroy(&reconn_list.mutex))
521 			VHOST_LOG_CONFIG("thread", ERR,
522 				"%s: failed to destroy reconnect mutex\n",
523 				__func__);
524 	}
525 
526 	return ret;
527 }
528 
529 static int
530 vhost_user_start_client(struct vhost_user_socket *vsocket)
531 {
532 	int ret;
533 	int fd = vsocket->socket_fd;
534 	const char *path = vsocket->path;
535 	struct vhost_user_reconnect *reconn;
536 
537 	ret = vhost_user_connect_nonblock(vsocket->path, fd, (struct sockaddr *)&vsocket->un,
538 					  sizeof(vsocket->un));
539 	if (ret == 0) {
540 		vhost_user_add_connection(fd, vsocket);
541 		return 0;
542 	}
543 
544 	VHOST_LOG_CONFIG(path, WARNING, "failed to connect: %s\n", strerror(errno));
545 
546 	if (ret == -2 || !vsocket->reconnect) {
547 		close(fd);
548 		return -1;
549 	}
550 
551 	VHOST_LOG_CONFIG(path, INFO, "reconnecting...\n");
552 	reconn = malloc(sizeof(*reconn));
553 	if (reconn == NULL) {
554 		VHOST_LOG_CONFIG(path, ERR, "failed to allocate memory for reconnect\n");
555 		close(fd);
556 		return -1;
557 	}
558 	reconn->un = vsocket->un;
559 	reconn->fd = fd;
560 	reconn->vsocket = vsocket;
561 	pthread_mutex_lock(&reconn_list.mutex);
562 	TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
563 	pthread_mutex_unlock(&reconn_list.mutex);
564 
565 	return 0;
566 }
567 
568 static struct vhost_user_socket *
569 find_vhost_user_socket(const char *path)
570 {
571 	int i;
572 
573 	if (path == NULL)
574 		return NULL;
575 
576 	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
577 		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
578 
579 		if (!strcmp(vsocket->path, path))
580 			return vsocket;
581 	}
582 
583 	return NULL;
584 }
585 
586 int
587 rte_vhost_driver_attach_vdpa_device(const char *path,
588 		struct rte_vdpa_device *dev)
589 {
590 	struct vhost_user_socket *vsocket;
591 
592 	if (dev == NULL || path == NULL)
593 		return -1;
594 
595 	pthread_mutex_lock(&vhost_user.mutex);
596 	vsocket = find_vhost_user_socket(path);
597 	if (vsocket)
598 		vsocket->vdpa_dev = dev;
599 	pthread_mutex_unlock(&vhost_user.mutex);
600 
601 	return vsocket ? 0 : -1;
602 }
603 
604 int
605 rte_vhost_driver_detach_vdpa_device(const char *path)
606 {
607 	struct vhost_user_socket *vsocket;
608 
609 	pthread_mutex_lock(&vhost_user.mutex);
610 	vsocket = find_vhost_user_socket(path);
611 	if (vsocket)
612 		vsocket->vdpa_dev = NULL;
613 	pthread_mutex_unlock(&vhost_user.mutex);
614 
615 	return vsocket ? 0 : -1;
616 }
617 
618 struct rte_vdpa_device *
619 rte_vhost_driver_get_vdpa_device(const char *path)
620 {
621 	struct vhost_user_socket *vsocket;
622 	struct rte_vdpa_device *dev = NULL;
623 
624 	pthread_mutex_lock(&vhost_user.mutex);
625 	vsocket = find_vhost_user_socket(path);
626 	if (vsocket)
627 		dev = vsocket->vdpa_dev;
628 	pthread_mutex_unlock(&vhost_user.mutex);
629 
630 	return dev;
631 }
632 
633 int
634 rte_vhost_driver_get_vdpa_dev_type(const char *path, uint32_t *type)
635 {
636 	struct vhost_user_socket *vsocket;
637 	struct rte_vdpa_device *vdpa_dev;
638 	int ret = 0;
639 
640 	pthread_mutex_lock(&vhost_user.mutex);
641 	vsocket = find_vhost_user_socket(path);
642 	if (!vsocket) {
643 		VHOST_LOG_CONFIG(path, ERR, "socket file is not registered yet.\n");
644 		ret = -1;
645 		goto unlock_exit;
646 	}
647 
648 	vdpa_dev = vsocket->vdpa_dev;
649 	if (!vdpa_dev) {
650 		ret = -1;
651 		goto unlock_exit;
652 	}
653 
654 	*type = vdpa_dev->type;
655 
656 unlock_exit:
657 	pthread_mutex_unlock(&vhost_user.mutex);
658 	return ret;
659 }
660 
661 int
662 rte_vhost_driver_disable_features(const char *path, uint64_t features)
663 {
664 	struct vhost_user_socket *vsocket;
665 
666 	pthread_mutex_lock(&vhost_user.mutex);
667 	vsocket = find_vhost_user_socket(path);
668 
669 	/* Note that use_builtin_virtio_net is not affected by this function
670 	 * since callers may want to selectively disable features of the
671 	 * built-in vhost net device backend.
672 	 */
673 
674 	if (vsocket)
675 		vsocket->features &= ~features;
676 	pthread_mutex_unlock(&vhost_user.mutex);
677 
678 	return vsocket ? 0 : -1;
679 }
680 
681 int
682 rte_vhost_driver_enable_features(const char *path, uint64_t features)
683 {
684 	struct vhost_user_socket *vsocket;
685 
686 	pthread_mutex_lock(&vhost_user.mutex);
687 	vsocket = find_vhost_user_socket(path);
688 	if (vsocket) {
689 		if ((vsocket->supported_features & features) != features) {
690 			/*
691 			 * trying to enable features the driver doesn't
692 			 * support.
693 			 */
694 			pthread_mutex_unlock(&vhost_user.mutex);
695 			return -1;
696 		}
697 		vsocket->features |= features;
698 	}
699 	pthread_mutex_unlock(&vhost_user.mutex);
700 
701 	return vsocket ? 0 : -1;
702 }
703 
704 int
705 rte_vhost_driver_set_features(const char *path, uint64_t features)
706 {
707 	struct vhost_user_socket *vsocket;
708 
709 	pthread_mutex_lock(&vhost_user.mutex);
710 	vsocket = find_vhost_user_socket(path);
711 	if (vsocket) {
712 		vsocket->supported_features = features;
713 		vsocket->features = features;
714 
715 		/* Anyone setting feature bits is implementing their own vhost
716 		 * device backend.
717 		 */
718 		vsocket->use_builtin_virtio_net = false;
719 	}
720 	pthread_mutex_unlock(&vhost_user.mutex);
721 
722 	return vsocket ? 0 : -1;
723 }
724 
725 int
726 rte_vhost_driver_get_features(const char *path, uint64_t *features)
727 {
728 	struct vhost_user_socket *vsocket;
729 	uint64_t vdpa_features;
730 	struct rte_vdpa_device *vdpa_dev;
731 	int ret = 0;
732 
733 	pthread_mutex_lock(&vhost_user.mutex);
734 	vsocket = find_vhost_user_socket(path);
735 	if (!vsocket) {
736 		VHOST_LOG_CONFIG(path, ERR, "socket file is not registered yet.\n");
737 		ret = -1;
738 		goto unlock_exit;
739 	}
740 
741 	vdpa_dev = vsocket->vdpa_dev;
742 	if (!vdpa_dev) {
743 		*features = vsocket->features;
744 		goto unlock_exit;
745 	}
746 
747 	if (vdpa_dev->ops->get_features(vdpa_dev, &vdpa_features) < 0) {
748 		VHOST_LOG_CONFIG(path, ERR, "failed to get vdpa features for socket file.\n");
749 		ret = -1;
750 		goto unlock_exit;
751 	}
752 
753 	*features = vsocket->features & vdpa_features;
754 
755 unlock_exit:
756 	pthread_mutex_unlock(&vhost_user.mutex);
757 	return ret;
758 }
759 
760 int
761 rte_vhost_driver_set_protocol_features(const char *path,
762 		uint64_t protocol_features)
763 {
764 	struct vhost_user_socket *vsocket;
765 
766 	pthread_mutex_lock(&vhost_user.mutex);
767 	vsocket = find_vhost_user_socket(path);
768 	if (vsocket)
769 		vsocket->protocol_features = protocol_features;
770 	pthread_mutex_unlock(&vhost_user.mutex);
771 	return vsocket ? 0 : -1;
772 }
773 
774 int
775 rte_vhost_driver_get_protocol_features(const char *path,
776 		uint64_t *protocol_features)
777 {
778 	struct vhost_user_socket *vsocket;
779 	uint64_t vdpa_protocol_features;
780 	struct rte_vdpa_device *vdpa_dev;
781 	int ret = 0;
782 
783 	pthread_mutex_lock(&vhost_user.mutex);
784 	vsocket = find_vhost_user_socket(path);
785 	if (!vsocket) {
786 		VHOST_LOG_CONFIG(path, ERR, "socket file is not registered yet.\n");
787 		ret = -1;
788 		goto unlock_exit;
789 	}
790 
791 	vdpa_dev = vsocket->vdpa_dev;
792 	if (!vdpa_dev) {
793 		*protocol_features = vsocket->protocol_features;
794 		goto unlock_exit;
795 	}
796 
797 	if (vdpa_dev->ops->get_protocol_features(vdpa_dev,
798 				&vdpa_protocol_features) < 0) {
799 		VHOST_LOG_CONFIG(path, ERR, "failed to get vdpa protocol features.\n");
800 		ret = -1;
801 		goto unlock_exit;
802 	}
803 
804 	*protocol_features = vsocket->protocol_features
805 		& vdpa_protocol_features;
806 
807 unlock_exit:
808 	pthread_mutex_unlock(&vhost_user.mutex);
809 	return ret;
810 }
811 
812 int
813 rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num)
814 {
815 	struct vhost_user_socket *vsocket;
816 	uint32_t vdpa_queue_num;
817 	struct rte_vdpa_device *vdpa_dev;
818 	int ret = 0;
819 
820 	pthread_mutex_lock(&vhost_user.mutex);
821 	vsocket = find_vhost_user_socket(path);
822 	if (!vsocket) {
823 		VHOST_LOG_CONFIG(path, ERR, "socket file is not registered yet.\n");
824 		ret = -1;
825 		goto unlock_exit;
826 	}
827 
828 	vdpa_dev = vsocket->vdpa_dev;
829 	if (!vdpa_dev) {
830 		*queue_num = vsocket->max_queue_pairs;
831 		goto unlock_exit;
832 	}
833 
834 	if (vdpa_dev->ops->get_queue_num(vdpa_dev, &vdpa_queue_num) < 0) {
835 		VHOST_LOG_CONFIG(path, ERR, "failed to get vdpa queue number.\n");
836 		ret = -1;
837 		goto unlock_exit;
838 	}
839 
840 	*queue_num = RTE_MIN(vsocket->max_queue_pairs, vdpa_queue_num);
841 
842 unlock_exit:
843 	pthread_mutex_unlock(&vhost_user.mutex);
844 	return ret;
845 }
846 
847 int
848 rte_vhost_driver_set_max_queue_num(const char *path, uint32_t max_queue_pairs)
849 {
850 	struct vhost_user_socket *vsocket;
851 	int ret = 0;
852 
853 	VHOST_LOG_CONFIG(path, INFO, "Setting max queue pairs to %u\n", max_queue_pairs);
854 
855 	if (max_queue_pairs > VHOST_MAX_QUEUE_PAIRS) {
856 		VHOST_LOG_CONFIG(path, ERR, "Library only supports up to %u queue pairs\n",
857 				VHOST_MAX_QUEUE_PAIRS);
858 		return -1;
859 	}
860 
861 	pthread_mutex_lock(&vhost_user.mutex);
862 	vsocket = find_vhost_user_socket(path);
863 	if (!vsocket) {
864 		VHOST_LOG_CONFIG(path, ERR, "socket file is not registered yet.\n");
865 		ret = -1;
866 		goto unlock_exit;
867 	}
868 
869 	vsocket->max_queue_pairs = max_queue_pairs;
870 
871 unlock_exit:
872 	pthread_mutex_unlock(&vhost_user.mutex);
873 	return ret;
874 }
875 
876 static void
877 vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
878 {
879 	if (vsocket == NULL)
880 		return;
881 
882 	free(vsocket->path);
883 	free(vsocket->malloc_notify_ops);
884 	free(vsocket);
885 }
886 
887 /*
888  * Register a new vhost-user socket; here we could act as server
889  * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
890  * is set.
891  */
892 int
893 rte_vhost_driver_register(const char *path, uint64_t flags)
894 {
895 	int ret = -1;
896 	struct vhost_user_socket *vsocket;
897 
898 	if (!path)
899 		return -1;
900 
901 	pthread_mutex_lock(&vhost_user.mutex);
902 
903 	if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
904 		VHOST_LOG_CONFIG(path, ERR, "the number of vhost sockets reaches maximum\n");
905 		goto out;
906 	}
907 
908 	vsocket = malloc(sizeof(struct vhost_user_socket));
909 	if (!vsocket)
910 		goto out;
911 	memset(vsocket, 0, sizeof(struct vhost_user_socket));
912 	vsocket->path = strdup(path);
913 	if (vsocket->path == NULL) {
914 		VHOST_LOG_CONFIG(path, ERR, "failed to copy socket path string\n");
915 		vhost_user_socket_mem_free(vsocket);
916 		goto out;
917 	}
918 	TAILQ_INIT(&vsocket->conn_list);
919 	ret = pthread_mutex_init(&vsocket->conn_mutex, NULL);
920 	if (ret) {
921 		VHOST_LOG_CONFIG(path, ERR, "failed to init connection mutex\n");
922 		goto out_free;
923 	}
924 
925 	if (!strncmp("/dev/vduse/", path, strlen("/dev/vduse/")))
926 		vsocket->is_vduse = true;
927 
928 	vsocket->vdpa_dev = NULL;
929 	vsocket->max_queue_pairs = VHOST_MAX_QUEUE_PAIRS;
930 	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
931 	vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
932 	vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
933 	vsocket->net_compliant_ol_flags = flags & RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
934 	vsocket->stats_enabled = flags & RTE_VHOST_USER_NET_STATS_ENABLE;
935 	if (vsocket->is_vduse)
936 		vsocket->iommu_support = true;
937 	else
938 		vsocket->iommu_support = flags & RTE_VHOST_USER_IOMMU_SUPPORT;
939 
940 	if (vsocket->async_copy && (vsocket->iommu_support ||
941 				(flags & RTE_VHOST_USER_POSTCOPY_SUPPORT))) {
942 		VHOST_LOG_CONFIG(path, ERR, "async copy with IOMMU or post-copy not supported\n");
943 		goto out_mutex;
944 	}
945 
946 	/*
947 	 * Set the supported features correctly for the builtin vhost-user
948 	 * net driver.
949 	 *
950 	 * Applications know nothing about features the builtin virtio net
951 	 * driver (virtio_net.c) supports, thus it's not possible for them
952 	 * to invoke rte_vhost_driver_set_features(). To workaround it, here
953 	 * we set it unconditionally. If the application want to implement
954 	 * another vhost-user driver (say SCSI), it should call the
955 	 * rte_vhost_driver_set_features(), which will overwrite following
956 	 * two values.
957 	 */
958 	vsocket->use_builtin_virtio_net = true;
959 	if (vsocket->is_vduse) {
960 		vsocket->supported_features = VDUSE_NET_SUPPORTED_FEATURES;
961 		vsocket->features           = VDUSE_NET_SUPPORTED_FEATURES;
962 	} else {
963 		vsocket->supported_features = VHOST_USER_NET_SUPPORTED_FEATURES;
964 		vsocket->features           = VHOST_USER_NET_SUPPORTED_FEATURES;
965 		vsocket->protocol_features  = VHOST_USER_PROTOCOL_FEATURES;
966 	}
967 
968 	if (vsocket->async_copy) {
969 		vsocket->supported_features &= ~(1ULL << VHOST_F_LOG_ALL);
970 		vsocket->features &= ~(1ULL << VHOST_F_LOG_ALL);
971 		VHOST_LOG_CONFIG(path, INFO, "logging feature is disabled in async copy mode\n");
972 	}
973 
974 	/*
975 	 * We'll not be able to receive a buffer from guest in linear mode
976 	 * without external buffer if it will not fit in a single mbuf, which is
977 	 * likely if segmentation offloading enabled.
978 	 */
979 	if (vsocket->linearbuf && !vsocket->extbuf) {
980 		uint64_t seg_offload_features =
981 				(1ULL << VIRTIO_NET_F_HOST_TSO4) |
982 				(1ULL << VIRTIO_NET_F_HOST_TSO6) |
983 				(1ULL << VIRTIO_NET_F_HOST_UFO);
984 
985 		VHOST_LOG_CONFIG(path, INFO, "Linear buffers requested without external buffers,\n");
986 		VHOST_LOG_CONFIG(path, INFO, "disabling host segmentation offloading support\n");
987 		vsocket->supported_features &= ~seg_offload_features;
988 		vsocket->features &= ~seg_offload_features;
989 	}
990 
991 	if (!vsocket->iommu_support) {
992 		vsocket->supported_features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
993 		vsocket->features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
994 	}
995 
996 	if (!(flags & RTE_VHOST_USER_POSTCOPY_SUPPORT)) {
997 		vsocket->protocol_features &=
998 			~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
999 	} else {
1000 #ifndef RTE_LIBRTE_VHOST_POSTCOPY
1001 		VHOST_LOG_CONFIG(path, ERR, "Postcopy requested but not compiled\n");
1002 		ret = -1;
1003 		goto out_mutex;
1004 #endif
1005 	}
1006 
1007 	if (!vsocket->is_vduse) {
1008 		if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
1009 			vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
1010 			if (vsocket->reconnect && reconn_tid == 0) {
1011 				if (vhost_user_reconnect_init() != 0)
1012 					goto out_mutex;
1013 			}
1014 		} else {
1015 			vsocket->is_server = true;
1016 		}
1017 		ret = create_unix_socket(vsocket);
1018 		if (ret < 0)
1019 			goto out_mutex;
1020 	}
1021 
1022 	vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
1023 
1024 	pthread_mutex_unlock(&vhost_user.mutex);
1025 	return ret;
1026 
1027 out_mutex:
1028 	if (pthread_mutex_destroy(&vsocket->conn_mutex)) {
1029 		VHOST_LOG_CONFIG(path, ERR, "failed to destroy connection mutex\n");
1030 	}
1031 out_free:
1032 	vhost_user_socket_mem_free(vsocket);
1033 out:
1034 	pthread_mutex_unlock(&vhost_user.mutex);
1035 
1036 	return ret;
1037 }
1038 
1039 static bool
1040 vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
1041 {
1042 	int found = false;
1043 	struct vhost_user_reconnect *reconn, *next;
1044 
1045 	pthread_mutex_lock(&reconn_list.mutex);
1046 
1047 	for (reconn = TAILQ_FIRST(&reconn_list.head);
1048 	     reconn != NULL; reconn = next) {
1049 		next = TAILQ_NEXT(reconn, next);
1050 
1051 		if (reconn->vsocket == vsocket) {
1052 			TAILQ_REMOVE(&reconn_list.head, reconn, next);
1053 			close(reconn->fd);
1054 			free(reconn);
1055 			found = true;
1056 			break;
1057 		}
1058 	}
1059 	pthread_mutex_unlock(&reconn_list.mutex);
1060 	return found;
1061 }
1062 
1063 /**
1064  * Unregister the specified vhost socket
1065  */
1066 int
1067 rte_vhost_driver_unregister(const char *path)
1068 {
1069 	int i;
1070 	int count;
1071 	struct vhost_user_connection *conn, *next;
1072 
1073 	if (path == NULL)
1074 		return -1;
1075 
1076 again:
1077 	pthread_mutex_lock(&vhost_user.mutex);
1078 
1079 	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
1080 		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
1081 		if (strcmp(vsocket->path, path))
1082 			continue;
1083 
1084 		if (vsocket->is_vduse) {
1085 			vduse_device_destroy(path);
1086 		} else if (vsocket->is_server) {
1087 			/*
1088 			 * If r/wcb is executing, release vhost_user's
1089 			 * mutex lock, and try again since the r/wcb
1090 			 * may use the mutex lock.
1091 			 */
1092 			if (fdset_try_del(&vhost_user.fdset, vsocket->socket_fd) == -1) {
1093 				pthread_mutex_unlock(&vhost_user.mutex);
1094 				goto again;
1095 			}
1096 		} else if (vsocket->reconnect) {
1097 			vhost_user_remove_reconnect(vsocket);
1098 		}
1099 
1100 		pthread_mutex_lock(&vsocket->conn_mutex);
1101 		for (conn = TAILQ_FIRST(&vsocket->conn_list);
1102 			 conn != NULL;
1103 			 conn = next) {
1104 			next = TAILQ_NEXT(conn, next);
1105 
1106 			/*
1107 			 * If r/wcb is executing, release vsocket's
1108 			 * conn_mutex and vhost_user's mutex locks, and
1109 			 * try again since the r/wcb may use the
1110 			 * conn_mutex and mutex locks.
1111 			 */
1112 			if (fdset_try_del(&vhost_user.fdset,
1113 					  conn->connfd) == -1) {
1114 				pthread_mutex_unlock(&vsocket->conn_mutex);
1115 				pthread_mutex_unlock(&vhost_user.mutex);
1116 				goto again;
1117 			}
1118 
1119 			VHOST_LOG_CONFIG(path, INFO, "free connfd %d\n", conn->connfd);
1120 			close(conn->connfd);
1121 			vhost_destroy_device(conn->vid);
1122 			TAILQ_REMOVE(&vsocket->conn_list, conn, next);
1123 			free(conn);
1124 		}
1125 		pthread_mutex_unlock(&vsocket->conn_mutex);
1126 
1127 		if (vsocket->is_server) {
1128 			close(vsocket->socket_fd);
1129 			unlink(path);
1130 		}
1131 
1132 		pthread_mutex_destroy(&vsocket->conn_mutex);
1133 		vhost_user_socket_mem_free(vsocket);
1134 
1135 		count = --vhost_user.vsocket_cnt;
1136 		vhost_user.vsockets[i] = vhost_user.vsockets[count];
1137 		vhost_user.vsockets[count] = NULL;
1138 		pthread_mutex_unlock(&vhost_user.mutex);
1139 		return 0;
1140 	}
1141 	pthread_mutex_unlock(&vhost_user.mutex);
1142 
1143 	return -1;
1144 }
1145 
1146 /*
1147  * Register ops so that we can add/remove device to data core.
1148  */
1149 static int
1150 vhost_driver_callback_register(const char *path,
1151 	struct rte_vhost_device_ops const * const ops,
1152 	struct rte_vhost_device_ops *malloc_ops)
1153 {
1154 	struct vhost_user_socket *vsocket;
1155 
1156 	pthread_mutex_lock(&vhost_user.mutex);
1157 	vsocket = find_vhost_user_socket(path);
1158 	if (vsocket) {
1159 		vsocket->notify_ops = ops;
1160 		free(vsocket->malloc_notify_ops);
1161 		vsocket->malloc_notify_ops = malloc_ops;
1162 	}
1163 	pthread_mutex_unlock(&vhost_user.mutex);
1164 
1165 	return vsocket ? 0 : -1;
1166 }
1167 
1168 int __vsym
1169 rte_vhost_driver_callback_register_v24(const char *path,
1170 	struct rte_vhost_device_ops const * const ops)
1171 {
1172 	return vhost_driver_callback_register(path, ops, NULL);
1173 }
1174 
1175 int __vsym
1176 rte_vhost_driver_callback_register_v23(const char *path,
1177 	struct rte_vhost_device_ops const * const ops)
1178 {
1179 	int ret;
1180 
1181 	/*
1182 	 * Although the ops structure is a const structure, we do need to
1183 	 * override the guest_notify operation. This is because with the
1184 	 * previous APIs it was "reserved" and if any garbage value was passed,
1185 	 * it could crash the application.
1186 	 */
1187 	if (ops && !ops->guest_notify) {
1188 		struct rte_vhost_device_ops *new_ops;
1189 
1190 		new_ops = malloc(sizeof(*new_ops));
1191 		if (new_ops == NULL)
1192 			return -1;
1193 
1194 		memcpy(new_ops, ops, sizeof(*new_ops));
1195 		new_ops->guest_notify = NULL;
1196 
1197 		ret = vhost_driver_callback_register(path, new_ops, new_ops);
1198 	} else {
1199 		ret = vhost_driver_callback_register(path, ops, NULL);
1200 	}
1201 
1202 	return ret;
1203 }
1204 
1205 /* Mark the v23 function as the old version, and v24 as the default version. */
1206 VERSION_SYMBOL(rte_vhost_driver_callback_register, _v23, 23);
1207 BIND_DEFAULT_SYMBOL(rte_vhost_driver_callback_register, _v24, 24);
1208 MAP_STATIC_SYMBOL(int rte_vhost_driver_callback_register(const char *path,
1209 		struct rte_vhost_device_ops const * const ops),
1210 		rte_vhost_driver_callback_register_v24);
1211 
1212 struct rte_vhost_device_ops const *
1213 vhost_driver_callback_get(const char *path)
1214 {
1215 	struct vhost_user_socket *vsocket;
1216 
1217 	pthread_mutex_lock(&vhost_user.mutex);
1218 	vsocket = find_vhost_user_socket(path);
1219 	pthread_mutex_unlock(&vhost_user.mutex);
1220 
1221 	return vsocket ? vsocket->notify_ops : NULL;
1222 }
1223 
1224 int
1225 rte_vhost_driver_start(const char *path)
1226 {
1227 	struct vhost_user_socket *vsocket;
1228 	static pthread_t fdset_tid;
1229 
1230 	pthread_mutex_lock(&vhost_user.mutex);
1231 	vsocket = find_vhost_user_socket(path);
1232 	pthread_mutex_unlock(&vhost_user.mutex);
1233 
1234 	if (!vsocket)
1235 		return -1;
1236 
1237 	if (vsocket->is_vduse)
1238 		return vduse_device_create(path, vsocket->net_compliant_ol_flags);
1239 
1240 	if (fdset_tid == 0) {
1241 		/**
1242 		 * create a pipe which will be waited by poll and notified to
1243 		 * rebuild the wait list of poll.
1244 		 */
1245 		if (fdset_pipe_init(&vhost_user.fdset) < 0) {
1246 			VHOST_LOG_CONFIG(path, ERR, "failed to create pipe for vhost fdset\n");
1247 			return -1;
1248 		}
1249 
1250 		int ret = rte_ctrl_thread_create(&fdset_tid,
1251 			"vhost-events", NULL, fdset_event_dispatch,
1252 			&vhost_user.fdset);
1253 		if (ret != 0) {
1254 			VHOST_LOG_CONFIG(path, ERR, "failed to create fdset handling thread\n");
1255 			fdset_pipe_uninit(&vhost_user.fdset);
1256 			return -1;
1257 		}
1258 	}
1259 
1260 	if (vsocket->is_server)
1261 		return vhost_user_start_server(vsocket);
1262 	else
1263 		return vhost_user_start_client(vsocket);
1264 }
1265