xref: /dpdk/lib/vhost/socket.c (revision 8484d74bd656bc0e951a3ed4e0816ee0fea5e593)
1 /* SPDX-License-Identifier: BSD-3-Clause
2  * Copyright(c) 2010-2016 Intel Corporation
3  */
4 
5 #include <stdint.h>
6 #include <stdio.h>
7 #include <limits.h>
8 #include <stdlib.h>
9 #include <unistd.h>
10 #include <string.h>
11 #include <sys/socket.h>
12 #include <sys/un.h>
13 #include <sys/queue.h>
14 #include <errno.h>
15 #include <fcntl.h>
16 
17 #include <rte_thread.h>
18 #include <rte_log.h>
19 
20 #include "fd_man.h"
21 #include "vduse.h"
22 #include "vhost.h"
23 #include "vhost_user.h"
24 
25 
26 TAILQ_HEAD(vhost_user_connection_list, vhost_user_connection);
27 
28 /*
29  * Every time rte_vhost_driver_register() is invoked, an associated
30  * vhost_user_socket struct will be created.
31  */
32 struct vhost_user_socket {
33 	struct vhost_user_connection_list conn_list;
34 	pthread_mutex_t conn_mutex;
35 	char *path;
36 	int socket_fd;
37 	struct sockaddr_un un;
38 	bool is_server;
39 	bool is_vduse;
40 	bool reconnect;
41 	bool iommu_support;
42 	bool use_builtin_virtio_net;
43 	bool extbuf;
44 	bool linearbuf;
45 	bool async_copy;
46 	bool net_compliant_ol_flags;
47 	bool stats_enabled;
48 	bool async_connect;
49 
50 	/*
51 	 * The "supported_features" indicates the feature bits the
52 	 * vhost driver supports. The "features" indicates the feature
53 	 * bits after the rte_vhost_driver_features_disable/enable().
54 	 * It is also the final feature bits used for vhost-user
55 	 * features negotiation.
56 	 */
57 	uint64_t supported_features;
58 	uint64_t features;
59 
60 	uint64_t protocol_features;
61 
62 	uint32_t max_queue_pairs;
63 
64 	struct rte_vdpa_device *vdpa_dev;
65 
66 	struct rte_vhost_device_ops const *notify_ops;
67 };
68 
69 struct vhost_user_connection {
70 	struct vhost_user_socket *vsocket;
71 	int connfd;
72 	int vid;
73 
74 	TAILQ_ENTRY(vhost_user_connection) next;
75 };
76 
77 #define MAX_VHOST_SOCKET 1024
78 struct vhost_user {
79 	struct vhost_user_socket *vsockets[MAX_VHOST_SOCKET];
80 	struct fdset *fdset;
81 	int vsocket_cnt;
82 	pthread_mutex_t mutex;
83 };
84 
85 #define MAX_VIRTIO_BACKLOG 128
86 
87 static void vhost_user_server_new_connection(int fd, void *data, int *remove);
88 static void vhost_user_read_cb(int fd, void *dat, int *remove);
89 static int create_unix_socket(struct vhost_user_socket *vsocket);
90 static int vhost_user_start_client(struct vhost_user_socket *vsocket);
91 
92 static struct vhost_user vhost_user = {
93 	.vsocket_cnt = 0,
94 	.mutex = PTHREAD_MUTEX_INITIALIZER,
95 };
96 
97 /*
98  * return bytes# of read on success or negative val on failure. Update fdnum
99  * with number of fds read.
100  */
101 int
102 read_fd_message(char *ifname, int sockfd, char *buf, int buflen, int *fds, int max_fds,
103 		int *fd_num)
104 {
105 	struct iovec iov;
106 	struct msghdr msgh;
107 	char control[CMSG_SPACE(max_fds * sizeof(int))];
108 	struct cmsghdr *cmsg;
109 	int got_fds = 0;
110 	int ret;
111 
112 	*fd_num = 0;
113 
114 	memset(&msgh, 0, sizeof(msgh));
115 	iov.iov_base = buf;
116 	iov.iov_len  = buflen;
117 
118 	msgh.msg_iov = &iov;
119 	msgh.msg_iovlen = 1;
120 	msgh.msg_control = control;
121 	msgh.msg_controllen = sizeof(control);
122 
123 	ret = recvmsg(sockfd, &msgh, 0);
124 	if (ret <= 0) {
125 		if (ret)
126 			VHOST_CONFIG_LOG(ifname, ERR, "recvmsg failed on fd %d (%s)",
127 				sockfd, strerror(errno));
128 		return ret;
129 	}
130 
131 	if (msgh.msg_flags & MSG_TRUNC)
132 		VHOST_CONFIG_LOG(ifname, ERR, "truncated msg (fd %d)", sockfd);
133 
134 	/* MSG_CTRUNC may be caused by LSM misconfiguration */
135 	if (msgh.msg_flags & MSG_CTRUNC)
136 		VHOST_CONFIG_LOG(ifname, ERR, "truncated control data (fd %d)", sockfd);
137 
138 	for (cmsg = CMSG_FIRSTHDR(&msgh); cmsg != NULL;
139 		cmsg = CMSG_NXTHDR(&msgh, cmsg)) {
140 		if ((cmsg->cmsg_level == SOL_SOCKET) &&
141 			(cmsg->cmsg_type == SCM_RIGHTS)) {
142 			got_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
143 			*fd_num = got_fds;
144 			memcpy(fds, CMSG_DATA(cmsg), got_fds * sizeof(int));
145 			break;
146 		}
147 	}
148 
149 	/* Clear out unused file descriptors */
150 	while (got_fds < max_fds)
151 		fds[got_fds++] = -1;
152 
153 	return ret;
154 }
155 
156 int
157 send_fd_message(char *ifname, int sockfd, char *buf, int buflen, int *fds, int fd_num)
158 {
159 
160 	struct iovec iov;
161 	struct msghdr msgh;
162 	size_t fdsize = fd_num * sizeof(int);
163 	char control[CMSG_SPACE(fdsize)];
164 	struct cmsghdr *cmsg;
165 	int ret;
166 
167 	memset(&msgh, 0, sizeof(msgh));
168 	iov.iov_base = buf;
169 	iov.iov_len = buflen;
170 
171 	msgh.msg_iov = &iov;
172 	msgh.msg_iovlen = 1;
173 
174 	if (fds && fd_num > 0) {
175 		msgh.msg_control = control;
176 		msgh.msg_controllen = sizeof(control);
177 		cmsg = CMSG_FIRSTHDR(&msgh);
178 		if (cmsg == NULL) {
179 			VHOST_CONFIG_LOG(ifname, ERR, "cmsg == NULL");
180 			errno = EINVAL;
181 			return -1;
182 		}
183 		cmsg->cmsg_len = CMSG_LEN(fdsize);
184 		cmsg->cmsg_level = SOL_SOCKET;
185 		cmsg->cmsg_type = SCM_RIGHTS;
186 		memcpy(CMSG_DATA(cmsg), fds, fdsize);
187 	} else {
188 		msgh.msg_control = NULL;
189 		msgh.msg_controllen = 0;
190 	}
191 
192 	do {
193 		ret = sendmsg(sockfd, &msgh, MSG_NOSIGNAL);
194 	} while (ret < 0 && errno == EINTR);
195 
196 	if (ret < 0) {
197 		VHOST_CONFIG_LOG(ifname, ERR, "sendmsg error on fd %d (%s)",
198 			sockfd, strerror(errno));
199 		return ret;
200 	}
201 
202 	return ret;
203 }
204 
205 static void
206 vhost_user_add_connection(int fd, struct vhost_user_socket *vsocket)
207 {
208 	int vid;
209 	size_t size;
210 	struct vhost_user_connection *conn;
211 	int ret;
212 	struct virtio_net *dev;
213 
214 	if (vsocket == NULL)
215 		return;
216 
217 	conn = malloc(sizeof(*conn));
218 	if (conn == NULL) {
219 		close(fd);
220 		return;
221 	}
222 
223 	vid = vhost_user_new_device();
224 	if (vid == -1) {
225 		goto err;
226 	}
227 
228 	size = strnlen(vsocket->path, PATH_MAX);
229 	vhost_set_ifname(vid, vsocket->path, size);
230 
231 	vhost_setup_virtio_net(vid, vsocket->use_builtin_virtio_net,
232 		vsocket->net_compliant_ol_flags, vsocket->stats_enabled,
233 		vsocket->iommu_support);
234 
235 	vhost_attach_vdpa_device(vid, vsocket->vdpa_dev);
236 
237 	if (vsocket->extbuf)
238 		vhost_enable_extbuf(vid);
239 
240 	if (vsocket->linearbuf)
241 		vhost_enable_linearbuf(vid);
242 
243 	if (vsocket->async_copy) {
244 		dev = get_device(vid);
245 
246 		if (dev)
247 			dev->async_copy = 1;
248 	}
249 
250 	VHOST_CONFIG_LOG(vsocket->path, INFO, "new device, handle is %d", vid);
251 
252 	if (vsocket->notify_ops->new_connection) {
253 		ret = vsocket->notify_ops->new_connection(vid);
254 		if (ret < 0) {
255 			VHOST_CONFIG_LOG(vsocket->path, ERR,
256 				"failed to add vhost user connection with fd %d",
257 				fd);
258 			goto err_cleanup;
259 		}
260 	}
261 
262 	conn->connfd = fd;
263 	conn->vsocket = vsocket;
264 	conn->vid = vid;
265 	ret = fdset_add(vhost_user.fdset, fd, vhost_user_read_cb,
266 			NULL, conn);
267 	if (ret < 0) {
268 		VHOST_CONFIG_LOG(vsocket->path, ERR,
269 			"failed to add fd %d into vhost server fdset",
270 			fd);
271 
272 		if (vsocket->notify_ops->destroy_connection)
273 			vsocket->notify_ops->destroy_connection(conn->vid);
274 
275 		goto err_cleanup;
276 	}
277 
278 	pthread_mutex_lock(&vsocket->conn_mutex);
279 	TAILQ_INSERT_TAIL(&vsocket->conn_list, conn, next);
280 	pthread_mutex_unlock(&vsocket->conn_mutex);
281 
282 	return;
283 
284 err_cleanup:
285 	vhost_destroy_device(vid);
286 err:
287 	free(conn);
288 	close(fd);
289 }
290 
291 /* call back when there is new vhost-user connection from client  */
292 static void
293 vhost_user_server_new_connection(int fd, void *dat, int *remove __rte_unused)
294 {
295 	struct vhost_user_socket *vsocket = dat;
296 
297 	fd = accept(fd, NULL, NULL);
298 	if (fd < 0)
299 		return;
300 
301 	VHOST_CONFIG_LOG(vsocket->path, INFO, "new vhost user connection is %d", fd);
302 	vhost_user_add_connection(fd, vsocket);
303 }
304 
305 static void
306 vhost_user_read_cb(int connfd, void *dat, int *remove)
307 {
308 	struct vhost_user_connection *conn = dat;
309 	struct vhost_user_socket *vsocket = conn->vsocket;
310 	int ret;
311 
312 	ret = vhost_user_msg_handler(conn->vid, connfd);
313 	if (ret < 0) {
314 		struct virtio_net *dev = get_device(conn->vid);
315 
316 		close(connfd);
317 		*remove = 1;
318 
319 		if (dev)
320 			vhost_destroy_device_notify(dev);
321 
322 		if (vsocket->notify_ops->destroy_connection)
323 			vsocket->notify_ops->destroy_connection(conn->vid);
324 
325 		vhost_destroy_device(conn->vid);
326 
327 		if (vsocket->reconnect) {
328 			create_unix_socket(vsocket);
329 			vhost_user_start_client(vsocket);
330 		}
331 
332 		pthread_mutex_lock(&vsocket->conn_mutex);
333 		TAILQ_REMOVE(&vsocket->conn_list, conn, next);
334 		pthread_mutex_unlock(&vsocket->conn_mutex);
335 
336 		free(conn);
337 	}
338 }
339 
340 static int
341 create_unix_socket(struct vhost_user_socket *vsocket)
342 {
343 	int fd;
344 	struct sockaddr_un *un = &vsocket->un;
345 
346 	fd = socket(AF_UNIX, SOCK_STREAM, 0);
347 	if (fd < 0)
348 		return -1;
349 	VHOST_CONFIG_LOG(vsocket->path, INFO, "vhost-user %s: socket created, fd: %d",
350 		vsocket->is_server ? "server" : "client", fd);
351 
352 	if (!vsocket->is_server && fcntl(fd, F_SETFL, O_NONBLOCK)) {
353 		VHOST_CONFIG_LOG(vsocket->path, ERR,
354 			"vhost-user: can't set nonblocking mode for socket, fd: %d (%s)",
355 			fd, strerror(errno));
356 		close(fd);
357 		return -1;
358 	}
359 
360 	memset(un, 0, sizeof(*un));
361 	un->sun_family = AF_UNIX;
362 	strncpy(un->sun_path, vsocket->path, sizeof(un->sun_path));
363 	un->sun_path[sizeof(un->sun_path) - 1] = '\0';
364 
365 	vsocket->socket_fd = fd;
366 	return 0;
367 }
368 
369 static int
370 vhost_user_start_server(struct vhost_user_socket *vsocket)
371 {
372 	int ret;
373 	int fd = vsocket->socket_fd;
374 	const char *path = vsocket->path;
375 
376 	/*
377 	 * bind () may fail if the socket file with the same name already
378 	 * exists. But the library obviously should not delete the file
379 	 * provided by the user, since we can not be sure that it is not
380 	 * being used by other applications. Moreover, many applications form
381 	 * socket names based on user input, which is prone to errors.
382 	 *
383 	 * The user must ensure that the socket does not exist before
384 	 * registering the vhost driver in server mode.
385 	 */
386 	ret = bind(fd, (struct sockaddr *)&vsocket->un, sizeof(vsocket->un));
387 	if (ret < 0) {
388 		VHOST_CONFIG_LOG(path, ERR, "failed to bind: %s; remove it and try again",
389 			strerror(errno));
390 		goto err;
391 	}
392 	VHOST_CONFIG_LOG(path, INFO, "binding succeeded");
393 
394 	ret = listen(fd, MAX_VIRTIO_BACKLOG);
395 	if (ret < 0)
396 		goto err;
397 
398 	ret = fdset_add(vhost_user.fdset, fd, vhost_user_server_new_connection,
399 		  NULL, vsocket);
400 	if (ret < 0) {
401 		VHOST_CONFIG_LOG(path, ERR, "failed to add listen fd %d to vhost server fdset",
402 			fd);
403 		goto err;
404 	}
405 
406 	return 0;
407 
408 err:
409 	close(fd);
410 	return -1;
411 }
412 
413 struct vhost_user_reconnect {
414 	struct sockaddr_un un;
415 	int fd;
416 	struct vhost_user_socket *vsocket;
417 
418 	TAILQ_ENTRY(vhost_user_reconnect) next;
419 };
420 
421 TAILQ_HEAD(vhost_user_reconnect_tailq_list, vhost_user_reconnect);
422 struct vhost_user_reconnect_list {
423 	struct vhost_user_reconnect_tailq_list head;
424 	pthread_mutex_t mutex;
425 };
426 
427 static struct vhost_user_reconnect_list reconn_list;
428 static rte_thread_t reconn_tid;
429 
430 static int
431 vhost_user_connect_nonblock(char *path, int fd, struct sockaddr *un, size_t sz)
432 {
433 	int ret, flags;
434 
435 	ret = connect(fd, un, sz);
436 	if (ret < 0 && errno != EISCONN)
437 		return -1;
438 
439 	flags = fcntl(fd, F_GETFL, 0);
440 	if (flags < 0) {
441 		VHOST_CONFIG_LOG(path, ERR, "can't get flags for connfd %d (%s)",
442 			fd, strerror(errno));
443 		return -2;
444 	}
445 	if ((flags & O_NONBLOCK) && fcntl(fd, F_SETFL, flags & ~O_NONBLOCK)) {
446 		VHOST_CONFIG_LOG(path, ERR, "can't disable nonblocking on fd %d", fd);
447 		return -2;
448 	}
449 	return 0;
450 }
451 
452 static uint32_t
453 vhost_user_client_reconnect(void *arg __rte_unused)
454 {
455 	int ret;
456 	struct vhost_user_reconnect *reconn, *next;
457 
458 	while (1) {
459 		pthread_mutex_lock(&reconn_list.mutex);
460 
461 		/*
462 		 * An equal implementation of TAILQ_FOREACH_SAFE,
463 		 * which does not exist on all platforms.
464 		 */
465 		for (reconn = TAILQ_FIRST(&reconn_list.head);
466 		     reconn != NULL; reconn = next) {
467 			next = TAILQ_NEXT(reconn, next);
468 
469 			ret = vhost_user_connect_nonblock(reconn->vsocket->path, reconn->fd,
470 						(struct sockaddr *)&reconn->un,
471 						sizeof(reconn->un));
472 			if (ret == -2) {
473 				close(reconn->fd);
474 				VHOST_CONFIG_LOG(reconn->vsocket->path, ERR,
475 					"reconnection for fd %d failed",
476 					reconn->fd);
477 				goto remove_fd;
478 			}
479 			if (ret == -1)
480 				continue;
481 
482 			VHOST_CONFIG_LOG(reconn->vsocket->path, INFO, "connected");
483 			vhost_user_add_connection(reconn->fd, reconn->vsocket);
484 remove_fd:
485 			TAILQ_REMOVE(&reconn_list.head, reconn, next);
486 			free(reconn);
487 		}
488 
489 		pthread_mutex_unlock(&reconn_list.mutex);
490 		sleep(1);
491 	}
492 
493 	return 0;
494 }
495 
496 static int
497 vhost_user_reconnect_init(void)
498 {
499 	int ret;
500 
501 	ret = pthread_mutex_init(&reconn_list.mutex, NULL);
502 	if (ret < 0) {
503 		VHOST_CONFIG_LOG("thread", ERR, "%s: failed to initialize mutex", __func__);
504 		return ret;
505 	}
506 	TAILQ_INIT(&reconn_list.head);
507 
508 	ret = rte_thread_create_internal_control(&reconn_tid, "vhost-reco",
509 			vhost_user_client_reconnect, NULL);
510 	if (ret != 0) {
511 		VHOST_CONFIG_LOG("thread", ERR, "failed to create reconnect thread");
512 		if (pthread_mutex_destroy(&reconn_list.mutex))
513 			VHOST_CONFIG_LOG("thread", ERR,
514 				"%s: failed to destroy reconnect mutex",
515 				__func__);
516 	}
517 
518 	return ret;
519 }
520 
521 static int
522 vhost_user_start_client(struct vhost_user_socket *vsocket)
523 {
524 	int ret;
525 	int fd = vsocket->socket_fd;
526 	const char *path = vsocket->path;
527 	struct vhost_user_reconnect *reconn;
528 
529 	if (!vsocket->async_connect || !vsocket->reconnect) {
530 		ret = vhost_user_connect_nonblock(vsocket->path, fd,
531 			(struct sockaddr *)&vsocket->un, sizeof(vsocket->un));
532 		if (ret == 0) {
533 			vhost_user_add_connection(fd, vsocket);
534 			return 0;
535 		}
536 
537 		VHOST_CONFIG_LOG(path, WARNING, "failed to connect: %s", strerror(errno));
538 
539 		if (ret == -2 || !vsocket->reconnect) {
540 			close(fd);
541 			return -1;
542 		}
543 
544 		VHOST_CONFIG_LOG(path, INFO, "reconnecting...");
545 	}
546 	reconn = malloc(sizeof(*reconn));
547 	if (reconn == NULL) {
548 		VHOST_CONFIG_LOG(path, ERR, "failed to allocate memory for reconnect");
549 		close(fd);
550 		return -1;
551 	}
552 	reconn->un = vsocket->un;
553 	reconn->fd = fd;
554 	reconn->vsocket = vsocket;
555 	pthread_mutex_lock(&reconn_list.mutex);
556 	TAILQ_INSERT_TAIL(&reconn_list.head, reconn, next);
557 	pthread_mutex_unlock(&reconn_list.mutex);
558 
559 	return 0;
560 }
561 
562 static struct vhost_user_socket *
563 find_vhost_user_socket(const char *path)
564 {
565 	int i;
566 
567 	if (path == NULL)
568 		return NULL;
569 
570 	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
571 		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
572 
573 		if (!strcmp(vsocket->path, path))
574 			return vsocket;
575 	}
576 
577 	return NULL;
578 }
579 
580 int
581 rte_vhost_driver_attach_vdpa_device(const char *path,
582 		struct rte_vdpa_device *dev)
583 {
584 	struct vhost_user_socket *vsocket;
585 
586 	if (dev == NULL || path == NULL)
587 		return -1;
588 
589 	pthread_mutex_lock(&vhost_user.mutex);
590 	vsocket = find_vhost_user_socket(path);
591 	if (vsocket)
592 		vsocket->vdpa_dev = dev;
593 	pthread_mutex_unlock(&vhost_user.mutex);
594 
595 	return vsocket ? 0 : -1;
596 }
597 
598 int
599 rte_vhost_driver_detach_vdpa_device(const char *path)
600 {
601 	struct vhost_user_socket *vsocket;
602 
603 	pthread_mutex_lock(&vhost_user.mutex);
604 	vsocket = find_vhost_user_socket(path);
605 	if (vsocket)
606 		vsocket->vdpa_dev = NULL;
607 	pthread_mutex_unlock(&vhost_user.mutex);
608 
609 	return vsocket ? 0 : -1;
610 }
611 
612 struct rte_vdpa_device *
613 rte_vhost_driver_get_vdpa_device(const char *path)
614 {
615 	struct vhost_user_socket *vsocket;
616 	struct rte_vdpa_device *dev = NULL;
617 
618 	pthread_mutex_lock(&vhost_user.mutex);
619 	vsocket = find_vhost_user_socket(path);
620 	if (vsocket)
621 		dev = vsocket->vdpa_dev;
622 	pthread_mutex_unlock(&vhost_user.mutex);
623 
624 	return dev;
625 }
626 
627 int
628 rte_vhost_driver_get_vdpa_dev_type(const char *path, uint32_t *type)
629 {
630 	struct vhost_user_socket *vsocket;
631 	struct rte_vdpa_device *vdpa_dev;
632 	int ret = 0;
633 
634 	pthread_mutex_lock(&vhost_user.mutex);
635 	vsocket = find_vhost_user_socket(path);
636 	if (!vsocket) {
637 		VHOST_CONFIG_LOG(path, ERR, "socket file is not registered yet.");
638 		ret = -1;
639 		goto unlock_exit;
640 	}
641 
642 	vdpa_dev = vsocket->vdpa_dev;
643 	if (!vdpa_dev) {
644 		ret = -1;
645 		goto unlock_exit;
646 	}
647 
648 	*type = vdpa_dev->type;
649 
650 unlock_exit:
651 	pthread_mutex_unlock(&vhost_user.mutex);
652 	return ret;
653 }
654 
655 int
656 rte_vhost_driver_disable_features(const char *path, uint64_t features)
657 {
658 	struct vhost_user_socket *vsocket;
659 
660 	pthread_mutex_lock(&vhost_user.mutex);
661 	vsocket = find_vhost_user_socket(path);
662 
663 	/* Note that use_builtin_virtio_net is not affected by this function
664 	 * since callers may want to selectively disable features of the
665 	 * built-in vhost net device backend.
666 	 */
667 
668 	if (vsocket)
669 		vsocket->features &= ~features;
670 	pthread_mutex_unlock(&vhost_user.mutex);
671 
672 	return vsocket ? 0 : -1;
673 }
674 
675 int
676 rte_vhost_driver_enable_features(const char *path, uint64_t features)
677 {
678 	struct vhost_user_socket *vsocket;
679 
680 	pthread_mutex_lock(&vhost_user.mutex);
681 	vsocket = find_vhost_user_socket(path);
682 	if (vsocket) {
683 		if ((vsocket->supported_features & features) != features) {
684 			/*
685 			 * trying to enable features the driver doesn't
686 			 * support.
687 			 */
688 			pthread_mutex_unlock(&vhost_user.mutex);
689 			return -1;
690 		}
691 		vsocket->features |= features;
692 	}
693 	pthread_mutex_unlock(&vhost_user.mutex);
694 
695 	return vsocket ? 0 : -1;
696 }
697 
698 int
699 rte_vhost_driver_set_features(const char *path, uint64_t features)
700 {
701 	struct vhost_user_socket *vsocket;
702 
703 	pthread_mutex_lock(&vhost_user.mutex);
704 	vsocket = find_vhost_user_socket(path);
705 	if (vsocket) {
706 		vsocket->supported_features = features;
707 		vsocket->features = features;
708 
709 		/* Anyone setting feature bits is implementing their own vhost
710 		 * device backend.
711 		 */
712 		vsocket->use_builtin_virtio_net = false;
713 	}
714 	pthread_mutex_unlock(&vhost_user.mutex);
715 
716 	return vsocket ? 0 : -1;
717 }
718 
719 int
720 rte_vhost_driver_get_features(const char *path, uint64_t *features)
721 {
722 	struct vhost_user_socket *vsocket;
723 	uint64_t vdpa_features;
724 	struct rte_vdpa_device *vdpa_dev;
725 	int ret = 0;
726 
727 	pthread_mutex_lock(&vhost_user.mutex);
728 	vsocket = find_vhost_user_socket(path);
729 	if (!vsocket) {
730 		VHOST_CONFIG_LOG(path, ERR, "socket file is not registered yet.");
731 		ret = -1;
732 		goto unlock_exit;
733 	}
734 
735 	vdpa_dev = vsocket->vdpa_dev;
736 	if (!vdpa_dev) {
737 		*features = vsocket->features;
738 		goto unlock_exit;
739 	}
740 
741 	if (vdpa_dev->ops->get_features(vdpa_dev, &vdpa_features) < 0) {
742 		VHOST_CONFIG_LOG(path, ERR, "failed to get vdpa features for socket file.");
743 		ret = -1;
744 		goto unlock_exit;
745 	}
746 
747 	*features = vsocket->features & vdpa_features;
748 
749 unlock_exit:
750 	pthread_mutex_unlock(&vhost_user.mutex);
751 	return ret;
752 }
753 
754 int
755 rte_vhost_driver_set_protocol_features(const char *path,
756 		uint64_t protocol_features)
757 {
758 	struct vhost_user_socket *vsocket;
759 
760 	pthread_mutex_lock(&vhost_user.mutex);
761 	vsocket = find_vhost_user_socket(path);
762 	if (vsocket)
763 		vsocket->protocol_features = protocol_features;
764 	pthread_mutex_unlock(&vhost_user.mutex);
765 	return vsocket ? 0 : -1;
766 }
767 
768 int
769 rte_vhost_driver_get_protocol_features(const char *path,
770 		uint64_t *protocol_features)
771 {
772 	struct vhost_user_socket *vsocket;
773 	uint64_t vdpa_protocol_features;
774 	struct rte_vdpa_device *vdpa_dev;
775 	int ret = 0;
776 
777 	pthread_mutex_lock(&vhost_user.mutex);
778 	vsocket = find_vhost_user_socket(path);
779 	if (!vsocket) {
780 		VHOST_CONFIG_LOG(path, ERR, "socket file is not registered yet.");
781 		ret = -1;
782 		goto unlock_exit;
783 	}
784 
785 	vdpa_dev = vsocket->vdpa_dev;
786 	if (!vdpa_dev) {
787 		*protocol_features = vsocket->protocol_features;
788 		goto unlock_exit;
789 	}
790 
791 	if (vdpa_dev->ops->get_protocol_features(vdpa_dev,
792 				&vdpa_protocol_features) < 0) {
793 		VHOST_CONFIG_LOG(path, ERR, "failed to get vdpa protocol features.");
794 		ret = -1;
795 		goto unlock_exit;
796 	}
797 
798 	*protocol_features = vsocket->protocol_features
799 		& vdpa_protocol_features;
800 
801 unlock_exit:
802 	pthread_mutex_unlock(&vhost_user.mutex);
803 	return ret;
804 }
805 
806 int
807 rte_vhost_driver_get_queue_num(const char *path, uint32_t *queue_num)
808 {
809 	struct vhost_user_socket *vsocket;
810 	uint32_t vdpa_queue_num;
811 	struct rte_vdpa_device *vdpa_dev;
812 	int ret = 0;
813 
814 	pthread_mutex_lock(&vhost_user.mutex);
815 	vsocket = find_vhost_user_socket(path);
816 	if (!vsocket) {
817 		VHOST_CONFIG_LOG(path, ERR, "socket file is not registered yet.");
818 		ret = -1;
819 		goto unlock_exit;
820 	}
821 
822 	vdpa_dev = vsocket->vdpa_dev;
823 	if (!vdpa_dev) {
824 		*queue_num = vsocket->max_queue_pairs;
825 		goto unlock_exit;
826 	}
827 
828 	if (vdpa_dev->ops->get_queue_num(vdpa_dev, &vdpa_queue_num) < 0) {
829 		VHOST_CONFIG_LOG(path, ERR, "failed to get vdpa queue number.");
830 		ret = -1;
831 		goto unlock_exit;
832 	}
833 
834 	*queue_num = RTE_MIN(vsocket->max_queue_pairs, vdpa_queue_num);
835 
836 unlock_exit:
837 	pthread_mutex_unlock(&vhost_user.mutex);
838 	return ret;
839 }
840 
841 int
842 rte_vhost_driver_set_max_queue_num(const char *path, uint32_t max_queue_pairs)
843 {
844 	struct vhost_user_socket *vsocket;
845 	int ret = 0;
846 
847 	VHOST_CONFIG_LOG(path, INFO, "Setting max queue pairs to %u", max_queue_pairs);
848 
849 	if (max_queue_pairs > VHOST_MAX_QUEUE_PAIRS) {
850 		VHOST_CONFIG_LOG(path, ERR, "Library only supports up to %u queue pairs",
851 				VHOST_MAX_QUEUE_PAIRS);
852 		return -1;
853 	}
854 
855 	pthread_mutex_lock(&vhost_user.mutex);
856 	vsocket = find_vhost_user_socket(path);
857 	if (!vsocket) {
858 		VHOST_CONFIG_LOG(path, ERR, "socket file is not registered yet.");
859 		ret = -1;
860 		goto unlock_exit;
861 	}
862 
863 	/*
864 	 * This is only useful for VDUSE for which number of virtqueues is set
865 	 * by the backend. For Vhost-user, the number of virtqueues is defined
866 	 * by the frontend.
867 	 */
868 	if (!vsocket->is_vduse) {
869 		VHOST_CONFIG_LOG(path, DEBUG,
870 				"Keeping %u max queue pairs for Vhost-user backend",
871 				VHOST_MAX_QUEUE_PAIRS);
872 		goto unlock_exit;
873 	}
874 
875 	vsocket->max_queue_pairs = max_queue_pairs;
876 
877 unlock_exit:
878 	pthread_mutex_unlock(&vhost_user.mutex);
879 	return ret;
880 }
881 
882 static void
883 vhost_user_socket_mem_free(struct vhost_user_socket *vsocket)
884 {
885 	if (vsocket == NULL)
886 		return;
887 
888 	free(vsocket->path);
889 	free(vsocket);
890 }
891 
892 /*
893  * Register a new vhost-user socket; here we could act as server
894  * (the default case), or client (when RTE_VHOST_USER_CLIENT) flag
895  * is set.
896  */
897 int
898 rte_vhost_driver_register(const char *path, uint64_t flags)
899 {
900 	int ret = -1;
901 	struct vhost_user_socket *vsocket;
902 
903 	if (!path)
904 		return -1;
905 
906 	pthread_mutex_lock(&vhost_user.mutex);
907 
908 	if (vhost_user.vsocket_cnt == MAX_VHOST_SOCKET) {
909 		VHOST_CONFIG_LOG(path, ERR, "the number of vhost sockets reaches maximum");
910 		goto out;
911 	}
912 
913 	vsocket = malloc(sizeof(struct vhost_user_socket));
914 	if (!vsocket)
915 		goto out;
916 	memset(vsocket, 0, sizeof(struct vhost_user_socket));
917 	vsocket->path = strdup(path);
918 	if (vsocket->path == NULL) {
919 		VHOST_CONFIG_LOG(path, ERR, "failed to copy socket path string");
920 		vhost_user_socket_mem_free(vsocket);
921 		goto out;
922 	}
923 	TAILQ_INIT(&vsocket->conn_list);
924 	ret = pthread_mutex_init(&vsocket->conn_mutex, NULL);
925 	if (ret) {
926 		VHOST_CONFIG_LOG(path, ERR, "failed to init connection mutex");
927 		goto out_free;
928 	}
929 
930 	if (!strncmp("/dev/vduse/", path, strlen("/dev/vduse/")))
931 		vsocket->is_vduse = true;
932 
933 	vsocket->vdpa_dev = NULL;
934 	vsocket->max_queue_pairs = VHOST_MAX_QUEUE_PAIRS;
935 	vsocket->extbuf = flags & RTE_VHOST_USER_EXTBUF_SUPPORT;
936 	vsocket->linearbuf = flags & RTE_VHOST_USER_LINEARBUF_SUPPORT;
937 	vsocket->async_copy = flags & RTE_VHOST_USER_ASYNC_COPY;
938 	vsocket->net_compliant_ol_flags = flags & RTE_VHOST_USER_NET_COMPLIANT_OL_FLAGS;
939 	vsocket->stats_enabled = flags & RTE_VHOST_USER_NET_STATS_ENABLE;
940 	vsocket->async_connect = flags & RTE_VHOST_USER_ASYNC_CONNECT;
941 	if (vsocket->is_vduse)
942 		vsocket->iommu_support = true;
943 	else
944 		vsocket->iommu_support = flags & RTE_VHOST_USER_IOMMU_SUPPORT;
945 
946 	if (vsocket->async_copy && (vsocket->iommu_support ||
947 				(flags & RTE_VHOST_USER_POSTCOPY_SUPPORT))) {
948 		VHOST_CONFIG_LOG(path, ERR, "async copy with IOMMU or post-copy not supported");
949 		goto out_mutex;
950 	}
951 
952 	/*
953 	 * Set the supported features correctly for the builtin vhost-user
954 	 * net driver.
955 	 *
956 	 * Applications know nothing about features the builtin virtio net
957 	 * driver (virtio_net.c) supports, thus it's not possible for them
958 	 * to invoke rte_vhost_driver_set_features(). To workaround it, here
959 	 * we set it unconditionally. If the application want to implement
960 	 * another vhost-user driver (say SCSI), it should call the
961 	 * rte_vhost_driver_set_features(), which will overwrite following
962 	 * two values.
963 	 */
964 	vsocket->use_builtin_virtio_net = true;
965 	if (vsocket->is_vduse) {
966 		vsocket->supported_features = VDUSE_NET_SUPPORTED_FEATURES;
967 		vsocket->features           = VDUSE_NET_SUPPORTED_FEATURES;
968 	} else {
969 		vsocket->supported_features = VHOST_USER_NET_SUPPORTED_FEATURES;
970 		vsocket->features           = VHOST_USER_NET_SUPPORTED_FEATURES;
971 		vsocket->protocol_features  = VHOST_USER_PROTOCOL_FEATURES;
972 	}
973 
974 	if (vsocket->async_copy) {
975 		vsocket->supported_features &= ~(1ULL << VHOST_F_LOG_ALL);
976 		vsocket->features &= ~(1ULL << VHOST_F_LOG_ALL);
977 		VHOST_CONFIG_LOG(path, INFO, "logging feature is disabled in async copy mode");
978 	}
979 
980 	/*
981 	 * We'll not be able to receive a buffer from guest in linear mode
982 	 * without external buffer if it will not fit in a single mbuf, which is
983 	 * likely if segmentation offloading enabled.
984 	 */
985 	if (vsocket->linearbuf && !vsocket->extbuf) {
986 		uint64_t seg_offload_features =
987 				(1ULL << VIRTIO_NET_F_HOST_TSO4) |
988 				(1ULL << VIRTIO_NET_F_HOST_TSO6) |
989 				(1ULL << VIRTIO_NET_F_HOST_UFO);
990 
991 		VHOST_CONFIG_LOG(path, INFO, "Linear buffers requested without external buffers,");
992 		VHOST_CONFIG_LOG(path, INFO, "disabling host segmentation offloading support");
993 		vsocket->supported_features &= ~seg_offload_features;
994 		vsocket->features &= ~seg_offload_features;
995 	}
996 
997 	if (!vsocket->iommu_support) {
998 		vsocket->supported_features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
999 		vsocket->features &= ~(1ULL << VIRTIO_F_IOMMU_PLATFORM);
1000 	}
1001 
1002 	if (!(flags & RTE_VHOST_USER_POSTCOPY_SUPPORT)) {
1003 		vsocket->protocol_features &=
1004 			~(1ULL << VHOST_USER_PROTOCOL_F_PAGEFAULT);
1005 	} else {
1006 #ifndef RTE_LIBRTE_VHOST_POSTCOPY
1007 		VHOST_CONFIG_LOG(path, ERR, "Postcopy requested but not compiled");
1008 		ret = -1;
1009 		goto out_mutex;
1010 #endif
1011 	}
1012 
1013 	if (!vsocket->is_vduse) {
1014 		if ((flags & RTE_VHOST_USER_CLIENT) != 0) {
1015 			vsocket->reconnect = !(flags & RTE_VHOST_USER_NO_RECONNECT);
1016 			if (vsocket->reconnect && reconn_tid.opaque_id == 0) {
1017 				if (vhost_user_reconnect_init() != 0)
1018 					goto out_mutex;
1019 			}
1020 		} else {
1021 			vsocket->is_server = true;
1022 		}
1023 		ret = create_unix_socket(vsocket);
1024 		if (ret < 0)
1025 			goto out_mutex;
1026 	}
1027 
1028 	vhost_user.vsockets[vhost_user.vsocket_cnt++] = vsocket;
1029 
1030 	pthread_mutex_unlock(&vhost_user.mutex);
1031 	return ret;
1032 
1033 out_mutex:
1034 	if (pthread_mutex_destroy(&vsocket->conn_mutex)) {
1035 		VHOST_CONFIG_LOG(path, ERR, "failed to destroy connection mutex");
1036 	}
1037 out_free:
1038 	vhost_user_socket_mem_free(vsocket);
1039 out:
1040 	pthread_mutex_unlock(&vhost_user.mutex);
1041 
1042 	return ret;
1043 }
1044 
1045 static bool
1046 vhost_user_remove_reconnect(struct vhost_user_socket *vsocket)
1047 {
1048 	int found = false;
1049 	struct vhost_user_reconnect *reconn, *next;
1050 
1051 	pthread_mutex_lock(&reconn_list.mutex);
1052 
1053 	for (reconn = TAILQ_FIRST(&reconn_list.head);
1054 	     reconn != NULL; reconn = next) {
1055 		next = TAILQ_NEXT(reconn, next);
1056 
1057 		if (reconn->vsocket == vsocket) {
1058 			TAILQ_REMOVE(&reconn_list.head, reconn, next);
1059 			close(reconn->fd);
1060 			free(reconn);
1061 			found = true;
1062 			break;
1063 		}
1064 	}
1065 	pthread_mutex_unlock(&reconn_list.mutex);
1066 	return found;
1067 }
1068 
1069 /**
1070  * Unregister the specified vhost socket
1071  */
1072 int
1073 rte_vhost_driver_unregister(const char *path)
1074 {
1075 	int i;
1076 	int count;
1077 	struct vhost_user_connection *conn, *next;
1078 
1079 	if (path == NULL)
1080 		return -1;
1081 
1082 again:
1083 	pthread_mutex_lock(&vhost_user.mutex);
1084 
1085 	for (i = 0; i < vhost_user.vsocket_cnt; i++) {
1086 		struct vhost_user_socket *vsocket = vhost_user.vsockets[i];
1087 		if (strcmp(vsocket->path, path))
1088 			continue;
1089 
1090 		if (vsocket->is_vduse) {
1091 			vduse_device_destroy(path);
1092 		} else if (vsocket->is_server) {
1093 			/*
1094 			 * If r/wcb is executing, release vhost_user's
1095 			 * mutex lock, and try again since the r/wcb
1096 			 * may use the mutex lock.
1097 			 */
1098 			if (fdset_try_del(vhost_user.fdset, vsocket->socket_fd) == -1) {
1099 				pthread_mutex_unlock(&vhost_user.mutex);
1100 				goto again;
1101 			}
1102 		} else if (vsocket->reconnect) {
1103 			vhost_user_remove_reconnect(vsocket);
1104 		}
1105 
1106 		pthread_mutex_lock(&vsocket->conn_mutex);
1107 		for (conn = TAILQ_FIRST(&vsocket->conn_list);
1108 			 conn != NULL;
1109 			 conn = next) {
1110 			next = TAILQ_NEXT(conn, next);
1111 
1112 			/*
1113 			 * If r/wcb is executing, release vsocket's
1114 			 * conn_mutex and vhost_user's mutex locks, and
1115 			 * try again since the r/wcb may use the
1116 			 * conn_mutex and mutex locks.
1117 			 */
1118 			if (fdset_try_del(vhost_user.fdset,
1119 					  conn->connfd) == -1) {
1120 				pthread_mutex_unlock(&vsocket->conn_mutex);
1121 				pthread_mutex_unlock(&vhost_user.mutex);
1122 				goto again;
1123 			}
1124 
1125 			VHOST_CONFIG_LOG(path, INFO, "free connfd %d", conn->connfd);
1126 			close(conn->connfd);
1127 			vhost_destroy_device(conn->vid);
1128 			TAILQ_REMOVE(&vsocket->conn_list, conn, next);
1129 			free(conn);
1130 		}
1131 		pthread_mutex_unlock(&vsocket->conn_mutex);
1132 
1133 		if (vsocket->is_server) {
1134 			close(vsocket->socket_fd);
1135 			unlink(path);
1136 		}
1137 
1138 		pthread_mutex_destroy(&vsocket->conn_mutex);
1139 		vhost_user_socket_mem_free(vsocket);
1140 
1141 		count = --vhost_user.vsocket_cnt;
1142 		vhost_user.vsockets[i] = vhost_user.vsockets[count];
1143 		vhost_user.vsockets[count] = NULL;
1144 		pthread_mutex_unlock(&vhost_user.mutex);
1145 		return 0;
1146 	}
1147 	pthread_mutex_unlock(&vhost_user.mutex);
1148 
1149 	return -1;
1150 }
1151 
1152 /*
1153  * Register ops so that we can add/remove device to data core.
1154  */
1155 int
1156 rte_vhost_driver_callback_register(const char *path,
1157 	struct rte_vhost_device_ops const * const ops)
1158 {
1159 	struct vhost_user_socket *vsocket;
1160 
1161 	pthread_mutex_lock(&vhost_user.mutex);
1162 	vsocket = find_vhost_user_socket(path);
1163 	if (vsocket)
1164 		vsocket->notify_ops = ops;
1165 	pthread_mutex_unlock(&vhost_user.mutex);
1166 
1167 	return vsocket ? 0 : -1;
1168 }
1169 
1170 struct rte_vhost_device_ops const *
1171 vhost_driver_callback_get(const char *path)
1172 {
1173 	struct vhost_user_socket *vsocket;
1174 
1175 	pthread_mutex_lock(&vhost_user.mutex);
1176 	vsocket = find_vhost_user_socket(path);
1177 	pthread_mutex_unlock(&vhost_user.mutex);
1178 
1179 	return vsocket ? vsocket->notify_ops : NULL;
1180 }
1181 
1182 int
1183 rte_vhost_driver_start(const char *path)
1184 {
1185 	struct vhost_user_socket *vsocket;
1186 
1187 	pthread_mutex_lock(&vhost_user.mutex);
1188 	vsocket = find_vhost_user_socket(path);
1189 	pthread_mutex_unlock(&vhost_user.mutex);
1190 
1191 	if (!vsocket)
1192 		return -1;
1193 
1194 	if (vsocket->is_vduse)
1195 		return vduse_device_create(path, vsocket->net_compliant_ol_flags);
1196 
1197 	if (vhost_user.fdset == NULL) {
1198 		vhost_user.fdset = fdset_init("vhost-evt");
1199 		if (vhost_user.fdset == NULL) {
1200 			VHOST_CONFIG_LOG(path, ERR, "failed to init Vhost-user fdset");
1201 			return -1;
1202 		}
1203 	}
1204 
1205 	if (vsocket->is_server)
1206 		return vhost_user_start_server(vsocket);
1207 	else
1208 		return vhost_user_start_client(vsocket);
1209 }
1210