xref: /netbsd-src/lib/librumpuser/rumpuser_sp.c (revision 1b9578b8c2c1f848eeb16dabbfd7d1f0d9fdefbd)
1 /*      $NetBSD: rumpuser_sp.c,v 1.45 2011/03/08 15:34:37 pooka Exp $	*/
2 
3 /*
4  * Copyright (c) 2010, 2011 Antti Kantee.  All Rights Reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 /*
29  * Sysproxy routines.  This provides system RPC support over host sockets.
30  * The most notable limitation is that the client and server must share
31  * the same ABI.  This does not mean that they have to be the same
32  * machine or that they need to run the same version of the host OS,
33  * just that they must agree on the data structures.  This even *might*
34  * work correctly from one hardware architecture to another.
35  */
36 
37 #include <sys/cdefs.h>
38 __RCSID("$NetBSD: rumpuser_sp.c,v 1.45 2011/03/08 15:34:37 pooka Exp $");
39 
40 #include <sys/types.h>
41 #include <sys/atomic.h>
42 #include <sys/mman.h>
43 #include <sys/socket.h>
44 
45 #include <arpa/inet.h>
46 #include <netinet/in.h>
47 #include <netinet/tcp.h>
48 
49 #include <assert.h>
50 #include <errno.h>
51 #include <fcntl.h>
52 #include <poll.h>
53 #include <pthread.h>
54 #include <stdarg.h>
55 #include <stdio.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include <unistd.h>
59 
60 #include <rump/rump.h> /* XXX: for rfork flags */
61 #include <rump/rumpuser.h>
62 #include "rumpuser_int.h"
63 
64 #include "sp_common.c"
65 
66 #ifndef MAXCLI
67 #define MAXCLI 256
68 #endif
69 #ifndef MAXWORKER
70 #define MAXWORKER 128
71 #endif
72 #ifndef IDLEWORKER
73 #define IDLEWORKER 16
74 #endif
75 int rumpsp_maxworker = MAXWORKER;
76 int rumpsp_idleworker = IDLEWORKER;
77 
78 static struct pollfd pfdlist[MAXCLI];
79 static struct spclient spclist[MAXCLI];
80 static unsigned int disco;
81 static volatile int spfini;
82 
83 static struct rumpuser_sp_ops spops;
84 
85 static char banner[MAXBANNER];
86 
87 #define PROTOMAJOR 0
88 #define PROTOMINOR 3
89 
90 struct prefork {
91 	uint32_t pf_auth[AUTHLEN];
92 	struct lwp *pf_lwp;
93 
94 	LIST_ENTRY(prefork) pf_entries;		/* global list */
95 	LIST_ENTRY(prefork) pf_spcentries;	/* linked from forking spc */
96 };
97 static LIST_HEAD(, prefork) preforks = LIST_HEAD_INITIALIZER(preforks);
98 static pthread_mutex_t pfmtx;
99 
100 /*
101  * This version is for the server.  It's optimized for multiple threads
102  * and is *NOT* reentrant wrt to signals.
103  */
104 static int
105 waitresp(struct spclient *spc, struct respwait *rw)
106 {
107 	int spcstate;
108 	int rv = 0;
109 
110 	pthread_mutex_lock(&spc->spc_mtx);
111 	sendunlockl(spc);
112 	while (!rw->rw_done && spc->spc_state != SPCSTATE_DYING) {
113 		pthread_cond_wait(&rw->rw_cv, &spc->spc_mtx);
114 	}
115 	TAILQ_REMOVE(&spc->spc_respwait, rw, rw_entries);
116 	spcstate = spc->spc_state;
117 	pthread_mutex_unlock(&spc->spc_mtx);
118 
119 	pthread_cond_destroy(&rw->rw_cv);
120 
121 	if (rv)
122 		return rv;
123 	if (spcstate == SPCSTATE_DYING)
124 		return ENOTCONN;
125 	return rw->rw_error;
126 }
127 
128 /*
129  * Manual wrappers, since librump does not have access to the
130  * user namespace wrapped interfaces.
131  */
132 
133 static void
134 lwproc_switch(struct lwp *l)
135 {
136 
137 	spops.spop_schedule();
138 	spops.spop_lwproc_switch(l);
139 	spops.spop_unschedule();
140 }
141 
142 static void
143 lwproc_release(void)
144 {
145 
146 	spops.spop_schedule();
147 	spops.spop_lwproc_release();
148 	spops.spop_unschedule();
149 }
150 
151 static int
152 lwproc_rfork(struct spclient *spc, int flags, const char *comm)
153 {
154 	int rv;
155 
156 	spops.spop_schedule();
157 	rv = spops.spop_lwproc_rfork(spc, flags, comm);
158 	spops.spop_unschedule();
159 
160 	return rv;
161 }
162 
163 static int
164 lwproc_newlwp(pid_t pid)
165 {
166 	int rv;
167 
168 	spops.spop_schedule();
169 	rv = spops.spop_lwproc_newlwp(pid);
170 	spops.spop_unschedule();
171 
172 	return rv;
173 }
174 
175 static struct lwp *
176 lwproc_curlwp(void)
177 {
178 	struct lwp *l;
179 
180 	spops.spop_schedule();
181 	l = spops.spop_lwproc_curlwp();
182 	spops.spop_unschedule();
183 
184 	return l;
185 }
186 
187 static pid_t
188 lwproc_getpid(void)
189 {
190 	pid_t p;
191 
192 	spops.spop_schedule();
193 	p = spops.spop_getpid();
194 	spops.spop_unschedule();
195 
196 	return p;
197 }
198 
199 static void
200 lwproc_execnotify(const char *comm)
201 {
202 
203 	spops.spop_schedule();
204 	spops.spop_execnotify(comm);
205 	spops.spop_unschedule();
206 }
207 
208 static void
209 lwproc_lwpexit(void)
210 {
211 
212 	spops.spop_schedule();
213 	spops.spop_lwpexit();
214 	spops.spop_unschedule();
215 }
216 
217 static int
218 rumpsyscall(int sysnum, void *data, register_t *retval)
219 {
220 	int rv;
221 
222 	spops.spop_schedule();
223 	rv = spops.spop_syscall(sysnum, data, retval);
224 	spops.spop_unschedule();
225 
226 	return rv;
227 }
228 
229 static uint64_t
230 nextreq(struct spclient *spc)
231 {
232 	uint64_t nw;
233 
234 	pthread_mutex_lock(&spc->spc_mtx);
235 	nw = spc->spc_nextreq++;
236 	pthread_mutex_unlock(&spc->spc_mtx);
237 
238 	return nw;
239 }
240 
241 /*
242  * XXX: we send responses with "blocking" I/O.  This is not
243  * ok for the main thread.  XXXFIXME
244  */
245 
246 static void
247 send_error_resp(struct spclient *spc, uint64_t reqno, int error)
248 {
249 	struct rsp_hdr rhdr;
250 	struct iovec iov[1];
251 
252 	rhdr.rsp_len = sizeof(rhdr);
253 	rhdr.rsp_reqno = reqno;
254 	rhdr.rsp_class = RUMPSP_ERROR;
255 	rhdr.rsp_type = 0;
256 	rhdr.rsp_error = error;
257 
258 	IOVPUT(iov[0], rhdr);
259 
260 	sendlock(spc);
261 	(void)SENDIOV(spc, iov);
262 	sendunlock(spc);
263 }
264 
265 static int
266 send_handshake_resp(struct spclient *spc, uint64_t reqno, int error)
267 {
268 	struct rsp_hdr rhdr;
269 	struct iovec iov[2];
270 	int rv;
271 
272 	rhdr.rsp_len = sizeof(rhdr) + sizeof(error);
273 	rhdr.rsp_reqno = reqno;
274 	rhdr.rsp_class = RUMPSP_RESP;
275 	rhdr.rsp_type = RUMPSP_HANDSHAKE;
276 	rhdr.rsp_error = 0;
277 
278 	IOVPUT(iov[0], rhdr);
279 	IOVPUT(iov[1], error);
280 
281 	sendlock(spc);
282 	rv = SENDIOV(spc, iov);
283 	sendunlock(spc);
284 
285 	return rv;
286 }
287 
288 static int
289 send_syscall_resp(struct spclient *spc, uint64_t reqno, int error,
290 	register_t *retval)
291 {
292 	struct rsp_hdr rhdr;
293 	struct rsp_sysresp sysresp;
294 	struct iovec iov[2];
295 	int rv;
296 
297 	rhdr.rsp_len = sizeof(rhdr) + sizeof(sysresp);
298 	rhdr.rsp_reqno = reqno;
299 	rhdr.rsp_class = RUMPSP_RESP;
300 	rhdr.rsp_type = RUMPSP_SYSCALL;
301 	rhdr.rsp_sysnum = 0;
302 
303 	sysresp.rsys_error = error;
304 	memcpy(sysresp.rsys_retval, retval, sizeof(sysresp.rsys_retval));
305 
306 	IOVPUT(iov[0], rhdr);
307 	IOVPUT(iov[1], sysresp);
308 
309 	sendlock(spc);
310 	rv = SENDIOV(spc, iov);
311 	sendunlock(spc);
312 
313 	return rv;
314 }
315 
316 static int
317 send_prefork_resp(struct spclient *spc, uint64_t reqno, uint32_t *auth)
318 {
319 	struct rsp_hdr rhdr;
320 	struct iovec iov[2];
321 	int rv;
322 
323 	rhdr.rsp_len = sizeof(rhdr) + AUTHLEN*sizeof(*auth);
324 	rhdr.rsp_reqno = reqno;
325 	rhdr.rsp_class = RUMPSP_RESP;
326 	rhdr.rsp_type = RUMPSP_PREFORK;
327 	rhdr.rsp_sysnum = 0;
328 
329 	IOVPUT(iov[0], rhdr);
330 	IOVPUT_WITHSIZE(iov[1], auth, AUTHLEN*sizeof(*auth));
331 
332 	sendlock(spc);
333 	rv = SENDIOV(spc, iov);
334 	sendunlock(spc);
335 
336 	return rv;
337 }
338 
339 static int
340 copyin_req(struct spclient *spc, const void *remaddr, size_t *dlen,
341 	int wantstr, void **resp)
342 {
343 	struct rsp_hdr rhdr;
344 	struct rsp_copydata copydata;
345 	struct respwait rw;
346 	struct iovec iov[2];
347 	int rv;
348 
349 	DPRINTF(("copyin_req: %zu bytes from %p\n", *dlen, remaddr));
350 
351 	rhdr.rsp_len = sizeof(rhdr) + sizeof(copydata);
352 	rhdr.rsp_class = RUMPSP_REQ;
353 	if (wantstr)
354 		rhdr.rsp_type = RUMPSP_COPYINSTR;
355 	else
356 		rhdr.rsp_type = RUMPSP_COPYIN;
357 	rhdr.rsp_sysnum = 0;
358 
359 	copydata.rcp_addr = __UNCONST(remaddr);
360 	copydata.rcp_len = *dlen;
361 
362 	IOVPUT(iov[0], rhdr);
363 	IOVPUT(iov[1], copydata);
364 
365 	putwait(spc, &rw, &rhdr);
366 	rv = SENDIOV(spc, iov);
367 	if (rv) {
368 		unputwait(spc, &rw);
369 		return rv;
370 	}
371 
372 	rv = waitresp(spc, &rw);
373 
374 	DPRINTF(("copyin: response %d\n", rv));
375 
376 	*resp = rw.rw_data;
377 	if (wantstr)
378 		*dlen = rw.rw_dlen;
379 
380 	return rv;
381 
382 }
383 
384 static int
385 send_copyout_req(struct spclient *spc, const void *remaddr,
386 	const void *data, size_t dlen)
387 {
388 	struct rsp_hdr rhdr;
389 	struct rsp_copydata copydata;
390 	struct iovec iov[3];
391 	int rv;
392 
393 	DPRINTF(("copyout_req (async): %zu bytes to %p\n", dlen, remaddr));
394 
395 	rhdr.rsp_len = sizeof(rhdr) + sizeof(copydata) + dlen;
396 	rhdr.rsp_reqno = nextreq(spc);
397 	rhdr.rsp_class = RUMPSP_REQ;
398 	rhdr.rsp_type = RUMPSP_COPYOUT;
399 	rhdr.rsp_sysnum = 0;
400 
401 	copydata.rcp_addr = __UNCONST(remaddr);
402 	copydata.rcp_len = dlen;
403 
404 	IOVPUT(iov[0], rhdr);
405 	IOVPUT(iov[1], copydata);
406 	IOVPUT_WITHSIZE(iov[2], __UNCONST(data), dlen);
407 
408 	sendlock(spc);
409 	rv = SENDIOV(spc, iov);
410 	sendunlock(spc);
411 
412 	return rv;
413 }
414 
415 static int
416 anonmmap_req(struct spclient *spc, size_t howmuch, void **resp)
417 {
418 	struct rsp_hdr rhdr;
419 	struct respwait rw;
420 	struct iovec iov[2];
421 	int rv;
422 
423 	DPRINTF(("anonmmap_req: %zu bytes\n", howmuch));
424 
425 	rhdr.rsp_len = sizeof(rhdr) + sizeof(howmuch);
426 	rhdr.rsp_class = RUMPSP_REQ;
427 	rhdr.rsp_type = RUMPSP_ANONMMAP;
428 	rhdr.rsp_sysnum = 0;
429 
430 	IOVPUT(iov[0], rhdr);
431 	IOVPUT(iov[1], howmuch);
432 
433 	putwait(spc, &rw, &rhdr);
434 	rv = SENDIOV(spc, iov);
435 	if (rv) {
436 		unputwait(spc, &rw);
437 		return rv;
438 	}
439 
440 	rv = waitresp(spc, &rw);
441 
442 	*resp = rw.rw_data;
443 
444 	DPRINTF(("anonmmap: mapped at %p\n", **(void ***)resp));
445 
446 	return rv;
447 }
448 
449 static int
450 send_raise_req(struct spclient *spc, int signo)
451 {
452 	struct rsp_hdr rhdr;
453 	struct iovec iov[1];
454 	int rv;
455 
456 	rhdr.rsp_len = sizeof(rhdr);
457 	rhdr.rsp_class = RUMPSP_REQ;
458 	rhdr.rsp_type = RUMPSP_RAISE;
459 	rhdr.rsp_signo = signo;
460 
461 	IOVPUT(iov[0], rhdr);
462 
463 	sendlock(spc);
464 	rv = SENDIOV(spc, iov);
465 	sendunlock(spc);
466 
467 	return rv;
468 }
469 
470 static void
471 spcref(struct spclient *spc)
472 {
473 
474 	pthread_mutex_lock(&spc->spc_mtx);
475 	spc->spc_refcnt++;
476 	pthread_mutex_unlock(&spc->spc_mtx);
477 }
478 
479 static void
480 spcrelease(struct spclient *spc)
481 {
482 	int ref;
483 
484 	pthread_mutex_lock(&spc->spc_mtx);
485 	ref = --spc->spc_refcnt;
486 	if (__predict_false(spc->spc_inexec && ref <= 2))
487 		pthread_cond_broadcast(&spc->spc_cv);
488 	pthread_mutex_unlock(&spc->spc_mtx);
489 
490 	if (ref > 0)
491 		return;
492 
493 	DPRINTF(("rump_sp: spcrelease: spc %p fd %d\n", spc, spc->spc_fd));
494 
495 	_DIAGASSERT(TAILQ_EMPTY(&spc->spc_respwait));
496 	_DIAGASSERT(spc->spc_buf == NULL);
497 
498 	if (spc->spc_mainlwp) {
499 		lwproc_switch(spc->spc_mainlwp);
500 		lwproc_release();
501 	}
502 	spc->spc_mainlwp = NULL;
503 
504 	close(spc->spc_fd);
505 	spc->spc_fd = -1;
506 	spc->spc_state = SPCSTATE_NEW;
507 
508 	atomic_inc_uint(&disco);
509 }
510 
511 static void
512 serv_handledisco(unsigned int idx)
513 {
514 	struct spclient *spc = &spclist[idx];
515 	int dolwpexit;
516 
517 	DPRINTF(("rump_sp: disconnecting [%u]\n", idx));
518 
519 	pfdlist[idx].fd = -1;
520 	pfdlist[idx].revents = 0;
521 	pthread_mutex_lock(&spc->spc_mtx);
522 	spc->spc_state = SPCSTATE_DYING;
523 	kickall(spc);
524 	sendunlockl(spc);
525 	/* exec uses mainlwp in another thread, but also nuked all lwps */
526 	dolwpexit = !spc->spc_inexec;
527 	pthread_mutex_unlock(&spc->spc_mtx);
528 
529 	if (dolwpexit && spc->spc_mainlwp) {
530 		lwproc_switch(spc->spc_mainlwp);
531 		lwproc_lwpexit();
532 		lwproc_switch(NULL);
533 	}
534 
535 	/*
536 	 * Nobody's going to attempt to send/receive anymore,
537 	 * so reinit info relevant to that.
538 	 */
539 	/*LINTED:pointer casts may be ok*/
540 	memset((char *)spc + SPC_ZEROFF, 0, sizeof(*spc) - SPC_ZEROFF);
541 
542 	spcrelease(spc);
543 }
544 
545 static void
546 serv_shutdown(void)
547 {
548 	struct spclient *spc;
549 	unsigned int i;
550 
551 	for (i = 1; i < MAXCLI; i++) {
552 		spc = &spclist[i];
553 		if (spc->spc_fd == -1)
554 			continue;
555 
556 		shutdown(spc->spc_fd, SHUT_RDWR);
557 		serv_handledisco(i);
558 
559 		spcrelease(spc);
560 	}
561 }
562 
563 static unsigned
564 serv_handleconn(int fd, connecthook_fn connhook, int busy)
565 {
566 	struct sockaddr_storage ss;
567 	socklen_t sl = sizeof(ss);
568 	int newfd, flags;
569 	unsigned i;
570 
571 	/*LINTED: cast ok */
572 	newfd = accept(fd, (struct sockaddr *)&ss, &sl);
573 	if (newfd == -1)
574 		return 0;
575 
576 	if (busy) {
577 		close(newfd); /* EBUSY */
578 		return 0;
579 	}
580 
581 	flags = fcntl(newfd, F_GETFL, 0);
582 	if (fcntl(newfd, F_SETFL, flags | O_NONBLOCK) == -1) {
583 		close(newfd);
584 		return 0;
585 	}
586 
587 	if (connhook(newfd) != 0) {
588 		close(newfd);
589 		return 0;
590 	}
591 
592 	/* write out a banner for the client */
593 	if (send(newfd, banner, strlen(banner), MSG_NOSIGNAL)
594 	    != (ssize_t)strlen(banner)) {
595 		close(newfd);
596 		return 0;
597 	}
598 
599 	/* find empty slot the simple way */
600 	for (i = 0; i < MAXCLI; i++) {
601 		if (pfdlist[i].fd == -1 && spclist[i].spc_state == SPCSTATE_NEW)
602 			break;
603 	}
604 
605 	assert(i < MAXCLI);
606 
607 	pfdlist[i].fd = newfd;
608 	spclist[i].spc_fd = newfd;
609 	spclist[i].spc_istatus = SPCSTATUS_BUSY; /* dedicated receiver */
610 	spclist[i].spc_refcnt = 1;
611 
612 	TAILQ_INIT(&spclist[i].spc_respwait);
613 
614 	DPRINTF(("rump_sp: added new connection fd %d at idx %u\n", newfd, i));
615 
616 	return i;
617 }
618 
619 static void
620 serv_handlesyscall(struct spclient *spc, struct rsp_hdr *rhdr, uint8_t *data)
621 {
622 	register_t retval[2] = {0, 0};
623 	int rv, sysnum;
624 
625 	sysnum = (int)rhdr->rsp_sysnum;
626 	DPRINTF(("rump_sp: handling syscall %d from client %d\n",
627 	    sysnum, spc->spc_pid));
628 
629 	if (__predict_false((rv = lwproc_newlwp(spc->spc_pid)) != 0)) {
630 		retval[0] = -1;
631 		send_syscall_resp(spc, rhdr->rsp_reqno, rv, retval);
632 		return;
633 	}
634 	spc->spc_syscallreq = rhdr->rsp_reqno;
635 	rv = rumpsyscall(sysnum, data, retval);
636 	spc->spc_syscallreq = 0;
637 	lwproc_release();
638 
639 	DPRINTF(("rump_sp: got return value %d & %d/%d\n",
640 	    rv, retval[0], retval[1]));
641 
642 	send_syscall_resp(spc, rhdr->rsp_reqno, rv, retval);
643 }
644 
645 static void
646 serv_handleexec(struct spclient *spc, struct rsp_hdr *rhdr, char *comm)
647 {
648 	size_t commlen = rhdr->rsp_len - HDRSZ;
649 
650 	pthread_mutex_lock(&spc->spc_mtx);
651 	/* one for the connection and one for us */
652 	while (spc->spc_refcnt > 2)
653 		pthread_cond_wait(&spc->spc_cv, &spc->spc_mtx);
654 	pthread_mutex_unlock(&spc->spc_mtx);
655 
656 	/*
657 	 * ok, all the threads are dead (or one is still alive and
658 	 * the connection is dead, in which case this doesn't matter
659 	 * very much).  proceed with exec.
660 	 */
661 
662 	/* ensure comm is 0-terminated */
663 	/* TODO: make sure it contains sensible chars? */
664 	comm[commlen] = '\0';
665 
666 	lwproc_switch(spc->spc_mainlwp);
667 	lwproc_execnotify(comm);
668 	lwproc_switch(NULL);
669 
670 	pthread_mutex_lock(&spc->spc_mtx);
671 	spc->spc_inexec = 0;
672 	pthread_mutex_unlock(&spc->spc_mtx);
673 	send_handshake_resp(spc, rhdr->rsp_reqno, 0);
674 }
675 
676 enum sbatype { SBA_SYSCALL, SBA_EXEC };
677 
678 struct servbouncearg {
679 	struct spclient *sba_spc;
680 	struct rsp_hdr sba_hdr;
681 	enum sbatype sba_type;
682 	uint8_t *sba_data;
683 
684 	TAILQ_ENTRY(servbouncearg) sba_entries;
685 };
686 static pthread_mutex_t sbamtx;
687 static pthread_cond_t sbacv;
688 static int nworker, idleworker, nwork;
689 static TAILQ_HEAD(, servbouncearg) wrklist = TAILQ_HEAD_INITIALIZER(wrklist);
690 
691 /*ARGSUSED*/
692 static void *
693 serv_workbouncer(void *arg)
694 {
695 	struct servbouncearg *sba;
696 
697 	for (;;) {
698 		pthread_mutex_lock(&sbamtx);
699 		if (__predict_false(idleworker - nwork >= rumpsp_idleworker)) {
700 			nworker--;
701 			pthread_mutex_unlock(&sbamtx);
702 			break;
703 		}
704 		idleworker++;
705 		while (TAILQ_EMPTY(&wrklist)) {
706 			_DIAGASSERT(nwork == 0);
707 			pthread_cond_wait(&sbacv, &sbamtx);
708 		}
709 		idleworker--;
710 
711 		sba = TAILQ_FIRST(&wrklist);
712 		TAILQ_REMOVE(&wrklist, sba, sba_entries);
713 		nwork--;
714 		pthread_mutex_unlock(&sbamtx);
715 
716 		if (__predict_true(sba->sba_type == SBA_SYSCALL)) {
717 			serv_handlesyscall(sba->sba_spc,
718 			    &sba->sba_hdr, sba->sba_data);
719 		} else {
720 			_DIAGASSERT(sba->sba_type == SBA_EXEC);
721 			serv_handleexec(sba->sba_spc, &sba->sba_hdr,
722 			    (char *)sba->sba_data);
723 		}
724 		spcrelease(sba->sba_spc);
725 		free(sba->sba_data);
726 		free(sba);
727 	}
728 
729 	return NULL;
730 }
731 
732 static int
733 sp_copyin(void *arg, const void *raddr, void *laddr, size_t *len, int wantstr)
734 {
735 	struct spclient *spc = arg;
736 	void *rdata = NULL; /* XXXuninit */
737 	int rv, nlocks;
738 
739 	rumpuser__kunlock(0, &nlocks, NULL);
740 
741 	rv = copyin_req(spc, raddr, len, wantstr, &rdata);
742 	if (rv)
743 		goto out;
744 
745 	memcpy(laddr, rdata, *len);
746 	free(rdata);
747 
748  out:
749 	rumpuser__klock(nlocks, NULL);
750 	if (rv)
751 		return EFAULT;
752 	return 0;
753 }
754 
755 int
756 rumpuser_sp_copyin(void *arg, const void *raddr, void *laddr, size_t len)
757 {
758 
759 	return sp_copyin(arg, raddr, laddr, &len, 0);
760 }
761 
762 int
763 rumpuser_sp_copyinstr(void *arg, const void *raddr, void *laddr, size_t *len)
764 {
765 
766 	return sp_copyin(arg, raddr, laddr, len, 1);
767 }
768 
769 static int
770 sp_copyout(void *arg, const void *laddr, void *raddr, size_t dlen)
771 {
772 	struct spclient *spc = arg;
773 	int nlocks, rv;
774 
775 	rumpuser__kunlock(0, &nlocks, NULL);
776 	rv = send_copyout_req(spc, raddr, laddr, dlen);
777 	rumpuser__klock(nlocks, NULL);
778 
779 	if (rv)
780 		return EFAULT;
781 	return 0;
782 }
783 
784 int
785 rumpuser_sp_copyout(void *arg, const void *laddr, void *raddr, size_t dlen)
786 {
787 
788 	return sp_copyout(arg, laddr, raddr, dlen);
789 }
790 
791 int
792 rumpuser_sp_copyoutstr(void *arg, const void *laddr, void *raddr, size_t *dlen)
793 {
794 
795 	return sp_copyout(arg, laddr, raddr, *dlen);
796 }
797 
798 int
799 rumpuser_sp_anonmmap(void *arg, size_t howmuch, void **addr)
800 {
801 	struct spclient *spc = arg;
802 	void *resp, *rdata;
803 	int nlocks, rv;
804 
805 	rumpuser__kunlock(0, &nlocks, NULL);
806 
807 	rv = anonmmap_req(spc, howmuch, &rdata);
808 	if (rv) {
809 		rv = EFAULT;
810 		goto out;
811 	}
812 
813 	resp = *(void **)rdata;
814 	free(rdata);
815 
816 	if (resp == NULL) {
817 		rv = ENOMEM;
818 	}
819 
820 	*addr = resp;
821 
822  out:
823 	rumpuser__klock(nlocks, NULL);
824 
825 	if (rv)
826 		return rv;
827 	return 0;
828 }
829 
830 int
831 rumpuser_sp_raise(void *arg, int signo)
832 {
833 	struct spclient *spc = arg;
834 	int rv, nlocks;
835 
836 	rumpuser__kunlock(0, &nlocks, NULL);
837 	rv = send_raise_req(spc, signo);
838 	rumpuser__klock(nlocks, NULL);
839 
840 	return rv;
841 }
842 
843 static pthread_attr_t pattr_detached;
844 static void
845 schedulework(struct spclient *spc, enum sbatype sba_type)
846 {
847 	struct servbouncearg *sba;
848 	pthread_t pt;
849 	uint64_t reqno;
850 	int retries = 0;
851 
852 	reqno = spc->spc_hdr.rsp_reqno;
853 	while ((sba = malloc(sizeof(*sba))) == NULL) {
854 		if (nworker == 0 || retries > 10) {
855 			send_error_resp(spc, reqno, EAGAIN);
856 			spcfreebuf(spc);
857 			return;
858 		}
859 		/* slim chance of more memory? */
860 		usleep(10000);
861 	}
862 
863 	sba->sba_spc = spc;
864 	sba->sba_type = sba_type;
865 	sba->sba_hdr = spc->spc_hdr;
866 	sba->sba_data = spc->spc_buf;
867 	spcresetbuf(spc);
868 
869 	spcref(spc);
870 
871 	pthread_mutex_lock(&sbamtx);
872 	TAILQ_INSERT_TAIL(&wrklist, sba, sba_entries);
873 	nwork++;
874 	if (nwork <= idleworker) {
875 		/* do we have a daemon's tool (i.e. idle threads)? */
876 		pthread_cond_signal(&sbacv);
877 	} else if (nworker < rumpsp_maxworker) {
878 		/*
879 		 * Else, need to create one
880 		 * (if we can, otherwise just expect another
881 		 * worker to pick up the syscall)
882 		 */
883 		if (pthread_create(&pt, &pattr_detached,
884 		    serv_workbouncer, NULL) == 0) {
885 			nworker++;
886 		}
887 	}
888 	pthread_mutex_unlock(&sbamtx);
889 }
890 
891 /*
892  *
893  * Startup routines and mainloop for server.
894  *
895  */
896 
897 struct spservarg {
898 	int sps_sock;
899 	connecthook_fn sps_connhook;
900 };
901 
902 static void
903 handlereq(struct spclient *spc)
904 {
905 	uint64_t reqno;
906 	int error, i;
907 
908 	reqno = spc->spc_hdr.rsp_reqno;
909 	if (__predict_false(spc->spc_state == SPCSTATE_NEW)) {
910 		if (spc->spc_hdr.rsp_type != RUMPSP_HANDSHAKE) {
911 			send_error_resp(spc, reqno, EAUTH);
912 			shutdown(spc->spc_fd, SHUT_RDWR);
913 			spcfreebuf(spc);
914 			return;
915 		}
916 
917 		if (spc->spc_hdr.rsp_handshake == HANDSHAKE_GUEST) {
918 			char *comm = (char *)spc->spc_buf;
919 			size_t commlen = spc->spc_hdr.rsp_len - HDRSZ;
920 
921 			/* ensure it's 0-terminated */
922 			/* XXX make sure it contains sensible chars? */
923 			comm[commlen] = '\0';
924 
925 			if ((error = lwproc_rfork(spc,
926 			    RUMP_RFCFDG, comm)) != 0) {
927 				shutdown(spc->spc_fd, SHUT_RDWR);
928 			}
929 
930 			spcfreebuf(spc);
931 			if (error)
932 				return;
933 
934 			spc->spc_mainlwp = lwproc_curlwp();
935 
936 			send_handshake_resp(spc, reqno, 0);
937 		} else if (spc->spc_hdr.rsp_handshake == HANDSHAKE_FORK) {
938 			struct lwp *tmpmain;
939 			struct prefork *pf;
940 			struct handshake_fork *rfp;
941 			int cancel;
942 
943 			if (spc->spc_off-HDRSZ != sizeof(*rfp)) {
944 				send_error_resp(spc, reqno, EINVAL);
945 				shutdown(spc->spc_fd, SHUT_RDWR);
946 				spcfreebuf(spc);
947 				return;
948 			}
949 
950 			/*LINTED*/
951 			rfp = (void *)spc->spc_buf;
952 			cancel = rfp->rf_cancel;
953 
954 			pthread_mutex_lock(&pfmtx);
955 			LIST_FOREACH(pf, &preforks, pf_entries) {
956 				if (memcmp(rfp->rf_auth, pf->pf_auth,
957 				    sizeof(rfp->rf_auth)) == 0) {
958 					LIST_REMOVE(pf, pf_entries);
959 					LIST_REMOVE(pf, pf_spcentries);
960 					break;
961 				}
962 			}
963 			pthread_mutex_lock(&pfmtx);
964 			spcfreebuf(spc);
965 
966 			if (!pf) {
967 				send_error_resp(spc, reqno, ESRCH);
968 				shutdown(spc->spc_fd, SHUT_RDWR);
969 				return;
970 			}
971 
972 			tmpmain = pf->pf_lwp;
973 			free(pf);
974 			lwproc_switch(tmpmain);
975 			if (cancel) {
976 				lwproc_release();
977 				shutdown(spc->spc_fd, SHUT_RDWR);
978 				return;
979 			}
980 
981 			/*
982 			 * So, we forked already during "prefork" to save
983 			 * the file descriptors from a parent exit
984 			 * race condition.  But now we need to fork
985 			 * a second time since the initial fork has
986 			 * the wrong spc pointer.  (yea, optimize
987 			 * interfaces some day if anyone cares)
988 			 */
989 			if ((error = lwproc_rfork(spc, 0, NULL)) != 0) {
990 				send_error_resp(spc, reqno, error);
991 				shutdown(spc->spc_fd, SHUT_RDWR);
992 				lwproc_release();
993 				return;
994 			}
995 			spc->spc_mainlwp = lwproc_curlwp();
996 			lwproc_switch(tmpmain);
997 			lwproc_release();
998 			lwproc_switch(spc->spc_mainlwp);
999 
1000 			send_handshake_resp(spc, reqno, 0);
1001 		} else {
1002 			send_error_resp(spc, reqno, EAUTH);
1003 			shutdown(spc->spc_fd, SHUT_RDWR);
1004 			spcfreebuf(spc);
1005 			return;
1006 		}
1007 
1008 		spc->spc_pid = lwproc_getpid();
1009 
1010 		DPRINTF(("rump_sp: handshake for client %p complete, pid %d\n",
1011 		    spc, spc->spc_pid));
1012 
1013 		lwproc_switch(NULL);
1014 		spc->spc_state = SPCSTATE_RUNNING;
1015 		return;
1016 	}
1017 
1018 	if (__predict_false(spc->spc_hdr.rsp_type == RUMPSP_PREFORK)) {
1019 		struct prefork *pf;
1020 		uint32_t auth[AUTHLEN];
1021 		int inexec;
1022 
1023 		DPRINTF(("rump_sp: prefork handler executing for %p\n", spc));
1024 		spcfreebuf(spc);
1025 
1026 		pthread_mutex_lock(&spc->spc_mtx);
1027 		inexec = spc->spc_inexec;
1028 		pthread_mutex_unlock(&spc->spc_mtx);
1029 		if (inexec) {
1030 			send_error_resp(spc, reqno, EBUSY);
1031 			shutdown(spc->spc_fd, SHUT_RDWR);
1032 			return;
1033 		}
1034 
1035 		pf = malloc(sizeof(*pf));
1036 		if (pf == NULL) {
1037 			send_error_resp(spc, reqno, ENOMEM);
1038 			return;
1039 		}
1040 
1041 		/*
1042 		 * Use client main lwp to fork.  this is never used by
1043 		 * worker threads (except in exec, but we checked for that
1044 		 * above) so we can safely use it here.
1045 		 */
1046 		lwproc_switch(spc->spc_mainlwp);
1047 		if ((error = lwproc_rfork(spc, RUMP_RFFDG, NULL)) != 0) {
1048 			DPRINTF(("rump_sp: fork failed: %d (%p)\n",error, spc));
1049 			send_error_resp(spc, reqno, error);
1050 			lwproc_switch(NULL);
1051 			free(pf);
1052 			return;
1053 		}
1054 
1055 		/* Ok, we have a new process context and a new curlwp */
1056 		for (i = 0; i < AUTHLEN; i++) {
1057 			pf->pf_auth[i] = auth[i] = arc4random();
1058 		}
1059 		pf->pf_lwp = lwproc_curlwp();
1060 		lwproc_switch(NULL);
1061 
1062 		pthread_mutex_lock(&pfmtx);
1063 		LIST_INSERT_HEAD(&preforks, pf, pf_entries);
1064 		LIST_INSERT_HEAD(&spc->spc_pflist, pf, pf_spcentries);
1065 		pthread_mutex_unlock(&pfmtx);
1066 
1067 		DPRINTF(("rump_sp: prefork handler success %p\n", spc));
1068 
1069 		send_prefork_resp(spc, reqno, auth);
1070 		return;
1071 	}
1072 
1073 	if (__predict_false(spc->spc_hdr.rsp_type == RUMPSP_HANDSHAKE)) {
1074 		int inexec;
1075 
1076 		if (spc->spc_hdr.rsp_handshake != HANDSHAKE_EXEC) {
1077 			send_error_resp(spc, reqno, EINVAL);
1078 			shutdown(spc->spc_fd, SHUT_RDWR);
1079 			spcfreebuf(spc);
1080 			return;
1081 		}
1082 
1083 		pthread_mutex_lock(&spc->spc_mtx);
1084 		inexec = spc->spc_inexec;
1085 		pthread_mutex_unlock(&spc->spc_mtx);
1086 		if (inexec) {
1087 			send_error_resp(spc, reqno, EBUSY);
1088 			shutdown(spc->spc_fd, SHUT_RDWR);
1089 			spcfreebuf(spc);
1090 			return;
1091 		}
1092 
1093 		pthread_mutex_lock(&spc->spc_mtx);
1094 		spc->spc_inexec = 1;
1095 		pthread_mutex_unlock(&spc->spc_mtx);
1096 
1097 		/*
1098 		 * start to drain lwps.  we will wait for it to finish
1099 		 * in another thread
1100 		 */
1101 		lwproc_switch(spc->spc_mainlwp);
1102 		lwproc_lwpexit();
1103 		lwproc_switch(NULL);
1104 
1105 		/*
1106 		 * exec has to wait for lwps to drain, so finish it off
1107 		 * in another thread
1108 		 */
1109 		schedulework(spc, SBA_EXEC);
1110 		return;
1111 	}
1112 
1113 	if (__predict_false(spc->spc_hdr.rsp_type != RUMPSP_SYSCALL)) {
1114 		send_error_resp(spc, reqno, EINVAL);
1115 		spcfreebuf(spc);
1116 		return;
1117 	}
1118 
1119 	schedulework(spc, SBA_SYSCALL);
1120 }
1121 
1122 static void *
1123 spserver(void *arg)
1124 {
1125 	struct spservarg *sarg = arg;
1126 	struct spclient *spc;
1127 	unsigned idx;
1128 	int seen;
1129 	int rv;
1130 	unsigned int nfds, maxidx;
1131 
1132 	for (idx = 0; idx < MAXCLI; idx++) {
1133 		pfdlist[idx].fd = -1;
1134 		pfdlist[idx].events = POLLIN;
1135 
1136 		spc = &spclist[idx];
1137 		pthread_mutex_init(&spc->spc_mtx, NULL);
1138 		pthread_cond_init(&spc->spc_cv, NULL);
1139 		spc->spc_fd = -1;
1140 	}
1141 	pfdlist[0].fd = spclist[0].spc_fd = sarg->sps_sock;
1142 	pfdlist[0].events = POLLIN;
1143 	nfds = 1;
1144 	maxidx = 0;
1145 
1146 	pthread_attr_init(&pattr_detached);
1147 	pthread_attr_setdetachstate(&pattr_detached, PTHREAD_CREATE_DETACHED);
1148 	/* XXX: doesn't stacksize currently work on NetBSD */
1149 	pthread_attr_setstacksize(&pattr_detached, 32*1024);
1150 
1151 	pthread_mutex_init(&sbamtx, NULL);
1152 	pthread_cond_init(&sbacv, NULL);
1153 
1154 	DPRINTF(("rump_sp: server mainloop\n"));
1155 
1156 	for (;;) {
1157 		int discoed;
1158 
1159 		/* g/c hangarounds (eventually) */
1160 		discoed = atomic_swap_uint(&disco, 0);
1161 		while (discoed--) {
1162 			nfds--;
1163 			idx = maxidx;
1164 			while (idx) {
1165 				if (pfdlist[idx].fd != -1) {
1166 					maxidx = idx;
1167 					break;
1168 				}
1169 				idx--;
1170 			}
1171 			DPRINTF(("rump_sp: set maxidx to [%u]\n",
1172 			    maxidx));
1173 		}
1174 
1175 		DPRINTF(("rump_sp: loop nfd %d\n", maxidx+1));
1176 		seen = 0;
1177 		rv = poll(pfdlist, maxidx+1, INFTIM);
1178 		assert(maxidx+1 <= MAXCLI);
1179 		assert(rv != 0);
1180 		if (rv == -1) {
1181 			if (errno == EINTR)
1182 				continue;
1183 			fprintf(stderr, "rump_spserver: poll returned %d\n",
1184 			    errno);
1185 			break;
1186 		}
1187 
1188 		for (idx = 0; seen < rv && idx < MAXCLI; idx++) {
1189 			if ((pfdlist[idx].revents & POLLIN) == 0)
1190 				continue;
1191 
1192 			seen++;
1193 			DPRINTF(("rump_sp: activity at [%u] %d/%d\n",
1194 			    idx, seen, rv));
1195 			if (idx > 0) {
1196 				spc = &spclist[idx];
1197 				DPRINTF(("rump_sp: mainloop read [%u]\n", idx));
1198 				switch (readframe(spc)) {
1199 				case 0:
1200 					break;
1201 				case -1:
1202 					serv_handledisco(idx);
1203 					break;
1204 				default:
1205 					switch (spc->spc_hdr.rsp_class) {
1206 					case RUMPSP_RESP:
1207 						kickwaiter(spc);
1208 						break;
1209 					case RUMPSP_REQ:
1210 						handlereq(spc);
1211 						break;
1212 					default:
1213 						send_error_resp(spc,
1214 						    spc->spc_hdr.rsp_reqno,
1215 						    ENOENT);
1216 						spcfreebuf(spc);
1217 						break;
1218 					}
1219 					break;
1220 				}
1221 
1222 			} else {
1223 				DPRINTF(("rump_sp: mainloop new connection\n"));
1224 
1225 				if (__predict_false(spfini)) {
1226 					close(spclist[0].spc_fd);
1227 					serv_shutdown();
1228 					goto out;
1229 				}
1230 
1231 				idx = serv_handleconn(pfdlist[0].fd,
1232 				    sarg->sps_connhook, nfds == MAXCLI);
1233 				if (idx)
1234 					nfds++;
1235 				if (idx > maxidx)
1236 					maxidx = idx;
1237 				DPRINTF(("rump_sp: maxid now %d\n", maxidx));
1238 			}
1239 		}
1240 	}
1241 
1242  out:
1243 	return NULL;
1244 }
1245 
1246 static unsigned cleanupidx;
1247 static struct sockaddr *cleanupsa;
1248 int
1249 rumpuser_sp_init(const char *url, const struct rumpuser_sp_ops *spopsp,
1250 	const char *ostype, const char *osrelease, const char *machine)
1251 {
1252 	pthread_t pt;
1253 	struct spservarg *sarg;
1254 	struct sockaddr *sap;
1255 	char *p;
1256 	unsigned idx;
1257 	int error, s;
1258 
1259 	p = strdup(url);
1260 	if (p == NULL)
1261 		return ENOMEM;
1262 	error = parseurl(p, &sap, &idx, 1);
1263 	free(p);
1264 	if (error)
1265 		return error;
1266 
1267 	snprintf(banner, sizeof(banner), "RUMPSP-%d.%d-%s-%s/%s\n",
1268 	    PROTOMAJOR, PROTOMINOR, ostype, osrelease, machine);
1269 
1270 	s = socket(parsetab[idx].domain, SOCK_STREAM, 0);
1271 	if (s == -1)
1272 		return errno;
1273 
1274 	spops = *spopsp;
1275 	sarg = malloc(sizeof(*sarg));
1276 	if (sarg == NULL) {
1277 		close(s);
1278 		return ENOMEM;
1279 	}
1280 
1281 	sarg->sps_sock = s;
1282 	sarg->sps_connhook = parsetab[idx].connhook;
1283 
1284 	cleanupidx = idx;
1285 	cleanupsa = sap;
1286 
1287 	/* sloppy error recovery */
1288 
1289 	/*LINTED*/
1290 	if (bind(s, sap, sap->sa_len) == -1) {
1291 		fprintf(stderr, "rump_sp: server bind failed\n");
1292 		return errno;
1293 	}
1294 
1295 	if (listen(s, MAXCLI) == -1) {
1296 		fprintf(stderr, "rump_sp: server listen failed\n");
1297 		return errno;
1298 	}
1299 
1300 	if ((error = pthread_create(&pt, NULL, spserver, sarg)) != 0) {
1301 		fprintf(stderr, "rump_sp: cannot create wrkr thread\n");
1302 		return errno;
1303 	}
1304 	pthread_detach(pt);
1305 
1306 	return 0;
1307 }
1308 
1309 void
1310 rumpuser_sp_fini(void *arg)
1311 {
1312 	struct spclient *spc = arg;
1313 	register_t retval[2] = {0, 0};
1314 
1315 	if (spclist[0].spc_fd) {
1316 		parsetab[cleanupidx].cleanup(cleanupsa);
1317 	}
1318 
1319 	/*
1320 	 * stuff response into the socket, since this process is just
1321 	 * about to exit
1322 	 */
1323 	if (spc && spc->spc_syscallreq)
1324 		send_syscall_resp(spc, spc->spc_syscallreq, 0, retval);
1325 
1326 	if (spclist[0].spc_fd) {
1327 		shutdown(spclist[0].spc_fd, SHUT_RDWR);
1328 		spfini = 1;
1329 	}
1330 }
1331