xref: /netbsd-src/lib/librumpuser/rumpuser_sp.c (revision daf6c4152fcddc27c445489775ed1f66ab4ea9a9)
1 /*      $NetBSD: rumpuser_sp.c,v 1.42 2011/02/15 16:10:41 pooka Exp $	*/
2 
3 /*
4  * Copyright (c) 2010, 2011 Antti Kantee.  All Rights Reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 /*
29  * Sysproxy routines.  This provides system RPC support over host sockets.
30  * The most notable limitation is that the client and server must share
31  * the same ABI.  This does not mean that they have to be the same
32  * machine or that they need to run the same version of the host OS,
33  * just that they must agree on the data structures.  This even *might*
34  * work correctly from one hardware architecture to another.
35  */
36 
37 #include <sys/cdefs.h>
38 __RCSID("$NetBSD: rumpuser_sp.c,v 1.42 2011/02/15 16:10:41 pooka Exp $");
39 
40 #include <sys/types.h>
41 #include <sys/atomic.h>
42 #include <sys/mman.h>
43 #include <sys/socket.h>
44 
45 #include <arpa/inet.h>
46 #include <netinet/in.h>
47 #include <netinet/tcp.h>
48 
49 #include <assert.h>
50 #include <errno.h>
51 #include <fcntl.h>
52 #include <poll.h>
53 #include <pthread.h>
54 #include <stdarg.h>
55 #include <stdio.h>
56 #include <stdlib.h>
57 #include <string.h>
58 #include <unistd.h>
59 
60 #include <rump/rump.h> /* XXX: for rfork flags */
61 #include <rump/rumpuser.h>
62 #include "rumpuser_int.h"
63 
64 #include "sp_common.c"
65 
66 #ifndef MAXCLI
67 #define MAXCLI 256
68 #endif
69 #ifndef MAXWORKER
70 #define MAXWORKER 128
71 #endif
72 #ifndef IDLEWORKER
73 #define IDLEWORKER 16
74 #endif
75 int rumpsp_maxworker = MAXWORKER;
76 int rumpsp_idleworker = IDLEWORKER;
77 
78 static struct pollfd pfdlist[MAXCLI];
79 static struct spclient spclist[MAXCLI];
80 static unsigned int disco;
81 static volatile int spfini;
82 
83 static struct rumpuser_sp_ops spops;
84 
85 static char banner[MAXBANNER];
86 
87 #define PROTOMAJOR 0
88 #define PROTOMINOR 3
89 
90 struct prefork {
91 	uint32_t pf_auth[AUTHLEN];
92 	struct lwp *pf_lwp;
93 
94 	LIST_ENTRY(prefork) pf_entries;		/* global list */
95 	LIST_ENTRY(prefork) pf_spcentries;	/* linked from forking spc */
96 };
97 static LIST_HEAD(, prefork) preforks = LIST_HEAD_INITIALIZER(preforks);
98 static pthread_mutex_t pfmtx;
99 
100 /*
101  * This version is for the server.  It's optimized for multiple threads
102  * and is *NOT* reentrant wrt to signals.
103  */
104 static int
105 waitresp(struct spclient *spc, struct respwait *rw)
106 {
107 	int spcstate;
108 	int rv = 0;
109 
110 	pthread_mutex_lock(&spc->spc_mtx);
111 	sendunlockl(spc);
112 	while (!rw->rw_done && spc->spc_state != SPCSTATE_DYING) {
113 		pthread_cond_wait(&rw->rw_cv, &spc->spc_mtx);
114 	}
115 	TAILQ_REMOVE(&spc->spc_respwait, rw, rw_entries);
116 	spcstate = spc->spc_state;
117 	pthread_mutex_unlock(&spc->spc_mtx);
118 
119 	pthread_cond_destroy(&rw->rw_cv);
120 
121 	if (rv)
122 		return rv;
123 	if (spcstate == SPCSTATE_DYING)
124 		return ENOTCONN;
125 	return rw->rw_error;
126 }
127 
128 /*
129  * Manual wrappers, since librump does not have access to the
130  * user namespace wrapped interfaces.
131  */
132 
133 static void
134 lwproc_switch(struct lwp *l)
135 {
136 
137 	spops.spop_schedule();
138 	spops.spop_lwproc_switch(l);
139 	spops.spop_unschedule();
140 }
141 
142 static void
143 lwproc_release(void)
144 {
145 
146 	spops.spop_schedule();
147 	spops.spop_lwproc_release();
148 	spops.spop_unschedule();
149 }
150 
151 static int
152 lwproc_rfork(struct spclient *spc, int flags, const char *comm)
153 {
154 	int rv;
155 
156 	spops.spop_schedule();
157 	rv = spops.spop_lwproc_rfork(spc, flags, comm);
158 	spops.spop_unschedule();
159 
160 	return rv;
161 }
162 
163 static int
164 lwproc_newlwp(pid_t pid)
165 {
166 	int rv;
167 
168 	spops.spop_schedule();
169 	rv = spops.spop_lwproc_newlwp(pid);
170 	spops.spop_unschedule();
171 
172 	return rv;
173 }
174 
175 static struct lwp *
176 lwproc_curlwp(void)
177 {
178 	struct lwp *l;
179 
180 	spops.spop_schedule();
181 	l = spops.spop_lwproc_curlwp();
182 	spops.spop_unschedule();
183 
184 	return l;
185 }
186 
187 static pid_t
188 lwproc_getpid(void)
189 {
190 	pid_t p;
191 
192 	spops.spop_schedule();
193 	p = spops.spop_getpid();
194 	spops.spop_unschedule();
195 
196 	return p;
197 }
198 static void
199 lwproc_execnotify(const char *comm)
200 {
201 
202 	spops.spop_schedule();
203 	spops.spop_execnotify(comm);
204 	spops.spop_unschedule();
205 }
206 
207 static void
208 lwproc_procexit(void)
209 {
210 
211 	spops.spop_schedule();
212 	spops.spop_procexit();
213 	spops.spop_unschedule();
214 }
215 
216 static int
217 rumpsyscall(int sysnum, void *data, register_t *retval)
218 {
219 	int rv;
220 
221 	spops.spop_schedule();
222 	rv = spops.spop_syscall(sysnum, data, retval);
223 	spops.spop_unschedule();
224 
225 	return rv;
226 }
227 
228 static uint64_t
229 nextreq(struct spclient *spc)
230 {
231 	uint64_t nw;
232 
233 	pthread_mutex_lock(&spc->spc_mtx);
234 	nw = spc->spc_nextreq++;
235 	pthread_mutex_unlock(&spc->spc_mtx);
236 
237 	return nw;
238 }
239 
240 static void
241 send_error_resp(struct spclient *spc, uint64_t reqno, int error)
242 {
243 	struct rsp_hdr rhdr;
244 
245 	rhdr.rsp_len = sizeof(rhdr);
246 	rhdr.rsp_reqno = reqno;
247 	rhdr.rsp_class = RUMPSP_ERROR;
248 	rhdr.rsp_type = 0;
249 	rhdr.rsp_error = error;
250 
251 	sendlock(spc);
252 	(void)dosend(spc, &rhdr, sizeof(rhdr));
253 	sendunlock(spc);
254 }
255 
256 static int
257 send_handshake_resp(struct spclient *spc, uint64_t reqno, int error)
258 {
259 	struct rsp_hdr rhdr;
260 	int rv;
261 
262 	rhdr.rsp_len = sizeof(rhdr) + sizeof(error);
263 	rhdr.rsp_reqno = reqno;
264 	rhdr.rsp_class = RUMPSP_RESP;
265 	rhdr.rsp_type = RUMPSP_HANDSHAKE;
266 	rhdr.rsp_error = 0;
267 
268 	sendlock(spc);
269 	rv = dosend(spc, &rhdr, sizeof(rhdr));
270 	rv = dosend(spc, &error, sizeof(error));
271 	sendunlock(spc);
272 
273 	return rv;
274 }
275 
276 static int
277 send_syscall_resp(struct spclient *spc, uint64_t reqno, int error,
278 	register_t *retval)
279 {
280 	struct rsp_hdr rhdr;
281 	struct rsp_sysresp sysresp;
282 	int rv;
283 
284 	rhdr.rsp_len = sizeof(rhdr) + sizeof(sysresp);
285 	rhdr.rsp_reqno = reqno;
286 	rhdr.rsp_class = RUMPSP_RESP;
287 	rhdr.rsp_type = RUMPSP_SYSCALL;
288 	rhdr.rsp_sysnum = 0;
289 
290 	sysresp.rsys_error = error;
291 	memcpy(sysresp.rsys_retval, retval, sizeof(sysresp.rsys_retval));
292 
293 	sendlock(spc);
294 	rv = dosend(spc, &rhdr, sizeof(rhdr));
295 	rv = dosend(spc, &sysresp, sizeof(sysresp));
296 	sendunlock(spc);
297 
298 	return rv;
299 }
300 
301 static int
302 send_prefork_resp(struct spclient *spc, uint64_t reqno, uint32_t *auth)
303 {
304 	struct rsp_hdr rhdr;
305 	int rv;
306 
307 	rhdr.rsp_len = sizeof(rhdr) + AUTHLEN*sizeof(*auth);
308 	rhdr.rsp_reqno = reqno;
309 	rhdr.rsp_class = RUMPSP_RESP;
310 	rhdr.rsp_type = RUMPSP_PREFORK;
311 	rhdr.rsp_sysnum = 0;
312 
313 	sendlock(spc);
314 	rv = dosend(spc, &rhdr, sizeof(rhdr));
315 	rv = dosend(spc, auth, AUTHLEN*sizeof(*auth));
316 	sendunlock(spc);
317 
318 	return rv;
319 }
320 
321 static int
322 copyin_req(struct spclient *spc, const void *remaddr, size_t *dlen,
323 	int wantstr, void **resp)
324 {
325 	struct rsp_hdr rhdr;
326 	struct rsp_copydata copydata;
327 	struct respwait rw;
328 	int rv;
329 
330 	DPRINTF(("copyin_req: %zu bytes from %p\n", *dlen, remaddr));
331 
332 	rhdr.rsp_len = sizeof(rhdr) + sizeof(copydata);
333 	rhdr.rsp_class = RUMPSP_REQ;
334 	if (wantstr)
335 		rhdr.rsp_type = RUMPSP_COPYINSTR;
336 	else
337 		rhdr.rsp_type = RUMPSP_COPYIN;
338 	rhdr.rsp_sysnum = 0;
339 
340 	copydata.rcp_addr = __UNCONST(remaddr);
341 	copydata.rcp_len = *dlen;
342 
343 	putwait(spc, &rw, &rhdr);
344 	rv = dosend(spc, &rhdr, sizeof(rhdr));
345 	rv = dosend(spc, &copydata, sizeof(copydata));
346 	if (rv) {
347 		unputwait(spc, &rw);
348 		return rv;
349 	}
350 
351 	rv = waitresp(spc, &rw);
352 
353 	DPRINTF(("copyin: response %d\n", rv));
354 
355 	*resp = rw.rw_data;
356 	if (wantstr)
357 		*dlen = rw.rw_dlen;
358 
359 	return rv;
360 
361 }
362 
363 static int
364 send_copyout_req(struct spclient *spc, const void *remaddr,
365 	const void *data, size_t dlen)
366 {
367 	struct rsp_hdr rhdr;
368 	struct rsp_copydata copydata;
369 	int rv;
370 
371 	DPRINTF(("copyout_req (async): %zu bytes to %p\n", dlen, remaddr));
372 
373 	rhdr.rsp_len = sizeof(rhdr) + sizeof(copydata) + dlen;
374 	rhdr.rsp_reqno = nextreq(spc);
375 	rhdr.rsp_class = RUMPSP_REQ;
376 	rhdr.rsp_type = RUMPSP_COPYOUT;
377 	rhdr.rsp_sysnum = 0;
378 
379 	copydata.rcp_addr = __UNCONST(remaddr);
380 	copydata.rcp_len = dlen;
381 
382 	sendlock(spc);
383 	rv = dosend(spc, &rhdr, sizeof(rhdr));
384 	rv = dosend(spc, &copydata, sizeof(copydata));
385 	rv = dosend(spc, data, dlen);
386 	sendunlock(spc);
387 
388 	return rv;
389 }
390 
391 static int
392 anonmmap_req(struct spclient *spc, size_t howmuch, void **resp)
393 {
394 	struct rsp_hdr rhdr;
395 	struct respwait rw;
396 	int rv;
397 
398 	DPRINTF(("anonmmap_req: %zu bytes\n", howmuch));
399 
400 	rhdr.rsp_len = sizeof(rhdr) + sizeof(howmuch);
401 	rhdr.rsp_class = RUMPSP_REQ;
402 	rhdr.rsp_type = RUMPSP_ANONMMAP;
403 	rhdr.rsp_sysnum = 0;
404 
405 	putwait(spc, &rw, &rhdr);
406 	rv = dosend(spc, &rhdr, sizeof(rhdr));
407 	rv = dosend(spc, &howmuch, sizeof(howmuch));
408 	if (rv) {
409 		unputwait(spc, &rw);
410 		return rv;
411 	}
412 
413 	rv = waitresp(spc, &rw);
414 
415 	*resp = rw.rw_data;
416 
417 	DPRINTF(("anonmmap: mapped at %p\n", **(void ***)resp));
418 
419 	return rv;
420 }
421 
422 static int
423 send_raise_req(struct spclient *spc, int signo)
424 {
425 	struct rsp_hdr rhdr;
426 	int rv;
427 
428 	rhdr.rsp_len = sizeof(rhdr);
429 	rhdr.rsp_class = RUMPSP_REQ;
430 	rhdr.rsp_type = RUMPSP_RAISE;
431 	rhdr.rsp_signo = signo;
432 
433 	sendlock(spc);
434 	rv = dosend(spc, &rhdr, sizeof(rhdr));
435 	sendunlock(spc);
436 
437 	return rv;
438 }
439 
440 static void
441 spcref(struct spclient *spc)
442 {
443 
444 	pthread_mutex_lock(&spc->spc_mtx);
445 	spc->spc_refcnt++;
446 	pthread_mutex_unlock(&spc->spc_mtx);
447 }
448 
449 static void
450 spcrelease(struct spclient *spc)
451 {
452 	int ref;
453 
454 	pthread_mutex_lock(&spc->spc_mtx);
455 	ref = --spc->spc_refcnt;
456 	pthread_mutex_unlock(&spc->spc_mtx);
457 
458 	if (ref > 0)
459 		return;
460 
461 	DPRINTF(("rump_sp: spcrelease: spc %p fd %d\n", spc, spc->spc_fd));
462 
463 	_DIAGASSERT(TAILQ_EMPTY(&spc->spc_respwait));
464 	_DIAGASSERT(spc->spc_buf == NULL);
465 
466 	if (spc->spc_mainlwp) {
467 		lwproc_switch(spc->spc_mainlwp);
468 		lwproc_release();
469 	}
470 	spc->spc_mainlwp = NULL;
471 
472 	close(spc->spc_fd);
473 	spc->spc_fd = -1;
474 	spc->spc_state = SPCSTATE_NEW;
475 
476 	atomic_inc_uint(&disco);
477 }
478 
479 static void
480 serv_handledisco(unsigned int idx)
481 {
482 	struct spclient *spc = &spclist[idx];
483 
484 	DPRINTF(("rump_sp: disconnecting [%u]\n", idx));
485 
486 	pfdlist[idx].fd = -1;
487 	pfdlist[idx].revents = 0;
488 	pthread_mutex_lock(&spc->spc_mtx);
489 	spc->spc_state = SPCSTATE_DYING;
490 	kickall(spc);
491 	sendunlockl(spc);
492 	pthread_mutex_unlock(&spc->spc_mtx);
493 
494 	if (spc->spc_mainlwp) {
495 		lwproc_switch(spc->spc_mainlwp);
496 		lwproc_procexit();
497 		lwproc_switch(NULL);
498 	}
499 
500 	/*
501 	 * Nobody's going to attempt to send/receive anymore,
502 	 * so reinit info relevant to that.
503 	 */
504 	/*LINTED:pointer casts may be ok*/
505 	memset((char *)spc + SPC_ZEROFF, 0, sizeof(*spc) - SPC_ZEROFF);
506 
507 	spcrelease(spc);
508 }
509 
510 static void
511 serv_shutdown(void)
512 {
513 	struct spclient *spc;
514 	unsigned int i;
515 
516 	for (i = 1; i < MAXCLI; i++) {
517 		spc = &spclist[i];
518 		if (spc->spc_fd == -1)
519 			continue;
520 
521 		shutdown(spc->spc_fd, SHUT_RDWR);
522 		serv_handledisco(i);
523 
524 		spcrelease(spc);
525 	}
526 }
527 
528 static unsigned
529 serv_handleconn(int fd, connecthook_fn connhook, int busy)
530 {
531 	struct sockaddr_storage ss;
532 	socklen_t sl = sizeof(ss);
533 	int newfd, flags;
534 	unsigned i;
535 
536 	/*LINTED: cast ok */
537 	newfd = accept(fd, (struct sockaddr *)&ss, &sl);
538 	if (newfd == -1)
539 		return 0;
540 
541 	if (busy) {
542 		close(newfd); /* EBUSY */
543 		return 0;
544 	}
545 
546 	flags = fcntl(newfd, F_GETFL, 0);
547 	if (fcntl(newfd, F_SETFL, flags | O_NONBLOCK) == -1) {
548 		close(newfd);
549 		return 0;
550 	}
551 
552 	if (connhook(newfd) != 0) {
553 		close(newfd);
554 		return 0;
555 	}
556 
557 	/* write out a banner for the client */
558 	if (send(newfd, banner, strlen(banner), MSG_NOSIGNAL)
559 	    != (ssize_t)strlen(banner)) {
560 		close(newfd);
561 		return 0;
562 	}
563 
564 	/* find empty slot the simple way */
565 	for (i = 0; i < MAXCLI; i++) {
566 		if (pfdlist[i].fd == -1 && spclist[i].spc_state == SPCSTATE_NEW)
567 			break;
568 	}
569 
570 	assert(i < MAXCLI);
571 
572 	pfdlist[i].fd = newfd;
573 	spclist[i].spc_fd = newfd;
574 	spclist[i].spc_istatus = SPCSTATUS_BUSY; /* dedicated receiver */
575 	spclist[i].spc_refcnt = 1;
576 
577 	TAILQ_INIT(&spclist[i].spc_respwait);
578 
579 	DPRINTF(("rump_sp: added new connection fd %d at idx %u\n", newfd, i));
580 
581 	return i;
582 }
583 
584 static void
585 serv_handlesyscall(struct spclient *spc, struct rsp_hdr *rhdr, uint8_t *data)
586 {
587 	register_t retval[2] = {0, 0};
588 	int rv, sysnum;
589 
590 	sysnum = (int)rhdr->rsp_sysnum;
591 	DPRINTF(("rump_sp: handling syscall %d from client %d\n",
592 	    sysnum, spc->spc_pid));
593 
594 	lwproc_newlwp(spc->spc_pid);
595 	spc->spc_syscallreq = rhdr->rsp_reqno;
596 	rv = rumpsyscall(sysnum, data, retval);
597 	spc->spc_syscallreq = 0;
598 	lwproc_release();
599 
600 	DPRINTF(("rump_sp: got return value %d & %d/%d\n",
601 	    rv, retval[0], retval[1]));
602 
603 	send_syscall_resp(spc, rhdr->rsp_reqno, rv, retval);
604 }
605 
606 struct sysbouncearg {
607 	struct spclient *sba_spc;
608 	struct rsp_hdr sba_hdr;
609 	uint8_t *sba_data;
610 
611 	TAILQ_ENTRY(sysbouncearg) sba_entries;
612 };
613 static pthread_mutex_t sbamtx;
614 static pthread_cond_t sbacv;
615 static int nworker, idleworker, nwork;
616 static TAILQ_HEAD(, sysbouncearg) syslist = TAILQ_HEAD_INITIALIZER(syslist);
617 
618 /*ARGSUSED*/
619 static void *
620 serv_syscallbouncer(void *arg)
621 {
622 	struct sysbouncearg *sba;
623 
624 	for (;;) {
625 		pthread_mutex_lock(&sbamtx);
626 		if (__predict_false(idleworker >= rumpsp_idleworker)) {
627 			nworker--;
628 			pthread_mutex_unlock(&sbamtx);
629 			break;
630 		}
631 		idleworker++;
632 		while (TAILQ_EMPTY(&syslist)) {
633 			_DIAGASSERT(nwork == 0);
634 			pthread_cond_wait(&sbacv, &sbamtx);
635 		}
636 		idleworker--;
637 
638 		sba = TAILQ_FIRST(&syslist);
639 		TAILQ_REMOVE(&syslist, sba, sba_entries);
640 		nwork--;
641 		pthread_mutex_unlock(&sbamtx);
642 
643 		serv_handlesyscall(sba->sba_spc,
644 		    &sba->sba_hdr, sba->sba_data);
645 		spcrelease(sba->sba_spc);
646 		free(sba->sba_data);
647 		free(sba);
648 	}
649 
650 	return NULL;
651 }
652 
653 static int
654 sp_copyin(void *arg, const void *raddr, void *laddr, size_t *len, int wantstr)
655 {
656 	struct spclient *spc = arg;
657 	void *rdata = NULL; /* XXXuninit */
658 	int rv, nlocks;
659 
660 	rumpuser__kunlock(0, &nlocks, NULL);
661 
662 	rv = copyin_req(spc, raddr, len, wantstr, &rdata);
663 	if (rv)
664 		goto out;
665 
666 	memcpy(laddr, rdata, *len);
667 	free(rdata);
668 
669  out:
670 	rumpuser__klock(nlocks, NULL);
671 	if (rv)
672 		return EFAULT;
673 	return 0;
674 }
675 
676 int
677 rumpuser_sp_copyin(void *arg, const void *raddr, void *laddr, size_t len)
678 {
679 
680 	return sp_copyin(arg, raddr, laddr, &len, 0);
681 }
682 
683 int
684 rumpuser_sp_copyinstr(void *arg, const void *raddr, void *laddr, size_t *len)
685 {
686 
687 	return sp_copyin(arg, raddr, laddr, len, 1);
688 }
689 
690 static int
691 sp_copyout(void *arg, const void *laddr, void *raddr, size_t dlen)
692 {
693 	struct spclient *spc = arg;
694 	int nlocks, rv;
695 
696 	rumpuser__kunlock(0, &nlocks, NULL);
697 	rv = send_copyout_req(spc, raddr, laddr, dlen);
698 	rumpuser__klock(nlocks, NULL);
699 
700 	if (rv)
701 		return EFAULT;
702 	return 0;
703 }
704 
705 int
706 rumpuser_sp_copyout(void *arg, const void *laddr, void *raddr, size_t dlen)
707 {
708 
709 	return sp_copyout(arg, laddr, raddr, dlen);
710 }
711 
712 int
713 rumpuser_sp_copyoutstr(void *arg, const void *laddr, void *raddr, size_t *dlen)
714 {
715 
716 	return sp_copyout(arg, laddr, raddr, *dlen);
717 }
718 
719 int
720 rumpuser_sp_anonmmap(void *arg, size_t howmuch, void **addr)
721 {
722 	struct spclient *spc = arg;
723 	void *resp, *rdata;
724 	int nlocks, rv;
725 
726 	rumpuser__kunlock(0, &nlocks, NULL);
727 
728 	rv = anonmmap_req(spc, howmuch, &rdata);
729 	if (rv) {
730 		rv = EFAULT;
731 		goto out;
732 	}
733 
734 	resp = *(void **)rdata;
735 	free(rdata);
736 
737 	if (resp == NULL) {
738 		rv = ENOMEM;
739 	}
740 
741 	*addr = resp;
742 
743  out:
744 	rumpuser__klock(nlocks, NULL);
745 
746 	if (rv)
747 		return rv;
748 	return 0;
749 }
750 
751 int
752 rumpuser_sp_raise(void *arg, int signo)
753 {
754 	struct spclient *spc = arg;
755 	int rv, nlocks;
756 
757 	rumpuser__kunlock(0, &nlocks, NULL);
758 	rv = send_raise_req(spc, signo);
759 	rumpuser__klock(nlocks, NULL);
760 
761 	return rv;
762 }
763 
764 /*
765  *
766  * Startup routines and mainloop for server.
767  *
768  */
769 
770 struct spservarg {
771 	int sps_sock;
772 	connecthook_fn sps_connhook;
773 };
774 
775 static pthread_attr_t pattr_detached;
776 static void
777 handlereq(struct spclient *spc)
778 {
779 	struct sysbouncearg *sba;
780 	pthread_t pt;
781 	uint64_t reqno;
782 	int retries, error, i;
783 
784 	reqno = spc->spc_hdr.rsp_reqno;
785 	if (__predict_false(spc->spc_state == SPCSTATE_NEW)) {
786 		if (spc->spc_hdr.rsp_type != RUMPSP_HANDSHAKE) {
787 			send_error_resp(spc, reqno, EAUTH);
788 			shutdown(spc->spc_fd, SHUT_RDWR);
789 			spcfreebuf(spc);
790 			return;
791 		}
792 
793 		if (spc->spc_hdr.rsp_handshake == HANDSHAKE_GUEST) {
794 			char *comm = (char *)spc->spc_buf;
795 			size_t commlen = spc->spc_hdr.rsp_len - HDRSZ;
796 
797 			/* ensure it's 0-terminated */
798 			/* XXX make sure it contains sensible chars? */
799 			comm[commlen] = '\0';
800 
801 			if ((error = lwproc_rfork(spc,
802 			    RUMP_RFCFDG, comm)) != 0) {
803 				shutdown(spc->spc_fd, SHUT_RDWR);
804 			}
805 
806 			spcfreebuf(spc);
807 			if (error)
808 				return;
809 
810 			spc->spc_mainlwp = lwproc_curlwp();
811 
812 			send_handshake_resp(spc, reqno, 0);
813 		} else if (spc->spc_hdr.rsp_handshake == HANDSHAKE_FORK) {
814 			struct lwp *tmpmain;
815 			struct prefork *pf;
816 			struct handshake_fork *rfp;
817 			int cancel;
818 
819 			if (spc->spc_off-HDRSZ != sizeof(*rfp)) {
820 				send_error_resp(spc, reqno, EINVAL);
821 				shutdown(spc->spc_fd, SHUT_RDWR);
822 				spcfreebuf(spc);
823 				return;
824 			}
825 
826 			/*LINTED*/
827 			rfp = (void *)spc->spc_buf;
828 			cancel = rfp->rf_cancel;
829 
830 			pthread_mutex_lock(&pfmtx);
831 			LIST_FOREACH(pf, &preforks, pf_entries) {
832 				if (memcmp(rfp->rf_auth, pf->pf_auth,
833 				    sizeof(rfp->rf_auth)) == 0) {
834 					LIST_REMOVE(pf, pf_entries);
835 					LIST_REMOVE(pf, pf_spcentries);
836 					break;
837 				}
838 			}
839 			pthread_mutex_lock(&pfmtx);
840 			spcfreebuf(spc);
841 
842 			if (!pf) {
843 				send_error_resp(spc, reqno, ESRCH);
844 				shutdown(spc->spc_fd, SHUT_RDWR);
845 				return;
846 			}
847 
848 			tmpmain = pf->pf_lwp;
849 			free(pf);
850 			lwproc_switch(tmpmain);
851 			if (cancel) {
852 				lwproc_release();
853 				shutdown(spc->spc_fd, SHUT_RDWR);
854 				return;
855 			}
856 
857 			/*
858 			 * So, we forked already during "prefork" to save
859 			 * the file descriptors from a parent exit
860 			 * race condition.  But now we need to fork
861 			 * a second time since the initial fork has
862 			 * the wrong spc pointer.  (yea, optimize
863 			 * interfaces some day if anyone cares)
864 			 */
865 			if ((error = lwproc_rfork(spc, 0, NULL)) != 0) {
866 				send_error_resp(spc, reqno, error);
867 				shutdown(spc->spc_fd, SHUT_RDWR);
868 				lwproc_release();
869 				return;
870 			}
871 			spc->spc_mainlwp = lwproc_curlwp();
872 			lwproc_switch(tmpmain);
873 			lwproc_release();
874 			lwproc_switch(spc->spc_mainlwp);
875 
876 			send_handshake_resp(spc, reqno, 0);
877 		}
878 
879 		spc->spc_pid = lwproc_getpid();
880 
881 		DPRINTF(("rump_sp: handshake for client %p complete, pid %d\n",
882 		    spc, spc->spc_pid));
883 
884 		lwproc_switch(NULL);
885 		spc->spc_state = SPCSTATE_RUNNING;
886 		return;
887 	}
888 
889 	if (__predict_false(spc->spc_hdr.rsp_type == RUMPSP_PREFORK)) {
890 		struct prefork *pf;
891 		uint32_t auth[AUTHLEN];
892 
893 		DPRINTF(("rump_sp: prefork handler executing for %p\n", spc));
894 		spcfreebuf(spc);
895 
896 		pf = malloc(sizeof(*pf));
897 		if (pf == NULL) {
898 			send_error_resp(spc, reqno, ENOMEM);
899 			return;
900 		}
901 
902 		/*
903 		 * Use client main lwp to fork.  this is never used by
904 		 * worker threads (except if spc refcount goes to 0),
905 		 * so we can safely use it here.
906 		 */
907 		lwproc_switch(spc->spc_mainlwp);
908 		if ((error = lwproc_rfork(spc, RUMP_RFFDG, NULL)) != 0) {
909 			DPRINTF(("rump_sp: fork failed: %d (%p)\n",error, spc));
910 			send_error_resp(spc, reqno, error);
911 			lwproc_switch(NULL);
912 			free(pf);
913 			return;
914 		}
915 
916 		/* Ok, we have a new process context and a new curlwp */
917 		for (i = 0; i < AUTHLEN; i++) {
918 			pf->pf_auth[i] = auth[i] = arc4random();
919 		}
920 		pf->pf_lwp = lwproc_curlwp();
921 		lwproc_switch(NULL);
922 
923 		pthread_mutex_lock(&pfmtx);
924 		LIST_INSERT_HEAD(&preforks, pf, pf_entries);
925 		LIST_INSERT_HEAD(&spc->spc_pflist, pf, pf_spcentries);
926 		pthread_mutex_unlock(&pfmtx);
927 
928 		DPRINTF(("rump_sp: prefork handler success %p\n", spc));
929 
930 		send_prefork_resp(spc, reqno, auth);
931 		return;
932 	}
933 
934 	if (__predict_false(spc->spc_hdr.rsp_type == RUMPSP_HANDSHAKE)) {
935 		char *comm = (char *)spc->spc_buf;
936 		size_t commlen = spc->spc_hdr.rsp_len - HDRSZ;
937 
938 		if (spc->spc_hdr.rsp_handshake != HANDSHAKE_EXEC) {
939 			send_error_resp(spc, reqno, EINVAL);
940 			spcfreebuf(spc);
941 			return;
942 		}
943 
944 		/* ensure it's 0-terminated */
945 		/* XXX make sure it contains sensible chars? */
946 		comm[commlen] = '\0';
947 
948 		lwproc_switch(spc->spc_mainlwp);
949 		lwproc_execnotify(comm);
950 		lwproc_switch(NULL);
951 
952 		send_handshake_resp(spc, reqno, 0);
953 		spcfreebuf(spc);
954 		return;
955 	}
956 
957 	if (__predict_false(spc->spc_hdr.rsp_type != RUMPSP_SYSCALL)) {
958 		send_error_resp(spc, reqno, EINVAL);
959 		spcfreebuf(spc);
960 		return;
961 	}
962 
963 	retries = 0;
964 	while ((sba = malloc(sizeof(*sba))) == NULL) {
965 		if (nworker == 0 || retries > 10) {
966 			send_error_resp(spc, reqno, EAGAIN);
967 			spcfreebuf(spc);
968 			return;
969 		}
970 		/* slim chance of more memory? */
971 		usleep(10000);
972 	}
973 
974 	sba->sba_spc = spc;
975 	sba->sba_hdr = spc->spc_hdr;
976 	sba->sba_data = spc->spc_buf;
977 	spcresetbuf(spc);
978 
979 	spcref(spc);
980 
981 	pthread_mutex_lock(&sbamtx);
982 	TAILQ_INSERT_TAIL(&syslist, sba, sba_entries);
983 	nwork++;
984 	if (nwork <= idleworker) {
985 		/* do we have a daemon's tool (i.e. idle threads)? */
986 		pthread_cond_signal(&sbacv);
987 	} else if (nworker < rumpsp_maxworker) {
988 		/*
989 		 * Else, need to create one
990 		 * (if we can, otherwise just expect another
991 		 * worker to pick up the syscall)
992 		 */
993 		if (pthread_create(&pt, &pattr_detached,
994 		    serv_syscallbouncer, NULL) == 0) {
995 			nworker++;
996 		}
997 	}
998 	pthread_mutex_unlock(&sbamtx);
999 }
1000 
1001 static void *
1002 spserver(void *arg)
1003 {
1004 	struct spservarg *sarg = arg;
1005 	struct spclient *spc;
1006 	unsigned idx;
1007 	int seen;
1008 	int rv;
1009 	unsigned int nfds, maxidx;
1010 
1011 	for (idx = 0; idx < MAXCLI; idx++) {
1012 		pfdlist[idx].fd = -1;
1013 		pfdlist[idx].events = POLLIN;
1014 
1015 		spc = &spclist[idx];
1016 		pthread_mutex_init(&spc->spc_mtx, NULL);
1017 		pthread_cond_init(&spc->spc_cv, NULL);
1018 		spc->spc_fd = -1;
1019 	}
1020 	pfdlist[0].fd = spclist[0].spc_fd = sarg->sps_sock;
1021 	pfdlist[0].events = POLLIN;
1022 	nfds = 1;
1023 	maxidx = 0;
1024 
1025 	pthread_attr_init(&pattr_detached);
1026 	pthread_attr_setdetachstate(&pattr_detached, PTHREAD_CREATE_DETACHED);
1027 	/* XXX: doesn't stacksize currently work on NetBSD */
1028 	pthread_attr_setstacksize(&pattr_detached, 32*1024);
1029 
1030 	pthread_mutex_init(&sbamtx, NULL);
1031 	pthread_cond_init(&sbacv, NULL);
1032 
1033 	DPRINTF(("rump_sp: server mainloop\n"));
1034 
1035 	for (;;) {
1036 		int discoed;
1037 
1038 		/* g/c hangarounds (eventually) */
1039 		discoed = atomic_swap_uint(&disco, 0);
1040 		while (discoed--) {
1041 			nfds--;
1042 			idx = maxidx;
1043 			while (idx) {
1044 				if (pfdlist[idx].fd != -1) {
1045 					maxidx = idx;
1046 					break;
1047 				}
1048 				idx--;
1049 			}
1050 			DPRINTF(("rump_sp: set maxidx to [%u]\n",
1051 			    maxidx));
1052 		}
1053 
1054 		DPRINTF(("rump_sp: loop nfd %d\n", maxidx+1));
1055 		seen = 0;
1056 		rv = poll(pfdlist, maxidx+1, INFTIM);
1057 		assert(maxidx+1 <= MAXCLI);
1058 		assert(rv != 0);
1059 		if (rv == -1) {
1060 			if (errno == EINTR)
1061 				continue;
1062 			fprintf(stderr, "rump_spserver: poll returned %d\n",
1063 			    errno);
1064 			break;
1065 		}
1066 
1067 		for (idx = 0; seen < rv && idx < MAXCLI; idx++) {
1068 			if ((pfdlist[idx].revents & POLLIN) == 0)
1069 				continue;
1070 
1071 			seen++;
1072 			DPRINTF(("rump_sp: activity at [%u] %d/%d\n",
1073 			    idx, seen, rv));
1074 			if (idx > 0) {
1075 				spc = &spclist[idx];
1076 				DPRINTF(("rump_sp: mainloop read [%u]\n", idx));
1077 				switch (readframe(spc)) {
1078 				case 0:
1079 					break;
1080 				case -1:
1081 					serv_handledisco(idx);
1082 					break;
1083 				default:
1084 					switch (spc->spc_hdr.rsp_class) {
1085 					case RUMPSP_RESP:
1086 						kickwaiter(spc);
1087 						break;
1088 					case RUMPSP_REQ:
1089 						handlereq(spc);
1090 						break;
1091 					default:
1092 						send_error_resp(spc,
1093 						    spc->spc_hdr.rsp_reqno,
1094 						    ENOENT);
1095 						spcfreebuf(spc);
1096 						break;
1097 					}
1098 					break;
1099 				}
1100 
1101 			} else {
1102 				DPRINTF(("rump_sp: mainloop new connection\n"));
1103 
1104 				if (__predict_false(spfini)) {
1105 					close(spclist[0].spc_fd);
1106 					serv_shutdown();
1107 					goto out;
1108 				}
1109 
1110 				idx = serv_handleconn(pfdlist[0].fd,
1111 				    sarg->sps_connhook, nfds == MAXCLI);
1112 				if (idx)
1113 					nfds++;
1114 				if (idx > maxidx)
1115 					maxidx = idx;
1116 				DPRINTF(("rump_sp: maxid now %d\n", maxidx));
1117 			}
1118 		}
1119 	}
1120 
1121  out:
1122 	return NULL;
1123 }
1124 
1125 static unsigned cleanupidx;
1126 static struct sockaddr *cleanupsa;
1127 int
1128 rumpuser_sp_init(const char *url, const struct rumpuser_sp_ops *spopsp,
1129 	const char *ostype, const char *osrelease, const char *machine)
1130 {
1131 	pthread_t pt;
1132 	struct spservarg *sarg;
1133 	struct sockaddr *sap;
1134 	char *p;
1135 	unsigned idx;
1136 	int error, s;
1137 
1138 	p = strdup(url);
1139 	if (p == NULL)
1140 		return ENOMEM;
1141 	error = parseurl(p, &sap, &idx, 1);
1142 	free(p);
1143 	if (error)
1144 		return error;
1145 
1146 	snprintf(banner, sizeof(banner), "RUMPSP-%d.%d-%s-%s/%s\n",
1147 	    PROTOMAJOR, PROTOMINOR, ostype, osrelease, machine);
1148 
1149 	s = socket(parsetab[idx].domain, SOCK_STREAM, 0);
1150 	if (s == -1)
1151 		return errno;
1152 
1153 	spops = *spopsp;
1154 	sarg = malloc(sizeof(*sarg));
1155 	if (sarg == NULL) {
1156 		close(s);
1157 		return ENOMEM;
1158 	}
1159 
1160 	sarg->sps_sock = s;
1161 	sarg->sps_connhook = parsetab[idx].connhook;
1162 
1163 	cleanupidx = idx;
1164 	cleanupsa = sap;
1165 
1166 	/* sloppy error recovery */
1167 
1168 	/*LINTED*/
1169 	if (bind(s, sap, sap->sa_len) == -1) {
1170 		fprintf(stderr, "rump_sp: server bind failed\n");
1171 		return errno;
1172 	}
1173 
1174 	if (listen(s, MAXCLI) == -1) {
1175 		fprintf(stderr, "rump_sp: server listen failed\n");
1176 		return errno;
1177 	}
1178 
1179 	if ((error = pthread_create(&pt, NULL, spserver, sarg)) != 0) {
1180 		fprintf(stderr, "rump_sp: cannot create wrkr thread\n");
1181 		return errno;
1182 	}
1183 	pthread_detach(pt);
1184 
1185 	return 0;
1186 }
1187 
1188 void
1189 rumpuser_sp_fini(void *arg)
1190 {
1191 	struct spclient *spc = arg;
1192 	register_t retval[2] = {0, 0};
1193 
1194 	if (spclist[0].spc_fd) {
1195 		parsetab[cleanupidx].cleanup(cleanupsa);
1196 	}
1197 
1198 	/*
1199 	 * stuff response into the socket, since this process is just
1200 	 * about to exit
1201 	 */
1202 	if (spc && spc->spc_syscallreq)
1203 		send_syscall_resp(spc, spc->spc_syscallreq, 0, retval);
1204 
1205 	if (spclist[0].spc_fd) {
1206 		shutdown(spclist[0].spc_fd, SHUT_RDWR);
1207 		spfini = 1;
1208 	}
1209 }
1210