xref: /netbsd-src/lib/librumpclient/rumpclient.c (revision 6cf6fe02a981b55727c49c3d37b0d8191a98c0ee)
1 /*      $NetBSD: rumpclient.c,v 1.62 2014/04/25 12:20:12 pooka Exp $	*/
2 
3 /*
4  * Copyright (c) 2010, 2011 Antti Kantee.  All Rights Reserved.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
16  * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
17  * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
18  * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
19  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
20  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
21  * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
22  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
23  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
24  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
25  * SUCH DAMAGE.
26  */
27 
28 /*
29  * Client side routines for rump syscall proxy.
30  */
31 
32 #include <rump/rumpuser_port.h>
33 
34 /*
35  * We use kqueue on NetBSD, poll elsewhere.  Theoretically we could
36  * use kqueue on other BSD's too, but I haven't tested those.  We
37  * want to use kqueue because it will give us the ability to get signal
38  * notifications but defer their handling to a stage where we do not
39  * hold the communication lock.  Taking a signal while holding on to
40  * that lock may cause a deadlock.  Therefore, block signals throughout
41  * the RPC when using poll.  On Linux, we use signalfd in the same role
42  * as kqueue on NetBSD to be able to take signals while waiting for a
43  * response from the server.
44  */
45 
46 #ifdef __NetBSD__
47 #define USE_KQUEUE
48 #endif
49 #if defined(__linux__) && !defined(__ANDROID__)
50 #define USE_SIGNALFD
51 #endif
52 
53 __RCSID("$NetBSD: rumpclient.c,v 1.62 2014/04/25 12:20:12 pooka Exp $");
54 
55 #include <sys/param.h>
56 #include <sys/mman.h>
57 #include <sys/socket.h>
58 #include <sys/time.h>
59 
60 #ifdef USE_KQUEUE
61 #include <sys/event.h>
62 #endif
63 
64 #include <arpa/inet.h>
65 #include <netinet/in.h>
66 #include <netinet/tcp.h>
67 
68 #include <assert.h>
69 #include <dlfcn.h>
70 #include <errno.h>
71 #include <fcntl.h>
72 #include <poll.h>
73 #include <pthread.h>
74 #include <signal.h>
75 #include <stdarg.h>
76 #include <stdbool.h>
77 #include <stdio.h>
78 #include <stdlib.h>
79 #include <string.h>
80 #include <unistd.h>
81 
82 #include <rump/rumpclient.h>
83 
84 #define HOSTOPS
85 int	(*host_socket)(int, int, int);
86 int	(*host_close)(int);
87 int	(*host_connect)(int, const struct sockaddr *, socklen_t);
88 int	(*host_fcntl)(int, int, ...);
89 #ifdef __ANDROID__
90 int	(*host_poll)(struct pollfd *, nfds_t, long);
91 #else
92 int	(*host_poll)(struct pollfd *, nfds_t, int);
93 #endif
94 ssize_t	(*host_read)(int, void *, size_t);
95 #ifdef __ANDROID__
96 int	(*host_sendmsg)(int, const struct msghdr *, unsigned int);
97 #else
98 ssize_t (*host_sendmsg)(int, const struct msghdr *, int);
99 #endif
100 int	(*host_setsockopt)(int, int, int, const void *, socklen_t);
101 int	(*host_dup)(int);
102 
103 #ifdef USE_KQUEUE
104 int	(*host_kqueue)(void);
105 int	(*host_kevent)(int, const struct kevent *, size_t,
106 		       struct kevent *, size_t, const struct timespec *);
107 #endif
108 
109 #ifdef USE_SIGNALFD
110 #include <sys/signalfd.h>
111 
112 int	(*host_signalfd)(int, const sigset_t *, int);
113 #endif
114 
115 int	(*host_execve)(const char *, char *const[], char *const[]);
116 
117 #include "sp_common.c"
118 #include "rumpuser_sigtrans.c"
119 
120 static struct spclient clispc = {
121 	.spc_fd = -1,
122 };
123 
124 static int holyfd = -1;
125 static sigset_t fullset;
126 
127 static int doconnect(void);
128 static int handshake_req(struct spclient *, int, void *, int, bool);
129 
130 /*
131  * Default: don't retry.  Most clients can't handle it
132  * (consider e.g. fds suddenly going missing).
133  */
134 static time_t retrytimo = 0;
135 
136 /* always defined to nothingness for now */
137 #define ERRLOG(a)
138 
139 static int
140 send_with_recon(struct spclient *spc, struct iovec *iov, size_t iovlen)
141 {
142 	struct timeval starttime, curtime;
143 	time_t prevreconmsg;
144 	unsigned reconretries;
145 	int rv;
146 
147 	for (prevreconmsg = 0, reconretries = 0;;) {
148 		rv = dosend(spc, iov, iovlen);
149 		if (__predict_false(rv == ENOTCONN || rv == EBADF)) {
150 			/* no persistent connections */
151 			if (retrytimo == 0) {
152 				rv = ENOTCONN;
153 				break;
154 			}
155 			if (retrytimo == RUMPCLIENT_RETRYCONN_DIE)
156 				_exit(1);
157 
158 			if (!prevreconmsg) {
159 				prevreconmsg = time(NULL);
160 				gettimeofday(&starttime, NULL);
161 			}
162 			if (reconretries == 1) {
163 				if (retrytimo == RUMPCLIENT_RETRYCONN_ONCE) {
164 					rv = ENOTCONN;
165 					break;
166 				}
167 				fprintf(stderr, "rump_sp: connection to "
168 				    "kernel lost, trying to reconnect ...\n");
169 			} else if (time(NULL) - prevreconmsg > 120) {
170 				fprintf(stderr, "rump_sp: still trying to "
171 				    "reconnect ...\n");
172 				prevreconmsg = time(NULL);
173 			}
174 
175 			/* check that we aren't over the limit */
176 			if (retrytimo > 0) {
177 				time_t tdiff;
178 
179 				gettimeofday(&curtime, NULL);
180 				tdiff = curtime.tv_sec - starttime.tv_sec;
181 				if (starttime.tv_usec > curtime.tv_usec)
182 					tdiff--;
183 				if (tdiff >= retrytimo) {
184 					fprintf(stderr, "rump_sp: reconnect "
185 					    "failed, %lld second timeout\n",
186 					    (long long)retrytimo);
187 					return ENOTCONN;
188 				}
189 			}
190 
191 			/* adhoc backoff timer */
192 			if (reconretries < 10) {
193 				usleep(100000 * reconretries);
194 			} else {
195 				sleep(MIN(10, reconretries-9));
196 			}
197 			reconretries++;
198 
199 			if ((rv = doconnect()) != 0)
200 				continue;
201 			if ((rv = handshake_req(&clispc, HANDSHAKE_GUEST,
202 			    NULL, 0, true)) != 0)
203 				continue;
204 
205 			/*
206 			 * ok, reconnect succesful.  we need to return to
207 			 * the upper layer to get the entire PDU resent.
208 			 */
209 			if (reconretries != 1)
210 				fprintf(stderr, "rump_sp: reconnected!\n");
211 			rv = EAGAIN;
212 			break;
213 		} else {
214 			_DIAGASSERT(errno != EAGAIN);
215 			break;
216 		}
217 	}
218 
219 	return rv;
220 }
221 
222 static int
223 cliwaitresp(struct spclient *spc, struct respwait *rw, sigset_t *mask,
224 	bool keeplock)
225 {
226 	uint64_t mygen;
227 	bool imalive = true;
228 
229 	pthread_mutex_lock(&spc->spc_mtx);
230 	if (!keeplock)
231 		sendunlockl(spc);
232 	mygen = spc->spc_generation;
233 
234 	rw->rw_error = 0;
235 	while (!rw->rw_done && rw->rw_error == 0) {
236 		if (__predict_false(spc->spc_generation != mygen || !imalive))
237 			break;
238 
239 		/* are we free to receive? */
240 		if (spc->spc_istatus == SPCSTATUS_FREE) {
241 			int gotresp, dosig, rv;
242 
243 			spc->spc_istatus = SPCSTATUS_BUSY;
244 			pthread_mutex_unlock(&spc->spc_mtx);
245 
246 			dosig = 0;
247 			for (gotresp = 0; !gotresp; ) {
248 #ifdef USE_KQUEUE
249 				struct kevent kev[8];
250 				int i;
251 
252 				/*
253 				 * typically we don't have a frame waiting
254 				 * when we come in here, so call kevent now
255 				 */
256 				rv = host_kevent(holyfd, NULL, 0,
257 				    kev, __arraycount(kev), NULL);
258 
259 				if (__predict_false(rv == -1)) {
260 					goto activity;
261 				}
262 
263 				/*
264 				 * XXX: don't know how this can happen
265 				 * (timeout cannot expire since there
266 				 * isn't one), but it does happen.
267 				 * treat it as an expectional condition
268 				 * and go through tryread to determine
269 				 * alive status.
270 				 */
271 				if (__predict_false(rv == 0))
272 					goto activity;
273 
274 				for (i = 0; i < rv; i++) {
275 					if (kev[i].filter == EVFILT_SIGNAL)
276 						dosig++;
277 				}
278 				if (dosig)
279 					goto cleanup;
280 
281 				/*
282 				 * ok, activity.  try to read a frame to
283 				 * determine what happens next.
284 				 */
285  activity:
286 #else /* !USE_KQUEUE */
287 				struct pollfd pfd[2];
288 
289 				pfd[0].fd = clispc.spc_fd;
290 				pfd[0].events = POLLIN;
291 				pfd[1].fd = holyfd;
292 				pfd[1].events = POLLIN;
293 
294 				rv = host_poll(pfd, 2, -1);
295 				if (rv >= 1 && pfd[1].revents & POLLIN) {
296 					dosig = 1;
297 					goto cleanup;
298 				}
299 #endif /* !USE_KQUEUE */
300 
301 				switch (readframe(spc)) {
302 				case 0:
303 					continue;
304 				case -1:
305 					imalive = false;
306 					goto cleanup;
307 				default:
308 					/* case 1 */
309 					break;
310 				}
311 
312 				switch (spc->spc_hdr.rsp_class) {
313 				case RUMPSP_RESP:
314 				case RUMPSP_ERROR:
315 					kickwaiter(spc);
316 					gotresp = spc->spc_hdr.rsp_reqno ==
317 					    rw->rw_reqno;
318 					break;
319 				case RUMPSP_REQ:
320 					handlereq(spc);
321 					break;
322 				default:
323 					/* panic */
324 					break;
325 				}
326 			}
327 
328  cleanup:
329 			pthread_mutex_lock(&spc->spc_mtx);
330 			if (spc->spc_istatus == SPCSTATUS_WANTED)
331 				kickall(spc);
332 			spc->spc_istatus = SPCSTATUS_FREE;
333 
334 			/* take one for the team */
335 			if (dosig) {
336 				pthread_mutex_unlock(&spc->spc_mtx);
337 				pthread_sigmask(SIG_SETMASK, mask, NULL);
338 				pthread_sigmask(SIG_SETMASK, &fullset, NULL);
339 				pthread_mutex_lock(&spc->spc_mtx);
340 			}
341 		} else {
342 			spc->spc_istatus = SPCSTATUS_WANTED;
343 			pthread_cond_wait(&rw->rw_cv, &spc->spc_mtx);
344 		}
345 	}
346 	TAILQ_REMOVE(&spc->spc_respwait, rw, rw_entries);
347 	pthread_mutex_unlock(&spc->spc_mtx);
348 	pthread_cond_destroy(&rw->rw_cv);
349 
350 	if (spc->spc_generation != mygen || !imalive) {
351 		return ENOTCONN;
352 	}
353 	return rw->rw_error;
354 }
355 
356 static int
357 syscall_req(struct spclient *spc, sigset_t *omask, int sysnum,
358 	const void *data, size_t dlen, void **resp)
359 {
360 	struct rsp_hdr rhdr;
361 	struct respwait rw;
362 	struct iovec iov[2];
363 	int rv;
364 
365 	rhdr.rsp_len = sizeof(rhdr) + dlen;
366 	rhdr.rsp_class = RUMPSP_REQ;
367 	rhdr.rsp_type = RUMPSP_SYSCALL;
368 	rhdr.rsp_sysnum = sysnum;
369 
370 	IOVPUT(iov[0], rhdr);
371 	IOVPUT_WITHSIZE(iov[1], __UNCONST(data), dlen);
372 
373 	do {
374 		putwait(spc, &rw, &rhdr);
375 		if ((rv = send_with_recon(spc, iov, __arraycount(iov))) != 0) {
376 			unputwait(spc, &rw);
377 			continue;
378 		}
379 
380 		rv = cliwaitresp(spc, &rw, omask, false);
381 		if (rv == ENOTCONN)
382 			rv = EAGAIN;
383 	} while (rv == EAGAIN);
384 
385 	*resp = rw.rw_data;
386 	return rv;
387 }
388 
389 static int
390 handshake_req(struct spclient *spc, int type, void *data,
391 	int cancel, bool haslock)
392 {
393 	struct handshake_fork rf;
394 	const char *myprogname = NULL; /* XXXgcc */
395 	struct rsp_hdr rhdr;
396 	struct respwait rw;
397 	sigset_t omask;
398 	size_t bonus;
399 	struct iovec iov[2];
400 	int rv;
401 
402 	if (type == HANDSHAKE_FORK) {
403 		bonus = sizeof(rf);
404 	} else {
405 #ifdef __NetBSD__
406 		/* would procfs work on NetBSD too? */
407 		myprogname = getprogname();
408 #else
409 		int fd = open("/proc/self/comm", O_RDONLY);
410 		if (fd == -1) {
411 			myprogname = "???";
412 		} else {
413 			static char commname[128];
414 
415 			memset(commname, 0, sizeof(commname));
416 			if (read(fd, commname, sizeof(commname)) > 0) {
417 				char *n;
418 
419 				n = strrchr(commname, '\n');
420 				if (n)
421 					*n = '\0';
422 				myprogname = commname;
423 			} else {
424 				myprogname = "???";
425 			}
426 			close(fd);
427 		}
428 #endif
429 		bonus = strlen(myprogname)+1;
430 	}
431 
432 	/* performs server handshake */
433 	rhdr.rsp_len = sizeof(rhdr) + bonus;
434 	rhdr.rsp_class = RUMPSP_REQ;
435 	rhdr.rsp_type = RUMPSP_HANDSHAKE;
436 	rhdr.rsp_handshake = type;
437 
438 	IOVPUT(iov[0], rhdr);
439 
440 	pthread_sigmask(SIG_SETMASK, &fullset, &omask);
441 	if (haslock)
442 		putwait_locked(spc, &rw, &rhdr);
443 	else
444 		putwait(spc, &rw, &rhdr);
445 	if (type == HANDSHAKE_FORK) {
446 		memcpy(rf.rf_auth, data, sizeof(rf.rf_auth)); /* uh, why? */
447 		rf.rf_cancel = cancel;
448 		IOVPUT(iov[1], rf);
449 	} else {
450 		IOVPUT_WITHSIZE(iov[1], __UNCONST(myprogname), bonus);
451 	}
452 	rv = send_with_recon(spc, iov, __arraycount(iov));
453 	if (rv || cancel) {
454 		if (haslock)
455 			unputwait_locked(spc, &rw);
456 		else
457 			unputwait(spc, &rw);
458 		if (cancel) {
459 			goto out;
460 		}
461 	} else {
462 		rv = cliwaitresp(spc, &rw, &omask, haslock);
463 	}
464 	if (rv)
465 		goto out;
466 
467 	rv = *(int *)rw.rw_data;
468 	free(rw.rw_data);
469 
470  out:
471 	pthread_sigmask(SIG_SETMASK, &omask, NULL);
472 	return rv;
473 }
474 
475 static int
476 prefork_req(struct spclient *spc, sigset_t *omask, void **resp)
477 {
478 	struct rsp_hdr rhdr;
479 	struct respwait rw;
480 	struct iovec iov[1];
481 	int rv;
482 
483 	rhdr.rsp_len = sizeof(rhdr);
484 	rhdr.rsp_class = RUMPSP_REQ;
485 	rhdr.rsp_type = RUMPSP_PREFORK;
486 	rhdr.rsp_error = 0;
487 
488 	IOVPUT(iov[0], rhdr);
489 
490 	do {
491 		putwait(spc, &rw, &rhdr);
492 		rv = send_with_recon(spc, iov, __arraycount(iov));
493 		if (rv != 0) {
494 			unputwait(spc, &rw);
495 			continue;
496 		}
497 
498 		rv = cliwaitresp(spc, &rw, omask, false);
499 		if (rv == ENOTCONN)
500 			rv = EAGAIN;
501 	} while (rv == EAGAIN);
502 
503 	*resp = rw.rw_data;
504 	return rv;
505 }
506 
507 /*
508  * prevent response code from deadlocking with reconnect code
509  */
510 static int
511 resp_sendlock(struct spclient *spc)
512 {
513 	int rv = 0;
514 
515 	pthread_mutex_lock(&spc->spc_mtx);
516 	while (spc->spc_ostatus != SPCSTATUS_FREE) {
517 		if (__predict_false(spc->spc_reconnecting)) {
518 			rv = EBUSY;
519 			goto out;
520 		}
521 		spc->spc_ostatus = SPCSTATUS_WANTED;
522 		pthread_cond_wait(&spc->spc_cv, &spc->spc_mtx);
523 	}
524 	spc->spc_ostatus = SPCSTATUS_BUSY;
525 
526  out:
527 	pthread_mutex_unlock(&spc->spc_mtx);
528 	return rv;
529 }
530 
531 static void
532 send_copyin_resp(struct spclient *spc, uint64_t reqno, void *data, size_t dlen,
533 	int wantstr)
534 {
535 	struct rsp_hdr rhdr;
536 	struct iovec iov[2];
537 
538 	if (wantstr)
539 		dlen = MIN(dlen, strlen(data)+1);
540 
541 	rhdr.rsp_len = sizeof(rhdr) + dlen;
542 	rhdr.rsp_reqno = reqno;
543 	rhdr.rsp_class = RUMPSP_RESP;
544 	rhdr.rsp_type = RUMPSP_COPYIN;
545 	rhdr.rsp_sysnum = 0;
546 
547 	IOVPUT(iov[0], rhdr);
548 	IOVPUT_WITHSIZE(iov[1], data, dlen);
549 
550 	if (resp_sendlock(spc) != 0)
551 		return;
552 	(void)SENDIOV(spc, iov);
553 	sendunlock(spc);
554 }
555 
556 static void
557 send_anonmmap_resp(struct spclient *spc, uint64_t reqno, void *addr)
558 {
559 	struct rsp_hdr rhdr;
560 	struct iovec iov[2];
561 
562 	rhdr.rsp_len = sizeof(rhdr) + sizeof(addr);
563 	rhdr.rsp_reqno = reqno;
564 	rhdr.rsp_class = RUMPSP_RESP;
565 	rhdr.rsp_type = RUMPSP_ANONMMAP;
566 	rhdr.rsp_sysnum = 0;
567 
568 	IOVPUT(iov[0], rhdr);
569 	IOVPUT(iov[1], addr);
570 
571 	if (resp_sendlock(spc) != 0)
572 		return;
573 	(void)SENDIOV(spc, iov);
574 	sendunlock(spc);
575 }
576 
577 int
578 rumpclient_syscall(int sysnum, const void *data, size_t dlen,
579 	register_t *retval)
580 {
581 	struct rsp_sysresp *resp;
582 	sigset_t omask;
583 	void *rdata;
584 	int rv;
585 
586 	pthread_sigmask(SIG_SETMASK, &fullset, &omask);
587 
588 	DPRINTF(("rumpsp syscall_req: syscall %d with %p/%zu\n",
589 	    sysnum, data, dlen));
590 
591 	rv = syscall_req(&clispc, &omask, sysnum, data, dlen, &rdata);
592 	if (rv)
593 		goto out;
594 
595 	resp = rdata;
596 	DPRINTF(("rumpsp syscall_resp: syscall %d error %d, rv: %d/%d\n",
597 	    sysnum, rv, resp->rsys_retval[0], resp->rsys_retval[1]));
598 
599 	memcpy(retval, &resp->rsys_retval, sizeof(resp->rsys_retval));
600 	rv = resp->rsys_error;
601 	free(rdata);
602 
603  out:
604 	pthread_sigmask(SIG_SETMASK, &omask, NULL);
605 	return rv;
606 }
607 
608 static void
609 handlereq(struct spclient *spc)
610 {
611 	struct rsp_copydata *copydata;
612 	struct rsp_hdr *rhdr = &spc->spc_hdr;
613 	void *mapaddr;
614 	size_t maplen;
615 	int reqtype = spc->spc_hdr.rsp_type;
616 	int sig;
617 
618 	switch (reqtype) {
619 	case RUMPSP_COPYIN:
620 	case RUMPSP_COPYINSTR:
621 		/*LINTED*/
622 		copydata = (struct rsp_copydata *)spc->spc_buf;
623 		DPRINTF(("rump_sp handlereq: copyin request: %p/%zu\n",
624 		    copydata->rcp_addr, copydata->rcp_len));
625 		send_copyin_resp(spc, spc->spc_hdr.rsp_reqno,
626 		    copydata->rcp_addr, copydata->rcp_len,
627 		    reqtype == RUMPSP_COPYINSTR);
628 		break;
629 	case RUMPSP_COPYOUT:
630 	case RUMPSP_COPYOUTSTR:
631 		/*LINTED*/
632 		copydata = (struct rsp_copydata *)spc->spc_buf;
633 		DPRINTF(("rump_sp handlereq: copyout request: %p/%zu\n",
634 		    copydata->rcp_addr, copydata->rcp_len));
635 		/*LINTED*/
636 		memcpy(copydata->rcp_addr, copydata->rcp_data,
637 		    copydata->rcp_len);
638 		break;
639 	case RUMPSP_ANONMMAP:
640 		/*LINTED*/
641 		maplen = *(size_t *)spc->spc_buf;
642 		mapaddr = mmap(NULL, maplen, PROT_READ|PROT_WRITE,
643 		    MAP_ANON|MAP_PRIVATE, -1, 0);
644 		if (mapaddr == MAP_FAILED)
645 			mapaddr = NULL;
646 		DPRINTF(("rump_sp handlereq: anonmmap: %p\n", mapaddr));
647 		send_anonmmap_resp(spc, spc->spc_hdr.rsp_reqno, mapaddr);
648 		break;
649 	case RUMPSP_RAISE:
650 		sig = rumpuser__sig_rump2host(rhdr->rsp_signo);
651 		DPRINTF(("rump_sp handlereq: raise sig %d\n", sig));
652 		raise(sig);
653 		/*
654 		 * We most likely have signals blocked, but the signal
655 		 * will be handled soon enough when we return.
656 		 */
657 		break;
658 	default:
659 		printf("PANIC: INVALID TYPE %d\n", reqtype);
660 		abort();
661 		break;
662 	}
663 
664 	spcfreebuf(spc);
665 }
666 
667 static unsigned ptab_idx;
668 static struct sockaddr *serv_sa;
669 
670 /* dup until we get a "good" fd which does not collide with stdio */
671 static int
672 dupgood(int myfd, int mustchange)
673 {
674 	int ofds[4];
675 	int sverrno;
676 	unsigned int i;
677 
678 	for (i = 0; (myfd <= 2 || mustchange) && myfd != -1; i++) {
679 		assert(i < __arraycount(ofds));
680 		ofds[i] = myfd;
681 		myfd = host_dup(myfd);
682 		if (mustchange) {
683 			i--; /* prevent closing old fd */
684 			mustchange = 0;
685 		}
686 	}
687 
688 	sverrno = 0;
689 	if (myfd == -1 && i > 0)
690 		sverrno = errno;
691 
692 	while (i-- > 0) {
693 		host_close(ofds[i]);
694 	}
695 
696 	if (sverrno)
697 		errno = sverrno;
698 
699 	return myfd;
700 }
701 
702 #if defined(USE_KQUEUE)
703 
704 static int
705 makeholyfd(void)
706 {
707 	struct kevent kev[NSIG+1];
708 	int i, fd;
709 
710 	/* setup kqueue, we want all signals and the fd */
711 	if ((fd = dupgood(host_kqueue(), 0)) == -1) {
712 		ERRLOG(("rump_sp: cannot setup kqueue"));
713 		return -1;
714 	}
715 
716 	for (i = 0; i < NSIG; i++) {
717 		EV_SET(&kev[i], i+1, EVFILT_SIGNAL, EV_ADD|EV_ENABLE, 0, 0, 0);
718 	}
719 	EV_SET(&kev[NSIG], clispc.spc_fd,
720 	    EVFILT_READ, EV_ADD|EV_ENABLE, 0, 0, 0);
721 	if (host_kevent(fd, kev, NSIG+1, NULL, 0, NULL) == -1) {
722 		ERRLOG(("rump_sp: kevent() failed"));
723 		host_close(fd);
724 		return -1;
725 	}
726 
727 	return fd;
728 }
729 
730 #elif defined(USE_SIGNALFD) /* !USE_KQUEUE */
731 
732 static int
733 makeholyfd(void)
734 {
735 
736 	return host_signalfd(-1, &fullset, 0);
737 }
738 
739 #else /* !USE_KQUEUE && !USE_SIGNALFD */
740 
741 static int
742 makeholyfd(void)
743 {
744 
745 	return -1;
746 }
747 
748 #endif
749 
750 static int
751 doconnect(void)
752 {
753 	struct respwait rw;
754 	struct rsp_hdr rhdr;
755 	char banner[MAXBANNER];
756 	int s, error, flags;
757 	ssize_t n;
758 
759 	if (holyfd != -1)
760 		host_close(holyfd);
761 	holyfd = -1;
762 	s = -1;
763 
764 	if (clispc.spc_fd != -1)
765 		host_close(clispc.spc_fd);
766 	clispc.spc_fd = -1;
767 
768 	/*
769 	 * for reconnect, gate everyone out of the receiver code
770 	 */
771 	putwait_locked(&clispc, &rw, &rhdr);
772 
773 	pthread_mutex_lock(&clispc.spc_mtx);
774 	clispc.spc_reconnecting = 1;
775 	pthread_cond_broadcast(&clispc.spc_cv);
776 	clispc.spc_generation++;
777 	while (clispc.spc_istatus != SPCSTATUS_FREE) {
778 		clispc.spc_istatus = SPCSTATUS_WANTED;
779 		pthread_cond_wait(&rw.rw_cv, &clispc.spc_mtx);
780 	}
781 	kickall(&clispc);
782 
783 	/*
784 	 * we can release it already since we hold the
785 	 * send lock during reconnect
786 	 * XXX: assert it
787 	 */
788 	clispc.spc_istatus = SPCSTATUS_FREE;
789 	pthread_mutex_unlock(&clispc.spc_mtx);
790 	unputwait_locked(&clispc, &rw);
791 
792 	free(clispc.spc_buf);
793 	clispc.spc_off = 0;
794 
795 	s = dupgood(host_socket(parsetab[ptab_idx].domain, SOCK_STREAM, 0), 0);
796 	if (s == -1)
797 		return -1;
798 
799 	while (host_connect(s, serv_sa, parsetab[ptab_idx].slen) == -1) {
800 		if (errno == EINTR)
801 			continue;
802 		ERRLOG(("rump_sp: client connect failed: %s\n",
803 		    strerror(errno)));
804 		return -1;
805 	}
806 
807 	if ((error = parsetab[ptab_idx].connhook(s)) != 0) {
808 		ERRLOG(("rump_sp: connect hook failed\n"));
809 		return -1;
810 	}
811 
812 	if ((n = host_read(s, banner, sizeof(banner)-1)) <= 0) {
813 		ERRLOG(("rump_sp: failed to read banner\n"));
814 		return -1;
815 	}
816 
817 	if (banner[n-1] != '\n') {
818 		ERRLOG(("rump_sp: invalid banner\n"));
819 		return -1;
820 	}
821 	banner[n] = '\0';
822 	/* XXX parse the banner some day */
823 
824 	flags = host_fcntl(s, F_GETFL, 0);
825 	if (host_fcntl(s, F_SETFL, flags | O_NONBLOCK) == -1) {
826 		ERRLOG(("rump_sp: socket fd NONBLOCK: %s\n", strerror(errno)));
827 		return -1;
828 	}
829 	clispc.spc_fd = s;
830 	clispc.spc_state = SPCSTATE_RUNNING;
831 	clispc.spc_reconnecting = 0;
832 	holyfd = makeholyfd();
833 
834 	return 0;
835 }
836 
837 static int
838 doinit(void)
839 {
840 
841 	TAILQ_INIT(&clispc.spc_respwait);
842 	pthread_mutex_init(&clispc.spc_mtx, NULL);
843 	pthread_cond_init(&clispc.spc_cv, NULL);
844 
845 	return 0;
846 }
847 
848 #ifdef RTLD_NEXT
849 void *rumpclient__dlsym(void *, const char *);
850 void *
851 rumpclient__dlsym(void *handle, const char *symbol)
852 {
853 
854 	return dlsym(handle, symbol);
855 }
856 void *rumphijack_dlsym(void *, const char *)
857     __attribute__((__weak__, alias("rumpclient__dlsym")));
858 #endif
859 
860 static pid_t init_done = 0;
861 
862 int
863 rumpclient_init(void)
864 {
865 	char *p;
866 	int error;
867 	int rv = -1;
868 	int hstype;
869 	pid_t mypid;
870 
871 	/*
872 	 * Make sure we're not riding the context of a previous
873 	 * host fork.  Note: it's *possible* that after n>1 forks
874 	 * we have the same pid as one of our exited parents, but
875 	 * I'm pretty sure there are 0 practical implications, since
876 	 * it means generations would have to skip rumpclient init.
877 	 */
878 	if (init_done == (mypid = getpid()))
879 		return 0;
880 
881 #ifdef USE_KQUEUE
882 	/* kq does not traverse fork() */
883 	holyfd = -1;
884 #endif
885 	init_done = mypid;
886 
887 	sigfillset(&fullset);
888 
889 	/*
890 	 * sag mir, wo die symbols sind.  zogen fort, der krieg beginnt.
891 	 * wann wird man je verstehen?  wann wird man je verstehen?
892 	 */
893 #ifdef RTLD_NEXT
894 #define FINDSYM2(_name_,_syscall_)					\
895 	if ((host_##_name_ = rumphijack_dlsym(RTLD_NEXT,		\
896 	    #_syscall_)) == NULL) {					\
897 		if (rumphijack_dlsym == rumpclient__dlsym)		\
898 			host_##_name_ = _name_; /* static fallback */	\
899 		if (host_##_name_ == NULL) {				\
900 			fprintf(stderr,"cannot find %s: %s", #_syscall_,\
901 			    dlerror());					\
902 			exit(1);					\
903 		}							\
904 	}
905 #else
906 #define FINDSYM2(_name_,_syscall)					\
907 	host_##_name_ = _name_;
908 #endif
909 #define FINDSYM(_name_) FINDSYM2(_name_,_name_)
910 #ifdef __NetBSD__
911 	FINDSYM2(socket,__socket30)
912 #else
913 	FINDSYM(socket)
914 #endif
915 
916 	FINDSYM(close)
917 	FINDSYM(connect)
918 	FINDSYM(fcntl)
919 	FINDSYM(poll)
920 	FINDSYM(read)
921 	FINDSYM(sendmsg)
922 	FINDSYM(setsockopt)
923 	FINDSYM(dup)
924 	FINDSYM(execve)
925 
926 #ifdef USE_KQUEUE
927 	FINDSYM(kqueue)
928 #if !__NetBSD_Prereq__(5,99,7)
929 	FINDSYM(kevent)
930 #else
931 	FINDSYM2(kevent,_sys___kevent50)
932 #endif
933 #endif /* USE_KQUEUE */
934 
935 #ifdef USE_SIGNALFD
936 	FINDSYM(signalfd)
937 #endif
938 
939 #undef	FINDSYM
940 #undef	FINDSY2
941 
942 	if ((p = getenv("RUMP__PARSEDSERVER")) == NULL) {
943 		if ((p = getenv("RUMP_SERVER")) == NULL) {
944 			fprintf(stderr, "error: RUMP_SERVER not set\n");
945 			errno = ENOENT;
946 			goto out;
947 		}
948 	}
949 
950 	if ((error = parseurl(p, &serv_sa, &ptab_idx, 0)) != 0) {
951 		errno = error;
952 		goto out;
953 	}
954 
955 	if (doinit() == -1)
956 		goto out;
957 
958 	if ((p = getenv("RUMPCLIENT__EXECFD")) != NULL) {
959 		sscanf(p, "%d,%d", &clispc.spc_fd, &holyfd);
960 		unsetenv("RUMPCLIENT__EXECFD");
961 		hstype = HANDSHAKE_EXEC;
962 	} else {
963 		if (doconnect() == -1)
964 			goto out;
965 		hstype = HANDSHAKE_GUEST;
966 	}
967 
968 	error = handshake_req(&clispc, hstype, NULL, 0, false);
969 	if (error) {
970 		pthread_mutex_destroy(&clispc.spc_mtx);
971 		pthread_cond_destroy(&clispc.spc_cv);
972 		if (clispc.spc_fd != -1)
973 			host_close(clispc.spc_fd);
974 		errno = error;
975 		goto out;
976 	}
977 	rv = 0;
978 
979  out:
980 	if (rv == -1)
981 		init_done = 0;
982 	return rv;
983 }
984 
985 struct rumpclient_fork {
986 	uint32_t fork_auth[AUTHLEN];
987 	struct spclient fork_spc;
988 	int fork_holyfd;
989 };
990 
991 struct rumpclient_fork *
992 rumpclient_prefork(void)
993 {
994 	struct rumpclient_fork *rpf;
995 	sigset_t omask;
996 	void *resp;
997 	int rv;
998 
999 	pthread_sigmask(SIG_SETMASK, &fullset, &omask);
1000 	rpf = malloc(sizeof(*rpf));
1001 	if (rpf == NULL)
1002 		goto out;
1003 
1004 	if ((rv = prefork_req(&clispc, &omask, &resp)) != 0) {
1005 		free(rpf);
1006 		errno = rv;
1007 		rpf = NULL;
1008 		goto out;
1009 	}
1010 
1011 	memcpy(rpf->fork_auth, resp, sizeof(rpf->fork_auth));
1012 	free(resp);
1013 
1014 	rpf->fork_spc = clispc;
1015 	rpf->fork_holyfd = holyfd;
1016 
1017  out:
1018 	pthread_sigmask(SIG_SETMASK, &omask, NULL);
1019 	return rpf;
1020 }
1021 
1022 int
1023 rumpclient_fork_init(struct rumpclient_fork *rpf)
1024 {
1025 	int error;
1026 	int osock;
1027 
1028 	osock = clispc.spc_fd;
1029 	memset(&clispc, 0, sizeof(clispc));
1030 	clispc.spc_fd = osock;
1031 
1032 #ifdef USE_KQUEUE
1033 	holyfd = -1; /* kqueue descriptor is not copied over fork() */
1034 #else
1035 	if (holyfd != -1) {
1036 		host_close(holyfd);
1037 		holyfd = -1;
1038 	}
1039 #endif
1040 
1041 	if (doinit() == -1)
1042 		return -1;
1043 	if (doconnect() == -1)
1044 		return -1;
1045 
1046 	error = handshake_req(&clispc, HANDSHAKE_FORK, rpf->fork_auth,
1047 	    0, false);
1048 	if (error) {
1049 		pthread_mutex_destroy(&clispc.spc_mtx);
1050 		pthread_cond_destroy(&clispc.spc_cv);
1051 		errno = error;
1052 		return -1;
1053 	}
1054 
1055 	return 0;
1056 }
1057 
1058 /*ARGSUSED*/
1059 void
1060 rumpclient_fork_cancel(struct rumpclient_fork *rpf)
1061 {
1062 
1063 	/* EUNIMPL */
1064 }
1065 
1066 void
1067 rumpclient_fork_vparent(struct rumpclient_fork *rpf)
1068 {
1069 
1070 	clispc = rpf->fork_spc;
1071 	holyfd = rpf->fork_holyfd;
1072 }
1073 
1074 void
1075 rumpclient_setconnretry(time_t timeout)
1076 {
1077 
1078 	if (timeout < RUMPCLIENT_RETRYCONN_DIE)
1079 		return; /* gigo */
1080 
1081 	retrytimo = timeout;
1082 }
1083 
1084 int
1085 rumpclient__closenotify(int *fdp, enum rumpclient_closevariant variant)
1086 {
1087 	int fd = *fdp;
1088 	int untilfd, rv;
1089 	int newfd;
1090 
1091 	switch (variant) {
1092 	case RUMPCLIENT_CLOSE_FCLOSEM:
1093 		untilfd = MAX(clispc.spc_fd, holyfd);
1094 		for (; fd <= untilfd; fd++) {
1095 			if (fd == clispc.spc_fd || fd == holyfd)
1096 				continue;
1097 			rv = host_close(fd);
1098 			if (rv == -1)
1099 				return -1;
1100 		}
1101 		*fdp = fd;
1102 		break;
1103 
1104 	case RUMPCLIENT_CLOSE_CLOSE:
1105 	case RUMPCLIENT_CLOSE_DUP2:
1106 		if (fd == clispc.spc_fd) {
1107 			newfd = dupgood(clispc.spc_fd, 1);
1108 			if (newfd == -1)
1109 				return -1;
1110 
1111 #ifdef USE_KQUEUE
1112 			{
1113 			struct kevent kev[2];
1114 
1115 			/*
1116 			 * now, we have a new socket number, so change
1117 			 * the file descriptor that kqueue is
1118 			 * monitoring.  remove old and add new.
1119 			 */
1120 			EV_SET(&kev[0], clispc.spc_fd,
1121 			    EVFILT_READ, EV_DELETE, 0, 0, 0);
1122 			EV_SET(&kev[1], newfd,
1123 			    EVFILT_READ, EV_ADD|EV_ENABLE, 0, 0, 0);
1124 			if (host_kevent(holyfd, kev, 2, NULL, 0, NULL) == -1) {
1125 				int sverrno = errno;
1126 				host_close(newfd);
1127 				errno = sverrno;
1128 				return -1;
1129 			}}
1130 #endif /* !USE_KQUEUE */
1131 			clispc.spc_fd = newfd;
1132 		}
1133 		if (holyfd != -1 && fd == holyfd) {
1134 			newfd = dupgood(holyfd, 1);
1135 			if (newfd == -1)
1136 				return -1;
1137 			holyfd = newfd;
1138 		}
1139 		break;
1140 	}
1141 
1142 	return 0;
1143 }
1144 
1145 pid_t
1146 rumpclient_fork(void)
1147 {
1148 
1149 	return rumpclient__dofork(fork);
1150 }
1151 
1152 /*
1153  * Process is about to exec.  Save info about our existing connection
1154  * in the env.  rumpclient will check for this info in init().
1155  * This is mostly for the benefit of rumphijack, but regular applications
1156  * may use it as well.
1157  */
1158 int
1159 rumpclient_exec(const char *path, char *const argv[], char *const envp[])
1160 {
1161 	char buf[4096];
1162 	char **newenv;
1163 	char *envstr, *envstr2;
1164 	size_t nelem;
1165 	int rv, sverrno;
1166 
1167 	snprintf(buf, sizeof(buf), "RUMPCLIENT__EXECFD=%d,%d",
1168 	    clispc.spc_fd, holyfd);
1169 	envstr = malloc(strlen(buf)+1);
1170 	if (envstr == NULL) {
1171 		return ENOMEM;
1172 	}
1173 	strcpy(envstr, buf);
1174 
1175 	/* do we have a fully parsed url we want to forward in the env? */
1176 	if (*parsedurl != '\0') {
1177 		snprintf(buf, sizeof(buf),
1178 		    "RUMP__PARSEDSERVER=%s", parsedurl);
1179 		envstr2 = malloc(strlen(buf)+1);
1180 		if (envstr2 == NULL) {
1181 			free(envstr);
1182 			return ENOMEM;
1183 		}
1184 		strcpy(envstr2, buf);
1185 	} else {
1186 		envstr2 = NULL;
1187 	}
1188 
1189 	for (nelem = 0; envp && envp[nelem]; nelem++)
1190 		continue;
1191 
1192 	newenv = malloc(sizeof(*newenv) * (nelem+3));
1193 	if (newenv == NULL) {
1194 		free(envstr2);
1195 		free(envstr);
1196 		return ENOMEM;
1197 	}
1198 	memcpy(&newenv[0], envp, nelem*sizeof(*envp));
1199 
1200 	newenv[nelem] = envstr;
1201 	newenv[nelem+1] = envstr2;
1202 	newenv[nelem+2] = NULL;
1203 
1204 	rv = host_execve(path, argv, newenv);
1205 
1206 	_DIAGASSERT(rv != 0);
1207 	sverrno = errno;
1208 	free(envstr2);
1209 	free(envstr);
1210 	free(newenv);
1211 	errno = sverrno;
1212 	return rv;
1213 }
1214 
1215 /*
1216  * daemon() is handwritten for the benefit of platforms which
1217  * do not support daemon().
1218  */
1219 int
1220 rumpclient_daemon(int nochdir, int noclose)
1221 {
1222 	struct rumpclient_fork *rf;
1223 	int sverrno;
1224 
1225 	if ((rf = rumpclient_prefork()) == NULL)
1226 		return -1;
1227 
1228 	switch (fork()) {
1229 	case 0:
1230 		break;
1231 	case -1:
1232 		goto daemonerr;
1233 	default:
1234 		_exit(0);
1235 	}
1236 
1237 	if (setsid() == -1)
1238 		goto daemonerr;
1239 	if (!nochdir && chdir("/") == -1)
1240 		goto daemonerr;
1241 	if (!noclose) {
1242 		int fd = open("/dev/null", O_RDWR);
1243 		dup2(fd, 0);
1244 		dup2(fd, 1);
1245 		dup2(fd, 2);
1246 		if (fd > 2)
1247 			close(fd);
1248 	}
1249 
1250 	/* note: fork is either completed or cancelled by the call */
1251 	if (rumpclient_fork_init(rf) == -1)
1252 		return -1;
1253 
1254 	return 0;
1255 
1256  daemonerr:
1257 	sverrno = errno;
1258 	rumpclient_fork_cancel(rf);
1259 	errno = sverrno;
1260 	return -1;
1261 }
1262