xref: /netbsd-src/sys/kern/kern_syscall.c (revision 6cf6fe02a981b55727c49c3d37b0d8191a98c0ee)
1 /*	$NetBSD: kern_syscall.c,v 1.9 2013/12/14 06:27:57 pgoyette Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software developed for The NetBSD Foundation
8  * by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: kern_syscall.c,v 1.9 2013/12/14 06:27:57 pgoyette Exp $");
34 
35 #ifdef _KERNEL_OPT
36 #include "opt_modular.h"
37 #include "opt_syscall_debug.h"
38 #include "opt_ktrace.h"
39 #include "opt_ptrace.h"
40 #endif
41 
42 /* XXX To get syscall prototypes. */
43 #define SYSVSHM
44 #define SYSVSEM
45 #define SYSVMSG
46 
47 #include <sys/param.h>
48 #include <sys/module.h>
49 #include <sys/sched.h>
50 #include <sys/syscall.h>
51 #include <sys/syscallargs.h>
52 #include <sys/syscallvar.h>
53 #include <sys/systm.h>
54 #include <sys/xcall.h>
55 #include <sys/ktrace.h>
56 #include <sys/ptrace.h>
57 
58 int
59 sys_nomodule(struct lwp *l, const void *v, register_t *retval)
60 {
61 #ifdef MODULAR
62 	static struct {
63 		u_int		al_code;
64 		const char	*al_module;
65 	} const autoload[] = {
66 	    { SYS_aio_cancel, "aio" },
67 	    { SYS_aio_error, "aio" },
68 	    { SYS_aio_fsync, "aio" },
69 	    { SYS_aio_read, "aio" },
70 	    { SYS_aio_return, "aio" },
71 	    { SYS___aio_suspend50, "aio" },
72 	    { SYS_aio_write, "aio" },
73 	    { SYS_lio_listio, "aio" },
74 	    { SYS_mq_open, "mqueue" },
75 	    { SYS_mq_close, "mqueue" },
76 	    { SYS_mq_unlink, "mqueue" },
77 	    { SYS_mq_getattr, "mqueue" },
78 	    { SYS_mq_setattr, "mqueue" },
79 	    { SYS_mq_notify, "mqueue" },
80 	    { SYS_mq_send, "mqueue" },
81 	    { SYS_mq_receive, "mqueue" },
82 	    { SYS___mq_timedsend50, "mqueue" },
83 	    { SYS___mq_timedreceive50, "mqueue" },
84 	    { SYS_compat_43_fstat43, "compat" },
85 	    { SYS_compat_43_lstat43, "compat" },
86 	    { SYS_compat_43_oaccept, "compat" },
87 	    { SYS_compat_43_ocreat, "compat" },
88 	    { SYS_compat_43_oftruncate, "compat" },
89 	    { SYS_compat_43_ogetdirentries, "compat" },
90 	    { SYS_compat_43_ogetdtablesize, "compat" },
91 	    { SYS_compat_43_ogethostid, "compat" },
92 	    { SYS_compat_43_ogethostname, "compat" },
93 	    { SYS_compat_43_ogetkerninfo, "compat" },
94 	    { SYS_compat_43_ogetpagesize, "compat" },
95 	    { SYS_compat_43_ogetpeername, "compat" },
96 	    { SYS_compat_43_ogetrlimit, "compat" },
97 	    { SYS_compat_43_ogetsockname, "compat" },
98 	    { SYS_compat_43_okillpg, "compat" },
99 	    { SYS_compat_43_olseek, "compat" },
100 	    { SYS_compat_43_ommap, "compat" },
101 	    { SYS_compat_43_oquota, "compat" },
102 	    { SYS_compat_43_orecv, "compat" },
103 	    { SYS_compat_43_orecvfrom, "compat" },
104 	    { SYS_compat_43_orecvmsg, "compat" },
105 	    { SYS_compat_43_osend, "compat" },
106 	    { SYS_compat_43_osendmsg, "compat" },
107 	    { SYS_compat_43_osethostid, "compat" },
108 	    { SYS_compat_43_osethostname, "compat" },
109 	    { SYS_compat_43_osetrlimit, "compat" },
110 	    { SYS_compat_43_osigblock, "compat" },
111 	    { SYS_compat_43_osigsetmask, "compat" },
112 	    { SYS_compat_43_osigstack, "compat" },
113 	    { SYS_compat_43_osigvec, "compat" },
114 	    { SYS_compat_43_otruncate, "compat" },
115 	    { SYS_compat_43_owait, "compat" },
116 	    { SYS_compat_43_stat43, "compat" },
117 	    { SYS_compat_09_ogetdomainname, "compat" },
118 	    { SYS_compat_09_osetdomainname, "compat" },
119 	    { SYS_compat_09_ouname, "compat" },
120 #ifndef _LP64
121 	    { SYS_compat_10_omsgsys, "compat" },
122 	    { SYS_compat_10_osemsys, "compat" },
123 	    { SYS_compat_10_oshmsys, "compat" },
124 #endif
125 	    { SYS_compat_12_fstat12, "compat" },
126 	    { SYS_compat_12_getdirentries, "compat" },
127 	    { SYS_compat_12_lstat12, "compat" },
128 	    { SYS_compat_12_msync, "compat" },
129 	    { SYS_compat_12_oreboot, "compat" },
130 	    { SYS_compat_12_oswapon, "compat" },
131 	    { SYS_compat_12_stat12, "compat" },
132 	    { SYS_compat_13_sigaction13, "compat" },
133 	    { SYS_compat_13_sigaltstack13, "compat" },
134 	    { SYS_compat_13_sigpending13, "compat" },
135 	    { SYS_compat_13_sigprocmask13, "compat" },
136 	    { SYS_compat_13_sigreturn13, "compat" },
137 	    { SYS_compat_13_sigsuspend13, "compat" },
138 	    { SYS_compat_14___semctl, "compat" },
139 	    { SYS_compat_14_msgctl, "compat" },
140 	    { SYS_compat_14_shmctl, "compat" },
141 	    { SYS_compat_16___sigaction14, "compat" },
142 	    { SYS_compat_16___sigreturn14, "compat" },
143 	    { SYS_compat_20_fhstatfs, "compat" },
144 	    { SYS_compat_20_fstatfs, "compat" },
145 	    { SYS_compat_20_getfsstat, "compat" },
146 	    { SYS_compat_20_statfs, "compat" },
147 	    { SYS_compat_30___fhstat30, "compat" },
148 	    { SYS_compat_30___fstat13, "compat" },
149 	    { SYS_compat_30___lstat13, "compat" },
150 	    { SYS_compat_30___stat13, "compat" },
151 	    { SYS_compat_30_fhopen, "compat" },
152 	    { SYS_compat_30_fhstat, "compat" },
153 	    { SYS_compat_30_fhstatvfs1, "compat" },
154 	    { SYS_compat_30_getdents, "compat" },
155 	    { SYS_compat_30_getfh, "compat" },
156 	    { SYS_compat_30_socket, "compat" },
157 	    { SYS_compat_40_mount, "compat" },
158 	    { SYS_compat_50_wait4, "compat" },
159 	    { SYS_compat_50_mknod, "compat" },
160 	    { SYS_compat_50_setitimer, "compat" },
161 	    { SYS_compat_50_getitimer, "compat" },
162 	    { SYS_compat_50_select, "compat" },
163 	    { SYS_compat_50_gettimeofday, "compat" },
164 	    { SYS_compat_50_getrusage, "compat" },
165 	    { SYS_compat_50_settimeofday, "compat" },
166 	    { SYS_compat_50_utimes, "compat" },
167 	    { SYS_compat_50_adjtime, "compat" },
168 	    { SYS_compat_50_lfs_segwait, "compat" },
169 	    { SYS_compat_50_futimes, "compat" },
170 	    { SYS_compat_50_clock_gettime, "compat" },
171 	    { SYS_compat_50_clock_settime, "compat" },
172 	    { SYS_compat_50_clock_getres, "compat" },
173 	    { SYS_compat_50_timer_settime, "compat" },
174 	    { SYS_compat_50_timer_gettime, "compat" },
175 	    { SYS_compat_50_nanosleep, "compat" },
176 	    { SYS_compat_50___sigtimedwait, "compat" },
177 	    { SYS_compat_50_mq_timedsend, "compat" },
178 	    { SYS_compat_50_mq_timedreceive, "compat" },
179 	    { SYS_compat_50_lutimes, "compat" },
180 	    { SYS_compat_50_____semctl13, "compat" },
181 	    { SYS_compat_50___msgctl13, "compat" },
182 	    { SYS_compat_50___shmctl13, "compat" },
183 	    { SYS_compat_50__lwp_park, "compat" },
184 	    { SYS_compat_50_kevent, "compat" },
185 	    { SYS_compat_50_pselect, "compat" },
186 	    { SYS_compat_50_pollts, "compat" },
187 	    { SYS_compat_50___stat30, "compat" },
188 	    { SYS_compat_50___fstat30, "compat" },
189 	    { SYS_compat_50___lstat30, "compat" },
190 	    { SYS_compat_50___ntp_gettime30, "compat" },
191 	    { SYS_compat_50___fhstat40, "compat" },
192 	    { SYS_compat_50_aio_suspend, "compat" },
193 	    { SYS_compat_60__lwp_park, "compat" },
194 	    { SYS__ksem_init, "ksem" },
195 	    { SYS__ksem_open, "ksem" },
196 	    { SYS__ksem_unlink, "ksem" },
197 	    { SYS__ksem_close, "ksem" },
198 	    { SYS__ksem_post, "ksem" },
199 	    { SYS__ksem_wait, "ksem" },
200 	    { SYS__ksem_trywait, "ksem" },
201 	    { SYS__ksem_getvalue, "ksem" },
202 	    { SYS__ksem_destroy, "ksem" },
203 	    { SYS__ksem_timedwait, "ksem" },
204 	    { SYS_nfssvc, "nfsserver" },
205 	    { SYS_afssys, "openafs" },
206 	};
207 	const struct sysent *sy;
208 	const struct emul *em;
209 	int code, i;
210 
211 	/*
212 	 * Restart the syscall if we interrupted a module unload that
213 	 * failed.  Acquiring kernconfig_lock delays us until any unload
214 	 * has been completed or rolled back.
215 	 */
216 	kernconfig_lock();
217 	sy = l->l_sysent;
218 	if (sy->sy_call != sys_nomodule) {
219 		kernconfig_unlock();
220 		return ERESTART;
221 	}
222 	/*
223 	 * Try to autoload a module to satisfy the request.  If it
224 	 * works, retry the request.
225 	 */
226 	em = l->l_proc->p_emul;
227 	if (em == &emul_netbsd) {
228 		code = sy - em->e_sysent;
229 		for (i = 0; i < __arraycount(autoload); i++) {
230 			if (autoload[i].al_code != code) {
231 				continue;
232 			}
233 			if (module_autoload(autoload[i].al_module,
234 			    MODULE_CLASS_ANY) != 0 ||
235 			    sy->sy_call == sys_nomodule) {
236 			    	break;
237 			}
238 			kernconfig_unlock();
239 			return ERESTART;
240 		}
241 	}
242 	kernconfig_unlock();
243 #endif	/* MODULAR */
244 
245 	return sys_nosys(l, v, retval);
246 }
247 
248 int
249 syscall_establish(const struct emul *em, const struct syscall_package *sp)
250 {
251 	struct sysent *sy;
252 	int i;
253 
254 	KASSERT(kernconfig_is_held());
255 
256 	if (em == NULL) {
257 		em = &emul_netbsd;
258 	}
259 	sy = em->e_sysent;
260 
261 	/*
262 	 * Ensure that all preconditions are valid, since this is
263 	 * an all or nothing deal.  Once a system call is entered,
264 	 * it can become busy and we could be unable to remove it
265 	 * on error.
266 	 */
267 	for (i = 0; sp[i].sp_call != NULL; i++) {
268 		if (sy[sp[i].sp_code].sy_call != sys_nomodule) {
269 #ifdef DIAGNOSTIC
270 			printf("syscall %d is busy\n", sp[i].sp_code);
271 #endif
272 			return EBUSY;
273 		}
274 	}
275 	/* Everything looks good, patch them in. */
276 	for (i = 0; sp[i].sp_call != NULL; i++) {
277 		sy[sp[i].sp_code].sy_call = sp[i].sp_call;
278 	}
279 
280 	return 0;
281 }
282 
283 int
284 syscall_disestablish(const struct emul *em, const struct syscall_package *sp)
285 {
286 	struct sysent *sy;
287 	uint64_t where;
288 	lwp_t *l;
289 	int i;
290 
291 	KASSERT(kernconfig_is_held());
292 
293 	if (em == NULL) {
294 		em = &emul_netbsd;
295 	}
296 	sy = em->e_sysent;
297 
298 	/*
299 	 * First, patch the system calls to sys_nomodule to gate further
300 	 * activity.
301 	 */
302 	for (i = 0; sp[i].sp_call != NULL; i++) {
303 		KASSERT(sy[sp[i].sp_code].sy_call == sp[i].sp_call);
304 		sy[sp[i].sp_code].sy_call = sys_nomodule;
305 	}
306 
307 	/*
308 	 * Run a cross call to cycle through all CPUs.  This does two
309 	 * things: lock activity provides a barrier and makes our update
310 	 * of sy_call visible to all CPUs, and upon return we can be sure
311 	 * that we see pertinent values of l_sysent posted by remote CPUs.
312 	 */
313 	where = xc_broadcast(0, (xcfunc_t)nullop, NULL, NULL);
314 	xc_wait(where);
315 
316 	/*
317 	 * Now it's safe to check l_sysent.  Run through all LWPs and see
318 	 * if anyone is still using the system call.
319 	 */
320 	for (i = 0; sp[i].sp_call != NULL; i++) {
321 		mutex_enter(proc_lock);
322 		LIST_FOREACH(l, &alllwp, l_list) {
323 			if (l->l_sysent == &sy[sp[i].sp_code]) {
324 				break;
325 			}
326 		}
327 		mutex_exit(proc_lock);
328 		if (l == NULL) {
329 			continue;
330 		}
331 		/*
332 		 * We lose: one or more calls are still in use.  Put back
333 		 * the old entrypoints and act like nothing happened.
334 		 * When we drop kernconfig_lock, any system calls held in
335 		 * sys_nomodule() will be restarted.
336 		 */
337 		for (i = 0; sp[i].sp_call != NULL; i++) {
338 			sy[sp[i].sp_code].sy_call = sp[i].sp_call;
339 		}
340 		return EBUSY;
341 	}
342 
343 	return 0;
344 }
345 
346 /*
347  * Return true if system call tracing is enabled for the specified process.
348  */
349 bool
350 trace_is_enabled(struct proc *p)
351 {
352 #ifdef SYSCALL_DEBUG
353 	return (true);
354 #endif
355 #ifdef KTRACE
356 	if (ISSET(p->p_traceflag, (KTRFAC_SYSCALL | KTRFAC_SYSRET)))
357 		return (true);
358 #endif
359 #ifdef PTRACE
360 	if (ISSET(p->p_slflag, PSL_SYSCALL))
361 		return (true);
362 #endif
363 
364 	return (false);
365 }
366 
367 /*
368  * Start trace of particular system call. If process is being traced,
369  * this routine is called by MD syscall dispatch code just before
370  * a system call is actually executed.
371  */
372 int
373 trace_enter(register_t code, const register_t *args, int narg)
374 {
375 	int error = 0;
376 
377 #ifdef SYSCALL_DEBUG
378 	scdebug_call(code, args);
379 #endif /* SYSCALL_DEBUG */
380 
381 	ktrsyscall(code, args, narg);
382 
383 #ifdef PTRACE
384 	if ((curlwp->l_proc->p_slflag & (PSL_SYSCALL|PSL_TRACED)) ==
385 	    (PSL_SYSCALL|PSL_TRACED)) {
386 		process_stoptrace();
387 		if (curlwp->l_proc->p_slflag & PSL_SYSCALLEMU) {
388 			/* tracer will emulate syscall for us */
389 			error = EJUSTRETURN;
390 		}
391 	}
392 #endif
393 	return error;
394 }
395 
396 /*
397  * End trace of particular system call. If process is being traced,
398  * this routine is called by MD syscall dispatch code just after
399  * a system call finishes.
400  * MD caller guarantees the passed 'code' is within the supported
401  * system call number range for emulation the process runs under.
402  */
403 void
404 trace_exit(register_t code, register_t rval[], int error)
405 {
406 #ifdef PTRACE
407 	struct proc *p = curlwp->l_proc;
408 #endif
409 
410 #ifdef SYSCALL_DEBUG
411 	scdebug_ret(code, error, rval);
412 #endif /* SYSCALL_DEBUG */
413 
414 	ktrsysret(code, error, rval);
415 
416 #ifdef PTRACE
417 	if ((p->p_slflag & (PSL_SYSCALL|PSL_TRACED|PSL_SYSCALLEMU)) ==
418 	    (PSL_SYSCALL|PSL_TRACED))
419 		process_stoptrace();
420 	CLR(p->p_slflag, PSL_SYSCALLEMU);
421 #endif
422 }
423