xref: /netbsd-src/sys/compat/linux/arch/i386/linux_machdep.c (revision deb6f0161a9109e7de9b519dc8dfb9478668dcdd)
1 /*	$NetBSD: linux_machdep.c,v 1.165 2017/09/17 09:41:35 maxv Exp $	*/
2 
3 /*-
4  * Copyright (c) 1995, 2000, 2008, 2009 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Frank van der Linden, and by Andrew Doran.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: linux_machdep.c,v 1.165 2017/09/17 09:41:35 maxv Exp $");
34 
35 #if defined(_KERNEL_OPT)
36 #include "opt_user_ldt.h"
37 #endif
38 
39 #include <sys/param.h>
40 #include <sys/systm.h>
41 #include <sys/signalvar.h>
42 #include <sys/kernel.h>
43 #include <sys/proc.h>
44 #include <sys/buf.h>
45 #include <sys/reboot.h>
46 #include <sys/conf.h>
47 #include <sys/exec.h>
48 #include <sys/file.h>
49 #include <sys/callout.h>
50 #include <sys/mbuf.h>
51 #include <sys/msgbuf.h>
52 #include <sys/mount.h>
53 #include <sys/vnode.h>
54 #include <sys/device.h>
55 #include <sys/syscallargs.h>
56 #include <sys/filedesc.h>
57 #include <sys/exec_elf.h>
58 #include <sys/disklabel.h>
59 #include <sys/ioctl.h>
60 #include <sys/wait.h>
61 #include <sys/kauth.h>
62 #include <sys/kmem.h>
63 
64 #include <miscfs/specfs/specdev.h>
65 
66 #include <compat/linux/common/linux_types.h>
67 #include <compat/linux/common/linux_signal.h>
68 #include <compat/linux/common/linux_util.h>
69 #include <compat/linux/common/linux_ioctl.h>
70 #include <compat/linux/common/linux_hdio.h>
71 #include <compat/linux/common/linux_exec.h>
72 #include <compat/linux/common/linux_machdep.h>
73 #include <compat/linux/common/linux_errno.h>
74 
75 #include <compat/linux/linux_syscallargs.h>
76 
77 #include <sys/cpu.h>
78 #include <machine/cpufunc.h>
79 #include <machine/psl.h>
80 #include <machine/reg.h>
81 #include <machine/segments.h>
82 #include <machine/specialreg.h>
83 #include <machine/sysarch.h>
84 #include <machine/vmparam.h>
85 
86 #include <x86/fpu.h>
87 
88 /*
89  * To see whether wscons is configured (for virtual console ioctl calls).
90  */
91 #if defined(_KERNEL_OPT)
92 #include "wsdisplay.h"
93 #endif
94 #if (NWSDISPLAY > 0)
95 #include <dev/wscons/wsconsio.h>
96 #include <dev/wscons/wsdisplay_usl_io.h>
97 #if defined(_KERNEL_OPT)
98 #include "opt_xserver.h"
99 #endif
100 #endif
101 
102 #ifdef DEBUG_LINUX
103 #define DPRINTF(a) uprintf a
104 #else
105 #define DPRINTF(a)
106 #endif
107 
108 extern struct disklist *x86_alldisks;
109 
110 static struct biosdisk_info *fd2biosinfo(struct proc *, struct file *);
111 static void linux_save_ucontext(struct lwp *, struct trapframe *,
112     const sigset_t *, struct sigaltstack *, struct linux_ucontext *);
113 static void linux_save_sigcontext(struct lwp *, struct trapframe *,
114     const sigset_t *, struct linux_sigcontext *);
115 static int linux_restore_sigcontext(struct lwp *,
116     struct linux_sigcontext *, register_t *);
117 static void linux_rt_sendsig(const ksiginfo_t *, const sigset_t *);
118 static void linux_old_sendsig(const ksiginfo_t *, const sigset_t *);
119 
120 extern char linux_sigcode[], linux_rt_sigcode[];
121 
122 /*
123  * Deal with some i386-specific things in the Linux emulation code.
124  */
125 
126 void
127 linux_setregs(struct lwp *l, struct exec_package *epp, vaddr_t stack)
128 {
129 	struct trapframe *tf;
130 
131 #ifdef USER_LDT
132 	pmap_ldt_cleanup(l);
133 #endif
134 
135 	fpu_save_area_clear(l, __Linux_NPXCW__);
136 
137 	tf = l->l_md.md_regs;
138 	tf->tf_gs = 0;
139 	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
140 	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
141 	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
142 	tf->tf_edi = 0;
143 	tf->tf_esi = 0;
144 	tf->tf_ebp = 0;
145 	tf->tf_ebx = l->l_proc->p_psstrp;
146 	tf->tf_edx = 0;
147 	tf->tf_ecx = 0;
148 	tf->tf_eax = 0;
149 	tf->tf_eip = epp->ep_entry;
150 	tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
151 	tf->tf_eflags = PSL_USERSET;
152 	tf->tf_esp = stack;
153 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
154 }
155 
156 /*
157  * Send an interrupt to process.
158  *
159  * Stack is set up to allow sigcode stored
160  * in u. to call routine, followed by kcall
161  * to sigreturn routine below.  After sigreturn
162  * resets the signal mask, the stack, and the
163  * frame pointer, it returns to the user
164  * specified pc, psl.
165  */
166 
167 void
168 linux_sendsig(const ksiginfo_t *ksi, const sigset_t *mask)
169 {
170 	if (SIGACTION(curproc, ksi->ksi_signo).sa_flags & SA_SIGINFO)
171 		linux_rt_sendsig(ksi, mask);
172 	else
173 		linux_old_sendsig(ksi, mask);
174 }
175 
176 
177 static void
178 linux_save_ucontext(struct lwp *l, struct trapframe *tf, const sigset_t *mask, struct sigaltstack *sas, struct linux_ucontext *uc)
179 {
180 	uc->uc_flags = 0;
181 	uc->uc_link = NULL;
182 	native_to_linux_sigaltstack(&uc->uc_stack, sas);
183 	linux_save_sigcontext(l, tf, mask, &uc->uc_mcontext);
184 	native_to_linux_sigset(&uc->uc_sigmask, mask);
185 	(void)memset(&uc->uc_fpregs_mem, 0, sizeof(uc->uc_fpregs_mem));
186 }
187 
188 static void
189 linux_save_sigcontext(struct lwp *l, struct trapframe *tf,
190     const sigset_t *mask, struct linux_sigcontext *sc)
191 {
192 	struct pcb *pcb = lwp_getpcb(l);
193 
194 	/* Save register context. */
195 	sc->sc_gs = tf->tf_gs;
196 	sc->sc_fs = tf->tf_fs;
197 	sc->sc_es = tf->tf_es;
198 	sc->sc_ds = tf->tf_ds;
199 	sc->sc_eflags = tf->tf_eflags;
200 
201 	sc->sc_edi = tf->tf_edi;
202 	sc->sc_esi = tf->tf_esi;
203 	sc->sc_esp = tf->tf_esp;
204 	sc->sc_ebp = tf->tf_ebp;
205 	sc->sc_ebx = tf->tf_ebx;
206 	sc->sc_edx = tf->tf_edx;
207 	sc->sc_ecx = tf->tf_ecx;
208 	sc->sc_eax = tf->tf_eax;
209 	sc->sc_eip = tf->tf_eip;
210 	sc->sc_cs = tf->tf_cs;
211 	sc->sc_esp_at_signal = tf->tf_esp;
212 	sc->sc_ss = tf->tf_ss;
213 	sc->sc_err = tf->tf_err;
214 	sc->sc_trapno = tf->tf_trapno;
215 	sc->sc_cr2 = pcb->pcb_cr2;
216 	sc->sc_387 = NULL;
217 
218 	/* Save signal stack. */
219 	/* Linux doesn't save the onstack flag in sigframe */
220 
221 	/* Save signal mask. */
222 	native_to_linux_old_sigset(&sc->sc_mask, mask);
223 }
224 
225 static void
226 linux_rt_sendsig(const ksiginfo_t *ksi, const sigset_t *mask)
227 {
228 	struct lwp *l = curlwp;
229 	struct proc *p = l->l_proc;
230 	struct trapframe *tf;
231 	struct linux_rt_sigframe *fp, frame;
232 	int onstack, error;
233 	int sig = ksi->ksi_signo;
234 	sig_t catcher = SIGACTION(p, sig).sa_handler;
235 	struct sigaltstack *sas = &l->l_sigstk;
236 
237 	tf = l->l_md.md_regs;
238 	/* Do we need to jump onto the signal stack? */
239 	onstack = (sas->ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 &&
240 	    (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
241 
242 
243 	/* Allocate space for the signal handler context. */
244 	if (onstack)
245 		fp = (struct linux_rt_sigframe *)((char *)sas->ss_sp +
246 		    sas->ss_size);
247 	else
248 		fp = (struct linux_rt_sigframe *)tf->tf_esp;
249 	fp--;
250 
251 	DPRINTF(("rt: onstack = %d, fp = %p sig = %d eip = 0x%x cr2 = 0x%x\n",
252 	    onstack, fp, sig, tf->tf_eip,
253 	    ((struct pcb *)lwp_getpcb(l))->pcb_cr2));
254 
255 	/* Build stack frame for signal trampoline. */
256 	frame.sf_handler = catcher;
257 	frame.sf_sig = native_to_linux_signo[sig];
258 	frame.sf_sip = &fp->sf_si;
259 	frame.sf_ucp = &fp->sf_uc;
260 
261 	/*
262 	 * XXX: the following code assumes that the constants for
263 	 * siginfo are the same between linux and NetBSD.
264 	 */
265 	native_to_linux_siginfo(&frame.sf_si, &ksi->ksi_info);
266 
267 	/* Save register context. */
268 	linux_save_ucontext(l, tf, mask, sas, &frame.sf_uc);
269 	sendsig_reset(l, sig);
270 
271 	mutex_exit(p->p_lock);
272 	error = copyout(&frame, fp, sizeof(frame));
273 	mutex_enter(p->p_lock);
274 
275 	if (error != 0) {
276 		/*
277 		 * Process has trashed its stack; give it an illegal
278 		 * instruction to halt it in its tracks.
279 		 */
280 		sigexit(l, SIGILL);
281 		/* NOTREACHED */
282 	}
283 
284 	/*
285 	 * Build context to run handler in.
286 	 */
287 	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
288 	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
289 	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
290 	tf->tf_eip = ((int)p->p_sigctx.ps_sigcode) +
291 	    (linux_rt_sigcode - linux_sigcode);
292 	tf->tf_cs = GSEL(GUCODE_SEL, SEL_UPL);
293 	tf->tf_eflags &= ~PSL_CLEARSIG;
294 	tf->tf_esp = (int)fp;
295 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
296 
297 	/* Remember that we're now on the signal stack. */
298 	if (onstack)
299 		sas->ss_flags |= SS_ONSTACK;
300 }
301 
302 static void
303 linux_old_sendsig(const ksiginfo_t *ksi, const sigset_t *mask)
304 {
305 	struct lwp *l = curlwp;
306 	struct proc *p = l->l_proc;
307 	struct trapframe *tf;
308 	struct linux_sigframe *fp, frame;
309 	int onstack, error;
310 	int sig = ksi->ksi_signo;
311 	sig_t catcher = SIGACTION(p, sig).sa_handler;
312 	struct sigaltstack *sas = &l->l_sigstk;
313 
314 	tf = l->l_md.md_regs;
315 
316 	/* Do we need to jump onto the signal stack? */
317 	onstack = (sas->ss_flags & (SS_DISABLE | SS_ONSTACK)) == 0 &&
318 	    (SIGACTION(p, sig).sa_flags & SA_ONSTACK) != 0;
319 
320 	/* Allocate space for the signal handler context. */
321 	if (onstack)
322 		fp = (struct linux_sigframe *) ((char *)sas->ss_sp +
323 		    sas->ss_size);
324 	else
325 		fp = (struct linux_sigframe *)tf->tf_esp;
326 	fp--;
327 
328 	DPRINTF(("old: onstack = %d, fp = %p sig = %d eip = 0x%x cr2 = 0x%x\n",
329 	    onstack, fp, sig, tf->tf_eip,
330 	    ((struct pcb *)lwp_getpcb(l))->pcb_cr2));
331 
332 	/* Build stack frame for signal trampoline. */
333 	frame.sf_handler = catcher;
334 	frame.sf_sig = native_to_linux_signo[sig];
335 
336 	linux_save_sigcontext(l, tf, mask, &frame.sf_sc);
337 	sendsig_reset(l, sig);
338 
339 	mutex_exit(p->p_lock);
340 	error = copyout(&frame, fp, sizeof(frame));
341 	mutex_enter(p->p_lock);
342 
343 	if (error != 0) {
344 		/*
345 		 * Process has trashed its stack; give it an illegal
346 		 * instruction to halt it in its tracks.
347 		 */
348 		sigexit(l, SIGILL);
349 		/* NOTREACHED */
350 	}
351 
352 	/*
353 	 * Build context to run handler in.
354 	 */
355 	tf->tf_fs = GSEL(GUDATA_SEL, SEL_UPL);
356 	tf->tf_es = GSEL(GUDATA_SEL, SEL_UPL);
357 	tf->tf_ds = GSEL(GUDATA_SEL, SEL_UPL);
358 	tf->tf_eip = (int)p->p_sigctx.ps_sigcode;
359 	tf->tf_cs = GSEL(GUCODEBIG_SEL, SEL_UPL);
360 	tf->tf_eflags &= ~PSL_CLEARSIG;
361 	tf->tf_esp = (int)fp;
362 	tf->tf_ss = GSEL(GUDATA_SEL, SEL_UPL);
363 
364 	/* Remember that we're now on the signal stack. */
365 	if (onstack)
366 		sas->ss_flags |= SS_ONSTACK;
367 }
368 
369 /*
370  * System call to cleanup state after a signal
371  * has been taken.  Reset signal mask and
372  * stack state from context left by sendsig (above).
373  * Return to previous pc and psl as specified by
374  * context left by sendsig. Check carefully to
375  * make sure that the user has not modified the
376  * psl to gain improper privileges or to cause
377  * a machine fault.
378  */
379 int
380 linux_sys_rt_sigreturn(struct lwp *l, const struct linux_sys_rt_sigreturn_args *uap, register_t *retval)
381 {
382 	/* {
383 		syscallarg(struct linux_ucontext *) ucp;
384 	} */
385 	struct linux_ucontext context, *ucp = SCARG(uap, ucp);
386 	int error;
387 
388 	/*
389 	 * The trampoline code hands us the context.
390 	 * It is unsafe to keep track of it ourselves, in the event that a
391 	 * program jumps out of a signal handler.
392 	 */
393 	if ((error = copyin(ucp, &context, sizeof(*ucp))) != 0)
394 		return error;
395 
396 	/* XXX XAX we can do better here by using more of the ucontext */
397 	return linux_restore_sigcontext(l, &context.uc_mcontext, retval);
398 }
399 
400 int
401 linux_sys_sigreturn(struct lwp *l, const struct linux_sys_sigreturn_args *uap, register_t *retval)
402 {
403 	/* {
404 		syscallarg(struct linux_sigcontext *) scp;
405 	} */
406 	struct linux_sigcontext context, *scp = SCARG(uap, scp);
407 	int error;
408 
409 	/*
410 	 * The trampoline code hands us the context.
411 	 * It is unsafe to keep track of it ourselves, in the event that a
412 	 * program jumps out of a signal handler.
413 	 */
414 	if ((error = copyin((void *)scp, &context, sizeof(*scp))) != 0)
415 		return error;
416 	return linux_restore_sigcontext(l, &context, retval);
417 }
418 
419 static int
420 linux_restore_sigcontext(struct lwp *l, struct linux_sigcontext *scp,
421     register_t *retval)
422 {
423 	struct proc *p = l->l_proc;
424 	struct sigaltstack *sas = &l->l_sigstk;
425 	struct trapframe *tf;
426 	sigset_t mask;
427 	ssize_t ss_gap;
428 
429 	/* Restore register context. */
430 	tf = l->l_md.md_regs;
431 	DPRINTF(("sigreturn enter esp=0x%x eip=0x%x\n", tf->tf_esp, tf->tf_eip));
432 
433 	/*
434 	 * Check for security violations.  If we're returning to
435 	 * protected mode, the CPU will validate the segment registers
436 	 * automatically and generate a trap on violations.  We handle
437 	 * the trap, rather than doing all of the checking here.
438 	 */
439 	if (((scp->sc_eflags ^ tf->tf_eflags) & PSL_USERSTATIC) != 0 ||
440 	    !USERMODE(scp->sc_cs))
441 		return EINVAL;
442 
443 	tf->tf_gs = scp->sc_gs;
444 	tf->tf_fs = scp->sc_fs;
445 	tf->tf_es = scp->sc_es;
446 	tf->tf_ds = scp->sc_ds;
447 	tf->tf_eflags = scp->sc_eflags;
448 
449 	tf->tf_edi = scp->sc_edi;
450 	tf->tf_esi = scp->sc_esi;
451 	tf->tf_ebp = scp->sc_ebp;
452 	tf->tf_ebx = scp->sc_ebx;
453 	tf->tf_edx = scp->sc_edx;
454 	tf->tf_ecx = scp->sc_ecx;
455 	tf->tf_eax = scp->sc_eax;
456 	tf->tf_eip = scp->sc_eip;
457 	tf->tf_cs = scp->sc_cs;
458 	tf->tf_esp = scp->sc_esp_at_signal;
459 	tf->tf_ss = scp->sc_ss;
460 
461 	/* Restore signal stack. */
462 	/*
463 	 * Linux really does it this way; it doesn't have space in sigframe
464 	 * to save the onstack flag.
465 	 */
466 	mutex_enter(p->p_lock);
467 	ss_gap = (ssize_t)((char *)scp->sc_esp_at_signal - (char *)sas->ss_sp);
468 	if (ss_gap >= 0 && ss_gap < sas->ss_size)
469 		sas->ss_flags |= SS_ONSTACK;
470 	else
471 		sas->ss_flags &= ~SS_ONSTACK;
472 
473 	/* Restore signal mask. */
474 	linux_old_to_native_sigset(&mask, &scp->sc_mask);
475 	(void) sigprocmask1(l, SIG_SETMASK, &mask, 0);
476 	mutex_exit(p->p_lock);
477 
478 	DPRINTF(("sigreturn exit esp=0x%x eip=0x%x\n", tf->tf_esp, tf->tf_eip));
479 	return EJUSTRETURN;
480 }
481 
482 #ifdef USER_LDT
483 
484 static int
485 linux_read_ldt(struct lwp *l, const struct linux_sys_modify_ldt_args *uap,
486     register_t *retval)
487 {
488 	struct x86_get_ldt_args gl;
489 	int error;
490 	union descriptor *ldt_buf;
491 	size_t sz;
492 
493 	/*
494 	 * I've checked the linux code - this function is asymetric with
495 	 * linux_write_ldt, and returns raw ldt entries.
496 	 * NB, the code I saw zerod the spare parts of the user buffer.
497 	 */
498 
499 	DPRINTF(("linux_read_ldt!"));
500 
501 	sz = 8192 * sizeof(*ldt_buf);
502 	ldt_buf = kmem_zalloc(sz, KM_SLEEP);
503 	gl.start = 0;
504 	gl.desc = NULL;
505 	gl.num = SCARG(uap, bytecount) / sizeof(union descriptor);
506 	error = x86_get_ldt1(l, &gl, ldt_buf);
507 	/* NB gl.num might have changed */
508 	if (error == 0) {
509 		*retval = gl.num * sizeof(*ldtstore);
510 		error = copyout(ldt_buf, SCARG(uap, ptr),
511 		    gl.num * sizeof *ldt_buf);
512 	}
513 	kmem_free(ldt_buf, sz);
514 
515 	return error;
516 }
517 
518 struct linux_ldt_info {
519 	u_int entry_number;
520 	u_long base_addr;
521 	u_int limit;
522 	u_int seg_32bit:1;
523 	u_int contents:2;
524 	u_int read_exec_only:1;
525 	u_int limit_in_pages:1;
526 	u_int seg_not_present:1;
527 	u_int useable:1;
528 };
529 
530 static int
531 linux_write_ldt(struct lwp *l, const struct linux_sys_modify_ldt_args *uap,
532     int oldmode)
533 {
534 	struct linux_ldt_info ldt_info;
535 	union descriptor d;
536 	struct x86_set_ldt_args sl;
537 	int error;
538 
539 	DPRINTF(("linux_write_ldt %d\n", oldmode));
540 	if (SCARG(uap, bytecount) != sizeof(ldt_info))
541 		return (EINVAL);
542 	if ((error = copyin(SCARG(uap, ptr), &ldt_info, sizeof(ldt_info))) != 0)
543 		return error;
544 	if (ldt_info.entry_number >= 8192)
545 		return (EINVAL);
546 	if (ldt_info.contents == 3) {
547 		if (oldmode)
548 			return (EINVAL);
549 		if (ldt_info.seg_not_present)
550 			return (EINVAL);
551 	}
552 
553 	if (ldt_info.base_addr == 0 && ldt_info.limit == 0 &&
554 	    (oldmode || (ldt_info.contents == 0 &&
555 	    ldt_info.read_exec_only == 1 && ldt_info.seg_32bit == 0 &&
556 	    ldt_info.limit_in_pages == 0 && ldt_info.seg_not_present == 1 &&
557 	    ldt_info.useable == 0))) {
558 		/* this means you should zero the ldt */
559 		(void)memset(&d, 0, sizeof(d));
560 	} else {
561 		d.sd.sd_lobase = ldt_info.base_addr & 0xffffff;
562 		d.sd.sd_hibase = (ldt_info.base_addr >> 24) & 0xff;
563 		d.sd.sd_lolimit = ldt_info.limit & 0xffff;
564 		d.sd.sd_hilimit = (ldt_info.limit >> 16) & 0xf;
565 		d.sd.sd_type = 16 | (ldt_info.contents << 2) |
566 		    (!ldt_info.read_exec_only << 1);
567 		d.sd.sd_dpl = SEL_UPL;
568 		d.sd.sd_p = !ldt_info.seg_not_present;
569 		d.sd.sd_def32 = ldt_info.seg_32bit;
570 		d.sd.sd_gran = ldt_info.limit_in_pages;
571 		if (!oldmode)
572 			d.sd.sd_xx = ldt_info.useable;
573 		else
574 			d.sd.sd_xx = 0;
575 	}
576 	sl.start = ldt_info.entry_number;
577 	sl.desc = NULL;
578 	sl.num = 1;
579 
580 	DPRINTF(("linux_write_ldt: idx=%d, base=0x%lx, limit=0x%x\n",
581 	    ldt_info.entry_number, ldt_info.base_addr, ldt_info.limit));
582 
583 	return x86_set_ldt1(l, &sl, &d);
584 }
585 
586 #endif /* USER_LDT */
587 
588 int
589 linux_sys_modify_ldt(struct lwp *l, const struct linux_sys_modify_ldt_args *uap, register_t *retval)
590 {
591 	/* {
592 		syscallarg(int) func;
593 		syscallarg(void *) ptr;
594 		syscallarg(size_t) bytecount;
595 	} */
596 
597 	switch (SCARG(uap, func)) {
598 #ifdef USER_LDT
599 	case 0:
600 		return linux_read_ldt(l, (const void *)uap, retval);
601 	case 1:
602 		return linux_write_ldt(l, (const void *)uap, 1);
603 	case 2:
604 #ifdef notyet
605 		return linux_read_default_ldt(l, (const void *)uap, retval);
606 #else
607 		return (ENOSYS);
608 #endif
609 	case 0x11:
610 		return linux_write_ldt(l, (const void *)uap, 0);
611 #endif /* USER_LDT */
612 
613 	default:
614 		return (ENOSYS);
615 	}
616 }
617 
618 /*
619  * XXX Pathetic hack to make svgalib work. This will fake the major
620  * device number of an opened VT so that svgalib likes it. grmbl.
621  * Should probably do it 'wrong the right way' and use a mapping
622  * array for all major device numbers, and map linux_mknod too.
623  */
624 dev_t
625 linux_fakedev(dev_t dev, int raw)
626 {
627 	extern const struct cdevsw ptc_cdevsw, pts_cdevsw;
628 	const struct cdevsw *cd = cdevsw_lookup(dev);
629 
630 	if (raw) {
631 #if (NWSDISPLAY > 0)
632 		extern const struct cdevsw wsdisplay_cdevsw;
633 		if (cd == &wsdisplay_cdevsw)
634 			return makedev(LINUX_CONS_MAJOR, (minor(dev) + 1));
635 #endif
636 	}
637 
638 	if (cd == &ptc_cdevsw)
639 		return makedev(LINUX_PTC_MAJOR, minor(dev));
640 	if (cd == &pts_cdevsw)
641 		return makedev(LINUX_PTS_MAJOR, minor(dev));
642 
643 	return dev;
644 }
645 
646 #if (NWSDISPLAY > 0)
647 /*
648  * That's not complete, but enough to get an X server running.
649  */
650 #define NR_KEYS 128
651 static const u_short plain_map[NR_KEYS] = {
652 	0x0200,	0x001b,	0x0031,	0x0032,	0x0033,	0x0034,	0x0035,	0x0036,
653 	0x0037,	0x0038,	0x0039,	0x0030,	0x002d,	0x003d,	0x007f,	0x0009,
654 	0x0b71,	0x0b77,	0x0b65,	0x0b72,	0x0b74,	0x0b79,	0x0b75,	0x0b69,
655 	0x0b6f,	0x0b70,	0x005b,	0x005d,	0x0201,	0x0702,	0x0b61,	0x0b73,
656 	0x0b64,	0x0b66,	0x0b67,	0x0b68,	0x0b6a,	0x0b6b,	0x0b6c,	0x003b,
657 	0x0027,	0x0060,	0x0700,	0x005c,	0x0b7a,	0x0b78,	0x0b63,	0x0b76,
658 	0x0b62,	0x0b6e,	0x0b6d,	0x002c,	0x002e,	0x002f,	0x0700,	0x030c,
659 	0x0703,	0x0020,	0x0207,	0x0100,	0x0101,	0x0102,	0x0103,	0x0104,
660 	0x0105,	0x0106,	0x0107,	0x0108,	0x0109,	0x0208,	0x0209,	0x0307,
661 	0x0308,	0x0309,	0x030b,	0x0304,	0x0305,	0x0306,	0x030a,	0x0301,
662 	0x0302,	0x0303,	0x0300,	0x0310,	0x0206,	0x0200,	0x003c,	0x010a,
663 	0x010b,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
664 	0x030e,	0x0702,	0x030d,	0x001c,	0x0701,	0x0205,	0x0114,	0x0603,
665 	0x0118,	0x0601,	0x0602,	0x0117,	0x0600,	0x0119,	0x0115,	0x0116,
666 	0x011a,	0x010c,	0x010d,	0x011b,	0x011c,	0x0110,	0x0311,	0x011d,
667 	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
668 }, shift_map[NR_KEYS] = {
669 	0x0200,	0x001b,	0x0021,	0x0040,	0x0023,	0x0024,	0x0025,	0x005e,
670 	0x0026,	0x002a,	0x0028,	0x0029,	0x005f,	0x002b,	0x007f,	0x0009,
671 	0x0b51,	0x0b57,	0x0b45,	0x0b52,	0x0b54,	0x0b59,	0x0b55,	0x0b49,
672 	0x0b4f,	0x0b50,	0x007b,	0x007d,	0x0201,	0x0702,	0x0b41,	0x0b53,
673 	0x0b44,	0x0b46,	0x0b47,	0x0b48,	0x0b4a,	0x0b4b,	0x0b4c,	0x003a,
674 	0x0022,	0x007e,	0x0700,	0x007c,	0x0b5a,	0x0b58,	0x0b43,	0x0b56,
675 	0x0b42,	0x0b4e,	0x0b4d,	0x003c,	0x003e,	0x003f,	0x0700,	0x030c,
676 	0x0703,	0x0020,	0x0207,	0x010a,	0x010b,	0x010c,	0x010d,	0x010e,
677 	0x010f,	0x0110,	0x0111,	0x0112,	0x0113,	0x0213,	0x0203,	0x0307,
678 	0x0308,	0x0309,	0x030b,	0x0304,	0x0305,	0x0306,	0x030a,	0x0301,
679 	0x0302,	0x0303,	0x0300,	0x0310,	0x0206,	0x0200,	0x003e,	0x010a,
680 	0x010b,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
681 	0x030e,	0x0702,	0x030d,	0x0200,	0x0701,	0x0205,	0x0114,	0x0603,
682 	0x020b,	0x0601,	0x0602,	0x0117,	0x0600,	0x020a,	0x0115,	0x0116,
683 	0x011a,	0x010c,	0x010d,	0x011b,	0x011c,	0x0110,	0x0311,	0x011d,
684 	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
685 }, altgr_map[NR_KEYS] = {
686 	0x0200,	0x0200,	0x0200,	0x0040,	0x0200,	0x0024,	0x0200,	0x0200,
687 	0x007b,	0x005b,	0x005d,	0x007d,	0x005c,	0x0200,	0x0200,	0x0200,
688 	0x0b71,	0x0b77,	0x0918,	0x0b72,	0x0b74,	0x0b79,	0x0b75,	0x0b69,
689 	0x0b6f,	0x0b70,	0x0200,	0x007e,	0x0201,	0x0702,	0x0914,	0x0b73,
690 	0x0917,	0x0919,	0x0b67,	0x0b68,	0x0b6a,	0x0b6b,	0x0b6c,	0x0200,
691 	0x0200,	0x0200,	0x0700,	0x0200,	0x0b7a,	0x0b78,	0x0916,	0x0b76,
692 	0x0915,	0x0b6e,	0x0b6d,	0x0200,	0x0200,	0x0200,	0x0700,	0x030c,
693 	0x0703,	0x0200,	0x0207,	0x050c,	0x050d,	0x050e,	0x050f,	0x0510,
694 	0x0511,	0x0512,	0x0513,	0x0514,	0x0515,	0x0208,	0x0202,	0x0911,
695 	0x0912,	0x0913,	0x030b,	0x090e,	0x090f,	0x0910,	0x030a,	0x090b,
696 	0x090c,	0x090d,	0x090a,	0x0310,	0x0206,	0x0200,	0x007c,	0x0516,
697 	0x0517,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
698 	0x030e,	0x0702,	0x030d,	0x0200,	0x0701,	0x0205,	0x0114,	0x0603,
699 	0x0118,	0x0601,	0x0602,	0x0117,	0x0600,	0x0119,	0x0115,	0x0116,
700 	0x011a,	0x010c,	0x010d,	0x011b,	0x011c,	0x0110,	0x0311,	0x011d,
701 	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
702 }, ctrl_map[NR_KEYS] = {
703 	0x0200,	0x0200,	0x0200,	0x0000,	0x001b,	0x001c,	0x001d,	0x001e,
704 	0x001f,	0x007f,	0x0200,	0x0200,	0x001f,	0x0200,	0x0008,	0x0200,
705 	0x0011,	0x0017,	0x0005,	0x0012,	0x0014,	0x0019,	0x0015,	0x0009,
706 	0x000f,	0x0010,	0x001b,	0x001d,	0x0201,	0x0702,	0x0001,	0x0013,
707 	0x0004,	0x0006,	0x0007,	0x0008,	0x000a,	0x000b,	0x000c,	0x0200,
708 	0x0007,	0x0000,	0x0700,	0x001c,	0x001a,	0x0018,	0x0003,	0x0016,
709 	0x0002,	0x000e,	0x000d,	0x0200,	0x020e,	0x007f,	0x0700,	0x030c,
710 	0x0703,	0x0000,	0x0207,	0x0100,	0x0101,	0x0102,	0x0103,	0x0104,
711 	0x0105,	0x0106,	0x0107,	0x0108,	0x0109,	0x0208,	0x0204,	0x0307,
712 	0x0308,	0x0309,	0x030b,	0x0304,	0x0305,	0x0306,	0x030a,	0x0301,
713 	0x0302,	0x0303,	0x0300,	0x0310,	0x0206,	0x0200,	0x0200,	0x010a,
714 	0x010b,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
715 	0x030e,	0x0702,	0x030d,	0x001c,	0x0701,	0x0205,	0x0114,	0x0603,
716 	0x0118,	0x0601,	0x0602,	0x0117,	0x0600,	0x0119,	0x0115,	0x0116,
717 	0x011a,	0x010c,	0x010d,	0x011b,	0x011c,	0x0110,	0x0311,	0x011d,
718 	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,	0x0200,
719 };
720 
721 const u_short * const linux_keytabs[] = {
722 	plain_map, shift_map, altgr_map, altgr_map, ctrl_map
723 };
724 #endif
725 
726 static struct biosdisk_info *
727 fd2biosinfo(struct proc *p, struct file *fp)
728 {
729 	struct vnode *vp;
730 	const char *blkname;
731 	char diskname[16];
732 	int i;
733 	struct nativedisk_info *nip;
734 	struct disklist *dl = x86_alldisks;
735 
736 	if (dl == NULL)
737 		return NULL;
738 	if (fp->f_type != DTYPE_VNODE)
739 		return NULL;
740 	vp = (struct vnode *)fp->f_data;
741 
742 	if (vp->v_type != VBLK)
743 		return NULL;
744 
745 	blkname = devsw_blk2name(major(vp->v_rdev));
746 	snprintf(diskname, sizeof diskname, "%s%llu", blkname,
747 	    (unsigned long long)DISKUNIT(vp->v_rdev));
748 
749 	for (i = 0; i < dl->dl_nnativedisks; i++) {
750 		nip = &dl->dl_nativedisks[i];
751 		if (strcmp(diskname, nip->ni_devname))
752 			continue;
753 		if (nip->ni_nmatches != 0)
754 			return &dl->dl_biosdisks[nip->ni_biosmatches[0]];
755 	}
756 
757 	return NULL;
758 }
759 
760 
761 /*
762  * We come here in a last attempt to satisfy a Linux ioctl() call
763  */
764 int
765 linux_machdepioctl(struct lwp *l, const struct linux_sys_ioctl_args *uap, register_t *retval)
766 {
767 	/* {
768 		syscallarg(int) fd;
769 		syscallarg(u_long) com;
770 		syscallarg(void *) data;
771 	} */
772 	struct sys_ioctl_args bia;
773 	u_long com;
774 	int error, error1;
775 #if (NWSDISPLAY > 0)
776 	struct vt_mode lvt;
777 	struct kbentry kbe;
778 #endif
779 	struct linux_hd_geometry hdg;
780 	struct linux_hd_big_geometry hdg_big;
781 	struct biosdisk_info *bip;
782 	file_t *fp;
783 	int fd;
784 	struct disklabel label;
785 	struct partinfo partp;
786 	int (*ioctlf)(struct file *, u_long, void *);
787 	u_long start, biostotal, realtotal;
788 	u_char heads, sectors;
789 	u_int cylinders;
790 	struct ioctl_pt pt;
791 
792 	fd = SCARG(uap, fd);
793 	SCARG(&bia, fd) = fd;
794 	SCARG(&bia, data) = SCARG(uap, data);
795 	com = SCARG(uap, com);
796 
797 	if ((fp = fd_getfile(fd)) == NULL)
798 		return (EBADF);
799 
800 	switch (com) {
801 #if (NWSDISPLAY > 0)
802 	case LINUX_KDGKBMODE:
803 		com = KDGKBMODE;
804 		break;
805 	case LINUX_KDSKBMODE:
806 		com = KDSKBMODE;
807 		if ((unsigned)SCARG(uap, data) == LINUX_K_MEDIUMRAW)
808 			SCARG(&bia, data) = (void *)K_RAW;
809 		break;
810 	case LINUX_KIOCSOUND:
811 		SCARG(&bia, data) =
812 		    (void *)(((unsigned long)SCARG(&bia, data)) & 0xffff);
813 		/* fall through */
814 	case LINUX_KDMKTONE:
815 		com = KDMKTONE;
816 		break;
817 	case LINUX_KDSETMODE:
818 		com = KDSETMODE;
819 		break;
820 	case LINUX_KDGETMODE:
821 		/* KD_* values are equal to the wscons numbers */
822 		com = WSDISPLAYIO_GMODE;
823 		break;
824 	case LINUX_KDENABIO:
825 		com = KDENABIO;
826 		break;
827 	case LINUX_KDDISABIO:
828 		com = KDDISABIO;
829 		break;
830 	case LINUX_KDGETLED:
831 		com = KDGETLED;
832 		break;
833 	case LINUX_KDSETLED:
834 		com = KDSETLED;
835 		break;
836 	case LINUX_VT_OPENQRY:
837 		com = VT_OPENQRY;
838 		break;
839 	case LINUX_VT_GETMODE:
840 		error = fp->f_ops->fo_ioctl(fp, VT_GETMODE, &lvt);
841 		if (error != 0)
842 			goto out;
843 		lvt.relsig = native_to_linux_signo[lvt.relsig];
844 		lvt.acqsig = native_to_linux_signo[lvt.acqsig];
845 		lvt.frsig = native_to_linux_signo[lvt.frsig];
846 		error = copyout(&lvt, SCARG(uap, data), sizeof (lvt));
847 		goto out;
848 	case LINUX_VT_SETMODE:
849 		error = copyin(SCARG(uap, data), &lvt, sizeof (lvt));
850 		if (error != 0)
851 			goto out;
852 		lvt.relsig = linux_to_native_signo[lvt.relsig];
853 		lvt.acqsig = linux_to_native_signo[lvt.acqsig];
854 		lvt.frsig = linux_to_native_signo[lvt.frsig];
855 		error = fp->f_ops->fo_ioctl(fp, VT_SETMODE, &lvt);
856 		goto out;
857 	case LINUX_VT_DISALLOCATE:
858 		/* XXX should use WSDISPLAYIO_DELSCREEN */
859 		error = 0;
860 		goto out;
861 	case LINUX_VT_RELDISP:
862 		com = VT_RELDISP;
863 		break;
864 	case LINUX_VT_ACTIVATE:
865 		com = VT_ACTIVATE;
866 		break;
867 	case LINUX_VT_WAITACTIVE:
868 		com = VT_WAITACTIVE;
869 		break;
870 	case LINUX_VT_GETSTATE:
871 		com = VT_GETSTATE;
872 		break;
873 	case LINUX_KDGKBTYPE:
874 	    {
875 		static const u_int8_t kb101 = KB_101;
876 
877 		/* This is what Linux does. */
878 		error = copyout(&kb101, SCARG(uap, data), 1);
879 		goto out;
880 	    }
881 	case LINUX_KDGKBENT:
882 		/*
883 		 * The Linux KDGKBENT ioctl is different from the
884 		 * SYSV original. So we handle it in machdep code.
885 		 * XXX We should use keyboard mapping information
886 		 * from wsdisplay, but this would be expensive.
887 		 */
888 		if ((error = copyin(SCARG(uap, data), &kbe,
889 				    sizeof(struct kbentry))))
890 			goto out;
891 		if (kbe.kb_table >= sizeof(linux_keytabs) / sizeof(u_short *)
892 		    || kbe.kb_index >= NR_KEYS) {
893 			error = EINVAL;
894 			goto out;
895 		}
896 		kbe.kb_value = linux_keytabs[kbe.kb_table][kbe.kb_index];
897 		error = copyout(&kbe, SCARG(uap, data),
898 				sizeof(struct kbentry));
899 		goto out;
900 #endif
901 	case LINUX_HDIO_GETGEO:
902 	case LINUX_HDIO_GETGEO_BIG:
903 		/*
904 		 * Try to mimic Linux behaviour: return the BIOS geometry
905 		 * if possible (extending its # of cylinders if it's beyond
906 		 * the 1023 limit), fall back to the MI geometry (i.e.
907 		 * the real geometry) if not found, by returning an
908 		 * error. See common/linux_hdio.c
909 		 */
910 		bip = fd2biosinfo(curproc, fp);
911 		ioctlf = fp->f_ops->fo_ioctl;
912 		error = ioctlf(fp, DIOCGDINFO, (void *)&label);
913 		error1 = ioctlf(fp, DIOCGPARTINFO, (void *)&partp);
914 		if (error != 0 && error1 != 0) {
915 			error = error1;
916 			goto out;
917 		}
918 		start = error1 != 0 ? partp.pi_offset : 0;
919 		if (bip != NULL && bip->bi_head != 0 && bip->bi_sec != 0
920 		    && bip->bi_cyl != 0) {
921 			heads = bip->bi_head;
922 			sectors = bip->bi_sec;
923 			cylinders = bip->bi_cyl;
924 			biostotal = heads * sectors * cylinders;
925 			realtotal = label.d_ntracks * label.d_nsectors *
926 			    label.d_ncylinders;
927 			if (realtotal > biostotal)
928 				cylinders = realtotal / (heads * sectors);
929 		} else {
930 			heads = label.d_ntracks;
931 			cylinders = label.d_ncylinders;
932 			sectors = label.d_nsectors;
933 		}
934 		if (com == LINUX_HDIO_GETGEO) {
935 			hdg.start = start;
936 			hdg.heads = heads;
937 			hdg.cylinders = cylinders;
938 			hdg.sectors = sectors;
939 			error = copyout(&hdg, SCARG(uap, data), sizeof hdg);
940 			goto out;
941 		} else {
942 			hdg_big.start = start;
943 			hdg_big.heads = heads;
944 			hdg_big.cylinders = cylinders;
945 			hdg_big.sectors = sectors;
946 			error = copyout(&hdg_big, SCARG(uap, data),
947 			    sizeof hdg_big);
948 			goto out;
949 		}
950 
951 	default:
952 		/*
953 		 * Unknown to us. If it's on a device, just pass it through
954 		 * using PTIOCLINUX, the device itself might be able to
955 		 * make some sense of it.
956 		 * XXX hack: if the function returns EJUSTRETURN,
957 		 * it has stuffed a sysctl return value in pt.data.
958 		 */
959 		ioctlf = fp->f_ops->fo_ioctl;
960 		pt.com = SCARG(uap, com);
961 		pt.data = SCARG(uap, data);
962 		error = ioctlf(fp, PTIOCLINUX, &pt);
963 		if (error == EJUSTRETURN) {
964 			retval[0] = (register_t)pt.data;
965 			error = 0;
966 		}
967 
968 		if (error == ENOTTY) {
969 			DPRINTF(("linux_machdepioctl: invalid ioctl %08lx\n",
970 			    com));
971 		}
972 		goto out;
973 	}
974 	SCARG(&bia, com) = com;
975 	error = sys_ioctl(curlwp, &bia, retval);
976 out:
977 	fd_putfile(fd);
978 	return error;
979 }
980 
981 /*
982  * Set I/O permissions for a process. Just set the maximum level
983  * right away (ignoring the argument), otherwise we would have
984  * to rely on I/O permission maps, which are not implemented.
985  */
986 int
987 linux_sys_iopl(struct lwp *l, const struct linux_sys_iopl_args *uap, register_t *retval)
988 {
989 	/* {
990 		syscallarg(int) level;
991 	} */
992 	struct trapframe *fp = l->l_md.md_regs;
993 
994 	if (kauth_authorize_machdep(l->l_cred, KAUTH_MACHDEP_IOPL,
995 	    NULL, NULL, NULL, NULL) != 0)
996 		return EPERM;
997 	fp->tf_eflags |= PSL_IOPL;
998 	*retval = 0;
999 	return 0;
1000 }
1001 
1002 /*
1003  * See above. If a root process tries to set access to an I/O port,
1004  * just let it have the whole range.
1005  */
1006 int
1007 linux_sys_ioperm(struct lwp *l, const struct linux_sys_ioperm_args *uap, register_t *retval)
1008 {
1009 	/* {
1010 		syscallarg(unsigned int) lo;
1011 		syscallarg(unsigned int) hi;
1012 		syscallarg(int) val;
1013 	} */
1014 	struct trapframe *fp = l->l_md.md_regs;
1015 
1016 	if (kauth_authorize_machdep(l->l_cred, SCARG(uap, val) ?
1017 	    KAUTH_MACHDEP_IOPERM_SET : KAUTH_MACHDEP_IOPERM_GET, NULL, NULL,
1018 	    NULL, NULL) != 0)
1019 		return EPERM;
1020 	if (SCARG(uap, val))
1021 		fp->tf_eflags |= PSL_IOPL;
1022 	*retval = 0;
1023 	return 0;
1024 }
1025 
1026 int
1027 linux_usertrap(struct lwp *l, vaddr_t trapaddr,
1028     void *arg)
1029 {
1030 	return 0;
1031 }
1032 
1033 const char *
1034 linux_get_uname_arch(void)
1035 {
1036 	static char uname_arch[5] = "i386";
1037 
1038 	if (uname_arch[1] == '3')
1039 		uname_arch[1] += cpu_class;
1040 	return uname_arch;
1041 }
1042