xref: /dflybsd-src/sys/vm/vm_vmspace.c (revision 90ea502b8c5d21f908cedff6680ee2bc9e74ce74)
1 /*
2  * Copyright (c) 2006 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vm/vm_vmspace.c,v 1.14 2007/08/15 03:15:07 dillon Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/kernel.h>
39 #include <sys/systm.h>
40 #include <sys/sysproto.h>
41 #include <sys/kern_syscall.h>
42 #include <sys/mman.h>
43 #include <sys/proc.h>
44 #include <sys/malloc.h>
45 #include <sys/sysctl.h>
46 #include <sys/vkernel.h>
47 #include <sys/vmspace.h>
48 
49 #include <vm/vm_extern.h>
50 #include <vm/pmap.h>
51 
52 #include <machine/vmparam.h>
53 
54 #include <sys/spinlock2.h>
55 #include <sys/sysref2.h>
56 
57 static struct vmspace_entry *vkernel_find_vmspace(struct vkernel_proc *vkp,
58 						  void *id);
59 static void vmspace_entry_delete(struct vmspace_entry *ve,
60 				 struct vkernel_proc *vkp);
61 
62 static MALLOC_DEFINE(M_VKERNEL, "vkernel", "VKernel structures");
63 
64 /*
65  * vmspace_create (void *id, int type, void *data)
66  *
67  * Create a VMSPACE under the control of the caller with the specified id.
68  * An id of NULL cannot be used.  The type and data fields must currently
69  * be 0.
70  *
71  * The vmspace starts out completely empty.  Memory may be mapped into the
72  * VMSPACE with vmspace_mmap() and MAP_VPAGETABLE section(s) controlled
73  * with vmspace_mcontrol().
74  *
75  * MPALMOSTSAFE
76  */
77 int
78 sys_vmspace_create(struct vmspace_create_args *uap)
79 {
80 	struct vmspace_entry *ve;
81 	struct vkernel_proc *vkp;
82 	int error;
83 
84 	if (vkernel_enable == 0)
85 		return (EOPNOTSUPP);
86 
87 	/*
88 	 * Create a virtual kernel side-structure for the process if one
89 	 * does not exist.
90 	 */
91 	get_mplock();
92 	if ((vkp = curproc->p_vkernel) == NULL) {
93 		vkp = kmalloc(sizeof(*vkp), M_VKERNEL, M_WAITOK|M_ZERO);
94 		vkp->refs = 1;
95 		spin_init(&vkp->spin);
96 		RB_INIT(&vkp->root);
97 		curproc->p_vkernel = vkp;
98 	}
99 
100 	/*
101 	 * Create a new VMSPACE
102 	 *
103 	 * XXX race if kmalloc blocks
104 	 */
105 	if (vkernel_find_vmspace(vkp, uap->id)) {
106 		error = EEXIST;
107 		goto done;
108 	}
109 	ve = kmalloc(sizeof(struct vmspace_entry), M_VKERNEL, M_WAITOK|M_ZERO);
110 	ve->vmspace = vmspace_alloc(VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
111 	ve->id = uap->id;
112 	pmap_pinit2(vmspace_pmap(ve->vmspace));
113 	RB_INSERT(vmspace_rb_tree, &vkp->root, ve);
114 	error = 0;
115 done:
116 	rel_mplock();
117 	return (error);
118 }
119 
120 /*
121  * vmspace_destroy (void *id)
122  *
123  * Destroy a VMSPACE.
124  *
125  * MPALMOSTSAFE
126  */
127 int
128 sys_vmspace_destroy(struct vmspace_destroy_args *uap)
129 {
130 	struct vkernel_proc *vkp;
131 	struct vmspace_entry *ve;
132 	int error;
133 
134 	get_mplock();
135 	if ((vkp = curproc->p_vkernel) == NULL) {
136 		error = EINVAL;
137 		goto done;
138 	}
139 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) {
140 		error = ENOENT;
141 		goto done;
142 	}
143 	if (ve->refs) {
144 		error = EBUSY;
145 		goto done;
146 	}
147 	vmspace_entry_delete(ve, vkp);
148 	error = 0;
149 done:
150 	rel_mplock();
151 	return(error);
152 }
153 
154 /*
155  * vmspace_ctl (void *id, int cmd, struct trapframe *tframe,
156  *		struct vextframe *vframe);
157  *
158  * Transfer control to a VMSPACE.  Control is returned after the specified
159  * number of microseconds or if a page fault, signal, trap, or system call
160  * occurs.  The context is updated as appropriate.
161  *
162  * MPALMOSTSAFE
163  */
164 int
165 sys_vmspace_ctl(struct vmspace_ctl_args *uap)
166 {
167 	struct vkernel_proc *vkp;
168 	struct vkernel_lwp *vklp;
169 	struct vmspace_entry *ve;
170 	struct lwp *lp;
171 	struct proc *p;
172 	int framesz;
173 	int error;
174 
175 	lp = curthread->td_lwp;
176 	p = lp->lwp_proc;
177 
178 	get_mplock();
179 	if ((vkp = p->p_vkernel) == NULL) {
180 		error = EINVAL;
181 		goto done;
182 	}
183 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) {
184 		error = ENOENT;
185 		goto done;
186 	}
187 
188 	/*
189 	 * Signal mailbox interlock
190 	 */
191 	if (p->p_flag & P_MAILBOX) {
192 		p->p_flag &= ~P_MAILBOX;
193 		error = EINTR;
194 		goto done;
195 	}
196 
197 	switch(uap->cmd) {
198 	case VMSPACE_CTL_RUN:
199 		/*
200 		 * Save the caller's register context, swap VM spaces, and
201 		 * install the passed register context.  Return with
202 		 * EJUSTRETURN so the syscall code doesn't adjust the context.
203 		 */
204 		atomic_add_int(&ve->refs, 1);
205 		framesz = sizeof(struct trapframe);
206 		if ((vklp = lp->lwp_vkernel) == NULL) {
207 			vklp = kmalloc(sizeof(*vklp), M_VKERNEL,
208 				       M_WAITOK|M_ZERO);
209 			lp->lwp_vkernel = vklp;
210 		}
211 		vklp->user_trapframe = uap->tframe;
212 		vklp->user_vextframe = uap->vframe;
213 		bcopy(uap->sysmsg_frame, &vklp->save_trapframe, framesz);
214 		bcopy(&curthread->td_tls, &vklp->save_vextframe.vx_tls,
215 		      sizeof(vklp->save_vextframe.vx_tls));
216 		error = copyin(uap->tframe, uap->sysmsg_frame, framesz);
217 		if (error == 0)
218 			error = copyin(&uap->vframe->vx_tls, &curthread->td_tls, sizeof(struct savetls));
219 		if (error == 0)
220 			error = cpu_sanitize_frame(uap->sysmsg_frame);
221 		if (error == 0)
222 			error = cpu_sanitize_tls(&curthread->td_tls);
223 		if (error) {
224 			bcopy(&vklp->save_trapframe, uap->sysmsg_frame, framesz);
225 			bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
226 			      sizeof(vklp->save_vextframe.vx_tls));
227 			set_user_TLS();
228 			atomic_subtract_int(&ve->refs, 1);
229 		} else {
230 			vklp->ve = ve;
231 			pmap_setlwpvm(lp, ve->vmspace);
232 			set_user_TLS();
233 			set_vkernel_fp(uap->sysmsg_frame);
234 			error = EJUSTRETURN;
235 		}
236 		break;
237 	default:
238 		error = EOPNOTSUPP;
239 		break;
240 	}
241 done:
242 	rel_mplock();
243 	return(error);
244 }
245 
246 /*
247  * vmspace_mmap(id, addr, len, prot, flags, fd, offset)
248  *
249  * map memory within a VMSPACE.  This function is just like a normal mmap()
250  * but operates on the vmspace's memory map.  Most callers use this to create
251  * a MAP_VPAGETABLE mapping.
252  *
253  * MPALMOSTSAFE
254  */
255 int
256 sys_vmspace_mmap(struct vmspace_mmap_args *uap)
257 {
258 	struct vkernel_proc *vkp;
259 	struct vmspace_entry *ve;
260 	int error;
261 
262 	get_mplock();
263 	if ((vkp = curproc->p_vkernel) == NULL) {
264 		error = EINVAL;
265 		goto done;
266 	}
267 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) {
268 		error = ENOENT;
269 		goto done;
270 	}
271 	error = kern_mmap(ve->vmspace, uap->addr, uap->len,
272 			  uap->prot, uap->flags,
273 			  uap->fd, uap->offset, &uap->sysmsg_resultp);
274 done:
275 	rel_mplock();
276 	return (error);
277 }
278 
279 /*
280  * vmspace_munmap(id, addr, len)
281  *
282  * unmap memory within a VMSPACE.
283  *
284  * MPALMOSTSAFE
285  */
286 int
287 sys_vmspace_munmap(struct vmspace_munmap_args *uap)
288 {
289 	struct vkernel_proc *vkp;
290 	struct vmspace_entry *ve;
291 	vm_offset_t addr;
292 	vm_offset_t tmpaddr;
293 	vm_size_t size, pageoff;
294 	vm_map_t map;
295 	int error;
296 
297 	get_mplock();
298 	if ((vkp = curproc->p_vkernel) == NULL) {
299 		error = EINVAL;
300 		goto done;
301 	}
302 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) {
303 		error = ENOENT;
304 		goto done;
305 	}
306 
307 	/*
308 	 * Copied from sys_munmap()
309 	 */
310 	addr = (vm_offset_t)uap->addr;
311 	size = uap->len;
312 
313 	pageoff = (addr & PAGE_MASK);
314 	addr -= pageoff;
315 	size += pageoff;
316 	size = (vm_size_t)round_page(size);
317 	if (size < uap->len) {		/* wrap */
318 		error = EINVAL;
319 		goto done;
320 	}
321 	tmpaddr = addr + size;		/* workaround gcc4 opt */
322 	if (tmpaddr < addr) {		/* wrap */
323 		error = EINVAL;
324 		goto done;
325 	}
326 	if (size == 0) {
327 		error = 0;
328 		goto done;
329 	}
330 
331 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
332 		error = EINVAL;
333 		goto done;
334 	}
335 	if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) {
336 		error = EINVAL;
337 		goto done;
338 	}
339 	map = &ve->vmspace->vm_map;
340 	if (!vm_map_check_protection(map, addr, tmpaddr, VM_PROT_NONE)) {
341 		error = EINVAL;
342 		goto done;
343 	}
344 	vm_map_remove(map, addr, addr + size);
345 	error = 0;
346 done:
347 	rel_mplock();
348 	return (error);
349 }
350 
351 /*
352  * vmspace_pread(id, buf, nbyte, flags, offset)
353  *
354  * Read data from a vmspace.  The number of bytes read is returned or
355  * -1 if an unrecoverable error occured.  If the number of bytes read is
356  * less then the request size, a page fault occured in the VMSPACE which
357  * the caller must resolve in order to proceed.
358  *
359  * (not implemented yet)
360  *
361  * MPALMOSTSAFE
362  */
363 int
364 sys_vmspace_pread(struct vmspace_pread_args *uap)
365 {
366 	struct vkernel_proc *vkp;
367 	struct vmspace_entry *ve;
368 	int error;
369 
370 	get_mplock();
371 	if ((vkp = curproc->p_vkernel) == NULL) {
372 		error = EINVAL;
373 		goto done;
374 	}
375 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) {
376 		error = ENOENT;
377 		goto done;
378 	}
379 	error = EINVAL;
380 done:
381 	rel_mplock();
382 	return (error);
383 }
384 
385 /*
386  * vmspace_pwrite(id, buf, nbyte, flags, offset)
387  *
388  * Write data to a vmspace.  The number of bytes written is returned or
389  * -1 if an unrecoverable error occured.  If the number of bytes written is
390  * less then the request size, a page fault occured in the VMSPACE which
391  * the caller must resolve in order to proceed.
392  *
393  * (not implemented yet)
394  *
395  * MPALMOSTSAFE
396  */
397 int
398 sys_vmspace_pwrite(struct vmspace_pwrite_args *uap)
399 {
400 	struct vkernel_proc *vkp;
401 	struct vmspace_entry *ve;
402 	int error;
403 
404 	get_mplock();
405 	if ((vkp = curproc->p_vkernel) == NULL) {
406 		error = EINVAL;
407 		goto done;
408 	}
409 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) {
410 		error = ENOENT;
411 		goto done;
412 	}
413 	error = EINVAL;
414 done:
415 	rel_mplock();
416 	return (error);
417 }
418 
419 /*
420  * vmspace_mcontrol(id, addr, len, behav, value)
421  *
422  * madvise/mcontrol support for a vmspace.
423  *
424  * MPALMOSTSAFE
425  */
426 int
427 sys_vmspace_mcontrol(struct vmspace_mcontrol_args *uap)
428 {
429 	struct vkernel_proc *vkp;
430 	struct vmspace_entry *ve;
431 	vm_offset_t start, end;
432 	vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len;
433 	int error;
434 
435 	get_mplock();
436 	if ((vkp = curproc->p_vkernel) == NULL) {
437 		error = EINVAL;
438 		goto done;
439 	}
440 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) {
441 		error = ENOENT;
442 		goto done;
443 	}
444 
445 	/*
446 	 * This code is basically copied from sys_mcontrol()
447 	 */
448 	if (uap->behav < 0 || uap->behav > MADV_CONTROL_END) {
449 		error = EINVAL;
450 		goto done;
451 	}
452 
453 	if (tmpaddr < (vm_offset_t)uap->addr) {
454 		error = EINVAL;
455 		goto done;
456 	}
457 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
458 		error = EINVAL;
459 		goto done;
460 	}
461         if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) {
462 		error = EINVAL;
463 		goto done;
464 	}
465 
466 	start = trunc_page((vm_offset_t) uap->addr);
467 	end = round_page(tmpaddr);
468 
469 	error = vm_map_madvise(&ve->vmspace->vm_map, start, end,
470 				uap->behav, uap->value);
471 done:
472 	rel_mplock();
473 	return (error);
474 }
475 
476 /*
477  * Red black tree functions
478  */
479 static int rb_vmspace_compare(struct vmspace_entry *, struct vmspace_entry *);
480 RB_GENERATE(vmspace_rb_tree, vmspace_entry, rb_entry, rb_vmspace_compare);
481 
482 /* a->start is address, and the only field has to be initialized */
483 static int
484 rb_vmspace_compare(struct vmspace_entry *a, struct vmspace_entry *b)
485 {
486         if ((char *)a->id < (char *)b->id)
487                 return(-1);
488         else if ((char *)a->id > (char *)b->id)
489                 return(1);
490         return(0);
491 }
492 
493 static
494 int
495 rb_vmspace_delete(struct vmspace_entry *ve, void *data)
496 {
497 	struct vkernel_proc *vkp = data;
498 
499 	KKASSERT(ve->refs == 0);
500 	vmspace_entry_delete(ve, vkp);
501 	return(0);
502 }
503 
504 /*
505  * Remove a vmspace_entry from the RB tree and destroy it.  We have to clean
506  * up the pmap, the vm_map, then destroy the vmspace.
507  */
508 static
509 void
510 vmspace_entry_delete(struct vmspace_entry *ve, struct vkernel_proc *vkp)
511 {
512 	RB_REMOVE(vmspace_rb_tree, &vkp->root, ve);
513 
514 	pmap_remove_pages(vmspace_pmap(ve->vmspace),
515 			  VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
516 	vm_map_remove(&ve->vmspace->vm_map,
517 		      VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
518 	sysref_put(&ve->vmspace->vm_sysref);
519 	kfree(ve, M_VKERNEL);
520 }
521 
522 
523 static
524 struct vmspace_entry *
525 vkernel_find_vmspace(struct vkernel_proc *vkp, void *id)
526 {
527 	struct vmspace_entry *ve;
528 	struct vmspace_entry key;
529 
530 	key.id = id;
531 	ve = RB_FIND(vmspace_rb_tree, &vkp->root, &key);
532 	return (ve);
533 }
534 
535 /*
536  * Manage vkernel refs, used by the kernel when fork()ing or exit()ing
537  * a vkernel process.
538  */
539 void
540 vkernel_inherit(struct proc *p1, struct proc *p2)
541 {
542 	struct vkernel_proc *vkp;
543 
544 	vkp = p1->p_vkernel;
545 	KKASSERT(vkp->refs > 0);
546 	atomic_add_int(&vkp->refs, 1);
547 	p2->p_vkernel = vkp;
548 }
549 
550 void
551 vkernel_exit(struct proc *p)
552 {
553 	struct vkernel_proc *vkp;
554 	struct lwp *lp;
555 	int freeme = 0;
556 
557 	vkp = p->p_vkernel;
558 	/*
559 	 * Restore the original VM context if we are killed while running
560 	 * a different one.
561 	 *
562 	 * This isn't supposed to happen.  What is supposed to happen is
563 	 * that the process should enter vkernel_trap() before the handling
564 	 * the signal.
565 	 */
566 	RB_FOREACH(lp, lwp_rb_tree, &p->p_lwp_tree) {
567 		vkernel_lwp_exit(lp);
568 	}
569 
570 	/*
571 	 * Dereference the common area
572 	 */
573 	p->p_vkernel = NULL;
574 	KKASSERT(vkp->refs > 0);
575 	spin_lock_wr(&vkp->spin);
576 	if (--vkp->refs == 0)
577 		freeme = 1;
578 	spin_unlock_wr(&vkp->spin);
579 
580 	if (freeme) {
581 		RB_SCAN(vmspace_rb_tree, &vkp->root, NULL,
582 			rb_vmspace_delete, vkp);
583 		kfree(vkp, M_VKERNEL);
584 	}
585 }
586 
587 void
588 vkernel_lwp_exit(struct lwp *lp)
589 {
590 	struct vkernel_lwp *vklp;
591 	struct vmspace_entry *ve;
592 
593 	if ((vklp = lp->lwp_vkernel) != NULL) {
594 		if ((ve = vklp->ve) != NULL) {
595 			kprintf("Warning, pid %d killed with "
596 				"active VC!\n", lp->lwp_proc->p_pid);
597 			print_backtrace();
598 			pmap_setlwpvm(lp, lp->lwp_proc->p_vmspace);
599 			vklp->ve = NULL;
600 			KKASSERT(ve->refs > 0);
601 			atomic_subtract_int(&ve->refs, 1);
602 		}
603 		lp->lwp_vkernel = NULL;
604 		kfree(vklp, M_VKERNEL);
605 	}
606 }
607 
608 /*
609  * A VM space under virtual kernel control trapped out or made a system call
610  * or otherwise needs to return control to the virtual kernel context.
611  */
612 int
613 vkernel_trap(struct lwp *lp, struct trapframe *frame)
614 {
615 	struct proc *p = lp->lwp_proc;
616 	struct vmspace_entry *ve;
617 	struct vkernel_lwp *vklp;
618 	int error;
619 
620 	/*
621 	 * Which vmspace entry was running?
622 	 */
623 	vklp = lp->lwp_vkernel;
624 	KKASSERT(vklp);
625 	ve = vklp->ve;
626 	KKASSERT(ve != NULL);
627 
628 	/*
629 	 * Switch the LWP vmspace back to the virtual kernel's VM space.
630 	 */
631 	vklp->ve = NULL;
632 	pmap_setlwpvm(lp, p->p_vmspace);
633 	KKASSERT(ve->refs > 0);
634 	atomic_subtract_int(&ve->refs, 1);
635 
636 	/*
637 	 * Copy the emulated process frame to the virtual kernel process.
638 	 * The emulated process cannot change TLS descriptors so don't
639 	 * bother saving them, we already have a copy.
640 	 *
641 	 * Restore the virtual kernel's saved context so the virtual kernel
642 	 * process can resume.
643 	 */
644 	error = copyout(frame, vklp->user_trapframe, sizeof(*frame));
645 	bcopy(&vklp->save_trapframe, frame, sizeof(*frame));
646 	bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
647 	      sizeof(vklp->save_vextframe.vx_tls));
648 	set_user_TLS();
649 	return(error);
650 }
651 
652