xref: /dflybsd-src/sys/vm/vm_vmspace.c (revision 4d9022e3888d071c1d026d8aa01aecd099e7bd9b)
1 /*
2  * Copyright (c) 2006 The DragonFly Project.  All rights reserved.
3  *
4  * This code is derived from software contributed to The DragonFly Project
5  * by Matthew Dillon <dillon@backplane.com>
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  *
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in
15  *    the documentation and/or other materials provided with the
16  *    distribution.
17  * 3. Neither the name of The DragonFly Project nor the names of its
18  *    contributors may be used to endorse or promote products derived
19  *    from this software without specific, prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
22  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
23  * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS
24  * FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE
25  * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
26  * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING,
27  * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
28  * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED
29  * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
30  * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
31  * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  *
34  * $DragonFly: src/sys/vm/vm_vmspace.c,v 1.14 2007/08/15 03:15:07 dillon Exp $
35  */
36 
37 #include <sys/param.h>
38 #include <sys/kernel.h>
39 #include <sys/systm.h>
40 #include <sys/sysproto.h>
41 #include <sys/kern_syscall.h>
42 #include <sys/mman.h>
43 #include <sys/proc.h>
44 #include <sys/malloc.h>
45 #include <sys/sysctl.h>
46 #include <sys/vkernel.h>
47 #include <sys/vmspace.h>
48 
49 #include <vm/vm_extern.h>
50 #include <vm/pmap.h>
51 
52 #include <machine/vmparam.h>
53 
54 #include <sys/spinlock2.h>
55 #include <sys/sysref2.h>
56 #include <sys/mplock2.h>
57 
58 static struct vmspace_entry *vkernel_find_vmspace(struct vkernel_proc *vkp,
59 						  void *id);
60 static void vmspace_entry_delete(struct vmspace_entry *ve,
61 				 struct vkernel_proc *vkp);
62 
63 static MALLOC_DEFINE(M_VKERNEL, "vkernel", "VKernel structures");
64 
65 /*
66  * vmspace_create (void *id, int type, void *data)
67  *
68  * Create a VMSPACE under the control of the caller with the specified id.
69  * An id of NULL cannot be used.  The type and data fields must currently
70  * be 0.
71  *
72  * The vmspace starts out completely empty.  Memory may be mapped into the
73  * VMSPACE with vmspace_mmap() and MAP_VPAGETABLE section(s) controlled
74  * with vmspace_mcontrol().
75  *
76  * MPALMOSTSAFE
77  */
78 int
79 sys_vmspace_create(struct vmspace_create_args *uap)
80 {
81 	struct vmspace_entry *ve;
82 	struct vkernel_proc *vkp;
83 	int error;
84 
85 	if (vkernel_enable == 0)
86 		return (EOPNOTSUPP);
87 
88 	/*
89 	 * Create a virtual kernel side-structure for the process if one
90 	 * does not exist.
91 	 */
92 	get_mplock();
93 	if ((vkp = curproc->p_vkernel) == NULL) {
94 		vkp = kmalloc(sizeof(*vkp), M_VKERNEL, M_WAITOK|M_ZERO);
95 		vkp->refs = 1;
96 		spin_init(&vkp->spin);
97 		RB_INIT(&vkp->root);
98 		curproc->p_vkernel = vkp;
99 	}
100 
101 	/*
102 	 * Create a new VMSPACE
103 	 *
104 	 * XXX race if kmalloc blocks
105 	 */
106 	if (vkernel_find_vmspace(vkp, uap->id)) {
107 		error = EEXIST;
108 		goto done;
109 	}
110 	ve = kmalloc(sizeof(struct vmspace_entry), M_VKERNEL, M_WAITOK|M_ZERO);
111 	ve->vmspace = vmspace_alloc(VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
112 	ve->id = uap->id;
113 	pmap_pinit2(vmspace_pmap(ve->vmspace));
114 	RB_INSERT(vmspace_rb_tree, &vkp->root, ve);
115 	error = 0;
116 done:
117 	rel_mplock();
118 	return (error);
119 }
120 
121 /*
122  * vmspace_destroy (void *id)
123  *
124  * Destroy a VMSPACE.
125  *
126  * MPALMOSTSAFE
127  */
128 int
129 sys_vmspace_destroy(struct vmspace_destroy_args *uap)
130 {
131 	struct vkernel_proc *vkp;
132 	struct vmspace_entry *ve;
133 	int error;
134 
135 	get_mplock();
136 	if ((vkp = curproc->p_vkernel) == NULL) {
137 		error = EINVAL;
138 		goto done;
139 	}
140 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) {
141 		error = ENOENT;
142 		goto done;
143 	}
144 	if (ve->refs) {
145 		error = EBUSY;
146 		goto done;
147 	}
148 	vmspace_entry_delete(ve, vkp);
149 	error = 0;
150 done:
151 	rel_mplock();
152 	return(error);
153 }
154 
155 /*
156  * vmspace_ctl (void *id, int cmd, struct trapframe *tframe,
157  *		struct vextframe *vframe);
158  *
159  * Transfer control to a VMSPACE.  Control is returned after the specified
160  * number of microseconds or if a page fault, signal, trap, or system call
161  * occurs.  The context is updated as appropriate.
162  *
163  * MPALMOSTSAFE
164  */
165 int
166 sys_vmspace_ctl(struct vmspace_ctl_args *uap)
167 {
168 	struct vkernel_proc *vkp;
169 	struct vkernel_lwp *vklp;
170 	struct vmspace_entry *ve;
171 	struct lwp *lp;
172 	struct proc *p;
173 	int framesz;
174 	int error;
175 
176 	lp = curthread->td_lwp;
177 	p = lp->lwp_proc;
178 
179 	get_mplock();
180 	if ((vkp = p->p_vkernel) == NULL) {
181 		error = EINVAL;
182 		goto done;
183 	}
184 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) {
185 		error = ENOENT;
186 		goto done;
187 	}
188 
189 	/*
190 	 * Signal mailbox interlock
191 	 */
192 	if (p->p_flag & P_MAILBOX) {
193 		p->p_flag &= ~P_MAILBOX;
194 		error = EINTR;
195 		goto done;
196 	}
197 
198 	switch(uap->cmd) {
199 	case VMSPACE_CTL_RUN:
200 		/*
201 		 * Save the caller's register context, swap VM spaces, and
202 		 * install the passed register context.  Return with
203 		 * EJUSTRETURN so the syscall code doesn't adjust the context.
204 		 */
205 		atomic_add_int(&ve->refs, 1);
206 		framesz = sizeof(struct trapframe);
207 		if ((vklp = lp->lwp_vkernel) == NULL) {
208 			vklp = kmalloc(sizeof(*vklp), M_VKERNEL,
209 				       M_WAITOK|M_ZERO);
210 			lp->lwp_vkernel = vklp;
211 		}
212 		vklp->user_trapframe = uap->tframe;
213 		vklp->user_vextframe = uap->vframe;
214 		bcopy(uap->sysmsg_frame, &vklp->save_trapframe, framesz);
215 		bcopy(&curthread->td_tls, &vklp->save_vextframe.vx_tls,
216 		      sizeof(vklp->save_vextframe.vx_tls));
217 		error = copyin(uap->tframe, uap->sysmsg_frame, framesz);
218 		if (error == 0)
219 			error = copyin(&uap->vframe->vx_tls, &curthread->td_tls, sizeof(struct savetls));
220 		if (error == 0)
221 			error = cpu_sanitize_frame(uap->sysmsg_frame);
222 		if (error == 0)
223 			error = cpu_sanitize_tls(&curthread->td_tls);
224 		if (error) {
225 			bcopy(&vklp->save_trapframe, uap->sysmsg_frame, framesz);
226 			bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
227 			      sizeof(vklp->save_vextframe.vx_tls));
228 			set_user_TLS();
229 			atomic_subtract_int(&ve->refs, 1);
230 		} else {
231 			vklp->ve = ve;
232 			pmap_setlwpvm(lp, ve->vmspace);
233 			set_user_TLS();
234 			set_vkernel_fp(uap->sysmsg_frame);
235 			error = EJUSTRETURN;
236 		}
237 		break;
238 	default:
239 		error = EOPNOTSUPP;
240 		break;
241 	}
242 done:
243 	rel_mplock();
244 	return(error);
245 }
246 
247 /*
248  * vmspace_mmap(id, addr, len, prot, flags, fd, offset)
249  *
250  * map memory within a VMSPACE.  This function is just like a normal mmap()
251  * but operates on the vmspace's memory map.  Most callers use this to create
252  * a MAP_VPAGETABLE mapping.
253  *
254  * No requirements.
255  */
256 int
257 sys_vmspace_mmap(struct vmspace_mmap_args *uap)
258 {
259 	struct vkernel_proc *vkp;
260 	struct vmspace_entry *ve;
261 	int error;
262 
263 	/*
264 	 * We hold the vmspace token to serialize calls to vkernel_find_vmspace
265 	 * and the vm token to serialize calls to kern_mmap.
266 	 */
267 	lwkt_gettoken(&vm_token);
268 	lwkt_gettoken(&vmspace_token);
269 	if ((vkp = curproc->p_vkernel) == NULL) {
270 		error = EINVAL;
271 		goto done;
272 	}
273 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) {
274 		error = ENOENT;
275 		goto done;
276 	}
277 	error = kern_mmap(ve->vmspace, uap->addr, uap->len,
278 			  uap->prot, uap->flags,
279 			  uap->fd, uap->offset, &uap->sysmsg_resultp);
280 done:
281 	lwkt_reltoken(&vmspace_token);
282 	lwkt_reltoken(&vm_token);
283 	return (error);
284 }
285 
286 /*
287  * vmspace_munmap(id, addr, len)
288  *
289  * unmap memory within a VMSPACE.
290  *
291  * MPALMOSTSAFE
292  */
293 int
294 sys_vmspace_munmap(struct vmspace_munmap_args *uap)
295 {
296 	struct vkernel_proc *vkp;
297 	struct vmspace_entry *ve;
298 	vm_offset_t addr;
299 	vm_offset_t tmpaddr;
300 	vm_size_t size, pageoff;
301 	vm_map_t map;
302 	int error;
303 
304 	get_mplock();
305 	if ((vkp = curproc->p_vkernel) == NULL) {
306 		error = EINVAL;
307 		goto done;
308 	}
309 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) {
310 		error = ENOENT;
311 		goto done;
312 	}
313 
314 	/*
315 	 * Copied from sys_munmap()
316 	 */
317 	addr = (vm_offset_t)uap->addr;
318 	size = uap->len;
319 
320 	pageoff = (addr & PAGE_MASK);
321 	addr -= pageoff;
322 	size += pageoff;
323 	size = (vm_size_t)round_page(size);
324 	if (size < uap->len) {		/* wrap */
325 		error = EINVAL;
326 		goto done;
327 	}
328 	tmpaddr = addr + size;		/* workaround gcc4 opt */
329 	if (tmpaddr < addr) {		/* wrap */
330 		error = EINVAL;
331 		goto done;
332 	}
333 	if (size == 0) {
334 		error = 0;
335 		goto done;
336 	}
337 
338 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
339 		error = EINVAL;
340 		goto done;
341 	}
342 	if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) {
343 		error = EINVAL;
344 		goto done;
345 	}
346 	map = &ve->vmspace->vm_map;
347 	if (!vm_map_check_protection(map, addr, tmpaddr, VM_PROT_NONE, FALSE)) {
348 		error = EINVAL;
349 		goto done;
350 	}
351 	vm_map_remove(map, addr, addr + size);
352 	error = 0;
353 done:
354 	rel_mplock();
355 	return (error);
356 }
357 
358 /*
359  * vmspace_pread(id, buf, nbyte, flags, offset)
360  *
361  * Read data from a vmspace.  The number of bytes read is returned or
362  * -1 if an unrecoverable error occured.  If the number of bytes read is
363  * less then the request size, a page fault occured in the VMSPACE which
364  * the caller must resolve in order to proceed.
365  *
366  * (not implemented yet)
367  *
368  * MPALMOSTSAFE
369  */
370 int
371 sys_vmspace_pread(struct vmspace_pread_args *uap)
372 {
373 	struct vkernel_proc *vkp;
374 	struct vmspace_entry *ve;
375 	int error;
376 
377 	get_mplock();
378 	if ((vkp = curproc->p_vkernel) == NULL) {
379 		error = EINVAL;
380 		goto done;
381 	}
382 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) {
383 		error = ENOENT;
384 		goto done;
385 	}
386 	error = EINVAL;
387 done:
388 	rel_mplock();
389 	return (error);
390 }
391 
392 /*
393  * vmspace_pwrite(id, buf, nbyte, flags, offset)
394  *
395  * Write data to a vmspace.  The number of bytes written is returned or
396  * -1 if an unrecoverable error occured.  If the number of bytes written is
397  * less then the request size, a page fault occured in the VMSPACE which
398  * the caller must resolve in order to proceed.
399  *
400  * (not implemented yet)
401  *
402  * MPALMOSTSAFE
403  */
404 int
405 sys_vmspace_pwrite(struct vmspace_pwrite_args *uap)
406 {
407 	struct vkernel_proc *vkp;
408 	struct vmspace_entry *ve;
409 	int error;
410 
411 	get_mplock();
412 	if ((vkp = curproc->p_vkernel) == NULL) {
413 		error = EINVAL;
414 		goto done;
415 	}
416 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) {
417 		error = ENOENT;
418 		goto done;
419 	}
420 	error = EINVAL;
421 done:
422 	rel_mplock();
423 	return (error);
424 }
425 
426 /*
427  * vmspace_mcontrol(id, addr, len, behav, value)
428  *
429  * madvise/mcontrol support for a vmspace.
430  *
431  * MPALMOSTSAFE
432  */
433 int
434 sys_vmspace_mcontrol(struct vmspace_mcontrol_args *uap)
435 {
436 	struct vkernel_proc *vkp;
437 	struct vmspace_entry *ve;
438 	vm_offset_t start, end;
439 	vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len;
440 	int error;
441 
442 	get_mplock();
443 	if ((vkp = curproc->p_vkernel) == NULL) {
444 		error = EINVAL;
445 		goto done;
446 	}
447 	if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) {
448 		error = ENOENT;
449 		goto done;
450 	}
451 
452 	/*
453 	 * This code is basically copied from sys_mcontrol()
454 	 */
455 	if (uap->behav < 0 || uap->behav > MADV_CONTROL_END) {
456 		error = EINVAL;
457 		goto done;
458 	}
459 
460 	if (tmpaddr < (vm_offset_t)uap->addr) {
461 		error = EINVAL;
462 		goto done;
463 	}
464 	if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) {
465 		error = EINVAL;
466 		goto done;
467 	}
468         if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) {
469 		error = EINVAL;
470 		goto done;
471 	}
472 
473 	start = trunc_page((vm_offset_t) uap->addr);
474 	end = round_page(tmpaddr);
475 
476 	error = vm_map_madvise(&ve->vmspace->vm_map, start, end,
477 				uap->behav, uap->value);
478 done:
479 	rel_mplock();
480 	return (error);
481 }
482 
483 /*
484  * Red black tree functions
485  */
486 static int rb_vmspace_compare(struct vmspace_entry *, struct vmspace_entry *);
487 RB_GENERATE(vmspace_rb_tree, vmspace_entry, rb_entry, rb_vmspace_compare);
488 
489 /* a->start is address, and the only field has to be initialized */
490 static int
491 rb_vmspace_compare(struct vmspace_entry *a, struct vmspace_entry *b)
492 {
493         if ((char *)a->id < (char *)b->id)
494                 return(-1);
495         else if ((char *)a->id > (char *)b->id)
496                 return(1);
497         return(0);
498 }
499 
500 static
501 int
502 rb_vmspace_delete(struct vmspace_entry *ve, void *data)
503 {
504 	struct vkernel_proc *vkp = data;
505 
506 	KKASSERT(ve->refs == 0);
507 	vmspace_entry_delete(ve, vkp);
508 	return(0);
509 }
510 
511 /*
512  * Remove a vmspace_entry from the RB tree and destroy it.  We have to clean
513  * up the pmap, the vm_map, then destroy the vmspace.
514  */
515 static
516 void
517 vmspace_entry_delete(struct vmspace_entry *ve, struct vkernel_proc *vkp)
518 {
519 	RB_REMOVE(vmspace_rb_tree, &vkp->root, ve);
520 
521 	pmap_remove_pages(vmspace_pmap(ve->vmspace),
522 			  VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
523 	vm_map_remove(&ve->vmspace->vm_map,
524 		      VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS);
525 	sysref_put(&ve->vmspace->vm_sysref);
526 	kfree(ve, M_VKERNEL);
527 }
528 
529 
530 static
531 struct vmspace_entry *
532 vkernel_find_vmspace(struct vkernel_proc *vkp, void *id)
533 {
534 	struct vmspace_entry *ve;
535 	struct vmspace_entry key;
536 
537 	key.id = id;
538 	ve = RB_FIND(vmspace_rb_tree, &vkp->root, &key);
539 	return (ve);
540 }
541 
542 /*
543  * Manage vkernel refs, used by the kernel when fork()ing or exit()ing
544  * a vkernel process.
545  */
546 void
547 vkernel_inherit(struct proc *p1, struct proc *p2)
548 {
549 	struct vkernel_proc *vkp;
550 
551 	vkp = p1->p_vkernel;
552 	KKASSERT(vkp->refs > 0);
553 	atomic_add_int(&vkp->refs, 1);
554 	p2->p_vkernel = vkp;
555 }
556 
557 void
558 vkernel_exit(struct proc *p)
559 {
560 	struct vkernel_proc *vkp;
561 	struct lwp *lp;
562 	int freeme = 0;
563 
564 	vkp = p->p_vkernel;
565 	/*
566 	 * Restore the original VM context if we are killed while running
567 	 * a different one.
568 	 *
569 	 * This isn't supposed to happen.  What is supposed to happen is
570 	 * that the process should enter vkernel_trap() before the handling
571 	 * the signal.
572 	 */
573 	RB_FOREACH(lp, lwp_rb_tree, &p->p_lwp_tree) {
574 		vkernel_lwp_exit(lp);
575 	}
576 
577 	/*
578 	 * Dereference the common area
579 	 */
580 	p->p_vkernel = NULL;
581 	KKASSERT(vkp->refs > 0);
582 	spin_lock_wr(&vkp->spin);
583 	if (--vkp->refs == 0)
584 		freeme = 1;
585 	spin_unlock_wr(&vkp->spin);
586 
587 	if (freeme) {
588 		RB_SCAN(vmspace_rb_tree, &vkp->root, NULL,
589 			rb_vmspace_delete, vkp);
590 		kfree(vkp, M_VKERNEL);
591 	}
592 }
593 
594 void
595 vkernel_lwp_exit(struct lwp *lp)
596 {
597 	struct vkernel_lwp *vklp;
598 	struct vmspace_entry *ve;
599 
600 	if ((vklp = lp->lwp_vkernel) != NULL) {
601 		if ((ve = vklp->ve) != NULL) {
602 			kprintf("Warning, pid %d killed with "
603 				"active VC!\n", lp->lwp_proc->p_pid);
604 			print_backtrace(-1);
605 			pmap_setlwpvm(lp, lp->lwp_proc->p_vmspace);
606 			vklp->ve = NULL;
607 			KKASSERT(ve->refs > 0);
608 			atomic_subtract_int(&ve->refs, 1);
609 		}
610 		lp->lwp_vkernel = NULL;
611 		kfree(vklp, M_VKERNEL);
612 	}
613 }
614 
615 /*
616  * A VM space under virtual kernel control trapped out or made a system call
617  * or otherwise needs to return control to the virtual kernel context.
618  */
619 void
620 vkernel_trap(struct lwp *lp, struct trapframe *frame)
621 {
622 	struct proc *p = lp->lwp_proc;
623 	struct vmspace_entry *ve;
624 	struct vkernel_lwp *vklp;
625 	int error;
626 
627 	/*
628 	 * Which vmspace entry was running?
629 	 */
630 	vklp = lp->lwp_vkernel;
631 	KKASSERT(vklp);
632 	ve = vklp->ve;
633 	KKASSERT(ve != NULL);
634 
635 	/*
636 	 * Switch the LWP vmspace back to the virtual kernel's VM space.
637 	 */
638 	vklp->ve = NULL;
639 	pmap_setlwpvm(lp, p->p_vmspace);
640 	KKASSERT(ve->refs > 0);
641 	atomic_subtract_int(&ve->refs, 1);
642 
643 	/*
644 	 * Copy the emulated process frame to the virtual kernel process.
645 	 * The emulated process cannot change TLS descriptors so don't
646 	 * bother saving them, we already have a copy.
647 	 *
648 	 * Restore the virtual kernel's saved context so the virtual kernel
649 	 * process can resume.
650 	 */
651 	error = copyout(frame, vklp->user_trapframe, sizeof(*frame));
652 	bcopy(&vklp->save_trapframe, frame, sizeof(*frame));
653 	bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls,
654 	      sizeof(vklp->save_vextframe.vx_tls));
655 	set_user_TLS();
656 	cpu_vkernel_trap(frame, error);
657 }
658 
659