1 /* 2 * Copyright (c) 2006 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vm/vm_vmspace.c,v 1.14 2007/08/15 03:15:07 dillon Exp $ 35 */ 36 37 #include <sys/param.h> 38 #include <sys/kernel.h> 39 #include <sys/systm.h> 40 #include <sys/sysproto.h> 41 #include <sys/kern_syscall.h> 42 #include <sys/mman.h> 43 #include <sys/proc.h> 44 #include <sys/malloc.h> 45 #include <sys/sysctl.h> 46 #include <sys/vkernel.h> 47 #include <sys/vmspace.h> 48 49 #include <vm/vm_extern.h> 50 #include <vm/pmap.h> 51 52 #include <machine/vmparam.h> 53 54 #include <sys/spinlock2.h> 55 #include <sys/sysref2.h> 56 #include <sys/mplock2.h> 57 58 static struct vmspace_entry *vkernel_find_vmspace(struct vkernel_proc *vkp, 59 void *id); 60 static void vmspace_entry_delete(struct vmspace_entry *ve, 61 struct vkernel_proc *vkp); 62 63 static MALLOC_DEFINE(M_VKERNEL, "vkernel", "VKernel structures"); 64 65 /* 66 * vmspace_create (void *id, int type, void *data) 67 * 68 * Create a VMSPACE under the control of the caller with the specified id. 69 * An id of NULL cannot be used. The type and data fields must currently 70 * be 0. 71 * 72 * The vmspace starts out completely empty. Memory may be mapped into the 73 * VMSPACE with vmspace_mmap() and MAP_VPAGETABLE section(s) controlled 74 * with vmspace_mcontrol(). 75 * 76 * MPALMOSTSAFE 77 */ 78 int 79 sys_vmspace_create(struct vmspace_create_args *uap) 80 { 81 struct vmspace_entry *ve; 82 struct vkernel_proc *vkp; 83 int error; 84 85 if (vkernel_enable == 0) 86 return (EOPNOTSUPP); 87 88 /* 89 * Create a virtual kernel side-structure for the process if one 90 * does not exist. 91 */ 92 get_mplock(); 93 if ((vkp = curproc->p_vkernel) == NULL) { 94 vkp = kmalloc(sizeof(*vkp), M_VKERNEL, M_WAITOK|M_ZERO); 95 vkp->refs = 1; 96 spin_init(&vkp->spin); 97 RB_INIT(&vkp->root); 98 curproc->p_vkernel = vkp; 99 } 100 101 /* 102 * Create a new VMSPACE 103 * 104 * XXX race if kmalloc blocks 105 */ 106 if (vkernel_find_vmspace(vkp, uap->id)) { 107 error = EEXIST; 108 goto done; 109 } 110 ve = kmalloc(sizeof(struct vmspace_entry), M_VKERNEL, M_WAITOK|M_ZERO); 111 ve->vmspace = vmspace_alloc(VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 112 ve->id = uap->id; 113 pmap_pinit2(vmspace_pmap(ve->vmspace)); 114 RB_INSERT(vmspace_rb_tree, &vkp->root, ve); 115 error = 0; 116 done: 117 rel_mplock(); 118 return (error); 119 } 120 121 /* 122 * vmspace_destroy (void *id) 123 * 124 * Destroy a VMSPACE. 125 * 126 * MPALMOSTSAFE 127 */ 128 int 129 sys_vmspace_destroy(struct vmspace_destroy_args *uap) 130 { 131 struct vkernel_proc *vkp; 132 struct vmspace_entry *ve; 133 int error; 134 135 get_mplock(); 136 if ((vkp = curproc->p_vkernel) == NULL) { 137 error = EINVAL; 138 goto done; 139 } 140 if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) { 141 error = ENOENT; 142 goto done; 143 } 144 if (ve->refs) { 145 error = EBUSY; 146 goto done; 147 } 148 vmspace_entry_delete(ve, vkp); 149 error = 0; 150 done: 151 rel_mplock(); 152 return(error); 153 } 154 155 /* 156 * vmspace_ctl (void *id, int cmd, struct trapframe *tframe, 157 * struct vextframe *vframe); 158 * 159 * Transfer control to a VMSPACE. Control is returned after the specified 160 * number of microseconds or if a page fault, signal, trap, or system call 161 * occurs. The context is updated as appropriate. 162 * 163 * MPALMOSTSAFE 164 */ 165 int 166 sys_vmspace_ctl(struct vmspace_ctl_args *uap) 167 { 168 struct vkernel_proc *vkp; 169 struct vkernel_lwp *vklp; 170 struct vmspace_entry *ve; 171 struct lwp *lp; 172 struct proc *p; 173 int framesz; 174 int error; 175 176 lp = curthread->td_lwp; 177 p = lp->lwp_proc; 178 179 get_mplock(); 180 if ((vkp = p->p_vkernel) == NULL) { 181 error = EINVAL; 182 goto done; 183 } 184 if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) { 185 error = ENOENT; 186 goto done; 187 } 188 189 /* 190 * Signal mailbox interlock 191 */ 192 if (p->p_flag & P_MAILBOX) { 193 p->p_flag &= ~P_MAILBOX; 194 error = EINTR; 195 goto done; 196 } 197 198 switch(uap->cmd) { 199 case VMSPACE_CTL_RUN: 200 /* 201 * Save the caller's register context, swap VM spaces, and 202 * install the passed register context. Return with 203 * EJUSTRETURN so the syscall code doesn't adjust the context. 204 */ 205 atomic_add_int(&ve->refs, 1); 206 framesz = sizeof(struct trapframe); 207 if ((vklp = lp->lwp_vkernel) == NULL) { 208 vklp = kmalloc(sizeof(*vklp), M_VKERNEL, 209 M_WAITOK|M_ZERO); 210 lp->lwp_vkernel = vklp; 211 } 212 vklp->user_trapframe = uap->tframe; 213 vklp->user_vextframe = uap->vframe; 214 bcopy(uap->sysmsg_frame, &vklp->save_trapframe, framesz); 215 bcopy(&curthread->td_tls, &vklp->save_vextframe.vx_tls, 216 sizeof(vklp->save_vextframe.vx_tls)); 217 error = copyin(uap->tframe, uap->sysmsg_frame, framesz); 218 if (error == 0) 219 error = copyin(&uap->vframe->vx_tls, &curthread->td_tls, sizeof(struct savetls)); 220 if (error == 0) 221 error = cpu_sanitize_frame(uap->sysmsg_frame); 222 if (error == 0) 223 error = cpu_sanitize_tls(&curthread->td_tls); 224 if (error) { 225 bcopy(&vklp->save_trapframe, uap->sysmsg_frame, framesz); 226 bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls, 227 sizeof(vklp->save_vextframe.vx_tls)); 228 set_user_TLS(); 229 atomic_subtract_int(&ve->refs, 1); 230 } else { 231 vklp->ve = ve; 232 pmap_setlwpvm(lp, ve->vmspace); 233 set_user_TLS(); 234 set_vkernel_fp(uap->sysmsg_frame); 235 error = EJUSTRETURN; 236 } 237 break; 238 default: 239 error = EOPNOTSUPP; 240 break; 241 } 242 done: 243 rel_mplock(); 244 return(error); 245 } 246 247 /* 248 * vmspace_mmap(id, addr, len, prot, flags, fd, offset) 249 * 250 * map memory within a VMSPACE. This function is just like a normal mmap() 251 * but operates on the vmspace's memory map. Most callers use this to create 252 * a MAP_VPAGETABLE mapping. 253 * 254 * No requirements. 255 */ 256 int 257 sys_vmspace_mmap(struct vmspace_mmap_args *uap) 258 { 259 struct vkernel_proc *vkp; 260 struct vmspace_entry *ve; 261 int error; 262 263 /* 264 * We hold the vmspace token to serialize calls to vkernel_find_vmspace 265 * and the vm token to serialize calls to kern_mmap. 266 */ 267 lwkt_gettoken(&vm_token); 268 lwkt_gettoken(&vmspace_token); 269 if ((vkp = curproc->p_vkernel) == NULL) { 270 error = EINVAL; 271 goto done; 272 } 273 if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) { 274 error = ENOENT; 275 goto done; 276 } 277 error = kern_mmap(ve->vmspace, uap->addr, uap->len, 278 uap->prot, uap->flags, 279 uap->fd, uap->offset, &uap->sysmsg_resultp); 280 done: 281 lwkt_reltoken(&vmspace_token); 282 lwkt_reltoken(&vm_token); 283 return (error); 284 } 285 286 /* 287 * vmspace_munmap(id, addr, len) 288 * 289 * unmap memory within a VMSPACE. 290 * 291 * MPALMOSTSAFE 292 */ 293 int 294 sys_vmspace_munmap(struct vmspace_munmap_args *uap) 295 { 296 struct vkernel_proc *vkp; 297 struct vmspace_entry *ve; 298 vm_offset_t addr; 299 vm_offset_t tmpaddr; 300 vm_size_t size, pageoff; 301 vm_map_t map; 302 int error; 303 304 get_mplock(); 305 if ((vkp = curproc->p_vkernel) == NULL) { 306 error = EINVAL; 307 goto done; 308 } 309 if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) { 310 error = ENOENT; 311 goto done; 312 } 313 314 /* 315 * Copied from sys_munmap() 316 */ 317 addr = (vm_offset_t)uap->addr; 318 size = uap->len; 319 320 pageoff = (addr & PAGE_MASK); 321 addr -= pageoff; 322 size += pageoff; 323 size = (vm_size_t)round_page(size); 324 if (size < uap->len) { /* wrap */ 325 error = EINVAL; 326 goto done; 327 } 328 tmpaddr = addr + size; /* workaround gcc4 opt */ 329 if (tmpaddr < addr) { /* wrap */ 330 error = EINVAL; 331 goto done; 332 } 333 if (size == 0) { 334 error = 0; 335 goto done; 336 } 337 338 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) { 339 error = EINVAL; 340 goto done; 341 } 342 if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) { 343 error = EINVAL; 344 goto done; 345 } 346 map = &ve->vmspace->vm_map; 347 if (!vm_map_check_protection(map, addr, tmpaddr, VM_PROT_NONE, FALSE)) { 348 error = EINVAL; 349 goto done; 350 } 351 vm_map_remove(map, addr, addr + size); 352 error = 0; 353 done: 354 rel_mplock(); 355 return (error); 356 } 357 358 /* 359 * vmspace_pread(id, buf, nbyte, flags, offset) 360 * 361 * Read data from a vmspace. The number of bytes read is returned or 362 * -1 if an unrecoverable error occured. If the number of bytes read is 363 * less then the request size, a page fault occured in the VMSPACE which 364 * the caller must resolve in order to proceed. 365 * 366 * (not implemented yet) 367 * 368 * MPALMOSTSAFE 369 */ 370 int 371 sys_vmspace_pread(struct vmspace_pread_args *uap) 372 { 373 struct vkernel_proc *vkp; 374 struct vmspace_entry *ve; 375 int error; 376 377 get_mplock(); 378 if ((vkp = curproc->p_vkernel) == NULL) { 379 error = EINVAL; 380 goto done; 381 } 382 if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) { 383 error = ENOENT; 384 goto done; 385 } 386 error = EINVAL; 387 done: 388 rel_mplock(); 389 return (error); 390 } 391 392 /* 393 * vmspace_pwrite(id, buf, nbyte, flags, offset) 394 * 395 * Write data to a vmspace. The number of bytes written is returned or 396 * -1 if an unrecoverable error occured. If the number of bytes written is 397 * less then the request size, a page fault occured in the VMSPACE which 398 * the caller must resolve in order to proceed. 399 * 400 * (not implemented yet) 401 * 402 * MPALMOSTSAFE 403 */ 404 int 405 sys_vmspace_pwrite(struct vmspace_pwrite_args *uap) 406 { 407 struct vkernel_proc *vkp; 408 struct vmspace_entry *ve; 409 int error; 410 411 get_mplock(); 412 if ((vkp = curproc->p_vkernel) == NULL) { 413 error = EINVAL; 414 goto done; 415 } 416 if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) { 417 error = ENOENT; 418 goto done; 419 } 420 error = EINVAL; 421 done: 422 rel_mplock(); 423 return (error); 424 } 425 426 /* 427 * vmspace_mcontrol(id, addr, len, behav, value) 428 * 429 * madvise/mcontrol support for a vmspace. 430 * 431 * MPALMOSTSAFE 432 */ 433 int 434 sys_vmspace_mcontrol(struct vmspace_mcontrol_args *uap) 435 { 436 struct vkernel_proc *vkp; 437 struct vmspace_entry *ve; 438 vm_offset_t start, end; 439 vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len; 440 int error; 441 442 get_mplock(); 443 if ((vkp = curproc->p_vkernel) == NULL) { 444 error = EINVAL; 445 goto done; 446 } 447 if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) { 448 error = ENOENT; 449 goto done; 450 } 451 452 /* 453 * This code is basically copied from sys_mcontrol() 454 */ 455 if (uap->behav < 0 || uap->behav > MADV_CONTROL_END) { 456 error = EINVAL; 457 goto done; 458 } 459 460 if (tmpaddr < (vm_offset_t)uap->addr) { 461 error = EINVAL; 462 goto done; 463 } 464 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) { 465 error = EINVAL; 466 goto done; 467 } 468 if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) { 469 error = EINVAL; 470 goto done; 471 } 472 473 start = trunc_page((vm_offset_t) uap->addr); 474 end = round_page(tmpaddr); 475 476 error = vm_map_madvise(&ve->vmspace->vm_map, start, end, 477 uap->behav, uap->value); 478 done: 479 rel_mplock(); 480 return (error); 481 } 482 483 /* 484 * Red black tree functions 485 */ 486 static int rb_vmspace_compare(struct vmspace_entry *, struct vmspace_entry *); 487 RB_GENERATE(vmspace_rb_tree, vmspace_entry, rb_entry, rb_vmspace_compare); 488 489 /* a->start is address, and the only field has to be initialized */ 490 static int 491 rb_vmspace_compare(struct vmspace_entry *a, struct vmspace_entry *b) 492 { 493 if ((char *)a->id < (char *)b->id) 494 return(-1); 495 else if ((char *)a->id > (char *)b->id) 496 return(1); 497 return(0); 498 } 499 500 static 501 int 502 rb_vmspace_delete(struct vmspace_entry *ve, void *data) 503 { 504 struct vkernel_proc *vkp = data; 505 506 KKASSERT(ve->refs == 0); 507 vmspace_entry_delete(ve, vkp); 508 return(0); 509 } 510 511 /* 512 * Remove a vmspace_entry from the RB tree and destroy it. We have to clean 513 * up the pmap, the vm_map, then destroy the vmspace. 514 */ 515 static 516 void 517 vmspace_entry_delete(struct vmspace_entry *ve, struct vkernel_proc *vkp) 518 { 519 RB_REMOVE(vmspace_rb_tree, &vkp->root, ve); 520 521 pmap_remove_pages(vmspace_pmap(ve->vmspace), 522 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 523 vm_map_remove(&ve->vmspace->vm_map, 524 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 525 sysref_put(&ve->vmspace->vm_sysref); 526 kfree(ve, M_VKERNEL); 527 } 528 529 530 static 531 struct vmspace_entry * 532 vkernel_find_vmspace(struct vkernel_proc *vkp, void *id) 533 { 534 struct vmspace_entry *ve; 535 struct vmspace_entry key; 536 537 key.id = id; 538 ve = RB_FIND(vmspace_rb_tree, &vkp->root, &key); 539 return (ve); 540 } 541 542 /* 543 * Manage vkernel refs, used by the kernel when fork()ing or exit()ing 544 * a vkernel process. 545 */ 546 void 547 vkernel_inherit(struct proc *p1, struct proc *p2) 548 { 549 struct vkernel_proc *vkp; 550 551 vkp = p1->p_vkernel; 552 KKASSERT(vkp->refs > 0); 553 atomic_add_int(&vkp->refs, 1); 554 p2->p_vkernel = vkp; 555 } 556 557 void 558 vkernel_exit(struct proc *p) 559 { 560 struct vkernel_proc *vkp; 561 struct lwp *lp; 562 int freeme = 0; 563 564 vkp = p->p_vkernel; 565 /* 566 * Restore the original VM context if we are killed while running 567 * a different one. 568 * 569 * This isn't supposed to happen. What is supposed to happen is 570 * that the process should enter vkernel_trap() before the handling 571 * the signal. 572 */ 573 RB_FOREACH(lp, lwp_rb_tree, &p->p_lwp_tree) { 574 vkernel_lwp_exit(lp); 575 } 576 577 /* 578 * Dereference the common area 579 */ 580 p->p_vkernel = NULL; 581 KKASSERT(vkp->refs > 0); 582 spin_lock_wr(&vkp->spin); 583 if (--vkp->refs == 0) 584 freeme = 1; 585 spin_unlock_wr(&vkp->spin); 586 587 if (freeme) { 588 RB_SCAN(vmspace_rb_tree, &vkp->root, NULL, 589 rb_vmspace_delete, vkp); 590 kfree(vkp, M_VKERNEL); 591 } 592 } 593 594 void 595 vkernel_lwp_exit(struct lwp *lp) 596 { 597 struct vkernel_lwp *vklp; 598 struct vmspace_entry *ve; 599 600 if ((vklp = lp->lwp_vkernel) != NULL) { 601 if ((ve = vklp->ve) != NULL) { 602 kprintf("Warning, pid %d killed with " 603 "active VC!\n", lp->lwp_proc->p_pid); 604 print_backtrace(-1); 605 pmap_setlwpvm(lp, lp->lwp_proc->p_vmspace); 606 vklp->ve = NULL; 607 KKASSERT(ve->refs > 0); 608 atomic_subtract_int(&ve->refs, 1); 609 } 610 lp->lwp_vkernel = NULL; 611 kfree(vklp, M_VKERNEL); 612 } 613 } 614 615 /* 616 * A VM space under virtual kernel control trapped out or made a system call 617 * or otherwise needs to return control to the virtual kernel context. 618 */ 619 void 620 vkernel_trap(struct lwp *lp, struct trapframe *frame) 621 { 622 struct proc *p = lp->lwp_proc; 623 struct vmspace_entry *ve; 624 struct vkernel_lwp *vklp; 625 int error; 626 627 /* 628 * Which vmspace entry was running? 629 */ 630 vklp = lp->lwp_vkernel; 631 KKASSERT(vklp); 632 ve = vklp->ve; 633 KKASSERT(ve != NULL); 634 635 /* 636 * Switch the LWP vmspace back to the virtual kernel's VM space. 637 */ 638 vklp->ve = NULL; 639 pmap_setlwpvm(lp, p->p_vmspace); 640 KKASSERT(ve->refs > 0); 641 atomic_subtract_int(&ve->refs, 1); 642 643 /* 644 * Copy the emulated process frame to the virtual kernel process. 645 * The emulated process cannot change TLS descriptors so don't 646 * bother saving them, we already have a copy. 647 * 648 * Restore the virtual kernel's saved context so the virtual kernel 649 * process can resume. 650 */ 651 error = copyout(frame, vklp->user_trapframe, sizeof(*frame)); 652 bcopy(&vklp->save_trapframe, frame, sizeof(*frame)); 653 bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls, 654 sizeof(vklp->save_vextframe.vx_tls)); 655 set_user_TLS(); 656 cpu_vkernel_trap(frame, error); 657 } 658 659