1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 2006 The DragonFly Project. All rights reserved. 5 * 6 * This code is derived from software contributed to The DragonFly Project 7 * by Matthew Dillon <dillon@backplane.com> 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in 17 * the documentation and/or other materials provided with the 18 * distribution. 19 * 3. Neither the name of The DragonFly Project nor the names of its 20 * contributors may be used to endorse or promote products derived 21 * from this software without specific, prior written permission. 22 * 23 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 24 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 25 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 26 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 27 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 28 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 29 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 30 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 31 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 32 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 33 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 34 * SUCH DAMAGE. 35 */ 36 37 #include <sys/param.h> 38 #include <sys/kernel.h> 39 #include <sys/systm.h> 40 #include <sys/sysproto.h> 41 #include <sys/kern_syscall.h> 42 #include <sys/mman.h> 43 #include <sys/thread.h> 44 #include <sys/proc.h> 45 #include <sys/malloc.h> 46 #include <sys/sysctl.h> 47 #include <sys/vkernel.h> 48 #include <sys/vmspace.h> 49 50 #include <vm/vm_extern.h> 51 #include <vm/pmap.h> 52 53 #include <machine/vmparam.h> 54 55 #include <sys/sysref2.h> 56 #include <sys/mplock2.h> 57 58 static struct vmspace_entry *vkernel_find_vmspace(struct vkernel_proc *vkp, 59 void *id); 60 static void vmspace_entry_delete(struct vmspace_entry *ve, 61 struct vkernel_proc *vkp); 62 63 static MALLOC_DEFINE(M_VKERNEL, "vkernel", "VKernel structures"); 64 65 /* 66 * vmspace_create (void *id, int type, void *data) 67 * 68 * Create a VMSPACE under the control of the caller with the specified id. 69 * An id of NULL cannot be used. The type and data fields must currently 70 * be 0. 71 * 72 * The vmspace starts out completely empty. Memory may be mapped into the 73 * VMSPACE with vmspace_mmap() and MAP_VPAGETABLE section(s) controlled 74 * with vmspace_mcontrol(). 75 * 76 * No requirements. 77 */ 78 int 79 sys_vmspace_create(struct vmspace_create_args *uap) 80 { 81 struct vmspace_entry *ve; 82 struct vkernel_proc *vkp; 83 struct proc *p = curproc; 84 int error; 85 86 if (vkernel_enable == 0) 87 return (EOPNOTSUPP); 88 89 /* 90 * Create a virtual kernel side-structure for the process if one 91 * does not exist. 92 * 93 * Implement a simple resolution for SMP races. 94 */ 95 if ((vkp = p->p_vkernel) == NULL) { 96 vkp = kmalloc(sizeof(*vkp), M_VKERNEL, M_WAITOK|M_ZERO); 97 lwkt_gettoken(&proc_token); 98 if (p->p_vkernel == NULL) { 99 vkp->refs = 1; 100 lwkt_token_init(&vkp->token, "vkernel"); 101 RB_INIT(&vkp->root); 102 p->p_vkernel = vkp; 103 } else { 104 kfree(vkp, M_VKERNEL); 105 vkp = p->p_vkernel; 106 } 107 lwkt_reltoken(&proc_token); 108 } 109 110 get_mplock(); 111 112 /* 113 * Create a new VMSPACE, disallow conflicting ids 114 */ 115 ve = kmalloc(sizeof(struct vmspace_entry), M_VKERNEL, M_WAITOK|M_ZERO); 116 ve->vmspace = vmspace_alloc(VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 117 ve->id = uap->id; 118 pmap_pinit2(vmspace_pmap(ve->vmspace)); 119 120 lwkt_gettoken(&vkp->token); 121 if (RB_INSERT(vmspace_rb_tree, &vkp->root, ve)) { 122 sysref_put(&ve->vmspace->vm_sysref); 123 kfree(ve, M_VKERNEL); 124 error = EEXIST; 125 } else { 126 error = 0; 127 } 128 lwkt_reltoken(&vkp->token); 129 rel_mplock(); 130 return (error); 131 } 132 133 /* 134 * Destroy a VMSPACE given its identifier. 135 * 136 * No requirements. 137 */ 138 int 139 sys_vmspace_destroy(struct vmspace_destroy_args *uap) 140 { 141 struct vkernel_proc *vkp; 142 struct vmspace_entry *ve; 143 int error; 144 145 get_mplock(); 146 if ((vkp = curproc->p_vkernel) == NULL) { 147 error = EINVAL; 148 goto done3; 149 } 150 lwkt_gettoken(&vkp->token); 151 if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) { 152 error = ENOENT; 153 goto done2; 154 } 155 if (ve->refs) { 156 error = EBUSY; 157 goto done2; 158 } 159 vmspace_entry_delete(ve, vkp); 160 error = 0; 161 done2: 162 lwkt_reltoken(&vkp->token); 163 done3: 164 rel_mplock(); 165 return(error); 166 } 167 168 /* 169 * vmspace_ctl (void *id, int cmd, struct trapframe *tframe, 170 * struct vextframe *vframe); 171 * 172 * Transfer control to a VMSPACE. Control is returned after the specified 173 * number of microseconds or if a page fault, signal, trap, or system call 174 * occurs. The context is updated as appropriate. 175 * 176 * No requirements. 177 */ 178 int 179 sys_vmspace_ctl(struct vmspace_ctl_args *uap) 180 { 181 struct vkernel_proc *vkp; 182 struct vkernel_lwp *vklp; 183 struct vmspace_entry *ve; 184 struct lwp *lp; 185 struct proc *p; 186 int framesz; 187 int error; 188 189 lp = curthread->td_lwp; 190 p = lp->lwp_proc; 191 192 if ((vkp = p->p_vkernel) == NULL) 193 return (EINVAL); 194 195 get_mplock(); 196 lwkt_gettoken(&vkp->token); 197 if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) { 198 error = ENOENT; 199 goto done; 200 } 201 202 switch(uap->cmd) { 203 case VMSPACE_CTL_RUN: 204 /* 205 * Save the caller's register context, swap VM spaces, and 206 * install the passed register context. Return with 207 * EJUSTRETURN so the syscall code doesn't adjust the context. 208 */ 209 atomic_add_int(&ve->refs, 1); 210 framesz = sizeof(struct trapframe); 211 if ((vklp = lp->lwp_vkernel) == NULL) { 212 vklp = kmalloc(sizeof(*vklp), M_VKERNEL, 213 M_WAITOK|M_ZERO); 214 lp->lwp_vkernel = vklp; 215 } 216 vklp->user_trapframe = uap->tframe; 217 vklp->user_vextframe = uap->vframe; 218 bcopy(uap->sysmsg_frame, &vklp->save_trapframe, framesz); 219 bcopy(&curthread->td_tls, &vklp->save_vextframe.vx_tls, 220 sizeof(vklp->save_vextframe.vx_tls)); 221 error = copyin(uap->tframe, uap->sysmsg_frame, framesz); 222 if (error == 0) { 223 error = copyin(&uap->vframe->vx_tls, 224 &curthread->td_tls, 225 sizeof(struct savetls)); 226 } 227 if (error == 0) 228 error = cpu_sanitize_frame(uap->sysmsg_frame); 229 if (error == 0) 230 error = cpu_sanitize_tls(&curthread->td_tls); 231 if (error) { 232 bcopy(&vklp->save_trapframe, uap->sysmsg_frame, 233 framesz); 234 bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls, 235 sizeof(vklp->save_vextframe.vx_tls)); 236 set_user_TLS(); 237 atomic_subtract_int(&ve->refs, 1); 238 } else { 239 vklp->ve = ve; 240 pmap_setlwpvm(lp, ve->vmspace); 241 set_user_TLS(); 242 set_vkernel_fp(uap->sysmsg_frame); 243 error = EJUSTRETURN; 244 } 245 break; 246 default: 247 error = EOPNOTSUPP; 248 break; 249 } 250 done: 251 lwkt_reltoken(&vkp->token); 252 rel_mplock(); 253 return(error); 254 } 255 256 /* 257 * vmspace_mmap(id, addr, len, prot, flags, fd, offset) 258 * 259 * map memory within a VMSPACE. This function is just like a normal mmap() 260 * but operates on the vmspace's memory map. Most callers use this to create 261 * a MAP_VPAGETABLE mapping. 262 * 263 * No requirements. 264 */ 265 int 266 sys_vmspace_mmap(struct vmspace_mmap_args *uap) 267 { 268 struct vkernel_proc *vkp; 269 struct vmspace_entry *ve; 270 int error; 271 272 /* 273 * We hold the vmspace token to serialize calls to vkernel_find_vmspace. 274 */ 275 lwkt_gettoken(&vmspace_token); 276 if ((vkp = curproc->p_vkernel) == NULL) { 277 error = EINVAL; 278 goto done3; 279 } 280 281 /* 282 * NOTE: kern_mmap() can block so we need to temporarily ref ve->refs. 283 */ 284 lwkt_gettoken(&vkp->token); 285 if ((ve = vkernel_find_vmspace(vkp, uap->id)) != NULL) { 286 atomic_add_int(&ve->refs, 1); 287 error = kern_mmap(ve->vmspace, uap->addr, uap->len, 288 uap->prot, uap->flags, 289 uap->fd, uap->offset, &uap->sysmsg_resultp); 290 atomic_subtract_int(&ve->refs, 1); 291 } else { 292 error = ENOENT; 293 } 294 lwkt_reltoken(&vkp->token); 295 done3: 296 lwkt_reltoken(&vmspace_token); 297 return (error); 298 } 299 300 /* 301 * vmspace_munmap(id, addr, len) 302 * 303 * unmap memory within a VMSPACE. 304 * 305 * No requirements. 306 */ 307 int 308 sys_vmspace_munmap(struct vmspace_munmap_args *uap) 309 { 310 struct vkernel_proc *vkp; 311 struct vmspace_entry *ve; 312 vm_offset_t addr; 313 vm_offset_t tmpaddr; 314 vm_size_t size, pageoff; 315 vm_map_t map; 316 int error; 317 318 get_mplock(); 319 if ((vkp = curproc->p_vkernel) == NULL) { 320 error = EINVAL; 321 goto done3; 322 } 323 lwkt_gettoken(&vkp->token); 324 if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) { 325 error = ENOENT; 326 goto done2; 327 } 328 329 /* 330 * NOTE: kern_munmap() can block so we need to temporarily 331 * ref ve->refs. 332 */ 333 atomic_add_int(&ve->refs, 1); 334 335 /* 336 * Copied from sys_munmap() 337 */ 338 addr = (vm_offset_t)uap->addr; 339 size = uap->len; 340 341 pageoff = (addr & PAGE_MASK); 342 addr -= pageoff; 343 size += pageoff; 344 size = (vm_size_t)round_page(size); 345 if (size < uap->len) { /* wrap */ 346 error = EINVAL; 347 goto done1; 348 } 349 tmpaddr = addr + size; /* workaround gcc4 opt */ 350 if (tmpaddr < addr) { /* wrap */ 351 error = EINVAL; 352 goto done1; 353 } 354 if (size == 0) { 355 error = 0; 356 goto done1; 357 } 358 359 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) { 360 error = EINVAL; 361 goto done1; 362 } 363 if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) { 364 error = EINVAL; 365 goto done1; 366 } 367 map = &ve->vmspace->vm_map; 368 if (!vm_map_check_protection(map, addr, tmpaddr, VM_PROT_NONE, FALSE)) { 369 error = EINVAL; 370 goto done1; 371 } 372 vm_map_remove(map, addr, addr + size); 373 error = 0; 374 done1: 375 atomic_subtract_int(&ve->refs, 1); 376 done2: 377 lwkt_reltoken(&vkp->token); 378 done3: 379 rel_mplock(); 380 return (error); 381 } 382 383 /* 384 * vmspace_pread(id, buf, nbyte, flags, offset) 385 * 386 * Read data from a vmspace. The number of bytes read is returned or 387 * -1 if an unrecoverable error occured. If the number of bytes read is 388 * less then the request size, a page fault occured in the VMSPACE which 389 * the caller must resolve in order to proceed. 390 * 391 * (not implemented yet) 392 * No requirements. 393 */ 394 int 395 sys_vmspace_pread(struct vmspace_pread_args *uap) 396 { 397 struct vkernel_proc *vkp; 398 struct vmspace_entry *ve; 399 int error; 400 401 get_mplock(); 402 if ((vkp = curproc->p_vkernel) == NULL) { 403 error = EINVAL; 404 goto done3; 405 } 406 lwkt_gettoken(&vkp->token); 407 if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) { 408 error = ENOENT; 409 goto done2; 410 } 411 error = EINVAL; 412 done2: 413 lwkt_reltoken(&vkp->token); 414 done3: 415 rel_mplock(); 416 return (error); 417 } 418 419 /* 420 * vmspace_pwrite(id, buf, nbyte, flags, offset) 421 * 422 * Write data to a vmspace. The number of bytes written is returned or 423 * -1 if an unrecoverable error occured. If the number of bytes written is 424 * less then the request size, a page fault occured in the VMSPACE which 425 * the caller must resolve in order to proceed. 426 * 427 * (not implemented yet) 428 * No requirements. 429 */ 430 int 431 sys_vmspace_pwrite(struct vmspace_pwrite_args *uap) 432 { 433 struct vkernel_proc *vkp; 434 struct vmspace_entry *ve; 435 int error; 436 437 get_mplock(); 438 if ((vkp = curproc->p_vkernel) == NULL) { 439 error = EINVAL; 440 goto done3; 441 } 442 lwkt_gettoken(&vkp->token); 443 if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) { 444 error = ENOENT; 445 goto done2; 446 } 447 error = EINVAL; 448 done2: 449 lwkt_reltoken(&vkp->token); 450 done3: 451 rel_mplock(); 452 return (error); 453 } 454 455 /* 456 * vmspace_mcontrol(id, addr, len, behav, value) 457 * 458 * madvise/mcontrol support for a vmspace. 459 * 460 * No requirements. 461 */ 462 int 463 sys_vmspace_mcontrol(struct vmspace_mcontrol_args *uap) 464 { 465 struct vkernel_proc *vkp; 466 struct vmspace_entry *ve; 467 vm_offset_t start, end; 468 vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len; 469 int error; 470 471 get_mplock(); 472 if ((vkp = curproc->p_vkernel) == NULL) { 473 error = EINVAL; 474 goto done3; 475 } 476 lwkt_gettoken(&vkp->token); 477 if ((ve = vkernel_find_vmspace(vkp, uap->id)) == NULL) { 478 error = ENOENT; 479 goto done2; 480 } 481 482 /* 483 * NOTE: kern_madvise() can block so we need to temporarily 484 * ref ve->refs. 485 */ 486 atomic_add_int(&ve->refs, 1); 487 488 /* 489 * This code is basically copied from sys_mcontrol() 490 */ 491 if (uap->behav < 0 || uap->behav > MADV_CONTROL_END) { 492 error = EINVAL; 493 goto done1; 494 } 495 496 if (tmpaddr < (vm_offset_t)uap->addr) { 497 error = EINVAL; 498 goto done1; 499 } 500 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) { 501 error = EINVAL; 502 goto done1; 503 } 504 if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) { 505 error = EINVAL; 506 goto done1; 507 } 508 509 start = trunc_page((vm_offset_t) uap->addr); 510 end = round_page(tmpaddr); 511 512 error = vm_map_madvise(&ve->vmspace->vm_map, start, end, 513 uap->behav, uap->value); 514 done1: 515 atomic_subtract_int(&ve->refs, 1); 516 done2: 517 lwkt_reltoken(&vkp->token); 518 done3: 519 rel_mplock(); 520 return (error); 521 } 522 523 /* 524 * Red black tree functions 525 */ 526 static int rb_vmspace_compare(struct vmspace_entry *, struct vmspace_entry *); 527 RB_GENERATE(vmspace_rb_tree, vmspace_entry, rb_entry, rb_vmspace_compare); 528 529 /* 530 * a->start is address, and the only field has to be initialized. 531 * The caller must hold vkp->token. 532 * 533 * The caller must hold vkp->token. 534 */ 535 static int 536 rb_vmspace_compare(struct vmspace_entry *a, struct vmspace_entry *b) 537 { 538 if ((char *)a->id < (char *)b->id) 539 return(-1); 540 else if ((char *)a->id > (char *)b->id) 541 return(1); 542 return(0); 543 } 544 545 /* 546 * The caller must hold vkp->token. 547 */ 548 static 549 int 550 rb_vmspace_delete(struct vmspace_entry *ve, void *data) 551 { 552 struct vkernel_proc *vkp = data; 553 554 KKASSERT(ve->refs == 0); 555 vmspace_entry_delete(ve, vkp); 556 return(0); 557 } 558 559 /* 560 * Remove a vmspace_entry from the RB tree and destroy it. We have to clean 561 * up the pmap, the vm_map, then destroy the vmspace. 562 * 563 * This function must remove the ve immediately before it might potentially 564 * block. 565 * 566 * The caller must hold vkp->token. 567 */ 568 static 569 void 570 vmspace_entry_delete(struct vmspace_entry *ve, struct vkernel_proc *vkp) 571 { 572 RB_REMOVE(vmspace_rb_tree, &vkp->root, ve); 573 574 pmap_remove_pages(vmspace_pmap(ve->vmspace), 575 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 576 vm_map_remove(&ve->vmspace->vm_map, 577 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 578 sysref_put(&ve->vmspace->vm_sysref); 579 kfree(ve, M_VKERNEL); 580 } 581 582 /* 583 * Locate the ve for (id), return the ve or NULL. If found this function 584 * will bump ve->refs which prevents the ve from being immediately destroyed 585 * (but it can still be removed). 586 * 587 * The caller must hold vkp->token. 588 */ 589 static 590 struct vmspace_entry * 591 vkernel_find_vmspace(struct vkernel_proc *vkp, void *id) 592 { 593 struct vmspace_entry *ve; 594 struct vmspace_entry key; 595 596 key.id = id; 597 ve = RB_FIND(vmspace_rb_tree, &vkp->root, &key); 598 return (ve); 599 } 600 601 /* 602 * Manage vkernel refs, used by the kernel when fork()ing or exit()ing 603 * a vkernel process. 604 * 605 * No requirements. 606 */ 607 void 608 vkernel_inherit(struct proc *p1, struct proc *p2) 609 { 610 struct vkernel_proc *vkp; 611 612 vkp = p1->p_vkernel; 613 KKASSERT(vkp->refs > 0); 614 atomic_add_int(&vkp->refs, 1); 615 p2->p_vkernel = vkp; 616 } 617 618 /* 619 * No requirements. 620 */ 621 void 622 vkernel_exit(struct proc *p) 623 { 624 struct vkernel_proc *vkp; 625 struct lwp *lp; 626 627 vkp = p->p_vkernel; 628 629 /* 630 * Restore the original VM context if we are killed while running 631 * a different one. 632 * 633 * This isn't supposed to happen. What is supposed to happen is 634 * that the process should enter vkernel_trap() before the handling 635 * the signal. 636 */ 637 RB_FOREACH(lp, lwp_rb_tree, &p->p_lwp_tree) { 638 vkernel_lwp_exit(lp); 639 } 640 641 /* 642 * Dereference the common area 643 */ 644 p->p_vkernel = NULL; 645 KKASSERT(vkp->refs > 0); 646 647 if (atomic_fetchadd_int(&vkp->refs, -1) == 1) { 648 lwkt_gettoken(&vkp->token); 649 RB_SCAN(vmspace_rb_tree, &vkp->root, NULL, 650 rb_vmspace_delete, vkp); 651 lwkt_reltoken(&vkp->token); 652 kfree(vkp, M_VKERNEL); 653 } 654 } 655 656 /* 657 * No requirements. 658 */ 659 void 660 vkernel_lwp_exit(struct lwp *lp) 661 { 662 struct vkernel_lwp *vklp; 663 struct vmspace_entry *ve; 664 665 if ((vklp = lp->lwp_vkernel) != NULL) { 666 if ((ve = vklp->ve) != NULL) { 667 kprintf("Warning, pid %d killed with " 668 "active VC!\n", lp->lwp_proc->p_pid); 669 pmap_setlwpvm(lp, lp->lwp_proc->p_vmspace); 670 vklp->ve = NULL; 671 KKASSERT(ve->refs > 0); 672 atomic_subtract_int(&ve->refs, 1); 673 } 674 lp->lwp_vkernel = NULL; 675 kfree(vklp, M_VKERNEL); 676 } 677 } 678 679 /* 680 * A VM space under virtual kernel control trapped out or made a system call 681 * or otherwise needs to return control to the virtual kernel context. 682 * 683 * No requirements. 684 */ 685 void 686 vkernel_trap(struct lwp *lp, struct trapframe *frame) 687 { 688 struct proc *p = lp->lwp_proc; 689 struct vmspace_entry *ve; 690 struct vkernel_lwp *vklp; 691 int error; 692 693 /* 694 * Which vmspace entry was running? 695 */ 696 vklp = lp->lwp_vkernel; 697 KKASSERT(vklp); 698 ve = vklp->ve; 699 KKASSERT(ve != NULL); 700 701 /* 702 * Switch the LWP vmspace back to the virtual kernel's VM space. 703 */ 704 vklp->ve = NULL; 705 pmap_setlwpvm(lp, p->p_vmspace); 706 KKASSERT(ve->refs > 0); 707 atomic_subtract_int(&ve->refs, 1); 708 /* ve is invalid once we kill our ref */ 709 710 /* 711 * Copy the emulated process frame to the virtual kernel process. 712 * The emulated process cannot change TLS descriptors so don't 713 * bother saving them, we already have a copy. 714 * 715 * Restore the virtual kernel's saved context so the virtual kernel 716 * process can resume. 717 */ 718 error = copyout(frame, vklp->user_trapframe, sizeof(*frame)); 719 bcopy(&vklp->save_trapframe, frame, sizeof(*frame)); 720 bcopy(&vklp->save_vextframe.vx_tls, &curthread->td_tls, 721 sizeof(vklp->save_vextframe.vx_tls)); 722 set_user_TLS(); 723 cpu_vkernel_trap(frame, error); 724 } 725