1 /* 2 * Copyright (c) 1988 University of Utah. 3 * Copyright (c) 1991, 1993 4 * The Regents of the University of California. All rights reserved. 5 * 6 * This code is derived from software contributed to Berkeley by 7 * the Systems Programming Group of the University of Utah Computer 8 * Science Department. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 3. All advertising materials mentioning features or use of this software 19 * must display the following acknowledgement: 20 * This product includes software developed by the University of 21 * California, Berkeley and its contributors. 22 * 4. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 39 * 40 * @(#)vm_mmap.c 8.4 (Berkeley) 1/12/94 41 * $FreeBSD: src/sys/vm/vm_mmap.c,v 1.108.2.6 2002/07/02 20:06:19 dillon Exp $ 42 * $DragonFly: src/sys/vm/vm_mmap.c,v 1.39 2007/04/30 07:18:57 dillon Exp $ 43 */ 44 45 /* 46 * Mapped file (mmap) interface to VM 47 */ 48 49 #include <sys/param.h> 50 #include <sys/kernel.h> 51 #include <sys/systm.h> 52 #include <sys/sysproto.h> 53 #include <sys/filedesc.h> 54 #include <sys/kern_syscall.h> 55 #include <sys/proc.h> 56 #include <sys/priv.h> 57 #include <sys/resource.h> 58 #include <sys/resourcevar.h> 59 #include <sys/vnode.h> 60 #include <sys/fcntl.h> 61 #include <sys/file.h> 62 #include <sys/mman.h> 63 #include <sys/conf.h> 64 #include <sys/stat.h> 65 #include <sys/vmmeter.h> 66 #include <sys/sysctl.h> 67 68 #include <vm/vm.h> 69 #include <vm/vm_param.h> 70 #include <sys/lock.h> 71 #include <vm/pmap.h> 72 #include <vm/vm_map.h> 73 #include <vm/vm_object.h> 74 #include <vm/vm_page.h> 75 #include <vm/vm_pager.h> 76 #include <vm/vm_pageout.h> 77 #include <vm/vm_extern.h> 78 #include <vm/vm_page.h> 79 #include <vm/vm_kern.h> 80 81 #include <sys/file2.h> 82 #include <sys/thread2.h> 83 #include <sys/mplock2.h> 84 85 static int max_proc_mmap; 86 SYSCTL_INT(_vm, OID_AUTO, max_proc_mmap, CTLFLAG_RW, &max_proc_mmap, 0, ""); 87 int vkernel_enable; 88 SYSCTL_INT(_vm, OID_AUTO, vkernel_enable, CTLFLAG_RW, &vkernel_enable, 0, ""); 89 90 /* 91 * Set the maximum number of vm_map_entry structures per process. Roughly 92 * speaking vm_map_entry structures are tiny, so allowing them to eat 1/100 93 * of our KVM malloc space still results in generous limits. We want a 94 * default that is good enough to prevent the kernel running out of resources 95 * if attacked from compromised user account but generous enough such that 96 * multi-threaded processes are not unduly inconvenienced. 97 */ 98 99 static void vmmapentry_rsrc_init (void *); 100 SYSINIT(vmmersrc, SI_BOOT1_POST, SI_ORDER_ANY, vmmapentry_rsrc_init, NULL) 101 102 static void 103 vmmapentry_rsrc_init(void *dummy) 104 { 105 max_proc_mmap = KvaSize / sizeof(struct vm_map_entry); 106 max_proc_mmap /= 100; 107 } 108 109 /* 110 * MPSAFE 111 */ 112 int 113 sys_sbrk(struct sbrk_args *uap) 114 { 115 /* Not yet implemented */ 116 return (EOPNOTSUPP); 117 } 118 119 /* 120 * sstk_args(int incr) 121 * 122 * MPSAFE 123 */ 124 int 125 sys_sstk(struct sstk_args *uap) 126 { 127 /* Not yet implemented */ 128 return (EOPNOTSUPP); 129 } 130 131 /* 132 * mmap_args(void *addr, size_t len, int prot, int flags, int fd, 133 * long pad, off_t pos) 134 * 135 * Memory Map (mmap) system call. Note that the file offset 136 * and address are allowed to be NOT page aligned, though if 137 * the MAP_FIXED flag it set, both must have the same remainder 138 * modulo the PAGE_SIZE (POSIX 1003.1b). If the address is not 139 * page-aligned, the actual mapping starts at trunc_page(addr) 140 * and the return value is adjusted up by the page offset. 141 * 142 * Generally speaking, only character devices which are themselves 143 * memory-based, such as a video framebuffer, can be mmap'd. Otherwise 144 * there would be no cache coherency between a descriptor and a VM mapping 145 * both to the same character device. 146 * 147 * Block devices can be mmap'd no matter what they represent. Cache coherency 148 * is maintained as long as you do not write directly to the underlying 149 * character device. 150 */ 151 152 int 153 kern_mmap(struct vmspace *vms, caddr_t uaddr, size_t ulen, 154 int uprot, int uflags, int fd, off_t upos, void **res) 155 { 156 struct thread *td = curthread; 157 struct proc *p = td->td_proc; 158 struct file *fp = NULL; 159 struct vnode *vp; 160 vm_offset_t addr; 161 vm_offset_t tmpaddr; 162 vm_size_t size, pageoff; 163 vm_prot_t prot, maxprot; 164 void *handle; 165 int flags, error; 166 int disablexworkaround; 167 off_t pos; 168 vm_object_t obj; 169 170 KKASSERT(p); 171 172 addr = (vm_offset_t) uaddr; 173 size = ulen; 174 prot = uprot & VM_PROT_ALL; 175 flags = uflags; 176 pos = upos; 177 178 /* 179 * Make sure mapping fits into numeric range etc. 180 * 181 * NOTE: We support the full unsigned range for size now. 182 */ 183 if (((flags & MAP_ANON) && (fd != -1 || pos != 0))) 184 return (EINVAL); 185 186 if (flags & MAP_STACK) { 187 if ((fd != -1) || 188 ((prot & (PROT_READ | PROT_WRITE)) != (PROT_READ | PROT_WRITE))) 189 return (EINVAL); 190 flags |= MAP_ANON; 191 pos = 0; 192 } 193 194 /* 195 * Virtual page tables cannot be used with MAP_STACK. Apart from 196 * it not making any sense, the aux union is used by both 197 * types. 198 * 199 * Because the virtual page table is stored in the backing object 200 * and might be updated by the kernel, the mapping must be R+W. 201 */ 202 if (flags & MAP_VPAGETABLE) { 203 if (vkernel_enable == 0) 204 return (EOPNOTSUPP); 205 if (flags & MAP_STACK) 206 return (EINVAL); 207 if ((prot & (PROT_READ|PROT_WRITE)) != (PROT_READ|PROT_WRITE)) 208 return (EINVAL); 209 } 210 211 /* 212 * Align the file position to a page boundary, 213 * and save its page offset component. 214 */ 215 pageoff = (pos & PAGE_MASK); 216 pos -= pageoff; 217 218 /* Adjust size for rounding (on both ends). */ 219 size += pageoff; /* low end... */ 220 size = (vm_size_t) round_page(size); /* hi end */ 221 if (size < ulen) /* wrap */ 222 return(EINVAL); 223 224 /* 225 * Check for illegal addresses. Watch out for address wrap... Note 226 * that VM_*_ADDRESS are not constants due to casts (argh). 227 */ 228 if (flags & (MAP_FIXED | MAP_TRYFIXED)) { 229 /* 230 * The specified address must have the same remainder 231 * as the file offset taken modulo PAGE_SIZE, so it 232 * should be aligned after adjustment by pageoff. 233 */ 234 addr -= pageoff; 235 if (addr & PAGE_MASK) 236 return (EINVAL); 237 238 /* 239 * Address range must be all in user VM space and not wrap. 240 */ 241 tmpaddr = addr + size; 242 if (tmpaddr < addr) 243 return (EINVAL); 244 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) 245 return (EINVAL); 246 if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) 247 return (EINVAL); 248 } else { 249 /* 250 * Set a reasonable start point for the hint if it was 251 * not specified or if it falls within the heap space. 252 * Hinted mmap()s do not allocate out of the heap space. 253 */ 254 if (addr == 0 || 255 (addr >= round_page((vm_offset_t)vms->vm_taddr) && 256 addr < round_page((vm_offset_t)vms->vm_daddr + maxdsiz))) 257 addr = round_page((vm_offset_t)vms->vm_daddr + maxdsiz); 258 } 259 260 if (flags & MAP_ANON) { 261 /* 262 * Mapping blank space is trivial. 263 */ 264 handle = NULL; 265 maxprot = VM_PROT_ALL; 266 } else { 267 /* 268 * Mapping file, get fp for validation. Obtain vnode and make 269 * sure it is of appropriate type. 270 */ 271 fp = holdfp(p->p_fd, fd, -1); 272 if (fp == NULL) 273 return (EBADF); 274 if (fp->f_type != DTYPE_VNODE) { 275 error = EINVAL; 276 goto done; 277 } 278 /* 279 * POSIX shared-memory objects are defined to have 280 * kernel persistence, and are not defined to support 281 * read(2)/write(2) -- or even open(2). Thus, we can 282 * use MAP_ASYNC to trade on-disk coherence for speed. 283 * The shm_open(3) library routine turns on the FPOSIXSHM 284 * flag to request this behavior. 285 */ 286 if (fp->f_flag & FPOSIXSHM) 287 flags |= MAP_NOSYNC; 288 vp = (struct vnode *) fp->f_data; 289 290 /* 291 * Validate the vnode for the operation. 292 */ 293 switch(vp->v_type) { 294 case VREG: 295 /* 296 * Get the proper underlying object 297 */ 298 if ((obj = vp->v_object) == NULL) { 299 error = EINVAL; 300 goto done; 301 } 302 KKASSERT((struct vnode *)obj->handle == vp); 303 break; 304 case VCHR: 305 /* 306 * Make sure a device has not been revoked. 307 * Mappability is handled by the device layer. 308 */ 309 if (vp->v_rdev == NULL) { 310 error = EBADF; 311 goto done; 312 } 313 break; 314 default: 315 /* 316 * Nothing else is mappable. 317 */ 318 error = EINVAL; 319 goto done; 320 } 321 322 /* 323 * XXX hack to handle use of /dev/zero to map anon memory (ala 324 * SunOS). 325 */ 326 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { 327 handle = NULL; 328 maxprot = VM_PROT_ALL; 329 flags |= MAP_ANON; 330 pos = 0; 331 } else { 332 /* 333 * cdevs does not provide private mappings of any kind. 334 */ 335 /* 336 * However, for XIG X server to continue to work, 337 * we should allow the superuser to do it anyway. 338 * We only allow it at securelevel < 1. 339 * (Because the XIG X server writes directly to video 340 * memory via /dev/mem, it should never work at any 341 * other securelevel. 342 * XXX this will have to go 343 */ 344 if (securelevel >= 1) 345 disablexworkaround = 1; 346 else 347 disablexworkaround = priv_check(td, PRIV_ROOT); 348 if (vp->v_type == VCHR && disablexworkaround && 349 (flags & (MAP_PRIVATE|MAP_COPY))) { 350 error = EINVAL; 351 goto done; 352 } 353 /* 354 * Ensure that file and memory protections are 355 * compatible. Note that we only worry about 356 * writability if mapping is shared; in this case, 357 * current and max prot are dictated by the open file. 358 * XXX use the vnode instead? Problem is: what 359 * credentials do we use for determination? What if 360 * proc does a setuid? 361 */ 362 maxprot = VM_PROT_EXECUTE; /* ??? */ 363 if (fp->f_flag & FREAD) { 364 maxprot |= VM_PROT_READ; 365 } else if (prot & PROT_READ) { 366 error = EACCES; 367 goto done; 368 } 369 /* 370 * If we are sharing potential changes (either via 371 * MAP_SHARED or via the implicit sharing of character 372 * device mappings), and we are trying to get write 373 * permission although we opened it without asking 374 * for it, bail out. Check for superuser, only if 375 * we're at securelevel < 1, to allow the XIG X server 376 * to continue to work. 377 */ 378 379 if ((flags & MAP_SHARED) != 0 || 380 (vp->v_type == VCHR && disablexworkaround)) { 381 if ((fp->f_flag & FWRITE) != 0) { 382 struct vattr va; 383 if ((error = VOP_GETATTR(vp, &va))) { 384 goto done; 385 } 386 if ((va.va_flags & 387 (IMMUTABLE|APPEND)) == 0) { 388 maxprot |= VM_PROT_WRITE; 389 } else if (prot & PROT_WRITE) { 390 error = EPERM; 391 goto done; 392 } 393 } else if ((prot & PROT_WRITE) != 0) { 394 error = EACCES; 395 goto done; 396 } 397 } else { 398 maxprot |= VM_PROT_WRITE; 399 } 400 handle = (void *)vp; 401 } 402 } 403 404 /* 405 * Do not allow more then a certain number of vm_map_entry structures 406 * per process. Scale with the number of rforks sharing the map 407 * to make the limit reasonable for threads. 408 */ 409 if (max_proc_mmap && 410 vms->vm_map.nentries >= max_proc_mmap * vms->vm_sysref.refcnt) { 411 error = ENOMEM; 412 goto done; 413 } 414 415 error = vm_mmap(&vms->vm_map, &addr, size, prot, maxprot, 416 flags, handle, pos); 417 if (error == 0) 418 *res = (void *)(addr + pageoff); 419 done: 420 if (fp) 421 fdrop(fp); 422 return (error); 423 } 424 425 /* 426 * MPALMOSTSAFE 427 */ 428 int 429 sys_mmap(struct mmap_args *uap) 430 { 431 int error; 432 433 get_mplock(); 434 error = kern_mmap(curproc->p_vmspace, uap->addr, uap->len, 435 uap->prot, uap->flags, 436 uap->fd, uap->pos, &uap->sysmsg_resultp); 437 rel_mplock(); 438 439 return (error); 440 } 441 442 /* 443 * msync_args(void *addr, size_t len, int flags) 444 * 445 * MPALMOSTSAFE 446 */ 447 int 448 sys_msync(struct msync_args *uap) 449 { 450 struct proc *p = curproc; 451 vm_offset_t addr; 452 vm_offset_t tmpaddr; 453 vm_size_t size, pageoff; 454 int flags; 455 vm_map_t map; 456 int rv; 457 458 addr = (vm_offset_t) uap->addr; 459 size = uap->len; 460 flags = uap->flags; 461 462 pageoff = (addr & PAGE_MASK); 463 addr -= pageoff; 464 size += pageoff; 465 size = (vm_size_t) round_page(size); 466 if (size < uap->len) /* wrap */ 467 return(EINVAL); 468 tmpaddr = addr + size; /* workaround gcc4 opt */ 469 if (tmpaddr < addr) /* wrap */ 470 return(EINVAL); 471 472 if ((flags & (MS_ASYNC|MS_INVALIDATE)) == (MS_ASYNC|MS_INVALIDATE)) 473 return (EINVAL); 474 475 get_mplock(); 476 map = &p->p_vmspace->vm_map; 477 478 /* 479 * XXX Gak! If size is zero we are supposed to sync "all modified 480 * pages with the region containing addr". Unfortunately, we don't 481 * really keep track of individual mmaps so we approximate by flushing 482 * the range of the map entry containing addr. This can be incorrect 483 * if the region splits or is coalesced with a neighbor. 484 */ 485 if (size == 0) { 486 vm_map_entry_t entry; 487 488 vm_map_lock_read(map); 489 rv = vm_map_lookup_entry(map, addr, &entry); 490 if (rv == FALSE) { 491 vm_map_unlock_read(map); 492 rv = KERN_INVALID_ADDRESS; 493 goto done; 494 } 495 addr = entry->start; 496 size = entry->end - entry->start; 497 vm_map_unlock_read(map); 498 } 499 500 /* 501 * Clean the pages and interpret the return value. 502 */ 503 rv = vm_map_clean(map, addr, addr + size, (flags & MS_ASYNC) == 0, 504 (flags & MS_INVALIDATE) != 0); 505 done: 506 rel_mplock(); 507 508 switch (rv) { 509 case KERN_SUCCESS: 510 break; 511 case KERN_INVALID_ADDRESS: 512 return (EINVAL); /* Sun returns ENOMEM? */ 513 case KERN_FAILURE: 514 return (EIO); 515 default: 516 return (EINVAL); 517 } 518 519 return (0); 520 } 521 522 /* 523 * munmap_args(void *addr, size_t len) 524 * 525 * MPALMOSTSAFE 526 */ 527 int 528 sys_munmap(struct munmap_args *uap) 529 { 530 struct proc *p = curproc; 531 vm_offset_t addr; 532 vm_offset_t tmpaddr; 533 vm_size_t size, pageoff; 534 vm_map_t map; 535 536 addr = (vm_offset_t) uap->addr; 537 size = uap->len; 538 539 pageoff = (addr & PAGE_MASK); 540 addr -= pageoff; 541 size += pageoff; 542 size = (vm_size_t) round_page(size); 543 if (size < uap->len) /* wrap */ 544 return(EINVAL); 545 tmpaddr = addr + size; /* workaround gcc4 opt */ 546 if (tmpaddr < addr) /* wrap */ 547 return(EINVAL); 548 549 if (size == 0) 550 return (0); 551 552 /* 553 * Check for illegal addresses. Watch out for address wrap... Note 554 * that VM_*_ADDRESS are not constants due to casts (argh). 555 */ 556 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) 557 return (EINVAL); 558 if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) 559 return (EINVAL); 560 561 get_mplock(); 562 map = &p->p_vmspace->vm_map; 563 /* 564 * Make sure entire range is allocated. 565 */ 566 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) { 567 rel_mplock(); 568 return (EINVAL); 569 } 570 /* returns nothing but KERN_SUCCESS anyway */ 571 vm_map_remove(map, addr, addr + size); 572 rel_mplock(); 573 return (0); 574 } 575 576 /* 577 * mprotect_args(const void *addr, size_t len, int prot) 578 * 579 * MPALMOSTSAFE 580 */ 581 int 582 sys_mprotect(struct mprotect_args *uap) 583 { 584 struct proc *p = curproc; 585 vm_offset_t addr; 586 vm_offset_t tmpaddr; 587 vm_size_t size, pageoff; 588 vm_prot_t prot; 589 int error; 590 591 addr = (vm_offset_t) uap->addr; 592 size = uap->len; 593 prot = uap->prot & VM_PROT_ALL; 594 #if defined(VM_PROT_READ_IS_EXEC) 595 if (prot & VM_PROT_READ) 596 prot |= VM_PROT_EXECUTE; 597 #endif 598 599 pageoff = (addr & PAGE_MASK); 600 addr -= pageoff; 601 size += pageoff; 602 size = (vm_size_t) round_page(size); 603 if (size < uap->len) /* wrap */ 604 return(EINVAL); 605 tmpaddr = addr + size; /* workaround gcc4 opt */ 606 if (tmpaddr < addr) /* wrap */ 607 return(EINVAL); 608 609 get_mplock(); 610 switch (vm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, 611 prot, FALSE)) { 612 case KERN_SUCCESS: 613 error = 0; 614 break; 615 case KERN_PROTECTION_FAILURE: 616 error = EACCES; 617 break; 618 default: 619 error = EINVAL; 620 break; 621 } 622 rel_mplock(); 623 return (error); 624 } 625 626 /* 627 * minherit_args(void *addr, size_t len, int inherit) 628 * 629 * MPALMOSTSAFE 630 */ 631 int 632 sys_minherit(struct minherit_args *uap) 633 { 634 struct proc *p = curproc; 635 vm_offset_t addr; 636 vm_offset_t tmpaddr; 637 vm_size_t size, pageoff; 638 vm_inherit_t inherit; 639 int error; 640 641 addr = (vm_offset_t)uap->addr; 642 size = uap->len; 643 inherit = uap->inherit; 644 645 pageoff = (addr & PAGE_MASK); 646 addr -= pageoff; 647 size += pageoff; 648 size = (vm_size_t) round_page(size); 649 if (size < uap->len) /* wrap */ 650 return(EINVAL); 651 tmpaddr = addr + size; /* workaround gcc4 opt */ 652 if (tmpaddr < addr) /* wrap */ 653 return(EINVAL); 654 655 get_mplock(); 656 657 switch (vm_map_inherit(&p->p_vmspace->vm_map, addr, 658 addr + size, inherit)) { 659 case KERN_SUCCESS: 660 error = 0; 661 break; 662 case KERN_PROTECTION_FAILURE: 663 error = EACCES; 664 break; 665 default: 666 error = EINVAL; 667 break; 668 } 669 rel_mplock(); 670 return (error); 671 } 672 673 /* 674 * madvise_args(void *addr, size_t len, int behav) 675 * 676 * MPALMOSTSAFE 677 */ 678 int 679 sys_madvise(struct madvise_args *uap) 680 { 681 struct proc *p = curproc; 682 vm_offset_t start, end; 683 vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len; 684 int error; 685 686 /* 687 * Check for illegal behavior 688 */ 689 if (uap->behav < 0 || uap->behav >= MADV_CONTROL_END) 690 return (EINVAL); 691 /* 692 * Check for illegal addresses. Watch out for address wrap... Note 693 * that VM_*_ADDRESS are not constants due to casts (argh). 694 */ 695 if (tmpaddr < (vm_offset_t)uap->addr) 696 return (EINVAL); 697 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) 698 return (EINVAL); 699 if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) 700 return (EINVAL); 701 702 /* 703 * Since this routine is only advisory, we default to conservative 704 * behavior. 705 */ 706 start = trunc_page((vm_offset_t)uap->addr); 707 end = round_page(tmpaddr); 708 709 get_mplock(); 710 error = vm_map_madvise(&p->p_vmspace->vm_map, start, end, 711 uap->behav, 0); 712 rel_mplock(); 713 return (error); 714 } 715 716 /* 717 * mcontrol_args(void *addr, size_t len, int behav, off_t value) 718 * 719 * MPALMOSTSAFE 720 */ 721 int 722 sys_mcontrol(struct mcontrol_args *uap) 723 { 724 struct proc *p = curproc; 725 vm_offset_t start, end; 726 vm_offset_t tmpaddr = (vm_offset_t)uap->addr + uap->len; 727 int error; 728 729 /* 730 * Check for illegal behavior 731 */ 732 if (uap->behav < 0 || uap->behav > MADV_CONTROL_END) 733 return (EINVAL); 734 /* 735 * Check for illegal addresses. Watch out for address wrap... Note 736 * that VM_*_ADDRESS are not constants due to casts (argh). 737 */ 738 if (tmpaddr < (vm_offset_t) uap->addr) 739 return (EINVAL); 740 if (VM_MAX_USER_ADDRESS > 0 && tmpaddr > VM_MAX_USER_ADDRESS) 741 return (EINVAL); 742 if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) 743 return (EINVAL); 744 745 /* 746 * Since this routine is only advisory, we default to conservative 747 * behavior. 748 */ 749 start = trunc_page((vm_offset_t)uap->addr); 750 end = round_page(tmpaddr); 751 752 get_mplock(); 753 error = vm_map_madvise(&p->p_vmspace->vm_map, start, end, 754 uap->behav, uap->value); 755 rel_mplock(); 756 return (error); 757 } 758 759 760 /* 761 * mincore_args(const void *addr, size_t len, char *vec) 762 * 763 * MPALMOSTSAFE 764 */ 765 int 766 sys_mincore(struct mincore_args *uap) 767 { 768 struct proc *p = curproc; 769 vm_offset_t addr, first_addr; 770 vm_offset_t end, cend; 771 pmap_t pmap; 772 vm_map_t map; 773 char *vec; 774 int error; 775 int vecindex, lastvecindex; 776 vm_map_entry_t current; 777 vm_map_entry_t entry; 778 int mincoreinfo; 779 unsigned int timestamp; 780 781 /* 782 * Make sure that the addresses presented are valid for user 783 * mode. 784 */ 785 first_addr = addr = trunc_page((vm_offset_t) uap->addr); 786 end = addr + (vm_size_t)round_page(uap->len); 787 if (end < addr) 788 return (EINVAL); 789 if (VM_MAX_USER_ADDRESS > 0 && end > VM_MAX_USER_ADDRESS) 790 return (EINVAL); 791 792 /* 793 * Address of byte vector 794 */ 795 vec = uap->vec; 796 797 map = &p->p_vmspace->vm_map; 798 pmap = vmspace_pmap(p->p_vmspace); 799 800 get_mplock(); 801 vm_map_lock_read(map); 802 RestartScan: 803 timestamp = map->timestamp; 804 805 if (!vm_map_lookup_entry(map, addr, &entry)) 806 entry = entry->next; 807 808 /* 809 * Do this on a map entry basis so that if the pages are not 810 * in the current processes address space, we can easily look 811 * up the pages elsewhere. 812 */ 813 lastvecindex = -1; 814 for(current = entry; 815 (current != &map->header) && (current->start < end); 816 current = current->next) { 817 818 /* 819 * ignore submaps (for now) or null objects 820 */ 821 if (current->maptype != VM_MAPTYPE_NORMAL && 822 current->maptype != VM_MAPTYPE_VPAGETABLE) { 823 continue; 824 } 825 if (current->object.vm_object == NULL) 826 continue; 827 828 /* 829 * limit this scan to the current map entry and the 830 * limits for the mincore call 831 */ 832 if (addr < current->start) 833 addr = current->start; 834 cend = current->end; 835 if (cend > end) 836 cend = end; 837 838 /* 839 * scan this entry one page at a time 840 */ 841 while (addr < cend) { 842 /* 843 * Check pmap first, it is likely faster, also 844 * it can provide info as to whether we are the 845 * one referencing or modifying the page. 846 * 847 * If we have to check the VM object, only mess 848 * around with normal maps. Do not mess around 849 * with virtual page tables (XXX). 850 */ 851 mincoreinfo = pmap_mincore(pmap, addr); 852 if (mincoreinfo == 0 && 853 current->maptype == VM_MAPTYPE_NORMAL) { 854 vm_pindex_t pindex; 855 vm_ooffset_t offset; 856 vm_page_t m; 857 858 /* 859 * calculate the page index into the object 860 */ 861 offset = current->offset + (addr - current->start); 862 pindex = OFF_TO_IDX(offset); 863 864 /* 865 * if the page is resident, then gather 866 * information about it. spl protection is 867 * required to maintain the object 868 * association. And XXX what if the page is 869 * busy? What's the deal with that? 870 */ 871 crit_enter(); 872 m = vm_page_lookup(current->object.vm_object, 873 pindex); 874 if (m && m->valid) { 875 mincoreinfo = MINCORE_INCORE; 876 if (m->dirty || 877 pmap_is_modified(m)) 878 mincoreinfo |= MINCORE_MODIFIED_OTHER; 879 if ((m->flags & PG_REFERENCED) || 880 pmap_ts_referenced(m)) { 881 vm_page_flag_set(m, PG_REFERENCED); 882 mincoreinfo |= MINCORE_REFERENCED_OTHER; 883 } 884 } 885 crit_exit(); 886 } 887 888 /* 889 * subyte may page fault. In case it needs to modify 890 * the map, we release the lock. 891 */ 892 vm_map_unlock_read(map); 893 894 /* 895 * calculate index into user supplied byte vector 896 */ 897 vecindex = OFF_TO_IDX(addr - first_addr); 898 899 /* 900 * If we have skipped map entries, we need to make sure that 901 * the byte vector is zeroed for those skipped entries. 902 */ 903 while((lastvecindex + 1) < vecindex) { 904 error = subyte( vec + lastvecindex, 0); 905 if (error) { 906 error = EFAULT; 907 goto done; 908 } 909 ++lastvecindex; 910 } 911 912 /* 913 * Pass the page information to the user 914 */ 915 error = subyte( vec + vecindex, mincoreinfo); 916 if (error) { 917 error = EFAULT; 918 goto done; 919 } 920 921 /* 922 * If the map has changed, due to the subyte, the previous 923 * output may be invalid. 924 */ 925 vm_map_lock_read(map); 926 if (timestamp != map->timestamp) 927 goto RestartScan; 928 929 lastvecindex = vecindex; 930 addr += PAGE_SIZE; 931 } 932 } 933 934 /* 935 * subyte may page fault. In case it needs to modify 936 * the map, we release the lock. 937 */ 938 vm_map_unlock_read(map); 939 940 /* 941 * Zero the last entries in the byte vector. 942 */ 943 vecindex = OFF_TO_IDX(end - first_addr); 944 while((lastvecindex + 1) < vecindex) { 945 error = subyte( vec + lastvecindex, 0); 946 if (error) { 947 error = EFAULT; 948 goto done; 949 } 950 ++lastvecindex; 951 } 952 953 /* 954 * If the map has changed, due to the subyte, the previous 955 * output may be invalid. 956 */ 957 vm_map_lock_read(map); 958 if (timestamp != map->timestamp) 959 goto RestartScan; 960 vm_map_unlock_read(map); 961 962 error = 0; 963 done: 964 rel_mplock(); 965 return (error); 966 } 967 968 /* 969 * mlock_args(const void *addr, size_t len) 970 * 971 * MPALMOSTSAFE 972 */ 973 int 974 sys_mlock(struct mlock_args *uap) 975 { 976 vm_offset_t addr; 977 vm_offset_t tmpaddr; 978 vm_size_t size, pageoff; 979 struct thread *td = curthread; 980 struct proc *p = td->td_proc; 981 int error; 982 983 addr = (vm_offset_t) uap->addr; 984 size = uap->len; 985 986 pageoff = (addr & PAGE_MASK); 987 addr -= pageoff; 988 size += pageoff; 989 size = (vm_size_t) round_page(size); 990 if (size < uap->len) /* wrap */ 991 return(EINVAL); 992 tmpaddr = addr + size; /* workaround gcc4 opt */ 993 if (tmpaddr < addr) /* wrap */ 994 return (EINVAL); 995 996 if (atop(size) + vmstats.v_wire_count > vm_page_max_wired) 997 return (EAGAIN); 998 999 get_mplock(); 1000 #ifdef pmap_wired_count 1001 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > 1002 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) { 1003 rel_mplock(); 1004 return (ENOMEM); 1005 } 1006 #else 1007 error = priv_check_cred(td->td_ucred, PRIV_ROOT, 0); 1008 if (error) { 1009 rel_mplock(); 1010 return (error); 1011 } 1012 #endif 1013 error = vm_map_unwire(&p->p_vmspace->vm_map, addr, addr + size, FALSE); 1014 rel_mplock(); 1015 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1016 } 1017 1018 /* 1019 * mlockall_args(int how) 1020 * 1021 * Dummy routine, doesn't actually do anything. 1022 * 1023 * MPSAFE 1024 */ 1025 int 1026 sys_mlockall(struct mlockall_args *uap) 1027 { 1028 return (ENOSYS); 1029 } 1030 1031 /* 1032 * munlockall_args(void) 1033 * 1034 * Dummy routine, doesn't actually do anything. 1035 * 1036 * MPSAFE 1037 */ 1038 int 1039 sys_munlockall(struct munlockall_args *uap) 1040 { 1041 return (ENOSYS); 1042 } 1043 1044 /* 1045 * munlock_args(const void *addr, size_t len) 1046 * 1047 * MPALMOSTSAFE 1048 */ 1049 int 1050 sys_munlock(struct munlock_args *uap) 1051 { 1052 struct thread *td = curthread; 1053 struct proc *p = td->td_proc; 1054 vm_offset_t addr; 1055 vm_offset_t tmpaddr; 1056 vm_size_t size, pageoff; 1057 int error; 1058 1059 addr = (vm_offset_t) uap->addr; 1060 size = uap->len; 1061 1062 pageoff = (addr & PAGE_MASK); 1063 addr -= pageoff; 1064 size += pageoff; 1065 size = (vm_size_t) round_page(size); 1066 1067 tmpaddr = addr + size; 1068 if (tmpaddr < addr) /* wrap */ 1069 return (EINVAL); 1070 1071 #ifndef pmap_wired_count 1072 error = priv_check(td, PRIV_ROOT); 1073 if (error) 1074 return (error); 1075 #endif 1076 1077 get_mplock(); 1078 error = vm_map_unwire(&p->p_vmspace->vm_map, addr, addr + size, TRUE); 1079 rel_mplock(); 1080 return (error == KERN_SUCCESS ? 0 : ENOMEM); 1081 } 1082 1083 /* 1084 * Internal version of mmap. 1085 * Currently used by mmap, exec, and sys5 shared memory. 1086 * Handle is either a vnode pointer or NULL for MAP_ANON. 1087 */ 1088 int 1089 vm_mmap(vm_map_t map, vm_offset_t *addr, vm_size_t size, vm_prot_t prot, 1090 vm_prot_t maxprot, int flags, void *handle, vm_ooffset_t foff) 1091 { 1092 boolean_t fitit; 1093 vm_object_t object; 1094 vm_offset_t eaddr; 1095 vm_size_t esize; 1096 struct vnode *vp; 1097 struct thread *td = curthread; 1098 struct proc *p; 1099 int rv = KERN_SUCCESS; 1100 off_t objsize; 1101 int docow; 1102 1103 if (size == 0) 1104 return (0); 1105 1106 objsize = round_page(size); 1107 if (objsize < size) 1108 return (EINVAL); 1109 size = objsize; 1110 1111 /* 1112 * XXX messy code, fixme 1113 * 1114 * NOTE: Overflow checks require discrete statements or GCC4 1115 * will optimize it out. 1116 */ 1117 if ((p = curproc) != NULL && map == &p->p_vmspace->vm_map) { 1118 esize = map->size + size; /* workaround gcc4 opt */ 1119 if (esize < map->size || 1120 esize > p->p_rlimit[RLIMIT_VMEM].rlim_cur) { 1121 return(ENOMEM); 1122 } 1123 } 1124 1125 /* 1126 * We currently can only deal with page aligned file offsets. 1127 * The check is here rather than in the syscall because the 1128 * kernel calls this function internally for other mmaping 1129 * operations (such as in exec) and non-aligned offsets will 1130 * cause pmap inconsistencies...so we want to be sure to 1131 * disallow this in all cases. 1132 * 1133 * NOTE: Overflow checks require discrete statements or GCC4 1134 * will optimize it out. 1135 */ 1136 if (foff & PAGE_MASK) 1137 return (EINVAL); 1138 1139 if ((flags & (MAP_FIXED | MAP_TRYFIXED)) == 0) { 1140 fitit = TRUE; 1141 *addr = round_page(*addr); 1142 } else { 1143 if (*addr != trunc_page(*addr)) 1144 return (EINVAL); 1145 eaddr = *addr + size; 1146 if (eaddr < *addr) 1147 return (EINVAL); 1148 fitit = FALSE; 1149 if ((flags & MAP_TRYFIXED) == 0) 1150 vm_map_remove(map, *addr, *addr + size); 1151 } 1152 1153 /* 1154 * Lookup/allocate object. 1155 */ 1156 if (flags & MAP_ANON) { 1157 /* 1158 * Unnamed anonymous regions always start at 0. 1159 */ 1160 if (handle) { 1161 /* 1162 * Default memory object 1163 */ 1164 object = default_pager_alloc(handle, objsize, 1165 prot, foff); 1166 if (object == NULL) 1167 return(ENOMEM); 1168 docow = MAP_PREFAULT_PARTIAL; 1169 } else { 1170 /* 1171 * Implicit single instance of a default memory 1172 * object, so we don't need a VM object yet. 1173 */ 1174 foff = 0; 1175 object = NULL; 1176 docow = 0; 1177 } 1178 vp = NULL; 1179 } else { 1180 vp = (struct vnode *)handle; 1181 if (vp->v_type == VCHR) { 1182 /* 1183 * Device mappings (device size unknown?). 1184 * Force them to be shared. 1185 */ 1186 handle = (void *)(intptr_t)vp->v_rdev; 1187 object = dev_pager_alloc(handle, objsize, prot, foff); 1188 if (object == NULL) 1189 return(EINVAL); 1190 docow = MAP_PREFAULT_PARTIAL; 1191 flags &= ~(MAP_PRIVATE|MAP_COPY); 1192 flags |= MAP_SHARED; 1193 } else { 1194 /* 1195 * Regular file mapping (typically). The attribute 1196 * check is for the link count test only. Mmapble 1197 * vnodes must already have a VM object assigned. 1198 */ 1199 struct vattr vat; 1200 int error; 1201 1202 error = VOP_GETATTR(vp, &vat); 1203 if (error) 1204 return (error); 1205 docow = MAP_PREFAULT_PARTIAL; 1206 object = vnode_pager_reference(vp); 1207 if (object == NULL && vp->v_type == VREG) { 1208 kprintf("Warning: cannot mmap vnode %p, no " 1209 "object\n", vp); 1210 return(EINVAL); 1211 } 1212 1213 /* 1214 * If it is a regular file without any references 1215 * we do not need to sync it. 1216 */ 1217 if (vp->v_type == VREG && vat.va_nlink == 0) { 1218 flags |= MAP_NOSYNC; 1219 } 1220 } 1221 } 1222 1223 /* 1224 * Deal with the adjusted flags 1225 */ 1226 if ((flags & (MAP_ANON|MAP_SHARED)) == 0) 1227 docow |= MAP_COPY_ON_WRITE; 1228 if (flags & MAP_NOSYNC) 1229 docow |= MAP_DISABLE_SYNCER; 1230 if (flags & MAP_NOCORE) 1231 docow |= MAP_DISABLE_COREDUMP; 1232 1233 #if defined(VM_PROT_READ_IS_EXEC) 1234 if (prot & VM_PROT_READ) 1235 prot |= VM_PROT_EXECUTE; 1236 1237 if (maxprot & VM_PROT_READ) 1238 maxprot |= VM_PROT_EXECUTE; 1239 #endif 1240 1241 /* 1242 * This may place the area in its own page directory if (size) is 1243 * large enough, otherwise it typically returns its argument. 1244 */ 1245 if (fitit) { 1246 *addr = pmap_addr_hint(object, *addr, size); 1247 } 1248 1249 /* 1250 * Stack mappings need special attention. 1251 * 1252 * Mappings that use virtual page tables will default to storing 1253 * the page table at offset 0. 1254 */ 1255 if (flags & MAP_STACK) { 1256 rv = vm_map_stack(map, *addr, size, flags, 1257 prot, maxprot, docow); 1258 } else if (flags & MAP_VPAGETABLE) { 1259 rv = vm_map_find(map, object, foff, addr, size, PAGE_SIZE, 1260 fitit, VM_MAPTYPE_VPAGETABLE, 1261 prot, maxprot, docow); 1262 } else { 1263 rv = vm_map_find(map, object, foff, addr, size, PAGE_SIZE, 1264 fitit, VM_MAPTYPE_NORMAL, 1265 prot, maxprot, docow); 1266 } 1267 1268 if (rv != KERN_SUCCESS) { 1269 /* 1270 * Lose the object reference. Will destroy the 1271 * object if it's an unnamed anonymous mapping 1272 * or named anonymous without other references. 1273 */ 1274 vm_object_deallocate(object); 1275 goto out; 1276 } 1277 1278 /* 1279 * Shared memory is also shared with children. 1280 */ 1281 if (flags & (MAP_SHARED|MAP_INHERIT)) { 1282 rv = vm_map_inherit(map, *addr, *addr + size, VM_INHERIT_SHARE); 1283 if (rv != KERN_SUCCESS) { 1284 vm_map_remove(map, *addr, *addr + size); 1285 goto out; 1286 } 1287 } 1288 1289 /* 1290 * Set the access time on the vnode 1291 */ 1292 if (vp != NULL) 1293 vn_mark_atime(vp, td); 1294 out: 1295 switch (rv) { 1296 case KERN_SUCCESS: 1297 return (0); 1298 case KERN_INVALID_ADDRESS: 1299 case KERN_NO_SPACE: 1300 return (ENOMEM); 1301 case KERN_PROTECTION_FAILURE: 1302 return (EACCES); 1303 default: 1304 return (EINVAL); 1305 } 1306 } 1307