1 /* $NetBSD: uvm_mmap.c,v 1.148 2014/01/25 17:30:45 christos Exp $ */ 2 3 /* 4 * Copyright (c) 1997 Charles D. Cranor and Washington University. 5 * Copyright (c) 1991, 1993 The Regents of the University of California. 6 * Copyright (c) 1988 University of Utah. 7 * 8 * All rights reserved. 9 * 10 * This code is derived from software contributed to Berkeley by 11 * the Systems Programming Group of the University of Utah Computer 12 * Science Department. 13 * 14 * Redistribution and use in source and binary forms, with or without 15 * modification, are permitted provided that the following conditions 16 * are met: 17 * 1. Redistributions of source code must retain the above copyright 18 * notice, this list of conditions and the following disclaimer. 19 * 2. Redistributions in binary form must reproduce the above copyright 20 * notice, this list of conditions and the following disclaimer in the 21 * documentation and/or other materials provided with the distribution. 22 * 3. Neither the name of the University nor the names of its contributors 23 * may be used to endorse or promote products derived from this software 24 * without specific prior written permission. 25 * 26 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 27 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 28 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 29 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 30 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 31 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 32 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 33 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 34 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 35 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 36 * SUCH DAMAGE. 37 * 38 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 39 * @(#)vm_mmap.c 8.5 (Berkeley) 5/19/94 40 * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp 41 */ 42 43 /* 44 * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap 45 * function. 46 */ 47 48 #include <sys/cdefs.h> 49 __KERNEL_RCSID(0, "$NetBSD: uvm_mmap.c,v 1.148 2014/01/25 17:30:45 christos Exp $"); 50 51 #include "opt_compat_netbsd.h" 52 #include "opt_pax.h" 53 #include "veriexec.h" 54 55 #include <sys/param.h> 56 #include <sys/systm.h> 57 #include <sys/file.h> 58 #include <sys/filedesc.h> 59 #include <sys/resourcevar.h> 60 #include <sys/mman.h> 61 #include <sys/mount.h> 62 #include <sys/vnode.h> 63 #include <sys/conf.h> 64 #include <sys/stat.h> 65 66 #if NVERIEXEC > 0 67 #include <sys/verified_exec.h> 68 #endif /* NVERIEXEC > 0 */ 69 70 #if defined(PAX_ASLR) || defined(PAX_MPROTECT) 71 #include <sys/pax.h> 72 #endif /* PAX_ASLR || PAX_MPROTECT */ 73 74 #include <miscfs/specfs/specdev.h> 75 76 #include <sys/syscallargs.h> 77 78 #include <uvm/uvm.h> 79 #include <uvm/uvm_device.h> 80 81 #ifndef COMPAT_ZERODEV 82 #define COMPAT_ZERODEV(dev) (0) 83 #endif 84 85 static int 86 range_test(vaddr_t addr, vsize_t size, bool ismmap) 87 { 88 vaddr_t vm_min_address = VM_MIN_ADDRESS; 89 vaddr_t vm_max_address = VM_MAXUSER_ADDRESS; 90 vaddr_t eaddr = addr + size; 91 int res = 0; 92 93 if (addr < vm_min_address) 94 return EINVAL; 95 if (eaddr > vm_max_address) 96 return ismmap ? EFBIG : EINVAL; 97 if (addr > eaddr) /* no wrapping! */ 98 return ismmap ? EOVERFLOW : EINVAL; 99 100 #ifdef MD_MMAP_RANGE_TEST 101 res = MD_MMAP_RANGE_TEST(addr, eaddr); 102 #endif 103 104 return res; 105 } 106 107 /* 108 * unimplemented VM system calls: 109 */ 110 111 /* 112 * sys_sbrk: sbrk system call. 113 */ 114 115 /* ARGSUSED */ 116 int 117 sys_sbrk(struct lwp *l, const struct sys_sbrk_args *uap, register_t *retval) 118 { 119 /* { 120 syscallarg(intptr_t) incr; 121 } */ 122 123 return (ENOSYS); 124 } 125 126 /* 127 * sys_sstk: sstk system call. 128 */ 129 130 /* ARGSUSED */ 131 int 132 sys_sstk(struct lwp *l, const struct sys_sstk_args *uap, register_t *retval) 133 { 134 /* { 135 syscallarg(int) incr; 136 } */ 137 138 return (ENOSYS); 139 } 140 141 /* 142 * sys_mincore: determine if pages are in core or not. 143 */ 144 145 /* ARGSUSED */ 146 int 147 sys_mincore(struct lwp *l, const struct sys_mincore_args *uap, 148 register_t *retval) 149 { 150 /* { 151 syscallarg(void *) addr; 152 syscallarg(size_t) len; 153 syscallarg(char *) vec; 154 } */ 155 struct proc *p = l->l_proc; 156 struct vm_page *pg; 157 char *vec, pgi; 158 struct uvm_object *uobj; 159 struct vm_amap *amap; 160 struct vm_anon *anon; 161 struct vm_map_entry *entry; 162 vaddr_t start, end, lim; 163 struct vm_map *map; 164 vsize_t len; 165 int error = 0, npgs; 166 167 map = &p->p_vmspace->vm_map; 168 169 start = (vaddr_t)SCARG(uap, addr); 170 len = SCARG(uap, len); 171 vec = SCARG(uap, vec); 172 173 if (start & PAGE_MASK) 174 return (EINVAL); 175 len = round_page(len); 176 end = start + len; 177 if (end <= start) 178 return (EINVAL); 179 180 /* 181 * Lock down vec, so our returned status isn't outdated by 182 * storing the status byte for a page. 183 */ 184 185 npgs = len >> PAGE_SHIFT; 186 error = uvm_vslock(p->p_vmspace, vec, npgs, VM_PROT_WRITE); 187 if (error) { 188 return error; 189 } 190 vm_map_lock_read(map); 191 192 if (uvm_map_lookup_entry(map, start, &entry) == false) { 193 error = ENOMEM; 194 goto out; 195 } 196 197 for (/* nothing */; 198 entry != &map->header && entry->start < end; 199 entry = entry->next) { 200 KASSERT(!UVM_ET_ISSUBMAP(entry)); 201 KASSERT(start >= entry->start); 202 203 /* Make sure there are no holes. */ 204 if (entry->end < end && 205 (entry->next == &map->header || 206 entry->next->start > entry->end)) { 207 error = ENOMEM; 208 goto out; 209 } 210 211 lim = end < entry->end ? end : entry->end; 212 213 /* 214 * Special case for objects with no "real" pages. Those 215 * are always considered resident (mapped devices). 216 */ 217 218 if (UVM_ET_ISOBJ(entry)) { 219 KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)); 220 if (UVM_OBJ_IS_DEVICE(entry->object.uvm_obj)) { 221 for (/* nothing */; start < lim; 222 start += PAGE_SIZE, vec++) 223 subyte(vec, 1); 224 continue; 225 } 226 } 227 228 amap = entry->aref.ar_amap; /* upper layer */ 229 uobj = entry->object.uvm_obj; /* lower layer */ 230 231 if (amap != NULL) 232 amap_lock(amap); 233 if (uobj != NULL) 234 mutex_enter(uobj->vmobjlock); 235 236 for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) { 237 pgi = 0; 238 if (amap != NULL) { 239 /* Check the upper layer first. */ 240 anon = amap_lookup(&entry->aref, 241 start - entry->start); 242 /* Don't need to lock anon here. */ 243 if (anon != NULL && anon->an_page != NULL) { 244 245 /* 246 * Anon has the page for this entry 247 * offset. 248 */ 249 250 pgi = 1; 251 } 252 } 253 if (uobj != NULL && pgi == 0) { 254 /* Check the lower layer. */ 255 pg = uvm_pagelookup(uobj, 256 entry->offset + (start - entry->start)); 257 if (pg != NULL) { 258 259 /* 260 * Object has the page for this entry 261 * offset. 262 */ 263 264 pgi = 1; 265 } 266 } 267 (void) subyte(vec, pgi); 268 } 269 if (uobj != NULL) 270 mutex_exit(uobj->vmobjlock); 271 if (amap != NULL) 272 amap_unlock(amap); 273 } 274 275 out: 276 vm_map_unlock_read(map); 277 uvm_vsunlock(p->p_vmspace, SCARG(uap, vec), npgs); 278 return (error); 279 } 280 281 /* 282 * sys_mmap: mmap system call. 283 * 284 * => file offset and address may not be page aligned 285 * - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE 286 * - if address isn't page aligned the mapping starts at trunc_page(addr) 287 * and the return value is adjusted up by the page offset. 288 */ 289 290 int 291 sys_mmap(struct lwp *l, const struct sys_mmap_args *uap, register_t *retval) 292 { 293 /* { 294 syscallarg(void *) addr; 295 syscallarg(size_t) len; 296 syscallarg(int) prot; 297 syscallarg(int) flags; 298 syscallarg(int) fd; 299 syscallarg(long) pad; 300 syscallarg(off_t) pos; 301 } */ 302 struct proc *p = l->l_proc; 303 vaddr_t addr; 304 struct vattr va; 305 off_t pos; 306 vsize_t size, pageoff; 307 vm_prot_t prot, maxprot; 308 int flags, fd; 309 vaddr_t defaddr; 310 struct file *fp = NULL; 311 struct vnode *vp; 312 void *handle; 313 int error; 314 #ifdef PAX_ASLR 315 vaddr_t orig_addr; 316 #endif /* PAX_ASLR */ 317 318 /* 319 * first, extract syscall args from the uap. 320 */ 321 322 addr = (vaddr_t)SCARG(uap, addr); 323 size = (vsize_t)SCARG(uap, len); 324 prot = SCARG(uap, prot) & VM_PROT_ALL; 325 flags = SCARG(uap, flags); 326 fd = SCARG(uap, fd); 327 pos = SCARG(uap, pos); 328 329 #ifdef PAX_ASLR 330 orig_addr = addr; 331 #endif /* PAX_ASLR */ 332 333 /* 334 * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and 335 * validate the flags. 336 */ 337 if (flags & MAP_COPY) { 338 flags = (flags & ~MAP_COPY) | MAP_PRIVATE; 339 #if defined(COMPAT_10) && defined(__i386__) 340 /* 341 * Ancient kernel on x86 did not obey PROT_EXEC on i386 at least 342 * and ld.so did not turn it on. We take care of this on amd64 343 * in compat32. 344 */ 345 prot |= PROT_EXEC; 346 #endif 347 } 348 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE)) 349 return (EINVAL); 350 351 /* 352 * align file position and save offset. adjust size. 353 */ 354 355 pageoff = (pos & PAGE_MASK); 356 pos -= pageoff; 357 size += pageoff; /* add offset */ 358 size = (vsize_t)round_page(size); /* round up */ 359 360 /* 361 * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr" 362 */ 363 if (flags & MAP_FIXED) { 364 365 /* ensure address and file offset are aligned properly */ 366 addr -= pageoff; 367 if (addr & PAGE_MASK) 368 return (EINVAL); 369 370 error = range_test(addr, size, true); 371 if (error) 372 return error; 373 } else if (addr == 0 || !(flags & MAP_TRYFIXED)) { 374 375 /* 376 * not fixed: make sure we skip over the largest 377 * possible heap for non-topdown mapping arrangements. 378 * we will refine our guess later (e.g. to account for 379 * VAC, etc) 380 */ 381 382 defaddr = p->p_emul->e_vm_default_addr(p, 383 (vaddr_t)p->p_vmspace->vm_daddr, size); 384 385 if (addr == 0 || 386 !(p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN)) 387 addr = MAX(addr, defaddr); 388 else 389 addr = MIN(addr, defaddr); 390 } 391 392 /* 393 * check for file mappings (i.e. not anonymous) and verify file. 394 */ 395 396 if ((flags & MAP_ANON) == 0) { 397 if ((fp = fd_getfile(fd)) == NULL) 398 return (EBADF); 399 if (fp->f_type != DTYPE_VNODE) { 400 fd_putfile(fd); 401 return (ENODEV); /* only mmap vnodes! */ 402 } 403 vp = fp->f_data; /* convert to vnode */ 404 if (vp->v_type != VREG && vp->v_type != VCHR && 405 vp->v_type != VBLK) { 406 fd_putfile(fd); 407 return (ENODEV); /* only REG/CHR/BLK support mmap */ 408 } 409 if (vp->v_type != VCHR && pos < 0) { 410 fd_putfile(fd); 411 return (EINVAL); 412 } 413 if (vp->v_type != VCHR && (off_t)(pos + size) < pos) { 414 fd_putfile(fd); 415 return (EOVERFLOW); /* no offset wrapping */ 416 } 417 418 /* special case: catch SunOS style /dev/zero */ 419 if (vp->v_type == VCHR 420 && (vp->v_rdev == zerodev || COMPAT_ZERODEV(vp->v_rdev))) { 421 flags |= MAP_ANON; 422 fd_putfile(fd); 423 fp = NULL; 424 goto is_anon; 425 } 426 427 /* 428 * Old programs may not select a specific sharing type, so 429 * default to an appropriate one. 430 * 431 * XXX: how does MAP_ANON fit in the picture? 432 */ 433 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) { 434 #if defined(DEBUG) 435 printf("WARNING: defaulted mmap() share type to " 436 "%s (pid %d command %s)\n", vp->v_type == VCHR ? 437 "MAP_SHARED" : "MAP_PRIVATE", p->p_pid, 438 p->p_comm); 439 #endif 440 if (vp->v_type == VCHR) 441 flags |= MAP_SHARED; /* for a device */ 442 else 443 flags |= MAP_PRIVATE; /* for a file */ 444 } 445 446 /* 447 * MAP_PRIVATE device mappings don't make sense (and aren't 448 * supported anyway). However, some programs rely on this, 449 * so just change it to MAP_SHARED. 450 */ 451 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) { 452 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED; 453 } 454 455 /* 456 * now check protection 457 */ 458 459 maxprot = VM_PROT_EXECUTE; 460 461 /* check read access */ 462 if (fp->f_flag & FREAD) 463 maxprot |= VM_PROT_READ; 464 else if (prot & PROT_READ) { 465 fd_putfile(fd); 466 return (EACCES); 467 } 468 469 /* check write access, shared case first */ 470 if (flags & MAP_SHARED) { 471 /* 472 * if the file is writable, only add PROT_WRITE to 473 * maxprot if the file is not immutable, append-only. 474 * otherwise, if we have asked for PROT_WRITE, return 475 * EPERM. 476 */ 477 if (fp->f_flag & FWRITE) { 478 vn_lock(vp, LK_SHARED | LK_RETRY); 479 error = VOP_GETATTR(vp, &va, l->l_cred); 480 VOP_UNLOCK(vp); 481 if (error) { 482 fd_putfile(fd); 483 return (error); 484 } 485 if ((va.va_flags & 486 (SF_SNAPSHOT|IMMUTABLE|APPEND)) == 0) 487 maxprot |= VM_PROT_WRITE; 488 else if (prot & PROT_WRITE) { 489 fd_putfile(fd); 490 return (EPERM); 491 } 492 } 493 else if (prot & PROT_WRITE) { 494 fd_putfile(fd); 495 return (EACCES); 496 } 497 } else { 498 /* MAP_PRIVATE mappings can always write to */ 499 maxprot |= VM_PROT_WRITE; 500 } 501 handle = vp; 502 503 } else { /* MAP_ANON case */ 504 /* 505 * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0? 506 */ 507 if (fd != -1) 508 return (EINVAL); 509 510 is_anon: /* label for SunOS style /dev/zero */ 511 handle = NULL; 512 maxprot = VM_PROT_ALL; 513 pos = 0; 514 } 515 516 #if NVERIEXEC > 0 517 if (handle != NULL) { 518 /* 519 * Check if the file can be executed indirectly. 520 * 521 * XXX: This gives false warnings about "Incorrect access type" 522 * XXX: if the mapping is not executable. Harmless, but will be 523 * XXX: fixed as part of other changes. 524 */ 525 if (veriexec_verify(l, handle, "(mmap)", VERIEXEC_INDIRECT, 526 NULL)) { 527 /* 528 * Don't allow executable mappings if we can't 529 * indirectly execute the file. 530 */ 531 if (prot & VM_PROT_EXECUTE) { 532 if (fp != NULL) 533 fd_putfile(fd); 534 return (EPERM); 535 } 536 537 /* 538 * Strip the executable bit from 'maxprot' to make sure 539 * it can't be made executable later. 540 */ 541 maxprot &= ~VM_PROT_EXECUTE; 542 } 543 } 544 #endif /* NVERIEXEC > 0 */ 545 546 #ifdef PAX_MPROTECT 547 pax_mprotect(l, &prot, &maxprot); 548 #endif /* PAX_MPROTECT */ 549 550 #ifdef PAX_ASLR 551 pax_aslr(l, &addr, orig_addr, flags); 552 #endif /* PAX_ASLR */ 553 554 /* 555 * now let kernel internal function uvm_mmap do the work. 556 */ 557 558 error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot, 559 flags, handle, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur); 560 561 if (error == 0) 562 /* remember to add offset */ 563 *retval = (register_t)(addr + pageoff); 564 565 if (fp != NULL) 566 fd_putfile(fd); 567 568 return (error); 569 } 570 571 /* 572 * sys___msync13: the msync system call (a front-end for flush) 573 */ 574 575 int 576 sys___msync13(struct lwp *l, const struct sys___msync13_args *uap, 577 register_t *retval) 578 { 579 /* { 580 syscallarg(void *) addr; 581 syscallarg(size_t) len; 582 syscallarg(int) flags; 583 } */ 584 struct proc *p = l->l_proc; 585 vaddr_t addr; 586 vsize_t size, pageoff; 587 struct vm_map *map; 588 int error, rv, flags, uvmflags; 589 590 /* 591 * extract syscall args from the uap 592 */ 593 594 addr = (vaddr_t)SCARG(uap, addr); 595 size = (vsize_t)SCARG(uap, len); 596 flags = SCARG(uap, flags); 597 598 /* sanity check flags */ 599 if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 || 600 (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 || 601 (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC)) 602 return (EINVAL); 603 if ((flags & (MS_ASYNC | MS_SYNC)) == 0) 604 flags |= MS_SYNC; 605 606 /* 607 * align the address to a page boundary and adjust the size accordingly. 608 */ 609 610 pageoff = (addr & PAGE_MASK); 611 addr -= pageoff; 612 size += pageoff; 613 size = (vsize_t)round_page(size); 614 615 error = range_test(addr, size, false); 616 if (error) 617 return error; 618 619 /* 620 * get map 621 */ 622 623 map = &p->p_vmspace->vm_map; 624 625 /* 626 * XXXCDC: do we really need this semantic? 627 * 628 * XXX Gak! If size is zero we are supposed to sync "all modified 629 * pages with the region containing addr". Unfortunately, we 630 * don't really keep track of individual mmaps so we approximate 631 * by flushing the range of the map entry containing addr. 632 * This can be incorrect if the region splits or is coalesced 633 * with a neighbor. 634 */ 635 636 if (size == 0) { 637 struct vm_map_entry *entry; 638 639 vm_map_lock_read(map); 640 rv = uvm_map_lookup_entry(map, addr, &entry); 641 if (rv == true) { 642 addr = entry->start; 643 size = entry->end - entry->start; 644 } 645 vm_map_unlock_read(map); 646 if (rv == false) 647 return (EINVAL); 648 } 649 650 /* 651 * translate MS_ flags into PGO_ flags 652 */ 653 654 uvmflags = PGO_CLEANIT; 655 if (flags & MS_INVALIDATE) 656 uvmflags |= PGO_FREE; 657 if (flags & MS_SYNC) 658 uvmflags |= PGO_SYNCIO; 659 660 error = uvm_map_clean(map, addr, addr+size, uvmflags); 661 return error; 662 } 663 664 /* 665 * sys_munmap: unmap a users memory 666 */ 667 668 int 669 sys_munmap(struct lwp *l, const struct sys_munmap_args *uap, register_t *retval) 670 { 671 /* { 672 syscallarg(void *) addr; 673 syscallarg(size_t) len; 674 } */ 675 struct proc *p = l->l_proc; 676 vaddr_t addr; 677 vsize_t size, pageoff; 678 struct vm_map *map; 679 struct vm_map_entry *dead_entries; 680 int error; 681 682 /* 683 * get syscall args. 684 */ 685 686 addr = (vaddr_t)SCARG(uap, addr); 687 size = (vsize_t)SCARG(uap, len); 688 689 /* 690 * align the address to a page boundary and adjust the size accordingly. 691 */ 692 693 pageoff = (addr & PAGE_MASK); 694 addr -= pageoff; 695 size += pageoff; 696 size = (vsize_t)round_page(size); 697 698 if (size == 0) 699 return (0); 700 701 error = range_test(addr, size, false); 702 if (error) 703 return error; 704 705 map = &p->p_vmspace->vm_map; 706 707 /* 708 * interesting system call semantic: make sure entire range is 709 * allocated before allowing an unmap. 710 */ 711 712 vm_map_lock(map); 713 #if 0 714 if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) { 715 vm_map_unlock(map); 716 return (EINVAL); 717 } 718 #endif 719 uvm_unmap_remove(map, addr, addr + size, &dead_entries, 0); 720 vm_map_unlock(map); 721 if (dead_entries != NULL) 722 uvm_unmap_detach(dead_entries, 0); 723 return (0); 724 } 725 726 /* 727 * sys_mprotect: the mprotect system call 728 */ 729 730 int 731 sys_mprotect(struct lwp *l, const struct sys_mprotect_args *uap, 732 register_t *retval) 733 { 734 /* { 735 syscallarg(void *) addr; 736 syscallarg(size_t) len; 737 syscallarg(int) prot; 738 } */ 739 struct proc *p = l->l_proc; 740 vaddr_t addr; 741 vsize_t size, pageoff; 742 vm_prot_t prot; 743 int error; 744 745 /* 746 * extract syscall args from uap 747 */ 748 749 addr = (vaddr_t)SCARG(uap, addr); 750 size = (vsize_t)SCARG(uap, len); 751 prot = SCARG(uap, prot) & VM_PROT_ALL; 752 753 /* 754 * align the address to a page boundary and adjust the size accordingly. 755 */ 756 757 pageoff = (addr & PAGE_MASK); 758 addr -= pageoff; 759 size += pageoff; 760 size = round_page(size); 761 762 error = range_test(addr, size, false); 763 if (error) 764 return error; 765 766 error = uvm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot, 767 false); 768 return error; 769 } 770 771 /* 772 * sys_minherit: the minherit system call 773 */ 774 775 int 776 sys_minherit(struct lwp *l, const struct sys_minherit_args *uap, 777 register_t *retval) 778 { 779 /* { 780 syscallarg(void *) addr; 781 syscallarg(int) len; 782 syscallarg(int) inherit; 783 } */ 784 struct proc *p = l->l_proc; 785 vaddr_t addr; 786 vsize_t size, pageoff; 787 vm_inherit_t inherit; 788 int error; 789 790 addr = (vaddr_t)SCARG(uap, addr); 791 size = (vsize_t)SCARG(uap, len); 792 inherit = SCARG(uap, inherit); 793 794 /* 795 * align the address to a page boundary and adjust the size accordingly. 796 */ 797 798 pageoff = (addr & PAGE_MASK); 799 addr -= pageoff; 800 size += pageoff; 801 size = (vsize_t)round_page(size); 802 803 error = range_test(addr, size, false); 804 if (error) 805 return error; 806 807 error = uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr + size, 808 inherit); 809 return error; 810 } 811 812 /* 813 * sys_madvise: give advice about memory usage. 814 */ 815 816 /* ARGSUSED */ 817 int 818 sys_madvise(struct lwp *l, const struct sys_madvise_args *uap, 819 register_t *retval) 820 { 821 /* { 822 syscallarg(void *) addr; 823 syscallarg(size_t) len; 824 syscallarg(int) behav; 825 } */ 826 struct proc *p = l->l_proc; 827 vaddr_t addr; 828 vsize_t size, pageoff; 829 int advice, error; 830 831 addr = (vaddr_t)SCARG(uap, addr); 832 size = (vsize_t)SCARG(uap, len); 833 advice = SCARG(uap, behav); 834 835 /* 836 * align the address to a page boundary, and adjust the size accordingly 837 */ 838 839 pageoff = (addr & PAGE_MASK); 840 addr -= pageoff; 841 size += pageoff; 842 size = (vsize_t)round_page(size); 843 844 error = range_test(addr, size, false); 845 if (error) 846 return error; 847 848 switch (advice) { 849 case MADV_NORMAL: 850 case MADV_RANDOM: 851 case MADV_SEQUENTIAL: 852 error = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size, 853 advice); 854 break; 855 856 case MADV_WILLNEED: 857 858 /* 859 * Activate all these pages, pre-faulting them in if 860 * necessary. 861 */ 862 error = uvm_map_willneed(&p->p_vmspace->vm_map, 863 addr, addr + size); 864 break; 865 866 case MADV_DONTNEED: 867 868 /* 869 * Deactivate all these pages. We don't need them 870 * any more. We don't, however, toss the data in 871 * the pages. 872 */ 873 874 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size, 875 PGO_DEACTIVATE); 876 break; 877 878 case MADV_FREE: 879 880 /* 881 * These pages contain no valid data, and may be 882 * garbage-collected. Toss all resources, including 883 * any swap space in use. 884 */ 885 886 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size, 887 PGO_FREE); 888 break; 889 890 case MADV_SPACEAVAIL: 891 892 /* 893 * XXXMRG What is this? I think it's: 894 * 895 * Ensure that we have allocated backing-store 896 * for these pages. 897 * 898 * This is going to require changes to the page daemon, 899 * as it will free swap space allocated to pages in core. 900 * There's also what to do for device/file/anonymous memory. 901 */ 902 903 return (EINVAL); 904 905 default: 906 return (EINVAL); 907 } 908 909 return error; 910 } 911 912 /* 913 * sys_mlock: memory lock 914 */ 915 916 int 917 sys_mlock(struct lwp *l, const struct sys_mlock_args *uap, register_t *retval) 918 { 919 /* { 920 syscallarg(const void *) addr; 921 syscallarg(size_t) len; 922 } */ 923 struct proc *p = l->l_proc; 924 vaddr_t addr; 925 vsize_t size, pageoff; 926 int error; 927 928 /* 929 * extract syscall args from uap 930 */ 931 932 addr = (vaddr_t)SCARG(uap, addr); 933 size = (vsize_t)SCARG(uap, len); 934 935 /* 936 * align the address to a page boundary and adjust the size accordingly 937 */ 938 939 pageoff = (addr & PAGE_MASK); 940 addr -= pageoff; 941 size += pageoff; 942 size = (vsize_t)round_page(size); 943 944 error = range_test(addr, size, false); 945 if (error) 946 return error; 947 948 if (atop(size) + uvmexp.wired > uvmexp.wiredmax) 949 return (EAGAIN); 950 951 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > 952 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) 953 return (EAGAIN); 954 955 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, false, 956 0); 957 if (error == EFAULT) 958 error = ENOMEM; 959 return error; 960 } 961 962 /* 963 * sys_munlock: unlock wired pages 964 */ 965 966 int 967 sys_munlock(struct lwp *l, const struct sys_munlock_args *uap, 968 register_t *retval) 969 { 970 /* { 971 syscallarg(const void *) addr; 972 syscallarg(size_t) len; 973 } */ 974 struct proc *p = l->l_proc; 975 vaddr_t addr; 976 vsize_t size, pageoff; 977 int error; 978 979 /* 980 * extract syscall args from uap 981 */ 982 983 addr = (vaddr_t)SCARG(uap, addr); 984 size = (vsize_t)SCARG(uap, len); 985 986 /* 987 * align the address to a page boundary, and adjust the size accordingly 988 */ 989 990 pageoff = (addr & PAGE_MASK); 991 addr -= pageoff; 992 size += pageoff; 993 size = (vsize_t)round_page(size); 994 995 error = range_test(addr, size, false); 996 if (error) 997 return error; 998 999 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, true, 1000 0); 1001 if (error == EFAULT) 1002 error = ENOMEM; 1003 return error; 1004 } 1005 1006 /* 1007 * sys_mlockall: lock all pages mapped into an address space. 1008 */ 1009 1010 int 1011 sys_mlockall(struct lwp *l, const struct sys_mlockall_args *uap, 1012 register_t *retval) 1013 { 1014 /* { 1015 syscallarg(int) flags; 1016 } */ 1017 struct proc *p = l->l_proc; 1018 int error, flags; 1019 1020 flags = SCARG(uap, flags); 1021 1022 if (flags == 0 || 1023 (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0) 1024 return (EINVAL); 1025 1026 error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags, 1027 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur); 1028 return (error); 1029 } 1030 1031 /* 1032 * sys_munlockall: unlock all pages mapped into an address space. 1033 */ 1034 1035 int 1036 sys_munlockall(struct lwp *l, const void *v, register_t *retval) 1037 { 1038 struct proc *p = l->l_proc; 1039 1040 (void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0); 1041 return (0); 1042 } 1043 1044 /* 1045 * uvm_mmap: internal version of mmap 1046 * 1047 * - used by sys_mmap and various framebuffers 1048 * - handle is a vnode pointer or NULL for MAP_ANON 1049 * - caller must page-align the file offset 1050 */ 1051 1052 int 1053 uvm_mmap(struct vm_map *map, vaddr_t *addr, vsize_t size, vm_prot_t prot, 1054 vm_prot_t maxprot, int flags, void *handle, voff_t foff, vsize_t locklimit) 1055 { 1056 struct uvm_object *uobj; 1057 struct vnode *vp; 1058 vaddr_t align = 0; 1059 int error; 1060 int advice = UVM_ADV_NORMAL; 1061 uvm_flag_t uvmflag = 0; 1062 bool needwritemap; 1063 1064 /* 1065 * check params 1066 */ 1067 1068 if (size == 0) 1069 return(0); 1070 if (foff & PAGE_MASK) 1071 return(EINVAL); 1072 if ((prot & maxprot) != prot) 1073 return(EINVAL); 1074 1075 /* 1076 * for non-fixed mappings, round off the suggested address. 1077 * for fixed mappings, check alignment and zap old mappings. 1078 */ 1079 1080 if ((flags & MAP_FIXED) == 0) { 1081 *addr = round_page(*addr); 1082 } else { 1083 if (*addr & PAGE_MASK) 1084 return(EINVAL); 1085 uvmflag |= UVM_FLAG_FIXED; 1086 (void) uvm_unmap(map, *addr, *addr + size); 1087 } 1088 1089 /* 1090 * Try to see if any requested alignment can even be attemped. 1091 * Make sure we can express the alignment (asking for a >= 4GB 1092 * alignment on an ILP32 architecure make no sense) and the 1093 * alignment is at least for a page sized quanitiy. If the 1094 * request was for a fixed mapping, make sure supplied address 1095 * adheres to the request alignment. 1096 */ 1097 align = (flags & MAP_ALIGNMENT_MASK) >> MAP_ALIGNMENT_SHIFT; 1098 if (align) { 1099 if (align >= sizeof(vaddr_t) * NBBY) 1100 return(EINVAL); 1101 align = 1L << align; 1102 if (align < PAGE_SIZE) 1103 return(EINVAL); 1104 if (align >= vm_map_max(map)) 1105 return(ENOMEM); 1106 if (flags & MAP_FIXED) { 1107 if ((*addr & (align-1)) != 0) 1108 return(EINVAL); 1109 align = 0; 1110 } 1111 } 1112 1113 /* 1114 * check resource limits 1115 */ 1116 1117 if (!VM_MAP_IS_KERNEL(map) && 1118 (((rlim_t)curproc->p_vmspace->vm_map.size + (rlim_t)size) > 1119 curproc->p_rlimit[RLIMIT_AS].rlim_cur)) 1120 return ENOMEM; 1121 1122 /* 1123 * handle anon vs. non-anon mappings. for non-anon mappings attach 1124 * to underlying vm object. 1125 */ 1126 1127 if (flags & MAP_ANON) { 1128 KASSERT(handle == NULL); 1129 foff = UVM_UNKNOWN_OFFSET; 1130 uobj = NULL; 1131 if ((flags & MAP_SHARED) == 0) 1132 /* XXX: defer amap create */ 1133 uvmflag |= UVM_FLAG_COPYONW; 1134 else 1135 /* shared: create amap now */ 1136 uvmflag |= UVM_FLAG_OVERLAY; 1137 1138 } else { 1139 KASSERT(handle != NULL); 1140 vp = (struct vnode *)handle; 1141 1142 /* 1143 * Don't allow mmap for EXEC if the file system 1144 * is mounted NOEXEC. 1145 */ 1146 if ((prot & PROT_EXEC) != 0 && 1147 (vp->v_mount->mnt_flag & MNT_NOEXEC) != 0) 1148 return (EACCES); 1149 1150 if (vp->v_type != VCHR) { 1151 error = VOP_MMAP(vp, prot, curlwp->l_cred); 1152 if (error) { 1153 return error; 1154 } 1155 vref(vp); 1156 uobj = &vp->v_uobj; 1157 1158 /* 1159 * If the vnode is being mapped with PROT_EXEC, 1160 * then mark it as text. 1161 */ 1162 if (prot & PROT_EXEC) { 1163 vn_markexec(vp); 1164 } 1165 } else { 1166 int i = maxprot; 1167 1168 /* 1169 * XXX Some devices don't like to be mapped with 1170 * XXX PROT_EXEC or PROT_WRITE, but we don't really 1171 * XXX have a better way of handling this, right now 1172 */ 1173 do { 1174 uobj = udv_attach((void *) &vp->v_rdev, 1175 (flags & MAP_SHARED) ? i : 1176 (i & ~VM_PROT_WRITE), foff, size); 1177 i--; 1178 } while ((uobj == NULL) && (i > 0)); 1179 if (uobj == NULL) 1180 return EINVAL; 1181 advice = UVM_ADV_RANDOM; 1182 } 1183 if ((flags & MAP_SHARED) == 0) { 1184 uvmflag |= UVM_FLAG_COPYONW; 1185 } 1186 1187 /* 1188 * Set vnode flags to indicate the new kinds of mapping. 1189 * We take the vnode lock in exclusive mode here to serialize 1190 * with direct I/O. 1191 * 1192 * Safe to check for these flag values without a lock, as 1193 * long as a reference to the vnode is held. 1194 */ 1195 needwritemap = (vp->v_iflag & VI_WRMAP) == 0 && 1196 (flags & MAP_SHARED) != 0 && 1197 (maxprot & VM_PROT_WRITE) != 0; 1198 if ((vp->v_vflag & VV_MAPPED) == 0 || needwritemap) { 1199 vn_lock(vp, LK_EXCLUSIVE | LK_RETRY); 1200 vp->v_vflag |= VV_MAPPED; 1201 if (needwritemap) { 1202 mutex_enter(vp->v_interlock); 1203 vp->v_iflag |= VI_WRMAP; 1204 mutex_exit(vp->v_interlock); 1205 } 1206 VOP_UNLOCK(vp); 1207 } 1208 } 1209 1210 uvmflag = UVM_MAPFLAG(prot, maxprot, 1211 (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY, 1212 advice, uvmflag); 1213 error = uvm_map(map, addr, size, uobj, foff, align, uvmflag); 1214 if (error) { 1215 if (uobj) 1216 uobj->pgops->pgo_detach(uobj); 1217 return error; 1218 } 1219 1220 /* 1221 * POSIX 1003.1b -- if our address space was configured 1222 * to lock all future mappings, wire the one we just made. 1223 * 1224 * Also handle the MAP_WIRED flag here. 1225 */ 1226 1227 if (prot == VM_PROT_NONE) { 1228 1229 /* 1230 * No more work to do in this case. 1231 */ 1232 1233 return (0); 1234 } 1235 if ((flags & MAP_WIRED) != 0 || (map->flags & VM_MAP_WIREFUTURE) != 0) { 1236 vm_map_lock(map); 1237 if (atop(size) + uvmexp.wired > uvmexp.wiredmax || 1238 (locklimit != 0 && 1239 size + ptoa(pmap_wired_count(vm_map_pmap(map))) > 1240 locklimit)) { 1241 vm_map_unlock(map); 1242 uvm_unmap(map, *addr, *addr + size); 1243 return ENOMEM; 1244 } 1245 1246 /* 1247 * uvm_map_pageable() always returns the map unlocked. 1248 */ 1249 1250 error = uvm_map_pageable(map, *addr, *addr + size, 1251 false, UVM_LK_ENTER); 1252 if (error) { 1253 uvm_unmap(map, *addr, *addr + size); 1254 return error; 1255 } 1256 return (0); 1257 } 1258 return 0; 1259 } 1260 1261 vaddr_t 1262 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz) 1263 { 1264 1265 if (p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN) 1266 return VM_DEFAULT_ADDRESS_TOPDOWN(base, sz); 1267 else 1268 return VM_DEFAULT_ADDRESS_BOTTOMUP(base, sz); 1269 } 1270