1 /* $OpenBSD: uvm_mmap.c,v 1.90 2012/04/22 05:43:14 guenther Exp $ */ 2 /* $NetBSD: uvm_mmap.c,v 1.49 2001/02/18 21:19:08 chs Exp $ */ 3 4 /* 5 * Copyright (c) 1997 Charles D. Cranor and Washington University. 6 * Copyright (c) 1991, 1993 The Regents of the University of California. 7 * Copyright (c) 1988 University of Utah. 8 * 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * the Systems Programming Group of the University of Utah Computer 13 * Science Department. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. All advertising materials mentioning features or use of this software 24 * must display the following acknowledgement: 25 * This product includes software developed by the Charles D. Cranor, 26 * Washington University, University of California, Berkeley and 27 * its contributors. 28 * 4. Neither the name of the University nor the names of its contributors 29 * may be used to endorse or promote products derived from this software 30 * without specific prior written permission. 31 * 32 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 33 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 34 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 35 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 36 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 37 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 38 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 39 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 40 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 41 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 42 * SUCH DAMAGE. 43 * 44 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 45 * @(#)vm_mmap.c 8.5 (Berkeley) 5/19/94 46 * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp 47 */ 48 49 /* 50 * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap 51 * function. 52 */ 53 #include <sys/param.h> 54 #include <sys/systm.h> 55 #include <sys/file.h> 56 #include <sys/filedesc.h> 57 #include <sys/resourcevar.h> 58 #include <sys/mman.h> 59 #include <sys/mount.h> 60 #include <sys/proc.h> 61 #include <sys/malloc.h> 62 #include <sys/vnode.h> 63 #include <sys/conf.h> 64 #include <sys/stat.h> 65 #include <sys/specdev.h> 66 67 #include <machine/exec.h> /* for __LDPGSZ */ 68 69 #include <sys/syscallargs.h> 70 71 #include <uvm/uvm.h> 72 #include <uvm/uvm_device.h> 73 #include <uvm/uvm_vnode.h> 74 75 /* 76 * Page align addr and size, returning EINVAL on wraparound. 77 */ 78 #define ALIGN_ADDR(addr, size, pageoff) do { \ 79 pageoff = (addr & PAGE_MASK); \ 80 if (pageoff != 0) { \ 81 if (size > SIZE_MAX - pageoff) \ 82 return (EINVAL); /* wraparound */ \ 83 addr -= pageoff; \ 84 size += pageoff; \ 85 } \ 86 if (size != 0) { \ 87 size = (vsize_t)round_page(size); \ 88 if (size == 0) \ 89 return (EINVAL); /* wraparound */ \ 90 } \ 91 } while (0) 92 93 /* 94 * sys_mquery: provide mapping hints to applications that do fixed mappings 95 * 96 * flags: 0 or MAP_FIXED (MAP_FIXED - means that we insist on this addr and 97 * don't care about PMAP_PREFER or such) 98 * addr: hint where we'd like to place the mapping. 99 * size: size of the mapping 100 * fd: fd of the file we want to map 101 * off: offset within the file 102 */ 103 104 int 105 sys_mquery(struct proc *p, void *v, register_t *retval) 106 { 107 struct sys_mquery_args /* { 108 syscallarg(void *) addr; 109 syscallarg(size_t) len; 110 syscallarg(int) prot; 111 syscallarg(int) flags; 112 syscallarg(int) fd; 113 syscallarg(long) pad; 114 syscallarg(off_t) pos; 115 } */ *uap = v; 116 struct file *fp; 117 struct uvm_object *uobj; 118 voff_t uoff; 119 int error; 120 vaddr_t vaddr; 121 int flags = 0; 122 vsize_t size; 123 vm_prot_t prot; 124 int fd; 125 126 vaddr = (vaddr_t) SCARG(uap, addr); 127 prot = SCARG(uap, prot); 128 size = (vsize_t) SCARG(uap, len); 129 fd = SCARG(uap, fd); 130 131 if ((prot & VM_PROT_ALL) != prot) 132 return (EINVAL); 133 134 if (SCARG(uap, flags) & MAP_FIXED) 135 flags |= UVM_FLAG_FIXED; 136 137 if (fd >= 0) { 138 if ((error = getvnode(p->p_fd, fd, &fp)) != 0) 139 return (error); 140 uobj = &((struct vnode *)fp->f_data)->v_uvm.u_obj; 141 uoff = SCARG(uap, pos); 142 } else { 143 fp = NULL; 144 uobj = NULL; 145 uoff = UVM_UNKNOWN_OFFSET; 146 } 147 148 if (vaddr == 0) 149 vaddr = uvm_map_hint(p->p_vmspace, prot); 150 151 error = uvm_map_mquery(&p->p_vmspace->vm_map, &vaddr, size, uoff, 152 flags); 153 if (error == 0) 154 *retval = (register_t)(vaddr); 155 156 if (fp != NULL) 157 FRELE(fp, p); 158 return (error); 159 } 160 161 /* 162 * sys_mincore: determine if pages are in core or not. 163 */ 164 165 /* ARGSUSED */ 166 int 167 sys_mincore(struct proc *p, void *v, register_t *retval) 168 { 169 struct sys_mincore_args /* { 170 syscallarg(void *) addr; 171 syscallarg(size_t) len; 172 syscallarg(char *) vec; 173 } */ *uap = v; 174 vm_page_t m; 175 char *vec, *pgi, *pgs; 176 struct uvm_object *uobj; 177 struct vm_amap *amap; 178 struct vm_anon *anon; 179 vm_map_entry_t entry, next; 180 vaddr_t start, end, lim; 181 vm_map_t map; 182 vsize_t len, npgs; 183 int error = 0; 184 185 map = &p->p_vmspace->vm_map; 186 187 start = (vaddr_t)SCARG(uap, addr); 188 len = SCARG(uap, len); 189 vec = SCARG(uap, vec); 190 191 if (start & PAGE_MASK) 192 return (EINVAL); 193 len = round_page(len); 194 end = start + len; 195 if (end <= start) 196 return (EINVAL); 197 198 npgs = len >> PAGE_SHIFT; 199 200 /* 201 * < art> Anyone trying to mincore more than 4GB of address space is 202 * clearly insane. 203 */ 204 if (npgs >= (0xffffffff >> PAGE_SHIFT)) 205 return (E2BIG); 206 pgs = malloc(sizeof(*pgs) * npgs, M_TEMP, M_WAITOK | M_CANFAIL); 207 if (pgs == NULL) 208 return (ENOMEM); 209 pgi = pgs; 210 211 /* 212 * Lock down vec, so our returned status isn't outdated by 213 * storing the status byte for a page. 214 */ 215 if ((error = uvm_vslock(p, vec, npgs, VM_PROT_WRITE)) != 0) { 216 free(pgs, M_TEMP); 217 return (error); 218 } 219 220 vm_map_lock_read(map); 221 222 if (uvm_map_lookup_entry(map, start, &entry) == FALSE) { 223 error = ENOMEM; 224 goto out; 225 } 226 227 for (/* nothing */; 228 entry != NULL && entry->start < end; 229 entry = RB_NEXT(uvm_map_addr, &map->addr, entry)) { 230 KASSERT(!UVM_ET_ISSUBMAP(entry)); 231 KASSERT(start >= entry->start); 232 233 /* Make sure there are no holes. */ 234 next = RB_NEXT(uvm_map_addr, &map->addr, entry); 235 if (entry->end < end && 236 (next == NULL || 237 next->start > entry->end)) { 238 error = ENOMEM; 239 goto out; 240 } 241 242 lim = end < entry->end ? end : entry->end; 243 244 /* 245 * Special case for objects with no "real" pages. Those 246 * are always considered resident (mapped devices). 247 */ 248 if (UVM_ET_ISOBJ(entry)) { 249 KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj)); 250 if (entry->object.uvm_obj->pgops->pgo_fault != NULL) { 251 for (/* nothing */; start < lim; 252 start += PAGE_SIZE, pgi++) 253 *pgi = 1; 254 continue; 255 } 256 } 257 258 amap = entry->aref.ar_amap; /* top layer */ 259 uobj = entry->object.uvm_obj; /* bottom layer */ 260 261 if (uobj != NULL) 262 simple_lock(&uobj->vmobjlock); 263 264 for (/* nothing */; start < lim; start += PAGE_SIZE, pgi++) { 265 *pgi = 0; 266 if (amap != NULL) { 267 /* Check the top layer first. */ 268 anon = amap_lookup(&entry->aref, 269 start - entry->start); 270 /* Don't need to lock anon here. */ 271 if (anon != NULL && anon->an_page != NULL) { 272 /* 273 * Anon has the page for this entry 274 * offset. 275 */ 276 *pgi = 1; 277 } 278 } 279 280 if (uobj != NULL && *pgi == 0) { 281 /* Check the bottom layer. */ 282 m = uvm_pagelookup(uobj, 283 entry->offset + (start - entry->start)); 284 if (m != NULL) { 285 /* 286 * Object has the page for this entry 287 * offset. 288 */ 289 *pgi = 1; 290 } 291 } 292 } 293 294 if (uobj != NULL) 295 simple_unlock(&uobj->vmobjlock); 296 } 297 298 out: 299 vm_map_unlock_read(map); 300 uvm_vsunlock(p, SCARG(uap, vec), npgs); 301 /* now the map is unlocked we can copyout without fear. */ 302 if (error == 0) 303 copyout(pgs, vec, npgs * sizeof(char)); 304 free(pgs, M_TEMP); 305 return (error); 306 } 307 308 /* 309 * sys_mmap: mmap system call. 310 * 311 * => file offset and address may not be page aligned 312 * - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE 313 * - if address isn't page aligned the mapping starts at trunc_page(addr) 314 * and the return value is adjusted up by the page offset. 315 */ 316 317 int 318 sys_mmap(struct proc *p, void *v, register_t *retval) 319 { 320 struct sys_mmap_args /* { 321 syscallarg(void *) addr; 322 syscallarg(size_t) len; 323 syscallarg(int) prot; 324 syscallarg(int) flags; 325 syscallarg(int) fd; 326 syscallarg(long) pad; 327 syscallarg(off_t) pos; 328 } */ *uap = v; 329 vaddr_t addr; 330 struct vattr va; 331 off_t pos; 332 vsize_t size, pageoff; 333 vm_prot_t prot, maxprot; 334 int flags, fd; 335 vaddr_t vm_min_address = VM_MIN_ADDRESS; 336 struct filedesc *fdp = p->p_fd; 337 struct file *fp = NULL; 338 struct vnode *vp; 339 caddr_t handle; 340 int error; 341 342 /* 343 * first, extract syscall args from the uap. 344 */ 345 346 addr = (vaddr_t) SCARG(uap, addr); 347 size = (vsize_t) SCARG(uap, len); 348 prot = SCARG(uap, prot); 349 flags = SCARG(uap, flags); 350 fd = SCARG(uap, fd); 351 pos = SCARG(uap, pos); 352 353 /* 354 * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and 355 * validate the flags. 356 */ 357 if ((prot & VM_PROT_ALL) != prot) 358 return (EINVAL); 359 if ((flags & MAP_FLAGMASK) != flags) 360 return (EINVAL); 361 if (flags & MAP_COPY) 362 flags = (flags & ~MAP_COPY) | MAP_PRIVATE; 363 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE)) 364 return (EINVAL); 365 if (size == 0) 366 return (EINVAL); 367 368 /* 369 * align file position and save offset. adjust size. 370 */ 371 ALIGN_ADDR(pos, size, pageoff); 372 373 /* 374 * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr" 375 */ 376 377 if (flags & MAP_FIXED) { 378 379 /* adjust address by the same amount as we did the offset */ 380 addr -= pageoff; 381 if (addr & PAGE_MASK) 382 return (EINVAL); /* not page aligned */ 383 384 if (addr > SIZE_MAX - size) 385 return (EINVAL); /* no wrapping! */ 386 if (VM_MAXUSER_ADDRESS > 0 && 387 (addr + size) > VM_MAXUSER_ADDRESS) 388 return (EINVAL); 389 if (vm_min_address > 0 && addr < vm_min_address) 390 return (EINVAL); 391 392 } 393 394 /* 395 * check for file mappings (i.e. not anonymous) and verify file. 396 */ 397 if ((flags & MAP_ANON) == 0) { 398 399 if ((fp = fd_getfile(fdp, fd)) == NULL) 400 return (EBADF); 401 402 FREF(fp); 403 404 if (fp->f_type != DTYPE_VNODE) { 405 error = ENODEV; /* only mmap vnodes! */ 406 goto out; 407 } 408 vp = (struct vnode *)fp->f_data; /* convert to vnode */ 409 410 if (vp->v_type != VREG && vp->v_type != VCHR && 411 vp->v_type != VBLK) { 412 error = ENODEV; /* only REG/CHR/BLK support mmap */ 413 goto out; 414 } 415 416 if (vp->v_type == VREG && (pos + size) < pos) { 417 error = EINVAL; /* no offset wrapping */ 418 goto out; 419 } 420 421 /* special case: catch SunOS style /dev/zero */ 422 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { 423 flags |= MAP_ANON; 424 FRELE(fp, p); 425 fp = NULL; 426 goto is_anon; 427 } 428 429 /* 430 * Old programs may not select a specific sharing type, so 431 * default to an appropriate one. 432 * 433 * XXX: how does MAP_ANON fit in the picture? 434 */ 435 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) { 436 #if defined(DEBUG) 437 printf("WARNING: defaulted mmap() share type to " 438 "%s (pid %d comm %s)\n", vp->v_type == VCHR ? 439 "MAP_SHARED" : "MAP_PRIVATE", p->p_pid, 440 p->p_comm); 441 #endif 442 if (vp->v_type == VCHR) 443 flags |= MAP_SHARED; /* for a device */ 444 else 445 flags |= MAP_PRIVATE; /* for a file */ 446 } 447 448 /* 449 * MAP_PRIVATE device mappings don't make sense (and aren't 450 * supported anyway). However, some programs rely on this, 451 * so just change it to MAP_SHARED. 452 */ 453 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) { 454 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED; 455 } 456 457 /* 458 * now check protection 459 */ 460 461 maxprot = VM_PROT_EXECUTE; 462 463 /* check read access */ 464 if (fp->f_flag & FREAD) 465 maxprot |= VM_PROT_READ; 466 else if (prot & PROT_READ) { 467 error = EACCES; 468 goto out; 469 } 470 471 /* check write access, shared case first */ 472 if (flags & MAP_SHARED) { 473 /* 474 * if the file is writable, only add PROT_WRITE to 475 * maxprot if the file is not immutable, append-only. 476 * otherwise, if we have asked for PROT_WRITE, return 477 * EPERM. 478 */ 479 if (fp->f_flag & FWRITE) { 480 if ((error = 481 VOP_GETATTR(vp, &va, p->p_ucred, p))) 482 goto out; 483 if ((va.va_flags & (IMMUTABLE|APPEND)) == 0) 484 maxprot |= VM_PROT_WRITE; 485 else if (prot & PROT_WRITE) { 486 error = EPERM; 487 goto out; 488 } 489 } else if (prot & PROT_WRITE) { 490 error = EACCES; 491 goto out; 492 } 493 } else { 494 /* MAP_PRIVATE mappings can always write to */ 495 maxprot |= VM_PROT_WRITE; 496 } 497 498 /* 499 * set handle to vnode 500 */ 501 502 handle = (caddr_t)vp; 503 504 } else { /* MAP_ANON case */ 505 /* 506 * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0? 507 */ 508 if (fd != -1) { 509 error = EINVAL; 510 goto out; 511 } 512 513 is_anon: /* label for SunOS style /dev/zero */ 514 handle = NULL; 515 maxprot = VM_PROT_ALL; 516 pos = 0; 517 } 518 519 if ((flags & MAP_ANON) != 0 || 520 ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) { 521 if (size > 522 (p->p_rlimit[RLIMIT_DATA].rlim_cur - ptoa(p->p_vmspace->vm_dused))) { 523 error = ENOMEM; 524 goto out; 525 } 526 } 527 528 /* 529 * now let kernel internal function uvm_mmap do the work. 530 */ 531 532 error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot, 533 flags, handle, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur, p); 534 535 if (error == 0) 536 /* remember to add offset */ 537 *retval = (register_t)(addr + pageoff); 538 539 out: 540 if (fp) 541 FRELE(fp, p); 542 return (error); 543 } 544 545 /* 546 * sys_msync: the msync system call (a front-end for flush) 547 */ 548 549 int 550 sys_msync(struct proc *p, void *v, register_t *retval) 551 { 552 struct sys_msync_args /* { 553 syscallarg(void *) addr; 554 syscallarg(size_t) len; 555 syscallarg(int) flags; 556 } */ *uap = v; 557 vaddr_t addr; 558 vsize_t size, pageoff; 559 vm_map_t map; 560 int flags, uvmflags; 561 562 /* 563 * extract syscall args from the uap 564 */ 565 566 addr = (vaddr_t)SCARG(uap, addr); 567 size = (vsize_t)SCARG(uap, len); 568 flags = SCARG(uap, flags); 569 570 /* sanity check flags */ 571 if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 || 572 (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 || 573 (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC)) 574 return (EINVAL); 575 if ((flags & (MS_ASYNC | MS_SYNC)) == 0) 576 flags |= MS_SYNC; 577 578 /* 579 * align the address to a page boundary, and adjust the size accordingly 580 */ 581 ALIGN_ADDR(addr, size, pageoff); 582 if (addr > SIZE_MAX - size) 583 return (EINVAL); /* disallow wrap-around. */ 584 585 /* 586 * get map 587 */ 588 589 map = &p->p_vmspace->vm_map; 590 591 /* 592 * translate MS_ flags into PGO_ flags 593 */ 594 uvmflags = PGO_CLEANIT; 595 if (flags & MS_INVALIDATE) 596 uvmflags |= PGO_FREE; 597 if (flags & MS_SYNC) 598 uvmflags |= PGO_SYNCIO; 599 else 600 uvmflags |= PGO_SYNCIO; /* XXXCDC: force sync for now! */ 601 602 return (uvm_map_clean(map, addr, addr+size, uvmflags)); 603 } 604 605 /* 606 * sys_munmap: unmap a users memory 607 */ 608 609 int 610 sys_munmap(struct proc *p, void *v, register_t *retval) 611 { 612 struct sys_munmap_args /* { 613 syscallarg(void *) addr; 614 syscallarg(size_t) len; 615 } */ *uap = v; 616 vaddr_t addr; 617 vsize_t size, pageoff; 618 vm_map_t map; 619 vaddr_t vm_min_address = VM_MIN_ADDRESS; 620 struct uvm_map_deadq dead_entries; 621 622 /* 623 * get syscall args... 624 */ 625 626 addr = (vaddr_t) SCARG(uap, addr); 627 size = (vsize_t) SCARG(uap, len); 628 629 /* 630 * align the address to a page boundary, and adjust the size accordingly 631 */ 632 ALIGN_ADDR(addr, size, pageoff); 633 634 /* 635 * Check for illegal addresses. Watch out for address wrap... 636 * Note that VM_*_ADDRESS are not constants due to casts (argh). 637 */ 638 if (addr > SIZE_MAX - size) 639 return (EINVAL); 640 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) 641 return (EINVAL); 642 if (vm_min_address > 0 && addr < vm_min_address) 643 return (EINVAL); 644 map = &p->p_vmspace->vm_map; 645 646 647 vm_map_lock(map); /* lock map so we can checkprot */ 648 649 /* 650 * interesting system call semantic: make sure entire range is 651 * allocated before allowing an unmap. 652 */ 653 654 if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) { 655 vm_map_unlock(map); 656 return (EINVAL); 657 } 658 659 /* 660 * doit! 661 */ 662 TAILQ_INIT(&dead_entries); 663 uvm_unmap_remove(map, addr, addr + size, &dead_entries, FALSE, TRUE); 664 665 vm_map_unlock(map); /* and unlock */ 666 667 uvm_unmap_detach(&dead_entries, 0); 668 669 return (0); 670 } 671 672 /* 673 * sys_mprotect: the mprotect system call 674 */ 675 676 int 677 sys_mprotect(struct proc *p, void *v, register_t *retval) 678 { 679 struct sys_mprotect_args /* { 680 syscallarg(void *) addr; 681 syscallarg(size_t) len; 682 syscallarg(int) prot; 683 } */ *uap = v; 684 vaddr_t addr; 685 vsize_t size, pageoff; 686 vm_prot_t prot; 687 688 /* 689 * extract syscall args from uap 690 */ 691 692 addr = (vaddr_t)SCARG(uap, addr); 693 size = (vsize_t)SCARG(uap, len); 694 prot = SCARG(uap, prot); 695 696 if ((prot & VM_PROT_ALL) != prot) 697 return (EINVAL); 698 699 /* 700 * align the address to a page boundary, and adjust the size accordingly 701 */ 702 ALIGN_ADDR(addr, size, pageoff); 703 if (addr > SIZE_MAX - size) 704 return (EINVAL); /* disallow wrap-around. */ 705 706 return (uvm_map_protect(&p->p_vmspace->vm_map, addr, addr+size, 707 prot, FALSE)); 708 } 709 710 /* 711 * sys_minherit: the minherit system call 712 */ 713 714 int 715 sys_minherit(struct proc *p, void *v, register_t *retval) 716 { 717 struct sys_minherit_args /* { 718 syscallarg(void *) addr; 719 syscallarg(size_t) len; 720 syscallarg(int) inherit; 721 } */ *uap = v; 722 vaddr_t addr; 723 vsize_t size, pageoff; 724 vm_inherit_t inherit; 725 726 addr = (vaddr_t)SCARG(uap, addr); 727 size = (vsize_t)SCARG(uap, len); 728 inherit = SCARG(uap, inherit); 729 730 /* 731 * align the address to a page boundary, and adjust the size accordingly 732 */ 733 ALIGN_ADDR(addr, size, pageoff); 734 if (addr > SIZE_MAX - size) 735 return (EINVAL); /* disallow wrap-around. */ 736 737 return (uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size, 738 inherit)); 739 } 740 741 /* 742 * sys_madvise: give advice about memory usage. 743 */ 744 745 /* ARGSUSED */ 746 int 747 sys_madvise(struct proc *p, void *v, register_t *retval) 748 { 749 struct sys_madvise_args /* { 750 syscallarg(void *) addr; 751 syscallarg(size_t) len; 752 syscallarg(int) behav; 753 } */ *uap = v; 754 vaddr_t addr; 755 vsize_t size, pageoff; 756 int advice, error; 757 758 addr = (vaddr_t)SCARG(uap, addr); 759 size = (vsize_t)SCARG(uap, len); 760 advice = SCARG(uap, behav); 761 762 /* 763 * align the address to a page boundary, and adjust the size accordingly 764 */ 765 ALIGN_ADDR(addr, size, pageoff); 766 if (addr > SIZE_MAX - size) 767 return (EINVAL); /* disallow wrap-around. */ 768 769 switch (advice) { 770 case MADV_NORMAL: 771 case MADV_RANDOM: 772 case MADV_SEQUENTIAL: 773 error = uvm_map_advice(&p->p_vmspace->vm_map, addr, 774 addr + size, advice); 775 break; 776 777 case MADV_WILLNEED: 778 /* 779 * Activate all these pages, pre-faulting them in if 780 * necessary. 781 */ 782 /* 783 * XXX IMPLEMENT ME. 784 * Should invent a "weak" mode for uvm_fault() 785 * which would only do the PGO_LOCKED pgo_get(). 786 */ 787 return (0); 788 789 case MADV_DONTNEED: 790 /* 791 * Deactivate all these pages. We don't need them 792 * any more. We don't, however, toss the data in 793 * the pages. 794 */ 795 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size, 796 PGO_DEACTIVATE); 797 break; 798 799 case MADV_FREE: 800 /* 801 * These pages contain no valid data, and may be 802 * garbage-collected. Toss all resources, including 803 * any swap space in use. 804 */ 805 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size, 806 PGO_FREE); 807 break; 808 809 case MADV_SPACEAVAIL: 810 /* 811 * XXXMRG What is this? I think it's: 812 * 813 * Ensure that we have allocated backing-store 814 * for these pages. 815 * 816 * This is going to require changes to the page daemon, 817 * as it will free swap space allocated to pages in core. 818 * There's also what to do for device/file/anonymous memory. 819 */ 820 return (EINVAL); 821 822 default: 823 return (EINVAL); 824 } 825 826 return (error); 827 } 828 829 /* 830 * sys_mlock: memory lock 831 */ 832 833 int 834 sys_mlock(struct proc *p, void *v, register_t *retval) 835 { 836 struct sys_mlock_args /* { 837 syscallarg(const void *) addr; 838 syscallarg(size_t) len; 839 } */ *uap = v; 840 vaddr_t addr; 841 vsize_t size, pageoff; 842 int error; 843 844 /* 845 * extract syscall args from uap 846 */ 847 addr = (vaddr_t)SCARG(uap, addr); 848 size = (vsize_t)SCARG(uap, len); 849 850 /* 851 * align the address to a page boundary and adjust the size accordingly 852 */ 853 ALIGN_ADDR(addr, size, pageoff); 854 if (addr > SIZE_MAX - size) 855 return (EINVAL); /* disallow wrap-around. */ 856 857 if (atop(size) + uvmexp.wired > uvmexp.wiredmax) 858 return (EAGAIN); 859 860 #ifdef pmap_wired_count 861 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > 862 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur) 863 return (EAGAIN); 864 #else 865 if ((error = suser(p, 0)) != 0) 866 return (error); 867 #endif 868 869 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE, 870 0); 871 return (error == 0 ? 0 : ENOMEM); 872 } 873 874 /* 875 * sys_munlock: unlock wired pages 876 */ 877 878 int 879 sys_munlock(struct proc *p, void *v, register_t *retval) 880 { 881 struct sys_munlock_args /* { 882 syscallarg(const void *) addr; 883 syscallarg(size_t) len; 884 } */ *uap = v; 885 vaddr_t addr; 886 vsize_t size, pageoff; 887 int error; 888 889 /* 890 * extract syscall args from uap 891 */ 892 893 addr = (vaddr_t)SCARG(uap, addr); 894 size = (vsize_t)SCARG(uap, len); 895 896 /* 897 * align the address to a page boundary, and adjust the size accordingly 898 */ 899 ALIGN_ADDR(addr, size, pageoff); 900 if (addr > SIZE_MAX - size) 901 return (EINVAL); /* disallow wrap-around. */ 902 903 #ifndef pmap_wired_count 904 if ((error = suser(p, 0)) != 0) 905 return (error); 906 #endif 907 908 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE, 909 0); 910 return (error == 0 ? 0 : ENOMEM); 911 } 912 913 /* 914 * sys_mlockall: lock all pages mapped into an address space. 915 */ 916 917 int 918 sys_mlockall(struct proc *p, void *v, register_t *retval) 919 { 920 struct sys_mlockall_args /* { 921 syscallarg(int) flags; 922 } */ *uap = v; 923 int error, flags; 924 925 flags = SCARG(uap, flags); 926 927 if (flags == 0 || 928 (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0) 929 return (EINVAL); 930 931 #ifndef pmap_wired_count 932 if ((error = suser(p, 0)) != 0) 933 return (error); 934 #endif 935 936 error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags, 937 p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur); 938 if (error != 0 && error != ENOMEM) 939 return (EAGAIN); 940 return (error); 941 } 942 943 /* 944 * sys_munlockall: unlock all pages mapped into an address space. 945 */ 946 947 int 948 sys_munlockall(struct proc *p, void *v, register_t *retval) 949 { 950 951 (void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0); 952 return (0); 953 } 954 955 /* 956 * uvm_mmap: internal version of mmap 957 * 958 * - used by sys_mmap, exec, and sysv shm 959 * - handle is a vnode pointer or NULL for MAP_ANON (XXX: not true, 960 * sysv shm uses "named anonymous memory") 961 * - caller must page-align the file offset 962 */ 963 964 int 965 uvm_mmap(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot, 966 vm_prot_t maxprot, int flags, caddr_t handle, voff_t foff, 967 vsize_t locklimit, struct proc *p) 968 { 969 struct uvm_object *uobj; 970 struct vnode *vp; 971 int error; 972 int advice = UVM_ADV_NORMAL; 973 uvm_flag_t uvmflag = 0; 974 vsize_t align = 0; /* userland page size */ 975 976 /* 977 * check params 978 */ 979 980 if (size == 0) 981 return(0); 982 if (foff & PAGE_MASK) 983 return(EINVAL); 984 if ((prot & maxprot) != prot) 985 return(EINVAL); 986 987 /* 988 * for non-fixed mappings, round off the suggested address. 989 * for fixed mappings, check alignment and zap old mappings. 990 */ 991 992 if ((flags & MAP_FIXED) == 0) { 993 *addr = round_page(*addr); /* round */ 994 } else { 995 if (*addr & PAGE_MASK) 996 return(EINVAL); 997 uvmflag |= UVM_FLAG_FIXED; 998 uvm_unmap(map, *addr, *addr + size); /* zap! */ 999 } 1000 1001 /* 1002 * handle anon vs. non-anon mappings. for non-anon mappings attach 1003 * to underlying vm object. 1004 */ 1005 1006 if (flags & MAP_ANON) { 1007 if ((flags & MAP_FIXED) == 0 && size >= __LDPGSZ) 1008 align = __LDPGSZ; 1009 foff = UVM_UNKNOWN_OFFSET; 1010 uobj = NULL; 1011 if ((flags & MAP_SHARED) == 0) 1012 /* XXX: defer amap create */ 1013 uvmflag |= UVM_FLAG_COPYONW; 1014 else 1015 /* shared: create amap now */ 1016 uvmflag |= UVM_FLAG_OVERLAY; 1017 1018 } else { 1019 1020 vp = (struct vnode *) handle; /* get vnode */ 1021 if (vp->v_type != VCHR) { 1022 uobj = uvn_attach((void *) vp, (flags & MAP_SHARED) ? 1023 maxprot : (maxprot & ~VM_PROT_WRITE)); 1024 1025 #ifndef UBC 1026 /* 1027 * XXXCDC: hack from old code 1028 * don't allow vnodes which have been mapped 1029 * shared-writeable to persist [forces them to be 1030 * flushed out when last reference goes]. 1031 * XXXCDC: interesting side effect: avoids a bug. 1032 * note that in WRITE [ufs_readwrite.c] that we 1033 * allocate buffer, uncache, and then do the write. 1034 * the problem with this is that if the uncache causes 1035 * VM data to be flushed to the same area of the file 1036 * we are writing to... in that case we've got the 1037 * buffer locked and our process goes to sleep forever. 1038 * 1039 * XXXCDC: checking maxprot protects us from the 1040 * "persistbug" program but this is not a long term 1041 * solution. 1042 * 1043 * XXXCDC: we don't bother calling uncache with the vp 1044 * VOP_LOCKed since we know that we are already 1045 * holding a valid reference to the uvn (from the 1046 * uvn_attach above), and thus it is impossible for 1047 * the uncache to kill the uvn and trigger I/O. 1048 */ 1049 if (flags & MAP_SHARED) { 1050 if ((prot & VM_PROT_WRITE) || 1051 (maxprot & VM_PROT_WRITE)) { 1052 uvm_vnp_uncache(vp); 1053 } 1054 } 1055 #else 1056 /* XXX for now, attach doesn't gain a ref */ 1057 vref(vp); 1058 #endif 1059 } else { 1060 uobj = udv_attach((void *) &vp->v_rdev, 1061 (flags & MAP_SHARED) ? maxprot : 1062 (maxprot & ~VM_PROT_WRITE), foff, size); 1063 /* 1064 * XXX Some devices don't like to be mapped with 1065 * XXX PROT_EXEC, but we don't really have a 1066 * XXX better way of handling this, right now 1067 */ 1068 if (uobj == NULL && (prot & PROT_EXEC) == 0) { 1069 maxprot &= ~VM_PROT_EXECUTE; 1070 uobj = udv_attach((void *) &vp->v_rdev, 1071 (flags & MAP_SHARED) ? maxprot : 1072 (maxprot & ~VM_PROT_WRITE), foff, size); 1073 } 1074 advice = UVM_ADV_RANDOM; 1075 } 1076 1077 if (uobj == NULL) 1078 return((vp->v_type == VREG) ? ENOMEM : EINVAL); 1079 1080 if ((flags & MAP_SHARED) == 0) 1081 uvmflag |= UVM_FLAG_COPYONW; 1082 } 1083 1084 /* 1085 * set up mapping flags 1086 */ 1087 1088 uvmflag = UVM_MAPFLAG(prot, maxprot, 1089 (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY, 1090 advice, uvmflag); 1091 1092 error = uvm_map(map, addr, size, uobj, foff, align, uvmflag); 1093 1094 if (error == 0) { 1095 /* 1096 * POSIX 1003.1b -- if our address space was configured 1097 * to lock all future mappings, wire the one we just made. 1098 */ 1099 if (prot == VM_PROT_NONE) { 1100 /* 1101 * No more work to do in this case. 1102 */ 1103 return (0); 1104 } 1105 1106 vm_map_lock(map); 1107 1108 if (map->flags & VM_MAP_WIREFUTURE) { 1109 if ((atop(size) + uvmexp.wired) > uvmexp.wiredmax 1110 #ifdef pmap_wired_count 1111 || (locklimit != 0 && (size + 1112 ptoa(pmap_wired_count(vm_map_pmap(map)))) > 1113 locklimit) 1114 #endif 1115 ) { 1116 error = ENOMEM; 1117 vm_map_unlock(map); 1118 /* unmap the region! */ 1119 uvm_unmap(map, *addr, *addr + size); 1120 goto bad; 1121 } 1122 /* 1123 * uvm_map_pageable() always returns the map 1124 * unlocked. 1125 */ 1126 error = uvm_map_pageable(map, *addr, *addr + size, 1127 FALSE, UVM_LK_ENTER); 1128 if (error != 0) { 1129 /* unmap the region! */ 1130 uvm_unmap(map, *addr, *addr + size); 1131 goto bad; 1132 } 1133 return (0); 1134 } 1135 1136 vm_map_unlock(map); 1137 1138 return (0); 1139 } 1140 1141 /* 1142 * errors: first detach from the uobj, if any. 1143 */ 1144 1145 if (uobj) 1146 uobj->pgops->pgo_detach(uobj); 1147 1148 bad: 1149 return (error); 1150 } 1151