1 /* $OpenBSD: uvm_mmap.c,v 1.193 2024/12/14 12:07:38 mvs Exp $ */ 2 /* $NetBSD: uvm_mmap.c,v 1.49 2001/02/18 21:19:08 chs Exp $ */ 3 4 /* 5 * Copyright (c) 1997 Charles D. Cranor and Washington University. 6 * Copyright (c) 1991, 1993 The Regents of the University of California. 7 * Copyright (c) 1988 University of Utah. 8 * 9 * All rights reserved. 10 * 11 * This code is derived from software contributed to Berkeley by 12 * the Systems Programming Group of the University of Utah Computer 13 * Science Department. 14 * 15 * Redistribution and use in source and binary forms, with or without 16 * modification, are permitted provided that the following conditions 17 * are met: 18 * 1. Redistributions of source code must retain the above copyright 19 * notice, this list of conditions and the following disclaimer. 20 * 2. Redistributions in binary form must reproduce the above copyright 21 * notice, this list of conditions and the following disclaimer in the 22 * documentation and/or other materials provided with the distribution. 23 * 3. Neither the name of the University nor the names of its contributors 24 * may be used to endorse or promote products derived from this software 25 * without specific prior written permission. 26 * 27 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND 28 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 29 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 30 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE 31 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 32 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS 33 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 34 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 35 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 36 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 37 * SUCH DAMAGE. 38 * 39 * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$ 40 * @(#)vm_mmap.c 8.5 (Berkeley) 5/19/94 41 * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp 42 */ 43 44 /* 45 * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap 46 * function. 47 */ 48 #include <sys/param.h> 49 #include <sys/systm.h> 50 #include <sys/fcntl.h> 51 #include <sys/file.h> 52 #include <sys/filedesc.h> 53 #include <sys/resourcevar.h> 54 #include <sys/mman.h> 55 #include <sys/mount.h> 56 #include <sys/proc.h> 57 #include <sys/malloc.h> 58 #include <sys/vnode.h> 59 #include <sys/conf.h> 60 #include <sys/signalvar.h> 61 #include <sys/syslog.h> 62 #include <sys/stat.h> 63 #include <sys/specdev.h> 64 #include <sys/stdint.h> 65 #include <sys/pledge.h> 66 #include <sys/unistd.h> /* for KBIND* */ 67 #include <sys/user.h> 68 69 #include <machine/exec.h> /* for __LDPGSZ */ 70 71 #include <sys/syscall.h> 72 #include <sys/syscallargs.h> 73 74 #include <uvm/uvm.h> 75 #include <uvm/uvm_device.h> 76 #include <uvm/uvm_vnode.h> 77 78 /* 79 * Locks used to protect data: 80 * a atomic 81 */ 82 83 int uvm_mmapanon(vm_map_t, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t, int, 84 vsize_t, struct proc *); 85 int uvm_mmapfile(vm_map_t, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t, int, 86 struct vnode *, voff_t, vsize_t, struct proc *); 87 88 89 /* 90 * Page align addr and size, returning EINVAL on wraparound. 91 */ 92 #define ALIGN_ADDR(addr, size, pageoff) do { \ 93 pageoff = (addr & PAGE_MASK); \ 94 if (pageoff != 0) { \ 95 if (size > SIZE_MAX - pageoff) \ 96 return EINVAL; /* wraparound */ \ 97 addr -= pageoff; \ 98 size += pageoff; \ 99 } \ 100 if (size != 0) { \ 101 size = (vsize_t)round_page(size); \ 102 if (size == 0) \ 103 return EINVAL; /* wraparound */ \ 104 } \ 105 } while (0) 106 107 /* 108 * sys_mquery: provide mapping hints to applications that do fixed mappings 109 * 110 * flags: 0 or MAP_FIXED (MAP_FIXED - means that we insist on this addr and 111 * don't care about PMAP_PREFER or such) 112 * addr: hint where we'd like to place the mapping. 113 * size: size of the mapping 114 * fd: fd of the file we want to map 115 * off: offset within the file 116 */ 117 int 118 sys_mquery(struct proc *p, void *v, register_t *retval) 119 { 120 struct sys_mquery_args /* { 121 syscallarg(void *) addr; 122 syscallarg(size_t) len; 123 syscallarg(int) prot; 124 syscallarg(int) flags; 125 syscallarg(int) fd; 126 syscallarg(off_t) pos; 127 } */ *uap = v; 128 struct file *fp; 129 voff_t uoff; 130 int error; 131 vaddr_t vaddr; 132 int flags = 0; 133 vsize_t size; 134 vm_prot_t prot; 135 int fd; 136 137 vaddr = (vaddr_t) SCARG(uap, addr); 138 prot = SCARG(uap, prot); 139 size = (vsize_t) SCARG(uap, len); 140 fd = SCARG(uap, fd); 141 142 if ((prot & PROT_MASK) != prot) 143 return EINVAL; 144 145 if (SCARG(uap, flags) & MAP_FIXED) 146 flags |= UVM_FLAG_FIXED; 147 148 if (fd >= 0) { 149 if ((error = getvnode(p, fd, &fp)) != 0) 150 return error; 151 uoff = SCARG(uap, pos); 152 } else { 153 fp = NULL; 154 uoff = UVM_UNKNOWN_OFFSET; 155 } 156 157 if (vaddr == 0) 158 vaddr = uvm_map_hint(p->p_vmspace, prot, VM_MIN_ADDRESS, 159 VM_MAXUSER_ADDRESS); 160 161 error = uvm_map_mquery(&p->p_vmspace->vm_map, &vaddr, size, uoff, 162 flags); 163 if (error == 0) 164 *retval = (register_t)(vaddr); 165 166 if (fp != NULL) 167 FRELE(fp, p); 168 return error; 169 } 170 171 int uvm_wxabort; /* [a] */ 172 173 /* 174 * W^X violations are only allowed on permitted filesystems. 175 */ 176 static inline int 177 uvm_wxcheck(struct proc *p, char *call) 178 { 179 struct process *pr = p->p_p; 180 int wxallowed = (pr->ps_textvp->v_mount && 181 (pr->ps_textvp->v_mount->mnt_flag & MNT_WXALLOWED)); 182 183 if (wxallowed && (pr->ps_flags & PS_WXNEEDED)) 184 return 0; 185 186 if (atomic_load_int(&uvm_wxabort)) { 187 KERNEL_LOCK(); 188 /* Report W^X failures */ 189 if (pr->ps_wxcounter++ == 0) 190 log(LOG_NOTICE, "%s(%d): %s W^X violation\n", 191 pr->ps_comm, pr->ps_pid, call); 192 /* Send uncatchable SIGABRT for coredump */ 193 sigexit(p, SIGABRT); 194 KERNEL_UNLOCK(); 195 } 196 197 return ENOTSUP; 198 } 199 200 /* 201 * sys_mmap: mmap system call. 202 * 203 * => file offset and address may not be page aligned 204 * - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE 205 * - if address isn't page aligned the mapping starts at trunc_page(addr) 206 * and the return value is adjusted up by the page offset. 207 */ 208 int 209 sys_mmap(struct proc *p, void *v, register_t *retval) 210 { 211 struct sys_mmap_args /* { 212 syscallarg(void *) addr; 213 syscallarg(size_t) len; 214 syscallarg(int) prot; 215 syscallarg(int) flags; 216 syscallarg(int) fd; 217 syscallarg(off_t) pos; 218 } */ *uap = v; 219 vaddr_t addr; 220 struct vattr va; 221 off_t pos; 222 vsize_t limit, pageoff, size; 223 vm_prot_t prot, maxprot; 224 int flags, fd; 225 vaddr_t vm_min_address = VM_MIN_ADDRESS; 226 struct filedesc *fdp = p->p_fd; 227 struct file *fp = NULL; 228 struct vnode *vp; 229 int error; 230 231 /* first, extract syscall args from the uap. */ 232 addr = (vaddr_t) SCARG(uap, addr); 233 size = (vsize_t) SCARG(uap, len); 234 prot = SCARG(uap, prot); 235 flags = SCARG(uap, flags); 236 fd = SCARG(uap, fd); 237 pos = SCARG(uap, pos); 238 239 /* 240 * Validate the flags. 241 */ 242 if ((prot & PROT_MASK) != prot) 243 return EINVAL; 244 if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC) && 245 (error = uvm_wxcheck(p, "mmap"))) 246 return error; 247 248 if ((flags & MAP_FLAGMASK) != flags) 249 return EINVAL; 250 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE)) 251 return EINVAL; 252 if ((flags & (MAP_FIXED|__MAP_NOREPLACE)) == __MAP_NOREPLACE) 253 return EINVAL; 254 if (flags & MAP_STACK) { 255 if ((flags & (MAP_ANON|MAP_PRIVATE)) != (MAP_ANON|MAP_PRIVATE)) 256 return EINVAL; 257 if (flags & ~(MAP_STACK|MAP_FIXED|MAP_ANON|MAP_PRIVATE)) 258 return EINVAL; 259 if (pos != 0) 260 return EINVAL; 261 if ((prot & (PROT_READ|PROT_WRITE)) != (PROT_READ|PROT_WRITE)) 262 return EINVAL; 263 } 264 if (size == 0) 265 return EINVAL; 266 267 error = pledge_protexec(p, prot); 268 if (error) 269 return error; 270 271 /* align file position and save offset. adjust size. */ 272 ALIGN_ADDR(pos, size, pageoff); 273 274 /* now check (MAP_FIXED) or get (!MAP_FIXED) the "addr" */ 275 if (flags & MAP_FIXED) { 276 /* adjust address by the same amount as we did the offset */ 277 addr -= pageoff; 278 if (addr & PAGE_MASK) 279 return EINVAL; /* not page aligned */ 280 281 if (addr > SIZE_MAX - size) 282 return EINVAL; /* no wrapping! */ 283 if (VM_MAXUSER_ADDRESS > 0 && 284 (addr + size) > VM_MAXUSER_ADDRESS) 285 return EINVAL; 286 if (vm_min_address > 0 && addr < vm_min_address) 287 return EINVAL; 288 } 289 290 /* check for file mappings (i.e. not anonymous) and verify file. */ 291 if ((flags & MAP_ANON) == 0) { 292 KERNEL_LOCK(); 293 if ((fp = fd_getfile(fdp, fd)) == NULL) { 294 error = EBADF; 295 goto out; 296 } 297 298 if (fp->f_type != DTYPE_VNODE) { 299 error = ENODEV; /* only mmap vnodes! */ 300 goto out; 301 } 302 vp = (struct vnode *)fp->f_data; /* convert to vnode */ 303 304 if (vp->v_type != VREG && vp->v_type != VCHR && 305 vp->v_type != VBLK) { 306 error = ENODEV; /* only REG/CHR/BLK support mmap */ 307 goto out; 308 } 309 310 if (vp->v_type == VREG && (pos + size) < pos) { 311 error = EINVAL; /* no offset wrapping */ 312 goto out; 313 } 314 315 /* special case: catch SunOS style /dev/zero */ 316 if (vp->v_type == VCHR && iszerodev(vp->v_rdev)) { 317 flags |= MAP_ANON; 318 FRELE(fp, p); 319 fp = NULL; 320 KERNEL_UNLOCK(); 321 goto is_anon; 322 } 323 324 /* 325 * Old programs may not select a specific sharing type, so 326 * default to an appropriate one. 327 */ 328 if ((flags & (MAP_SHARED|MAP_PRIVATE)) == 0) { 329 #if defined(DEBUG) 330 printf("WARNING: defaulted mmap() share type to" 331 " %s (pid %d comm %s)\n", 332 vp->v_type == VCHR ? "MAP_SHARED" : "MAP_PRIVATE", 333 p->p_p->ps_pid, p->p_p->ps_comm); 334 #endif 335 if (vp->v_type == VCHR) 336 flags |= MAP_SHARED; /* for a device */ 337 else 338 flags |= MAP_PRIVATE; /* for a file */ 339 } 340 341 /* 342 * MAP_PRIVATE device mappings don't make sense (and aren't 343 * supported anyway). However, some programs rely on this, 344 * so just change it to MAP_SHARED. 345 */ 346 if (vp->v_type == VCHR && (flags & MAP_PRIVATE) != 0) { 347 flags = (flags & ~MAP_PRIVATE) | MAP_SHARED; 348 } 349 350 /* now check protection */ 351 maxprot = PROT_EXEC; 352 353 /* check read access */ 354 if (fp->f_flag & FREAD) 355 maxprot |= PROT_READ; 356 else if (prot & PROT_READ) { 357 error = EACCES; 358 goto out; 359 } 360 361 /* check write access, shared case first */ 362 if (flags & MAP_SHARED) { 363 /* 364 * if the file is writable, only add PROT_WRITE to 365 * maxprot if the file is not immutable, append-only. 366 * otherwise, if we have asked for PROT_WRITE, return 367 * EPERM. 368 */ 369 if (fp->f_flag & FWRITE) { 370 error = VOP_GETATTR(vp, &va, p->p_ucred, p); 371 if (error) 372 goto out; 373 if ((va.va_flags & (IMMUTABLE|APPEND)) == 0) 374 maxprot |= PROT_WRITE; 375 else if (prot & PROT_WRITE) { 376 error = EPERM; 377 goto out; 378 } 379 } else if (prot & PROT_WRITE) { 380 error = EACCES; 381 goto out; 382 } 383 } else { 384 /* MAP_PRIVATE mappings can always write to */ 385 maxprot |= PROT_WRITE; 386 } 387 if ((flags & __MAP_NOFAULT) != 0 || 388 ((flags & MAP_PRIVATE) != 0 && (prot & PROT_WRITE) != 0)) { 389 limit = lim_cur(RLIMIT_DATA); 390 if (limit < size || 391 limit - size < ptoa(p->p_vmspace->vm_dused)) { 392 error = ENOMEM; 393 goto out; 394 } 395 } 396 error = uvm_mmapfile(&p->p_vmspace->vm_map, &addr, size, prot, 397 maxprot, flags, vp, pos, lim_cur(RLIMIT_MEMLOCK), p); 398 FRELE(fp, p); 399 KERNEL_UNLOCK(); 400 } else { /* MAP_ANON case */ 401 if (fd != -1) 402 return EINVAL; 403 404 is_anon: /* label for SunOS style /dev/zero */ 405 406 /* __MAP_NOFAULT only makes sense with a backing object */ 407 if ((flags & __MAP_NOFAULT) != 0) 408 return EINVAL; 409 410 if (prot != PROT_NONE || (flags & MAP_SHARED)) { 411 limit = lim_cur(RLIMIT_DATA); 412 if (limit < size || 413 limit - size < ptoa(p->p_vmspace->vm_dused)) { 414 return ENOMEM; 415 } 416 } 417 418 /* 419 * We've been treating (MAP_SHARED|MAP_PRIVATE) == 0 as 420 * MAP_PRIVATE, so make that clear. 421 */ 422 if ((flags & MAP_SHARED) == 0) 423 flags |= MAP_PRIVATE; 424 425 maxprot = PROT_MASK; 426 error = uvm_mmapanon(&p->p_vmspace->vm_map, &addr, size, prot, 427 maxprot, flags, lim_cur(RLIMIT_MEMLOCK), p); 428 } 429 430 if (error == 0) 431 /* remember to add offset */ 432 *retval = (register_t)(addr + pageoff); 433 434 return error; 435 436 out: 437 KERNEL_UNLOCK(); 438 if (fp) 439 FRELE(fp, p); 440 return error; 441 } 442 443 /* 444 * sys_msync: the msync system call (a front-end for flush) 445 */ 446 447 int 448 sys_msync(struct proc *p, void *v, register_t *retval) 449 { 450 struct sys_msync_args /* { 451 syscallarg(void *) addr; 452 syscallarg(size_t) len; 453 syscallarg(int) flags; 454 } */ *uap = v; 455 vaddr_t addr; 456 vsize_t size, pageoff; 457 int flags, uvmflags; 458 459 /* extract syscall args from the uap */ 460 addr = (vaddr_t)SCARG(uap, addr); 461 size = (vsize_t)SCARG(uap, len); 462 flags = SCARG(uap, flags); 463 464 /* sanity check flags */ 465 if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 || 466 (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 || 467 (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC)) 468 return EINVAL; 469 if ((flags & (MS_ASYNC | MS_SYNC)) == 0) 470 flags |= MS_SYNC; 471 472 /* align the address to a page boundary, and adjust the size accordingly */ 473 ALIGN_ADDR(addr, size, pageoff); 474 if (addr > SIZE_MAX - size) 475 return EINVAL; /* disallow wrap-around. */ 476 477 /* translate MS_ flags into PGO_ flags */ 478 uvmflags = PGO_CLEANIT; 479 if (flags & MS_INVALIDATE) 480 uvmflags |= PGO_FREE; 481 if (flags & MS_SYNC) 482 uvmflags |= PGO_SYNCIO; 483 else 484 uvmflags |= PGO_SYNCIO; /* XXXCDC: force sync for now! */ 485 486 return uvm_map_clean(&p->p_vmspace->vm_map, addr, addr+size, uvmflags); 487 } 488 489 /* 490 * sys_munmap: unmap a users memory 491 */ 492 int 493 sys_munmap(struct proc *p, void *v, register_t *retval) 494 { 495 struct sys_munmap_args /* { 496 syscallarg(void *) addr; 497 syscallarg(size_t) len; 498 } */ *uap = v; 499 vaddr_t addr; 500 vsize_t size, pageoff; 501 vm_map_t map; 502 vaddr_t vm_min_address = VM_MIN_ADDRESS; 503 struct uvm_map_deadq dead_entries; 504 505 /* get syscall args... */ 506 addr = (vaddr_t) SCARG(uap, addr); 507 size = (vsize_t) SCARG(uap, len); 508 509 /* align address to a page boundary, and adjust size accordingly */ 510 ALIGN_ADDR(addr, size, pageoff); 511 512 /* 513 * Check for illegal addresses. Watch out for address wrap... 514 * Note that VM_*_ADDRESS are not constants due to casts (argh). 515 */ 516 if (addr > SIZE_MAX - size) 517 return EINVAL; 518 if (VM_MAXUSER_ADDRESS > 0 && addr + size > VM_MAXUSER_ADDRESS) 519 return EINVAL; 520 if (vm_min_address > 0 && addr < vm_min_address) 521 return EINVAL; 522 map = &p->p_vmspace->vm_map; 523 524 525 vm_map_lock(map); /* lock map so we can checkprot */ 526 527 /* 528 * interesting system call semantic: make sure entire range is 529 * allocated before allowing an unmap. 530 */ 531 if (!uvm_map_checkprot(map, addr, addr + size, PROT_NONE)) { 532 vm_map_unlock(map); 533 return EINVAL; 534 } 535 536 TAILQ_INIT(&dead_entries); 537 if (uvm_unmap_remove(map, addr, addr + size, &dead_entries, 538 FALSE, TRUE, TRUE) != 0) { 539 vm_map_unlock(map); 540 return EPERM; /* immutable entries found */ 541 } 542 vm_map_unlock(map); /* and unlock */ 543 544 uvm_unmap_detach(&dead_entries, 0); 545 546 return 0; 547 } 548 549 /* 550 * sys_mprotect: the mprotect system call 551 */ 552 int 553 sys_mprotect(struct proc *p, void *v, register_t *retval) 554 { 555 struct sys_mprotect_args /* { 556 syscallarg(void *) addr; 557 syscallarg(size_t) len; 558 syscallarg(int) prot; 559 } */ *uap = v; 560 vaddr_t addr; 561 vsize_t size, pageoff; 562 vm_prot_t prot; 563 int error; 564 565 /* 566 * extract syscall args from uap 567 */ 568 569 addr = (vaddr_t)SCARG(uap, addr); 570 size = (vsize_t)SCARG(uap, len); 571 prot = SCARG(uap, prot); 572 573 if ((prot & PROT_MASK) != prot) 574 return EINVAL; 575 if ((prot & (PROT_WRITE | PROT_EXEC)) == (PROT_WRITE | PROT_EXEC) && 576 (error = uvm_wxcheck(p, "mprotect"))) 577 return error; 578 579 error = pledge_protexec(p, prot); 580 if (error) 581 return error; 582 583 /* 584 * align the address to a page boundary, and adjust the size accordingly 585 */ 586 ALIGN_ADDR(addr, size, pageoff); 587 if (addr > SIZE_MAX - size) 588 return EINVAL; /* disallow wrap-around. */ 589 590 return (uvm_map_protect(&p->p_vmspace->vm_map, addr, addr+size, 591 prot, 0, FALSE, TRUE)); 592 } 593 594 /* 595 * sys_pinsyscalls. The caller is required to normalize base,len 596 * to the minimum .text region, and adjust pintable offsets relative 597 * to that base. 598 */ 599 int 600 sys_pinsyscalls(struct proc *p, void *v, register_t *retval) 601 { 602 struct sys_pinsyscalls_args /* { 603 syscallarg(void *) base; 604 syscallarg(size_t) len; 605 syscallarg(u_int *) pins; 606 syscallarg(int) npins; 607 } */ *uap = v; 608 struct process *pr = p->p_p; 609 struct vm_map *map = &p->p_vmspace->vm_map; 610 int npins, error = 0, i; 611 vaddr_t base; 612 size_t len; 613 u_int *pins; 614 615 if (pr->ps_libcpin.pn_start || 616 (pr->ps_vmspace->vm_map.flags & VM_MAP_PINSYSCALL_ONCE)) 617 return (EPERM); 618 base = (vaddr_t)SCARG(uap, base); 619 len = (vsize_t)SCARG(uap, len); 620 if (base > SIZE_MAX - len) 621 return (EINVAL); /* disallow wrap-around. */ 622 if (base < map->min_offset || base+len > map->max_offset) 623 return (EINVAL); 624 625 /* XXX MP unlock */ 626 627 npins = SCARG(uap, npins); 628 if (npins < 1 || npins > SYS_MAXSYSCALL) 629 return (E2BIG); 630 pins = malloc(npins * sizeof(u_int), M_PINSYSCALL, M_WAITOK|M_ZERO); 631 if (pins == NULL) 632 return (ENOMEM); 633 error = copyin(SCARG(uap, pins), pins, npins * sizeof(u_int)); 634 if (error) 635 goto err; 636 637 /* Range-check pintable offsets */ 638 for (i = 0; i < npins; i++) { 639 if (pins[i] == (u_int)-1 || pins[i] == 0) 640 continue; 641 if (pins[i] > SCARG(uap, len)) { 642 error = ERANGE; 643 break; 644 } 645 } 646 if (error) { 647 err: 648 free(pins, M_PINSYSCALL, npins * sizeof(u_int)); 649 return (error); 650 } 651 pr->ps_libcpin.pn_start = base; 652 pr->ps_libcpin.pn_end = base + len; 653 pr->ps_libcpin.pn_pins = pins; 654 pr->ps_libcpin.pn_npins = npins; 655 656 #ifdef PMAP_CHECK_COPYIN 657 /* Assume (and insist) on libc.so text being execute-only */ 658 if (PMAP_CHECK_COPYIN) 659 uvm_map_check_copyin_add(map, base, base+len); 660 #endif 661 return (0); 662 } 663 664 /* 665 * sys_mimmutable: the mimmutable system call 666 */ 667 int 668 sys_mimmutable(struct proc *p, void *v, register_t *retval) 669 { 670 struct sys_mimmutable_args /* { 671 immutablearg(void *) addr; 672 immutablearg(size_t) len; 673 } */ *uap = v; 674 vaddr_t addr; 675 vsize_t size, pageoff; 676 677 addr = (vaddr_t)SCARG(uap, addr); 678 size = (vsize_t)SCARG(uap, len); 679 680 /* 681 * align the address to a page boundary, and adjust the size accordingly 682 */ 683 ALIGN_ADDR(addr, size, pageoff); 684 if (addr > SIZE_MAX - size) 685 return EINVAL; /* disallow wrap-around. */ 686 687 return uvm_map_immutable(&p->p_vmspace->vm_map, addr, addr+size, 1); 688 } 689 690 /* 691 * sys_minherit: the minherit system call 692 */ 693 int 694 sys_minherit(struct proc *p, void *v, register_t *retval) 695 { 696 struct sys_minherit_args /* { 697 syscallarg(void *) addr; 698 syscallarg(size_t) len; 699 syscallarg(int) inherit; 700 } */ *uap = v; 701 vaddr_t addr; 702 vsize_t size, pageoff; 703 vm_inherit_t inherit; 704 705 addr = (vaddr_t)SCARG(uap, addr); 706 size = (vsize_t)SCARG(uap, len); 707 inherit = SCARG(uap, inherit); 708 709 /* 710 * align the address to a page boundary, and adjust the size accordingly 711 */ 712 ALIGN_ADDR(addr, size, pageoff); 713 if (addr > SIZE_MAX - size) 714 return EINVAL; /* disallow wrap-around. */ 715 716 return (uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr+size, 717 inherit)); 718 } 719 720 /* 721 * sys_madvise: give advice about memory usage. 722 */ 723 int 724 sys_madvise(struct proc *p, void *v, register_t *retval) 725 { 726 struct sys_madvise_args /* { 727 syscallarg(void *) addr; 728 syscallarg(size_t) len; 729 syscallarg(int) behav; 730 } */ *uap = v; 731 vaddr_t addr; 732 vsize_t size, pageoff; 733 int advice, error; 734 735 addr = (vaddr_t)SCARG(uap, addr); 736 size = (vsize_t)SCARG(uap, len); 737 advice = SCARG(uap, behav); 738 739 /* 740 * align the address to a page boundary, and adjust the size accordingly 741 */ 742 ALIGN_ADDR(addr, size, pageoff); 743 if (addr > SIZE_MAX - size) 744 return EINVAL; /* disallow wrap-around. */ 745 746 switch (advice) { 747 case MADV_NORMAL: 748 case MADV_RANDOM: 749 case MADV_SEQUENTIAL: 750 error = uvm_map_advice(&p->p_vmspace->vm_map, addr, 751 addr + size, advice); 752 break; 753 754 case MADV_WILLNEED: 755 /* 756 * Activate all these pages, pre-faulting them in if 757 * necessary. 758 */ 759 /* 760 * XXX IMPLEMENT ME. 761 * Should invent a "weak" mode for uvm_fault() 762 * which would only do the PGO_LOCKED pgo_get(). 763 */ 764 return 0; 765 766 case MADV_DONTNEED: 767 /* 768 * Deactivate all these pages. We don't need them 769 * any more. We don't, however, toss the data in 770 * the pages. 771 */ 772 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size, 773 PGO_DEACTIVATE); 774 break; 775 776 case MADV_FREE: 777 /* 778 * These pages contain no valid data, and may be 779 * garbage-collected. Toss all resources, including 780 * any swap space in use. 781 */ 782 error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size, 783 PGO_FREE); 784 break; 785 786 case MADV_SPACEAVAIL: 787 /* 788 * XXXMRG What is this? I think it's: 789 * 790 * Ensure that we have allocated backing-store 791 * for these pages. 792 * 793 * This is going to require changes to the page daemon, 794 * as it will free swap space allocated to pages in core. 795 * There's also what to do for device/file/anonymous memory. 796 */ 797 return EINVAL; 798 799 default: 800 return EINVAL; 801 } 802 803 return error; 804 } 805 806 /* 807 * sys_mlock: memory lock 808 */ 809 810 int 811 sys_mlock(struct proc *p, void *v, register_t *retval) 812 { 813 struct sys_mlock_args /* { 814 syscallarg(const void *) addr; 815 syscallarg(size_t) len; 816 } */ *uap = v; 817 vaddr_t addr; 818 vsize_t size, pageoff; 819 int error; 820 821 /* extract syscall args from uap */ 822 addr = (vaddr_t)SCARG(uap, addr); 823 size = (vsize_t)SCARG(uap, len); 824 825 /* align address to a page boundary and adjust size accordingly */ 826 ALIGN_ADDR(addr, size, pageoff); 827 if (addr > SIZE_MAX - size) 828 return EINVAL; /* disallow wrap-around. */ 829 830 if (atop(size) + uvmexp.wired > uvmexp.wiredmax) 831 return EAGAIN; 832 833 #ifdef pmap_wired_count 834 if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) > 835 lim_cur(RLIMIT_MEMLOCK)) 836 return EAGAIN; 837 #else 838 if ((error = suser(p)) != 0) 839 return error; 840 #endif 841 842 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, FALSE, 843 0); 844 return error == 0 ? 0 : ENOMEM; 845 } 846 847 /* 848 * sys_munlock: unlock wired pages 849 */ 850 851 int 852 sys_munlock(struct proc *p, void *v, register_t *retval) 853 { 854 struct sys_munlock_args /* { 855 syscallarg(const void *) addr; 856 syscallarg(size_t) len; 857 } */ *uap = v; 858 vaddr_t addr; 859 vsize_t size, pageoff; 860 int error; 861 862 /* extract syscall args from uap */ 863 addr = (vaddr_t)SCARG(uap, addr); 864 size = (vsize_t)SCARG(uap, len); 865 866 /* align address to a page boundary, and adjust size accordingly */ 867 ALIGN_ADDR(addr, size, pageoff); 868 if (addr > SIZE_MAX - size) 869 return EINVAL; /* disallow wrap-around. */ 870 871 #ifndef pmap_wired_count 872 if ((error = suser(p)) != 0) 873 return error; 874 #endif 875 876 error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, TRUE, 877 0); 878 return error == 0 ? 0 : ENOMEM; 879 } 880 881 /* 882 * sys_mlockall: lock all pages mapped into an address space. 883 */ 884 int 885 sys_mlockall(struct proc *p, void *v, register_t *retval) 886 { 887 struct sys_mlockall_args /* { 888 syscallarg(int) flags; 889 } */ *uap = v; 890 int error, flags; 891 892 flags = SCARG(uap, flags); 893 894 if (flags == 0 || 895 (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0) 896 return EINVAL; 897 898 #ifndef pmap_wired_count 899 if ((error = suser(p)) != 0) 900 return error; 901 #endif 902 903 error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags, 904 lim_cur(RLIMIT_MEMLOCK)); 905 if (error != 0 && error != ENOMEM) 906 return EAGAIN; 907 return error; 908 } 909 910 /* 911 * sys_munlockall: unlock all pages mapped into an address space. 912 */ 913 int 914 sys_munlockall(struct proc *p, void *v, register_t *retval) 915 { 916 917 (void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0); 918 return 0; 919 } 920 921 /* 922 * common code for mmapanon and mmapfile to lock a mmaping 923 */ 924 int 925 uvm_mmaplock(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot, 926 vsize_t locklimit) 927 { 928 int error; 929 930 /* 931 * POSIX 1003.1b -- if our address space was configured 932 * to lock all future mappings, wire the one we just made. 933 */ 934 if (prot == PROT_NONE) { 935 /* 936 * No more work to do in this case. 937 */ 938 return 0; 939 } 940 941 vm_map_lock(map); 942 if (map->flags & VM_MAP_WIREFUTURE) { 943 KERNEL_LOCK(); 944 if ((atop(size) + uvmexp.wired) > uvmexp.wiredmax 945 #ifdef pmap_wired_count 946 || (locklimit != 0 && (size + 947 ptoa(pmap_wired_count(vm_map_pmap(map)))) > 948 locklimit) 949 #endif 950 ) { 951 error = ENOMEM; 952 vm_map_unlock(map); 953 /* unmap the region! */ 954 uvm_unmap(map, *addr, *addr + size); 955 KERNEL_UNLOCK(); 956 return error; 957 } 958 /* 959 * uvm_map_pageable() always returns the map 960 * unlocked. 961 */ 962 error = uvm_map_pageable(map, *addr, *addr + size, 963 FALSE, UVM_LK_ENTER); 964 if (error != 0) { 965 /* unmap the region! */ 966 uvm_unmap(map, *addr, *addr + size); 967 KERNEL_UNLOCK(); 968 return error; 969 } 970 KERNEL_UNLOCK(); 971 return 0; 972 } 973 vm_map_unlock(map); 974 return 0; 975 } 976 977 /* 978 * uvm_mmapanon: internal version of mmap for anons 979 * 980 * - used by sys_mmap 981 */ 982 int 983 uvm_mmapanon(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot, 984 vm_prot_t maxprot, int flags, vsize_t locklimit, struct proc *p) 985 { 986 int error; 987 int advice = MADV_NORMAL; 988 unsigned int uvmflag = 0; 989 vsize_t align = 0; /* userland page size */ 990 991 /* 992 * for non-fixed mappings, round off the suggested address. 993 * for fixed mappings, check alignment and zap old mappings. 994 */ 995 if ((flags & MAP_FIXED) == 0) { 996 *addr = round_page(*addr); /* round */ 997 } else { 998 if (*addr & PAGE_MASK) 999 return EINVAL; 1000 1001 uvmflag |= UVM_FLAG_FIXED; 1002 if ((flags & __MAP_NOREPLACE) == 0) 1003 uvmflag |= UVM_FLAG_UNMAP; 1004 } 1005 1006 if ((flags & MAP_FIXED) == 0 && size >= __LDPGSZ) 1007 align = __LDPGSZ; 1008 if ((flags & MAP_SHARED) == 0) 1009 /* XXX: defer amap create */ 1010 uvmflag |= UVM_FLAG_COPYONW; 1011 else 1012 /* shared: create amap now */ 1013 uvmflag |= UVM_FLAG_OVERLAY; 1014 if (flags & MAP_STACK) 1015 uvmflag |= UVM_FLAG_STACK; 1016 if (flags & MAP_CONCEAL) 1017 uvmflag |= UVM_FLAG_CONCEAL; 1018 1019 /* set up mapping flags */ 1020 uvmflag = UVM_MAPFLAG(prot, maxprot, 1021 (flags & MAP_SHARED) ? MAP_INHERIT_SHARE : MAP_INHERIT_COPY, 1022 advice, uvmflag); 1023 1024 error = uvm_mapanon(map, addr, size, align, uvmflag); 1025 1026 if (error == 0) 1027 error = uvm_mmaplock(map, addr, size, prot, locklimit); 1028 return error; 1029 } 1030 1031 /* 1032 * uvm_mmapfile: internal version of mmap for non-anons 1033 * 1034 * - used by sys_mmap 1035 * - caller must page-align the file offset 1036 */ 1037 int 1038 uvm_mmapfile(vm_map_t map, vaddr_t *addr, vsize_t size, vm_prot_t prot, 1039 vm_prot_t maxprot, int flags, struct vnode *vp, voff_t foff, 1040 vsize_t locklimit, struct proc *p) 1041 { 1042 struct uvm_object *uobj; 1043 int error; 1044 int advice = MADV_NORMAL; 1045 unsigned int uvmflag = 0; 1046 vsize_t align = 0; /* userland page size */ 1047 1048 /* 1049 * for non-fixed mappings, round off the suggested address. 1050 * for fixed mappings, check alignment and zap old mappings. 1051 */ 1052 if ((flags & MAP_FIXED) == 0) { 1053 *addr = round_page(*addr); /* round */ 1054 } else { 1055 if (*addr & PAGE_MASK) 1056 return EINVAL; 1057 1058 uvmflag |= UVM_FLAG_FIXED; 1059 if ((flags & __MAP_NOREPLACE) == 0) 1060 uvmflag |= UVM_FLAG_UNMAP; 1061 } 1062 1063 /* 1064 * attach to underlying vm object. 1065 */ 1066 if (vp->v_type != VCHR) { 1067 uobj = uvn_attach(vp, (flags & MAP_SHARED) ? 1068 maxprot : (maxprot & ~PROT_WRITE)); 1069 1070 /* 1071 * XXXCDC: hack from old code 1072 * don't allow vnodes which have been mapped 1073 * shared-writeable to persist [forces them to be 1074 * flushed out when last reference goes]. 1075 * XXXCDC: interesting side effect: avoids a bug. 1076 * note that in WRITE [ufs_readwrite.c] that we 1077 * allocate buffer, uncache, and then do the write. 1078 * the problem with this is that if the uncache causes 1079 * VM data to be flushed to the same area of the file 1080 * we are writing to... in that case we've got the 1081 * buffer locked and our process goes to sleep forever. 1082 * 1083 * XXXCDC: checking maxprot protects us from the 1084 * "persistbug" program but this is not a long term 1085 * solution. 1086 * 1087 * XXXCDC: we don't bother calling uncache with the vp 1088 * VOP_LOCKed since we know that we are already 1089 * holding a valid reference to the uvn (from the 1090 * uvn_attach above), and thus it is impossible for 1091 * the uncache to kill the uvn and trigger I/O. 1092 */ 1093 if (flags & MAP_SHARED) { 1094 if ((prot & PROT_WRITE) || 1095 (maxprot & PROT_WRITE)) { 1096 uvm_vnp_uncache(vp); 1097 } 1098 } 1099 } else { 1100 uobj = udv_attach(vp->v_rdev, 1101 (flags & MAP_SHARED) ? maxprot : 1102 (maxprot & ~PROT_WRITE), foff, size); 1103 /* 1104 * XXX Some devices don't like to be mapped with 1105 * XXX PROT_EXEC, but we don't really have a 1106 * XXX better way of handling this, right now 1107 */ 1108 if (uobj == NULL && (prot & PROT_EXEC) == 0) { 1109 maxprot &= ~PROT_EXEC; 1110 uobj = udv_attach(vp->v_rdev, 1111 (flags & MAP_SHARED) ? maxprot : 1112 (maxprot & ~PROT_WRITE), foff, size); 1113 } 1114 advice = MADV_RANDOM; 1115 } 1116 1117 if (uobj == NULL) 1118 return vp->v_type == VREG ? ENOMEM : EINVAL; 1119 1120 if ((flags & MAP_SHARED) == 0) 1121 uvmflag |= UVM_FLAG_COPYONW; 1122 if (flags & __MAP_NOFAULT) 1123 uvmflag |= (UVM_FLAG_NOFAULT | UVM_FLAG_OVERLAY); 1124 if (flags & MAP_STACK) 1125 uvmflag |= UVM_FLAG_STACK; 1126 if (flags & MAP_CONCEAL) 1127 uvmflag |= UVM_FLAG_CONCEAL; 1128 1129 /* set up mapping flags */ 1130 uvmflag = UVM_MAPFLAG(prot, maxprot, 1131 (flags & MAP_SHARED) ? MAP_INHERIT_SHARE : MAP_INHERIT_COPY, 1132 advice, uvmflag); 1133 1134 error = uvm_map(map, addr, size, uobj, foff, align, uvmflag); 1135 1136 if (error == 0) 1137 return uvm_mmaplock(map, addr, size, prot, locklimit); 1138 1139 /* errors: first detach from the uobj, if any. */ 1140 if (uobj) 1141 uobj->pgops->pgo_detach(uobj); 1142 1143 return error; 1144 } 1145 1146 int 1147 sys_kbind(struct proc *p, void *v, register_t *retval) 1148 { 1149 struct sys_kbind_args /* { 1150 syscallarg(const struct __kbind *) param; 1151 syscallarg(size_t) psize; 1152 syscallarg(uint64_t) proc_cookie; 1153 } */ *uap = v; 1154 const struct __kbind *paramp; 1155 union { 1156 struct __kbind uk[KBIND_BLOCK_MAX]; 1157 char upad[KBIND_BLOCK_MAX * sizeof(*paramp) + KBIND_DATA_MAX]; 1158 } param; 1159 struct uvm_map_deadq dead_entries; 1160 struct process *pr = p->p_p; 1161 const char *data; 1162 vaddr_t baseva, last_baseva, endva, pageoffset, kva; 1163 size_t psize, s; 1164 u_long pc; 1165 int count, i, extra; 1166 int error, sigill = 0; 1167 1168 /* 1169 * extract syscall args from uap 1170 */ 1171 paramp = SCARG(uap, param); 1172 psize = SCARG(uap, psize); 1173 1174 /* 1175 * If paramp is NULL and we're uninitialized, disable the syscall 1176 * for the process. Raise SIGILL if paramp is NULL and we're 1177 * already initialized. 1178 * 1179 * If paramp is non-NULL and we're uninitialized, do initialization. 1180 * Otherwise, do security checks and raise SIGILL on failure. 1181 */ 1182 pc = PROC_PC(p); 1183 mtx_enter(&pr->ps_mtx); 1184 if (paramp == NULL) { 1185 /* ld.so disables kbind() when lazy binding is disabled */ 1186 if (pr->ps_kbind_addr == 0) 1187 pr->ps_kbind_addr = BOGO_PC; 1188 /* pre-7.3 static binaries disable kbind */ 1189 /* XXX delete check in 2026 */ 1190 else if (pr->ps_kbind_addr != BOGO_PC) 1191 sigill = 1; 1192 } else if (pr->ps_kbind_addr == 0) { 1193 pr->ps_kbind_addr = pc; 1194 pr->ps_kbind_cookie = SCARG(uap, proc_cookie); 1195 } else if (pc != pr->ps_kbind_addr || pc == BOGO_PC || 1196 pr->ps_kbind_cookie != SCARG(uap, proc_cookie)) { 1197 sigill = 1; 1198 } 1199 mtx_leave(&pr->ps_mtx); 1200 1201 /* Raise SIGILL if something is off. */ 1202 if (sigill) { 1203 KERNEL_LOCK(); 1204 sigexit(p, SIGILL); 1205 /* NOTREACHED */ 1206 KERNEL_UNLOCK(); 1207 } 1208 1209 /* We're done if we were disabling the syscall. */ 1210 if (paramp == NULL) 1211 return 0; 1212 1213 if (psize < sizeof(struct __kbind) || psize > sizeof(param)) 1214 return EINVAL; 1215 if ((error = copyin(paramp, ¶m, psize))) 1216 return error; 1217 1218 /* 1219 * The param argument points to an array of __kbind structures 1220 * followed by the corresponding new data areas for them. Verify 1221 * that the sizes in the __kbind structures add up to the total 1222 * size and find the start of the new area. 1223 */ 1224 paramp = ¶m.uk[0]; 1225 s = psize; 1226 for (count = 0; s > 0 && count < KBIND_BLOCK_MAX; count++) { 1227 if (s < sizeof(*paramp)) 1228 return EINVAL; 1229 s -= sizeof(*paramp); 1230 1231 baseva = (vaddr_t)paramp[count].kb_addr; 1232 endva = baseva + paramp[count].kb_size - 1; 1233 if (paramp[count].kb_addr == NULL || 1234 paramp[count].kb_size == 0 || 1235 paramp[count].kb_size > KBIND_DATA_MAX || 1236 baseva >= VM_MAXUSER_ADDRESS || 1237 endva >= VM_MAXUSER_ADDRESS || 1238 s < paramp[count].kb_size) 1239 return EINVAL; 1240 1241 s -= paramp[count].kb_size; 1242 } 1243 if (s > 0) 1244 return EINVAL; 1245 data = (const char *)¶mp[count]; 1246 1247 /* all looks good, so do the bindings */ 1248 last_baseva = VM_MAXUSER_ADDRESS; 1249 kva = 0; 1250 TAILQ_INIT(&dead_entries); 1251 for (i = 0; i < count; i++) { 1252 baseva = (vaddr_t)paramp[i].kb_addr; 1253 s = paramp[i].kb_size; 1254 pageoffset = baseva & PAGE_MASK; 1255 baseva = trunc_page(baseva); 1256 1257 /* hppa at least runs PLT entries over page edge */ 1258 extra = (pageoffset + s) & PAGE_MASK; 1259 if (extra > pageoffset) 1260 extra = 0; 1261 else 1262 s -= extra; 1263 redo: 1264 /* make sure the desired page is mapped into kernel_map */ 1265 if (baseva != last_baseva) { 1266 if (kva != 0) { 1267 vm_map_lock(kernel_map); 1268 uvm_unmap_remove(kernel_map, kva, 1269 kva+PAGE_SIZE, &dead_entries, 1270 FALSE, TRUE, FALSE); /* XXX */ 1271 vm_map_unlock(kernel_map); 1272 kva = 0; 1273 } 1274 if ((error = uvm_map_extract(&p->p_vmspace->vm_map, 1275 baseva, PAGE_SIZE, &kva, UVM_EXTRACT_FIXPROT))) 1276 break; 1277 last_baseva = baseva; 1278 } 1279 1280 /* do the update */ 1281 if ((error = kcopy(data, (char *)kva + pageoffset, s))) 1282 break; 1283 data += s; 1284 1285 if (extra > 0) { 1286 baseva += PAGE_SIZE; 1287 s = extra; 1288 pageoffset = 0; 1289 extra = 0; 1290 goto redo; 1291 } 1292 } 1293 1294 if (kva != 0) { 1295 vm_map_lock(kernel_map); 1296 uvm_unmap_remove(kernel_map, kva, kva+PAGE_SIZE, 1297 &dead_entries, FALSE, TRUE, FALSE); /* XXX */ 1298 vm_map_unlock(kernel_map); 1299 } 1300 uvm_unmap_detach(&dead_entries, AMAP_REFALL); 1301 1302 return error; 1303 } 1304