xref: /netbsd-src/sys/uvm/uvm_mmap.c (revision 82d56013d7b633d116a93943de88e08335357a7c)
1 /*	$NetBSD: uvm_mmap.c,v 1.175 2020/02/23 15:46:43 ad Exp $	*/
2 
3 /*
4  * Copyright (c) 1997 Charles D. Cranor and Washington University.
5  * Copyright (c) 1991, 1993 The Regents of the University of California.
6  * Copyright (c) 1988 University of Utah.
7  *
8  * All rights reserved.
9  *
10  * This code is derived from software contributed to Berkeley by
11  * the Systems Programming Group of the University of Utah Computer
12  * Science Department.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
39  *      @(#)vm_mmap.c   8.5 (Berkeley) 5/19/94
40  * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
41  */
42 
43 /*
44  * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
45  * function.
46  */
47 
48 #include <sys/cdefs.h>
49 __KERNEL_RCSID(0, "$NetBSD: uvm_mmap.c,v 1.175 2020/02/23 15:46:43 ad Exp $");
50 
51 #include "opt_compat_netbsd.h"
52 #include "opt_pax.h"
53 
54 #include <sys/types.h>
55 #include <sys/file.h>
56 #include <sys/filedesc.h>
57 #include <sys/resourcevar.h>
58 #include <sys/mman.h>
59 #include <sys/pax.h>
60 
61 #include <sys/syscallargs.h>
62 
63 #include <uvm/uvm.h>
64 #include <uvm/uvm_device.h>
65 
66 static int uvm_mmap(struct vm_map *, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t,
67     int, int, struct uvm_object *, voff_t, vsize_t);
68 
69 static int
70 range_test(const struct vm_map *map, vaddr_t addr, vsize_t size, bool ismmap)
71 {
72 	vaddr_t vm_min_address = vm_map_min(map);
73 	vaddr_t vm_max_address = vm_map_max(map);
74 	vaddr_t eaddr = addr + size;
75 	int res = 0;
76 
77 	if (addr < vm_min_address)
78 		return EINVAL;
79 	if (eaddr > vm_max_address)
80 		return ismmap ? EFBIG : EINVAL;
81 	if (addr > eaddr) /* no wrapping! */
82 		return ismmap ? EOVERFLOW : EINVAL;
83 
84 #ifdef MD_MMAP_RANGE_TEST
85 	res = MD_MMAP_RANGE_TEST(addr, eaddr);
86 #endif
87 
88 	return res;
89 }
90 
91 /*
92  * align the address to a page boundary, and adjust the size accordingly
93  */
94 static int
95 round_and_check(const struct vm_map *map, vaddr_t *addr, vsize_t *size)
96 {
97 	const vsize_t pageoff = (vsize_t)(*addr & PAGE_MASK);
98 
99 	*addr -= pageoff;
100 
101 	if (*size != 0) {
102 		*size += pageoff;
103 		*size = (vsize_t)round_page(*size);
104 	} else if (*addr + *size < *addr) {
105 		return ENOMEM;
106 	}
107 
108 	return range_test(map, *addr, *size, false);
109 }
110 
111 /*
112  * sys_mincore: determine if pages are in core or not.
113  */
114 
115 /* ARGSUSED */
116 int
117 sys_mincore(struct lwp *l, const struct sys_mincore_args *uap,
118     register_t *retval)
119 {
120 	/* {
121 		syscallarg(void *) addr;
122 		syscallarg(size_t) len;
123 		syscallarg(char *) vec;
124 	} */
125 	struct proc *p = l->l_proc;
126 	struct vm_page *pg;
127 	char *vec, pgi;
128 	struct uvm_object *uobj;
129 	struct vm_amap *amap;
130 	struct vm_anon *anon;
131 	struct vm_map_entry *entry;
132 	vaddr_t start, end, lim;
133 	struct vm_map *map;
134 	vsize_t len;
135 	int error = 0;
136 	size_t npgs;
137 
138 	map = &p->p_vmspace->vm_map;
139 
140 	start = (vaddr_t)SCARG(uap, addr);
141 	len = SCARG(uap, len);
142 	vec = SCARG(uap, vec);
143 
144 	if (start & PAGE_MASK)
145 		return EINVAL;
146 	len = round_page(len);
147 	end = start + len;
148 	if (end <= start)
149 		return EINVAL;
150 
151 	/*
152 	 * Lock down vec, so our returned status isn't outdated by
153 	 * storing the status byte for a page.
154 	 */
155 
156 	npgs = len >> PAGE_SHIFT;
157 	error = uvm_vslock(p->p_vmspace, vec, npgs, VM_PROT_WRITE);
158 	if (error) {
159 		return error;
160 	}
161 	vm_map_lock_read(map);
162 
163 	if (uvm_map_lookup_entry(map, start, &entry) == false) {
164 		error = ENOMEM;
165 		goto out;
166 	}
167 
168 	for (/* nothing */;
169 	     entry != &map->header && entry->start < end;
170 	     entry = entry->next) {
171 		KASSERT(!UVM_ET_ISSUBMAP(entry));
172 		KASSERT(start >= entry->start);
173 
174 		/* Make sure there are no holes. */
175 		if (entry->end < end &&
176 		     (entry->next == &map->header ||
177 		      entry->next->start > entry->end)) {
178 			error = ENOMEM;
179 			goto out;
180 		}
181 
182 		lim = end < entry->end ? end : entry->end;
183 
184 		/*
185 		 * Special case for objects with no "real" pages.  Those
186 		 * are always considered resident (mapped devices).
187 		 */
188 
189 		if (UVM_ET_ISOBJ(entry)) {
190 			KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
191 			if (UVM_OBJ_IS_DEVICE(entry->object.uvm_obj)) {
192 				for (/* nothing */; start < lim;
193 				     start += PAGE_SIZE, vec++)
194 					ustore_char(vec, 1);
195 				continue;
196 			}
197 		}
198 
199 		amap = entry->aref.ar_amap;	/* upper layer */
200 		uobj = entry->object.uvm_obj;	/* lower layer */
201 
202 		if (amap != NULL)
203 			amap_lock(amap, RW_READER);
204 		if (uobj != NULL)
205 			rw_enter(uobj->vmobjlock, RW_READER);
206 
207 		for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
208 			pgi = 0;
209 			if (amap != NULL) {
210 				/* Check the upper layer first. */
211 				anon = amap_lookup(&entry->aref,
212 				    start - entry->start);
213 				/* Don't need to lock anon here. */
214 				if (anon != NULL && anon->an_page != NULL) {
215 
216 					/*
217 					 * Anon has the page for this entry
218 					 * offset.
219 					 */
220 
221 					pgi = 1;
222 				}
223 			}
224 			if (uobj != NULL && pgi == 0) {
225 				/* Check the lower layer. */
226 				pg = uvm_pagelookup(uobj,
227 				    entry->offset + (start - entry->start));
228 				if (pg != NULL) {
229 
230 					/*
231 					 * Object has the page for this entry
232 					 * offset.
233 					 */
234 
235 					pgi = 1;
236 				}
237 			}
238 			(void) ustore_char(vec, pgi);
239 		}
240 		if (uobj != NULL)
241 			rw_exit(uobj->vmobjlock);
242 		if (amap != NULL)
243 			amap_unlock(amap);
244 	}
245 
246  out:
247 	vm_map_unlock_read(map);
248 	uvm_vsunlock(p->p_vmspace, SCARG(uap, vec), npgs);
249 	return error;
250 }
251 
252 /*
253  * sys_mmap: mmap system call.
254  *
255  * => file offset and address may not be page aligned
256  *    - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
257  *    - if address isn't page aligned the mapping starts at trunc_page(addr)
258  *      and the return value is adjusted up by the page offset.
259  */
260 
261 int
262 sys_mmap(struct lwp *l, const struct sys_mmap_args *uap, register_t *retval)
263 {
264 	/* {
265 		syscallarg(void *) addr;
266 		syscallarg(size_t) len;
267 		syscallarg(int) prot;
268 		syscallarg(int) flags;
269 		syscallarg(int) fd;
270 		syscallarg(long) pad;
271 		syscallarg(off_t) pos;
272 	} */
273 	struct proc *p = l->l_proc;
274 	vaddr_t addr;
275 	off_t pos;
276 	vsize_t size, pageoff, newsize;
277 	vm_prot_t prot, maxprot, extraprot;
278 	int flags, fd, advice;
279 	vaddr_t defaddr;
280 	struct file *fp = NULL;
281 	struct uvm_object *uobj;
282 	int error;
283 #ifdef PAX_ASLR
284 	vaddr_t orig_addr;
285 #endif /* PAX_ASLR */
286 
287 	/*
288 	 * first, extract syscall args from the uap.
289 	 */
290 
291 	addr = (vaddr_t)SCARG(uap, addr);
292 	size = (vsize_t)SCARG(uap, len);
293 	prot = SCARG(uap, prot) & VM_PROT_ALL;
294 	extraprot = PROT_MPROTECT_EXTRACT(SCARG(uap, prot));
295 	flags = SCARG(uap, flags);
296 	fd = SCARG(uap, fd);
297 	pos = SCARG(uap, pos);
298 
299 #ifdef PAX_ASLR
300 	orig_addr = addr;
301 #endif /* PAX_ASLR */
302 
303 	if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
304 		return EINVAL;
305 
306 	/*
307 	 * align file position and save offset.  adjust size.
308 	 */
309 
310 	pageoff = (pos & PAGE_MASK);
311 	pos    -= pageoff;
312 	newsize = size + pageoff;		/* add offset */
313 	newsize = (vsize_t)round_page(newsize);	/* round up */
314 
315 	if (newsize < size)
316 		return ENOMEM;
317 	size = newsize;
318 
319 	/*
320 	 * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr"
321 	 */
322 	if (flags & MAP_FIXED) {
323 		/* ensure address and file offset are aligned properly */
324 		addr -= pageoff;
325 		if (addr & PAGE_MASK)
326 			return EINVAL;
327 
328 		error = range_test(&p->p_vmspace->vm_map, addr, size, true);
329 		if (error) {
330 			return error;
331 		}
332 	} else if (addr == 0 || !(flags & MAP_TRYFIXED)) {
333 		/*
334 		 * not fixed: make sure we skip over the largest
335 		 * possible heap for non-topdown mapping arrangements.
336 		 * we will refine our guess later (e.g. to account for
337 		 * VAC, etc)
338 		 */
339 
340 		defaddr = p->p_emul->e_vm_default_addr(p,
341 		    (vaddr_t)p->p_vmspace->vm_daddr, size,
342 		    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
343 
344 		if (addr == 0 || !(p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN))
345 			addr = MAX(addr, defaddr);
346 		else
347 			addr = MIN(addr, defaddr);
348 	}
349 
350 	/*
351 	 * check for file mappings (i.e. not anonymous) and verify file.
352 	 */
353 
354 	advice = UVM_ADV_NORMAL;
355 	if ((flags & MAP_ANON) == 0) {
356 		if ((fp = fd_getfile(fd)) == NULL)
357 			return EBADF;
358 
359 		if (fp->f_ops->fo_mmap == NULL) {
360 			error = ENODEV;
361 			goto out;
362 		}
363 		error = (*fp->f_ops->fo_mmap)(fp, &pos, size, prot, &flags,
364 		    &advice, &uobj, &maxprot);
365 		if (error) {
366 			goto out;
367 		}
368 		if (uobj == NULL) {
369 			flags |= MAP_ANON;
370 			fd_putfile(fd);
371 			fp = NULL;
372 			goto is_anon;
373 		}
374 	} else {		/* MAP_ANON case */
375 		/*
376 		 * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0?
377 		 */
378 		if (fd != -1)
379 			return EINVAL;
380 
381  is_anon:		/* label for SunOS style /dev/zero */
382 		uobj = NULL;
383 		maxprot = VM_PROT_ALL;
384 		pos = 0;
385 	}
386 
387 	maxprot = PAX_MPROTECT_MAXPROTECT(l, prot, extraprot, maxprot);
388 	if (((prot | extraprot) & maxprot) != (prot | extraprot)) {
389 		error = EACCES;
390 		goto out;
391 	}
392 	if ((error = PAX_MPROTECT_VALIDATE(l, prot)))
393 		goto out;
394 
395 	pax_aslr_mmap(l, &addr, orig_addr, flags);
396 
397 	/*
398 	 * now let kernel internal function uvm_mmap do the work.
399 	 */
400 
401 	error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
402 	    flags, advice, uobj, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
403 
404 	/* remember to add offset */
405 	*retval = (register_t)(addr + pageoff);
406 
407  out:
408 	if (fp != NULL)
409 		fd_putfile(fd);
410 
411 	return error;
412 }
413 
414 /*
415  * sys___msync13: the msync system call (a front-end for flush)
416  */
417 
418 int
419 sys___msync13(struct lwp *l, const struct sys___msync13_args *uap,
420     register_t *retval)
421 {
422 	/* {
423 		syscallarg(void *) addr;
424 		syscallarg(size_t) len;
425 		syscallarg(int) flags;
426 	} */
427 	struct proc *p = l->l_proc;
428 	vaddr_t addr;
429 	vsize_t size;
430 	struct vm_map *map;
431 	int error, flags, uvmflags;
432 	bool rv;
433 
434 	/*
435 	 * extract syscall args from the uap
436 	 */
437 
438 	addr = (vaddr_t)SCARG(uap, addr);
439 	size = (vsize_t)SCARG(uap, len);
440 	flags = SCARG(uap, flags);
441 
442 	/* sanity check flags */
443 	if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
444 	    (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
445 	    (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
446 		return EINVAL;
447 	if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
448 		flags |= MS_SYNC;
449 
450 	/*
451 	 * get map
452 	 */
453 	map = &p->p_vmspace->vm_map;
454 
455 	if (round_and_check(map, &addr, &size))
456 		return ENOMEM;
457 
458 	/*
459 	 * XXXCDC: do we really need this semantic?
460 	 *
461 	 * XXX Gak!  If size is zero we are supposed to sync "all modified
462 	 * pages with the region containing addr".  Unfortunately, we
463 	 * don't really keep track of individual mmaps so we approximate
464 	 * by flushing the range of the map entry containing addr.
465 	 * This can be incorrect if the region splits or is coalesced
466 	 * with a neighbor.
467 	 */
468 
469 	if (size == 0) {
470 		struct vm_map_entry *entry;
471 
472 		vm_map_lock_read(map);
473 		rv = uvm_map_lookup_entry(map, addr, &entry);
474 		if (rv == true) {
475 			addr = entry->start;
476 			size = entry->end - entry->start;
477 		}
478 		vm_map_unlock_read(map);
479 		if (rv == false)
480 			return EINVAL;
481 	}
482 
483 	/*
484 	 * translate MS_ flags into PGO_ flags
485 	 */
486 
487 	uvmflags = PGO_CLEANIT;
488 	if (flags & MS_INVALIDATE)
489 		uvmflags |= PGO_FREE;
490 	if (flags & MS_SYNC)
491 		uvmflags |= PGO_SYNCIO;
492 
493 	error = uvm_map_clean(map, addr, addr+size, uvmflags);
494 	return error;
495 }
496 
497 /*
498  * sys_munmap: unmap a users memory
499  */
500 
501 int
502 sys_munmap(struct lwp *l, const struct sys_munmap_args *uap, register_t *retval)
503 {
504 	/* {
505 		syscallarg(void *) addr;
506 		syscallarg(size_t) len;
507 	} */
508 	struct proc *p = l->l_proc;
509 	vaddr_t addr;
510 	vsize_t size;
511 	struct vm_map *map;
512 	struct vm_map_entry *dead_entries;
513 
514 	/*
515 	 * get syscall args.
516 	 */
517 
518 	addr = (vaddr_t)SCARG(uap, addr);
519 	size = (vsize_t)SCARG(uap, len);
520 
521 	map = &p->p_vmspace->vm_map;
522 
523 	if (round_and_check(map, &addr, &size))
524 		return EINVAL;
525 
526 	if (size == 0)
527 		return 0;
528 
529 	vm_map_lock(map);
530 #if 0
531 	/*
532 	 * interesting system call semantic: make sure entire range is
533 	 * allocated before allowing an unmap.
534 	 */
535 	if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) {
536 		vm_map_unlock(map);
537 		return EINVAL;
538 	}
539 #endif
540 	uvm_unmap_remove(map, addr, addr + size, &dead_entries, 0);
541 	vm_map_unlock(map);
542 	if (dead_entries != NULL)
543 		uvm_unmap_detach(dead_entries, 0);
544 	return 0;
545 }
546 
547 /*
548  * sys_mprotect: the mprotect system call
549  */
550 
551 int
552 sys_mprotect(struct lwp *l, const struct sys_mprotect_args *uap,
553     register_t *retval)
554 {
555 	/* {
556 		syscallarg(void *) addr;
557 		syscallarg(size_t) len;
558 		syscallarg(int) prot;
559 	} */
560 	struct proc *p = l->l_proc;
561 	vaddr_t addr;
562 	vsize_t size;
563 	vm_prot_t prot;
564 	int error;
565 
566 	/*
567 	 * extract syscall args from uap
568 	 */
569 
570 	addr = (vaddr_t)SCARG(uap, addr);
571 	size = (vsize_t)SCARG(uap, len);
572 	prot = SCARG(uap, prot) & VM_PROT_ALL;
573 
574 	if (round_and_check(&p->p_vmspace->vm_map, &addr, &size))
575 		return EINVAL;
576 
577 	error = uvm_map_protect_user(l, addr, addr + size, prot);
578 	return error;
579 }
580 
581 /*
582  * sys_minherit: the minherit system call
583  */
584 
585 int
586 sys_minherit(struct lwp *l, const struct sys_minherit_args *uap,
587    register_t *retval)
588 {
589 	/* {
590 		syscallarg(void *) addr;
591 		syscallarg(int) len;
592 		syscallarg(int) inherit;
593 	} */
594 	struct proc *p = l->l_proc;
595 	vaddr_t addr;
596 	vsize_t size;
597 	vm_inherit_t inherit;
598 	int error;
599 
600 	addr = (vaddr_t)SCARG(uap, addr);
601 	size = (vsize_t)SCARG(uap, len);
602 	inherit = SCARG(uap, inherit);
603 
604 	if (round_and_check(&p->p_vmspace->vm_map, &addr, &size))
605 		return EINVAL;
606 
607 	error = uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr + size,
608 	    inherit);
609 	return error;
610 }
611 
612 /*
613  * sys_madvise: give advice about memory usage.
614  */
615 
616 /* ARGSUSED */
617 int
618 sys_madvise(struct lwp *l, const struct sys_madvise_args *uap,
619    register_t *retval)
620 {
621 	/* {
622 		syscallarg(void *) addr;
623 		syscallarg(size_t) len;
624 		syscallarg(int) behav;
625 	} */
626 	struct proc *p = l->l_proc;
627 	vaddr_t addr;
628 	vsize_t size;
629 	int advice, error;
630 
631 	addr = (vaddr_t)SCARG(uap, addr);
632 	size = (vsize_t)SCARG(uap, len);
633 	advice = SCARG(uap, behav);
634 
635 	if (round_and_check(&p->p_vmspace->vm_map, &addr, &size))
636 		return EINVAL;
637 
638 	switch (advice) {
639 	case MADV_NORMAL:
640 	case MADV_RANDOM:
641 	case MADV_SEQUENTIAL:
642 		error = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size,
643 		    advice);
644 		break;
645 
646 	case MADV_WILLNEED:
647 
648 		/*
649 		 * Activate all these pages, pre-faulting them in if
650 		 * necessary.
651 		 */
652 		error = uvm_map_willneed(&p->p_vmspace->vm_map,
653 		    addr, addr + size);
654 		break;
655 
656 	case MADV_DONTNEED:
657 
658 		/*
659 		 * Deactivate all these pages.  We don't need them
660 		 * any more.  We don't, however, toss the data in
661 		 * the pages.
662 		 */
663 
664 		error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
665 		    PGO_DEACTIVATE);
666 		break;
667 
668 	case MADV_FREE:
669 
670 		/*
671 		 * These pages contain no valid data, and may be
672 		 * garbage-collected.  Toss all resources, including
673 		 * any swap space in use.
674 		 */
675 
676 		error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
677 		    PGO_FREE);
678 		break;
679 
680 	case MADV_SPACEAVAIL:
681 
682 		/*
683 		 * XXXMRG What is this?  I think it's:
684 		 *
685 		 *	Ensure that we have allocated backing-store
686 		 *	for these pages.
687 		 *
688 		 * This is going to require changes to the page daemon,
689 		 * as it will free swap space allocated to pages in core.
690 		 * There's also what to do for device/file/anonymous memory.
691 		 */
692 
693 		return EINVAL;
694 
695 	default:
696 		return EINVAL;
697 	}
698 
699 	return error;
700 }
701 
702 /*
703  * sys_mlock: memory lock
704  */
705 
706 int
707 sys_mlock(struct lwp *l, const struct sys_mlock_args *uap, register_t *retval)
708 {
709 	/* {
710 		syscallarg(const void *) addr;
711 		syscallarg(size_t) len;
712 	} */
713 	struct proc *p = l->l_proc;
714 	vaddr_t addr;
715 	vsize_t size;
716 	int error;
717 
718 	/*
719 	 * extract syscall args from uap
720 	 */
721 
722 	addr = (vaddr_t)SCARG(uap, addr);
723 	size = (vsize_t)SCARG(uap, len);
724 
725 	if (round_and_check(&p->p_vmspace->vm_map, &addr, &size))
726 		return ENOMEM;
727 
728 	if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
729 		return EAGAIN;
730 
731 	if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
732 	    p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
733 		return EAGAIN;
734 
735 	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, false,
736 	    0);
737 	if (error == EFAULT)
738 		error = ENOMEM;
739 	return error;
740 }
741 
742 /*
743  * sys_munlock: unlock wired pages
744  */
745 
746 int
747 sys_munlock(struct lwp *l, const struct sys_munlock_args *uap,
748     register_t *retval)
749 {
750 	/* {
751 		syscallarg(const void *) addr;
752 		syscallarg(size_t) len;
753 	} */
754 	struct proc *p = l->l_proc;
755 	vaddr_t addr;
756 	vsize_t size;
757 
758 	/*
759 	 * extract syscall args from uap
760 	 */
761 
762 	addr = (vaddr_t)SCARG(uap, addr);
763 	size = (vsize_t)SCARG(uap, len);
764 
765 	if (round_and_check(&p->p_vmspace->vm_map, &addr, &size))
766 		return ENOMEM;
767 
768 	if (uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, true, 0))
769 		return ENOMEM;
770 
771 	return 0;
772 }
773 
774 /*
775  * sys_mlockall: lock all pages mapped into an address space.
776  */
777 
778 int
779 sys_mlockall(struct lwp *l, const struct sys_mlockall_args *uap,
780     register_t *retval)
781 {
782 	/* {
783 		syscallarg(int) flags;
784 	} */
785 	struct proc *p = l->l_proc;
786 	int error, flags;
787 
788 	flags = SCARG(uap, flags);
789 
790 	if (flags == 0 || (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
791 		return EINVAL;
792 
793 	error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
794 	    p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
795 	return error;
796 }
797 
798 /*
799  * sys_munlockall: unlock all pages mapped into an address space.
800  */
801 
802 int
803 sys_munlockall(struct lwp *l, const void *v, register_t *retval)
804 {
805 	struct proc *p = l->l_proc;
806 
807 	(void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
808 	return 0;
809 }
810 
811 /*
812  * uvm_mmap: internal version of mmap
813  *
814  * - used by sys_mmap and various framebuffers
815  * - uobj is a struct uvm_object pointer or NULL for MAP_ANON
816  * - caller must page-align the file offset
817  */
818 
819 int
820 uvm_mmap(struct vm_map *map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
821     vm_prot_t maxprot, int flags, int advice, struct uvm_object *uobj,
822     voff_t foff, vsize_t locklimit)
823 {
824 	vaddr_t align = 0;
825 	int error;
826 	uvm_flag_t uvmflag = 0;
827 
828 	/*
829 	 * check params
830 	 */
831 
832 	if (size == 0)
833 		return 0;
834 	if (foff & PAGE_MASK)
835 		return EINVAL;
836 	if ((prot & maxprot) != prot)
837 		return EINVAL;
838 
839 	/*
840 	 * for non-fixed mappings, round off the suggested address.
841 	 * for fixed mappings, check alignment.
842 	 */
843 
844 	if ((flags & MAP_FIXED) == 0) {
845 		*addr = round_page(*addr);
846 	} else {
847 		if (*addr & PAGE_MASK)
848 			return EINVAL;
849 		uvmflag |= UVM_FLAG_FIXED | UVM_FLAG_UNMAP;
850 	}
851 
852 	/*
853 	 * Try to see if any requested alignment can even be attemped.
854 	 * Make sure we can express the alignment (asking for a >= 4GB
855 	 * alignment on an ILP32 architecure make no sense) and the
856 	 * alignment is at least for a page sized quanitiy.  If the
857 	 * request was for a fixed mapping, make sure supplied address
858 	 * adheres to the request alignment.
859 	 */
860 	align = (flags & MAP_ALIGNMENT_MASK) >> MAP_ALIGNMENT_SHIFT;
861 	if (align) {
862 		if (align >= sizeof(vaddr_t) * NBBY)
863 			return EINVAL;
864 		align = 1UL << align;
865 		if (align < PAGE_SIZE)
866 			return EINVAL;
867 		if (align >= vm_map_max(map))
868 			return ENOMEM;
869 		if (flags & MAP_FIXED) {
870 			if ((*addr & (align-1)) != 0)
871 				return EINVAL;
872 			align = 0;
873 		}
874 	}
875 
876 	/*
877 	 * check resource limits
878 	 */
879 
880 	if (!VM_MAP_IS_KERNEL(map) &&
881 	    (((rlim_t)curproc->p_vmspace->vm_map.size + (rlim_t)size) >
882 	    curproc->p_rlimit[RLIMIT_AS].rlim_cur))
883 		return ENOMEM;
884 
885 	/*
886 	 * handle anon vs. non-anon mappings.   for non-anon mappings attach
887 	 * to underlying vm object.
888 	 */
889 
890 	if (flags & MAP_ANON) {
891 		KASSERT(uobj == NULL);
892 		foff = UVM_UNKNOWN_OFFSET;
893 		if ((flags & MAP_SHARED) == 0)
894 			/* XXX: defer amap create */
895 			uvmflag |= UVM_FLAG_COPYONW;
896 		else
897 			/* shared: create amap now */
898 			uvmflag |= UVM_FLAG_OVERLAY;
899 
900 	} else {
901 		KASSERT(uobj != NULL);
902 		if ((flags & MAP_SHARED) == 0) {
903 			uvmflag |= UVM_FLAG_COPYONW;
904 		}
905 	}
906 
907 	uvmflag = UVM_MAPFLAG(prot, maxprot,
908 	    (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY, advice,
909 	    uvmflag);
910 	error = uvm_map(map, addr, size, uobj, foff, align, uvmflag);
911 	if (error) {
912 		if (uobj)
913 			uobj->pgops->pgo_detach(uobj);
914 		return error;
915 	}
916 
917 	/*
918 	 * POSIX 1003.1b -- if our address space was configured
919 	 * to lock all future mappings, wire the one we just made.
920 	 *
921 	 * Also handle the MAP_WIRED flag here.
922 	 */
923 
924 	if (prot == VM_PROT_NONE) {
925 
926 		/*
927 		 * No more work to do in this case.
928 		 */
929 
930 		return 0;
931 	}
932 	if ((flags & MAP_WIRED) != 0 || (map->flags & VM_MAP_WIREFUTURE) != 0) {
933 		vm_map_lock(map);
934 		if (atop(size) + uvmexp.wired > uvmexp.wiredmax ||
935 		    (locklimit != 0 &&
936 		     size + ptoa(pmap_wired_count(vm_map_pmap(map))) >
937 		     locklimit)) {
938 			vm_map_unlock(map);
939 			uvm_unmap(map, *addr, *addr + size);
940 			return ENOMEM;
941 		}
942 
943 		/*
944 		 * uvm_map_pageable() always returns the map unlocked.
945 		 */
946 
947 		error = uvm_map_pageable(map, *addr, *addr + size,
948 		    false, UVM_LK_ENTER);
949 		if (error) {
950 			uvm_unmap(map, *addr, *addr + size);
951 			return error;
952 		}
953 		return 0;
954 	}
955 	return 0;
956 }
957 
958 vaddr_t
959 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz, int topdown)
960 {
961 
962 	if (topdown)
963 		return VM_DEFAULT_ADDRESS_TOPDOWN(base, sz);
964 	else
965 		return VM_DEFAULT_ADDRESS_BOTTOMUP(base, sz);
966 }
967 
968 int
969 uvm_mmap_dev(struct proc *p, void **addrp, size_t len, dev_t dev,
970     off_t off)
971 {
972 	struct uvm_object *uobj;
973 	int error, flags, prot;
974 
975 	flags = MAP_SHARED;
976 	prot = VM_PROT_READ | VM_PROT_WRITE;
977 	if (*addrp)
978 		flags |= MAP_FIXED;
979 	else
980 		*addrp = (void *)p->p_emul->e_vm_default_addr(p,
981 		    (vaddr_t)p->p_vmspace->vm_daddr, len,
982 		    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
983 
984 	uobj = udv_attach(dev, prot, off, len);
985 	if (uobj == NULL)
986 		return EINVAL;
987 
988 	error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp,
989 	    (vsize_t)len, prot, prot, flags, UVM_ADV_RANDOM, uobj, off,
990 	    p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
991 	return error;
992 }
993 
994 int
995 uvm_mmap_anon(struct proc *p, void **addrp, size_t len)
996 {
997 	int error, flags, prot;
998 
999 	flags = MAP_PRIVATE | MAP_ANON;
1000 	prot = VM_PROT_READ | VM_PROT_WRITE;
1001 	if (*addrp)
1002 		flags |= MAP_FIXED;
1003 	else
1004 		*addrp = (void *)p->p_emul->e_vm_default_addr(p,
1005 		    (vaddr_t)p->p_vmspace->vm_daddr, len,
1006 		    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
1007 
1008 	error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp,
1009 	    (vsize_t)len, prot, prot, flags, UVM_ADV_NORMAL, NULL, 0,
1010 	    p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
1011 	return error;
1012 }
1013