xref: /netbsd-src/sys/uvm/uvm_mmap.c (revision 63aea4bd5b445e491ff0389fe27ec78b3099dba3)
1 /*	$NetBSD: uvm_mmap.c,v 1.154 2015/11/26 13:15:34 martin Exp $	*/
2 
3 /*
4  * Copyright (c) 1997 Charles D. Cranor and Washington University.
5  * Copyright (c) 1991, 1993 The Regents of the University of California.
6  * Copyright (c) 1988 University of Utah.
7  *
8  * All rights reserved.
9  *
10  * This code is derived from software contributed to Berkeley by
11  * the Systems Programming Group of the University of Utah Computer
12  * Science Department.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
39  *      @(#)vm_mmap.c   8.5 (Berkeley) 5/19/94
40  * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
41  */
42 
43 /*
44  * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
45  * function.
46  */
47 
48 #include <sys/cdefs.h>
49 __KERNEL_RCSID(0, "$NetBSD: uvm_mmap.c,v 1.154 2015/11/26 13:15:34 martin Exp $");
50 
51 #include "opt_compat_netbsd.h"
52 #include "opt_pax.h"
53 
54 #include <sys/types.h>
55 #include <sys/file.h>
56 #include <sys/filedesc.h>
57 #include <sys/resourcevar.h>
58 #include <sys/mman.h>
59 
60 #if defined(PAX_ASLR) || defined(PAX_MPROTECT)
61 #include <sys/pax.h>
62 #endif /* PAX_ASLR || PAX_MPROTECT */
63 
64 #include <sys/syscallargs.h>
65 
66 #include <uvm/uvm.h>
67 #include <uvm/uvm_device.h>
68 
69 static int uvm_mmap(struct vm_map *, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t,
70 		    int, int, struct uvm_object *, voff_t, vsize_t);
71 
72 static int
73 range_test(vaddr_t addr, vsize_t size, bool ismmap)
74 {
75 	vaddr_t vm_min_address = VM_MIN_ADDRESS;
76 	vaddr_t vm_max_address = VM_MAXUSER_ADDRESS;
77 	vaddr_t eaddr = addr + size;
78 	int res = 0;
79 
80 	if (addr < vm_min_address)
81 		return EINVAL;
82 	if (eaddr > vm_max_address)
83 		return ismmap ? EFBIG : EINVAL;
84 	if (addr > eaddr) /* no wrapping! */
85 		return ismmap ? EOVERFLOW : EINVAL;
86 
87 #ifdef MD_MMAP_RANGE_TEST
88 	res = MD_MMAP_RANGE_TEST(addr, eaddr);
89 #endif
90 
91 	return res;
92 }
93 
94 /*
95  * unimplemented VM system calls:
96  */
97 
98 /*
99  * sys_sbrk: sbrk system call.
100  */
101 
102 /* ARGSUSED */
103 int
104 sys_sbrk(struct lwp *l, const struct sys_sbrk_args *uap, register_t *retval)
105 {
106 	/* {
107 		syscallarg(intptr_t) incr;
108 	} */
109 
110 	return (ENOSYS);
111 }
112 
113 /*
114  * sys_sstk: sstk system call.
115  */
116 
117 /* ARGSUSED */
118 int
119 sys_sstk(struct lwp *l, const struct sys_sstk_args *uap, register_t *retval)
120 {
121 	/* {
122 		syscallarg(int) incr;
123 	} */
124 
125 	return (ENOSYS);
126 }
127 
128 /*
129  * sys_mincore: determine if pages are in core or not.
130  */
131 
132 /* ARGSUSED */
133 int
134 sys_mincore(struct lwp *l, const struct sys_mincore_args *uap,
135     register_t *retval)
136 {
137 	/* {
138 		syscallarg(void *) addr;
139 		syscallarg(size_t) len;
140 		syscallarg(char *) vec;
141 	} */
142 	struct proc *p = l->l_proc;
143 	struct vm_page *pg;
144 	char *vec, pgi;
145 	struct uvm_object *uobj;
146 	struct vm_amap *amap;
147 	struct vm_anon *anon;
148 	struct vm_map_entry *entry;
149 	vaddr_t start, end, lim;
150 	struct vm_map *map;
151 	vsize_t len;
152 	int error = 0, npgs;
153 
154 	map = &p->p_vmspace->vm_map;
155 
156 	start = (vaddr_t)SCARG(uap, addr);
157 	len = SCARG(uap, len);
158 	vec = SCARG(uap, vec);
159 
160 	if (start & PAGE_MASK)
161 		return (EINVAL);
162 	len = round_page(len);
163 	end = start + len;
164 	if (end <= start)
165 		return (EINVAL);
166 
167 	/*
168 	 * Lock down vec, so our returned status isn't outdated by
169 	 * storing the status byte for a page.
170 	 */
171 
172 	npgs = len >> PAGE_SHIFT;
173 	error = uvm_vslock(p->p_vmspace, vec, npgs, VM_PROT_WRITE);
174 	if (error) {
175 		return error;
176 	}
177 	vm_map_lock_read(map);
178 
179 	if (uvm_map_lookup_entry(map, start, &entry) == false) {
180 		error = ENOMEM;
181 		goto out;
182 	}
183 
184 	for (/* nothing */;
185 	     entry != &map->header && entry->start < end;
186 	     entry = entry->next) {
187 		KASSERT(!UVM_ET_ISSUBMAP(entry));
188 		KASSERT(start >= entry->start);
189 
190 		/* Make sure there are no holes. */
191 		if (entry->end < end &&
192 		     (entry->next == &map->header ||
193 		      entry->next->start > entry->end)) {
194 			error = ENOMEM;
195 			goto out;
196 		}
197 
198 		lim = end < entry->end ? end : entry->end;
199 
200 		/*
201 		 * Special case for objects with no "real" pages.  Those
202 		 * are always considered resident (mapped devices).
203 		 */
204 
205 		if (UVM_ET_ISOBJ(entry)) {
206 			KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
207 			if (UVM_OBJ_IS_DEVICE(entry->object.uvm_obj)) {
208 				for (/* nothing */; start < lim;
209 				     start += PAGE_SIZE, vec++)
210 					subyte(vec, 1);
211 				continue;
212 			}
213 		}
214 
215 		amap = entry->aref.ar_amap;	/* upper layer */
216 		uobj = entry->object.uvm_obj;	/* lower layer */
217 
218 		if (amap != NULL)
219 			amap_lock(amap);
220 		if (uobj != NULL)
221 			mutex_enter(uobj->vmobjlock);
222 
223 		for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
224 			pgi = 0;
225 			if (amap != NULL) {
226 				/* Check the upper layer first. */
227 				anon = amap_lookup(&entry->aref,
228 				    start - entry->start);
229 				/* Don't need to lock anon here. */
230 				if (anon != NULL && anon->an_page != NULL) {
231 
232 					/*
233 					 * Anon has the page for this entry
234 					 * offset.
235 					 */
236 
237 					pgi = 1;
238 				}
239 			}
240 			if (uobj != NULL && pgi == 0) {
241 				/* Check the lower layer. */
242 				pg = uvm_pagelookup(uobj,
243 				    entry->offset + (start - entry->start));
244 				if (pg != NULL) {
245 
246 					/*
247 					 * Object has the page for this entry
248 					 * offset.
249 					 */
250 
251 					pgi = 1;
252 				}
253 			}
254 			(void) subyte(vec, pgi);
255 		}
256 		if (uobj != NULL)
257 			mutex_exit(uobj->vmobjlock);
258 		if (amap != NULL)
259 			amap_unlock(amap);
260 	}
261 
262  out:
263 	vm_map_unlock_read(map);
264 	uvm_vsunlock(p->p_vmspace, SCARG(uap, vec), npgs);
265 	return (error);
266 }
267 
268 /*
269  * sys_mmap: mmap system call.
270  *
271  * => file offset and address may not be page aligned
272  *    - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
273  *    - if address isn't page aligned the mapping starts at trunc_page(addr)
274  *      and the return value is adjusted up by the page offset.
275  */
276 
277 int
278 sys_mmap(struct lwp *l, const struct sys_mmap_args *uap, register_t *retval)
279 {
280 	/* {
281 		syscallarg(void *) addr;
282 		syscallarg(size_t) len;
283 		syscallarg(int) prot;
284 		syscallarg(int) flags;
285 		syscallarg(int) fd;
286 		syscallarg(long) pad;
287 		syscallarg(off_t) pos;
288 	} */
289 	struct proc *p = l->l_proc;
290 	vaddr_t addr;
291 	off_t pos;
292 	vsize_t size, pageoff, newsize;
293 	vm_prot_t prot, maxprot;
294 	int flags, fd, advice;
295 	vaddr_t defaddr;
296 	struct file *fp = NULL;
297 	struct uvm_object *uobj;
298 	int error;
299 #ifdef PAX_ASLR
300 	vaddr_t orig_addr;
301 #endif /* PAX_ASLR */
302 
303 	/*
304 	 * first, extract syscall args from the uap.
305 	 */
306 
307 	addr = (vaddr_t)SCARG(uap, addr);
308 	size = (vsize_t)SCARG(uap, len);
309 	prot = SCARG(uap, prot) & VM_PROT_ALL;
310 	flags = SCARG(uap, flags);
311 	fd = SCARG(uap, fd);
312 	pos = SCARG(uap, pos);
313 
314 #ifdef PAX_ASLR
315 	orig_addr = addr;
316 #endif /* PAX_ASLR */
317 
318 	/*
319 	 * Fixup the old deprecated MAP_COPY into MAP_PRIVATE, and
320 	 * validate the flags.
321 	 */
322 	if (flags & MAP_COPY) {
323 		flags = (flags & ~MAP_COPY) | MAP_PRIVATE;
324 #if defined(COMPAT_10) && defined(__i386__)
325 		/*
326 		 * Ancient kernel on x86 did not obey PROT_EXEC on i386 at least
327 		 * and ld.so did not turn it on. We take care of this on amd64
328 		 * in compat32.
329 		 */
330 		prot |= PROT_EXEC;
331 #endif
332 	}
333 	if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
334 		return (EINVAL);
335 
336 	/*
337 	 * align file position and save offset.  adjust size.
338 	 */
339 
340 	pageoff = (pos & PAGE_MASK);
341 	pos    -= pageoff;
342 	newsize = size + pageoff;		/* add offset */
343 	newsize = (vsize_t)round_page(newsize);	/* round up */
344 
345 	if (newsize < size)
346 		return (ENOMEM);
347 	size = newsize;
348 
349 	/*
350 	 * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr"
351 	 */
352 	if (flags & MAP_FIXED) {
353 
354 		/* ensure address and file offset are aligned properly */
355 		addr -= pageoff;
356 		if (addr & PAGE_MASK)
357 			return (EINVAL);
358 
359 		error = range_test(addr, size, true);
360 		if (error) {
361 			return error;
362 		}
363 
364 	} else if (addr == 0 || !(flags & MAP_TRYFIXED)) {
365 
366 		/*
367 		 * not fixed: make sure we skip over the largest
368 		 * possible heap for non-topdown mapping arrangements.
369 		 * we will refine our guess later (e.g. to account for
370 		 * VAC, etc)
371 		 */
372 
373 		defaddr = p->p_emul->e_vm_default_addr(p,
374 		    (vaddr_t)p->p_vmspace->vm_daddr, size,
375 		    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
376 
377 		if (addr == 0 ||
378 		    !(p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN))
379 			addr = MAX(addr, defaddr);
380 		else
381 			addr = MIN(addr, defaddr);
382 	}
383 
384 	/*
385 	 * check for file mappings (i.e. not anonymous) and verify file.
386 	 */
387 
388 	advice = UVM_ADV_NORMAL;
389 	if ((flags & MAP_ANON) == 0) {
390 		if ((fp = fd_getfile(fd)) == NULL)
391 			return (EBADF);
392 
393 		if (fp->f_ops->fo_mmap == NULL) {
394 			error = ENODEV;
395 			goto out;
396 		}
397 		error = (*fp->f_ops->fo_mmap)(fp, &pos, size, prot, &flags,
398 					      &advice, &uobj, &maxprot);
399 		if (error) {
400 			goto out;
401 		}
402 		if (uobj == NULL) {
403 			flags |= MAP_ANON;
404 			fd_putfile(fd);
405 			fp = NULL;
406 			goto is_anon;
407 		}
408 	} else {		/* MAP_ANON case */
409 		/*
410 		 * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0?
411 		 */
412 		if (fd != -1)
413 			return (EINVAL);
414 
415  is_anon:		/* label for SunOS style /dev/zero */
416 		uobj = NULL;
417 		maxprot = VM_PROT_ALL;
418 		pos = 0;
419 	}
420 
421 #ifdef PAX_MPROTECT
422 	pax_mprotect(l, &prot, &maxprot);
423 #endif /* PAX_MPROTECT */
424 
425 #ifdef PAX_ASLR
426 	pax_aslr_mmap(l, &addr, orig_addr, flags);
427 #endif /* PAX_ASLR */
428 
429 	/*
430 	 * now let kernel internal function uvm_mmap do the work.
431 	 */
432 
433 	error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
434 	    flags, advice, uobj, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
435 
436 	/* remember to add offset */
437 	*retval = (register_t)(addr + pageoff);
438 
439  out:
440      	if (fp != NULL)
441 		fd_putfile(fd);
442 
443 	return (error);
444 }
445 
446 /*
447  * sys___msync13: the msync system call (a front-end for flush)
448  */
449 
450 int
451 sys___msync13(struct lwp *l, const struct sys___msync13_args *uap,
452     register_t *retval)
453 {
454 	/* {
455 		syscallarg(void *) addr;
456 		syscallarg(size_t) len;
457 		syscallarg(int) flags;
458 	} */
459 	struct proc *p = l->l_proc;
460 	vaddr_t addr;
461 	vsize_t size, pageoff;
462 	struct vm_map *map;
463 	int error, rv, flags, uvmflags;
464 
465 	/*
466 	 * extract syscall args from the uap
467 	 */
468 
469 	addr = (vaddr_t)SCARG(uap, addr);
470 	size = (vsize_t)SCARG(uap, len);
471 	flags = SCARG(uap, flags);
472 
473 	/* sanity check flags */
474 	if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
475 	    (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
476 	    (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
477 		return (EINVAL);
478 	if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
479 		flags |= MS_SYNC;
480 
481 	/*
482 	 * align the address to a page boundary and adjust the size accordingly.
483 	 */
484 
485 	pageoff = (addr & PAGE_MASK);
486 	addr -= pageoff;
487 	size += pageoff;
488 	size = (vsize_t)round_page(size);
489 
490 	error = range_test(addr, size, false);
491 	if (error)
492 		return error;
493 
494 	/*
495 	 * get map
496 	 */
497 
498 	map = &p->p_vmspace->vm_map;
499 
500 	/*
501 	 * XXXCDC: do we really need this semantic?
502 	 *
503 	 * XXX Gak!  If size is zero we are supposed to sync "all modified
504 	 * pages with the region containing addr".  Unfortunately, we
505 	 * don't really keep track of individual mmaps so we approximate
506 	 * by flushing the range of the map entry containing addr.
507 	 * This can be incorrect if the region splits or is coalesced
508 	 * with a neighbor.
509 	 */
510 
511 	if (size == 0) {
512 		struct vm_map_entry *entry;
513 
514 		vm_map_lock_read(map);
515 		rv = uvm_map_lookup_entry(map, addr, &entry);
516 		if (rv == true) {
517 			addr = entry->start;
518 			size = entry->end - entry->start;
519 		}
520 		vm_map_unlock_read(map);
521 		if (rv == false)
522 			return (EINVAL);
523 	}
524 
525 	/*
526 	 * translate MS_ flags into PGO_ flags
527 	 */
528 
529 	uvmflags = PGO_CLEANIT;
530 	if (flags & MS_INVALIDATE)
531 		uvmflags |= PGO_FREE;
532 	if (flags & MS_SYNC)
533 		uvmflags |= PGO_SYNCIO;
534 
535 	error = uvm_map_clean(map, addr, addr+size, uvmflags);
536 	return error;
537 }
538 
539 /*
540  * sys_munmap: unmap a users memory
541  */
542 
543 int
544 sys_munmap(struct lwp *l, const struct sys_munmap_args *uap, register_t *retval)
545 {
546 	/* {
547 		syscallarg(void *) addr;
548 		syscallarg(size_t) len;
549 	} */
550 	struct proc *p = l->l_proc;
551 	vaddr_t addr;
552 	vsize_t size, pageoff;
553 	struct vm_map *map;
554 	struct vm_map_entry *dead_entries;
555 	int error;
556 
557 	/*
558 	 * get syscall args.
559 	 */
560 
561 	addr = (vaddr_t)SCARG(uap, addr);
562 	size = (vsize_t)SCARG(uap, len);
563 
564 	/*
565 	 * align the address to a page boundary and adjust the size accordingly.
566 	 */
567 
568 	pageoff = (addr & PAGE_MASK);
569 	addr -= pageoff;
570 	size += pageoff;
571 	size = (vsize_t)round_page(size);
572 
573 	if (size == 0)
574 		return (0);
575 
576 	error = range_test(addr, size, false);
577 	if (error)
578 		return error;
579 
580 	map = &p->p_vmspace->vm_map;
581 
582 	/*
583 	 * interesting system call semantic: make sure entire range is
584 	 * allocated before allowing an unmap.
585 	 */
586 
587 	vm_map_lock(map);
588 #if 0
589 	if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) {
590 		vm_map_unlock(map);
591 		return (EINVAL);
592 	}
593 #endif
594 	uvm_unmap_remove(map, addr, addr + size, &dead_entries, 0);
595 	vm_map_unlock(map);
596 	if (dead_entries != NULL)
597 		uvm_unmap_detach(dead_entries, 0);
598 	return (0);
599 }
600 
601 /*
602  * sys_mprotect: the mprotect system call
603  */
604 
605 int
606 sys_mprotect(struct lwp *l, const struct sys_mprotect_args *uap,
607     register_t *retval)
608 {
609 	/* {
610 		syscallarg(void *) addr;
611 		syscallarg(size_t) len;
612 		syscallarg(int) prot;
613 	} */
614 	struct proc *p = l->l_proc;
615 	vaddr_t addr;
616 	vsize_t size, pageoff;
617 	vm_prot_t prot;
618 	int error;
619 
620 	/*
621 	 * extract syscall args from uap
622 	 */
623 
624 	addr = (vaddr_t)SCARG(uap, addr);
625 	size = (vsize_t)SCARG(uap, len);
626 	prot = SCARG(uap, prot) & VM_PROT_ALL;
627 
628 	/*
629 	 * align the address to a page boundary and adjust the size accordingly.
630 	 */
631 
632 	pageoff = (addr & PAGE_MASK);
633 	addr -= pageoff;
634 	size += pageoff;
635 	size = round_page(size);
636 
637 	error = range_test(addr, size, false);
638 	if (error)
639 		return error;
640 
641 	error = uvm_map_protect(&p->p_vmspace->vm_map, addr, addr + size, prot,
642 				false);
643 	return error;
644 }
645 
646 /*
647  * sys_minherit: the minherit system call
648  */
649 
650 int
651 sys_minherit(struct lwp *l, const struct sys_minherit_args *uap,
652    register_t *retval)
653 {
654 	/* {
655 		syscallarg(void *) addr;
656 		syscallarg(int) len;
657 		syscallarg(int) inherit;
658 	} */
659 	struct proc *p = l->l_proc;
660 	vaddr_t addr;
661 	vsize_t size, pageoff;
662 	vm_inherit_t inherit;
663 	int error;
664 
665 	addr = (vaddr_t)SCARG(uap, addr);
666 	size = (vsize_t)SCARG(uap, len);
667 	inherit = SCARG(uap, inherit);
668 
669 	/*
670 	 * align the address to a page boundary and adjust the size accordingly.
671 	 */
672 
673 	pageoff = (addr & PAGE_MASK);
674 	addr -= pageoff;
675 	size += pageoff;
676 	size = (vsize_t)round_page(size);
677 
678 	error = range_test(addr, size, false);
679 	if (error)
680 		return error;
681 
682 	error = uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr + size,
683 				inherit);
684 	return error;
685 }
686 
687 /*
688  * sys_madvise: give advice about memory usage.
689  */
690 
691 /* ARGSUSED */
692 int
693 sys_madvise(struct lwp *l, const struct sys_madvise_args *uap,
694    register_t *retval)
695 {
696 	/* {
697 		syscallarg(void *) addr;
698 		syscallarg(size_t) len;
699 		syscallarg(int) behav;
700 	} */
701 	struct proc *p = l->l_proc;
702 	vaddr_t addr;
703 	vsize_t size, pageoff;
704 	int advice, error;
705 
706 	addr = (vaddr_t)SCARG(uap, addr);
707 	size = (vsize_t)SCARG(uap, len);
708 	advice = SCARG(uap, behav);
709 
710 	/*
711 	 * align the address to a page boundary, and adjust the size accordingly
712 	 */
713 
714 	pageoff = (addr & PAGE_MASK);
715 	addr -= pageoff;
716 	size += pageoff;
717 	size = (vsize_t)round_page(size);
718 
719 	error = range_test(addr, size, false);
720 	if (error)
721 		return error;
722 
723 	switch (advice) {
724 	case MADV_NORMAL:
725 	case MADV_RANDOM:
726 	case MADV_SEQUENTIAL:
727 		error = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size,
728 		    advice);
729 		break;
730 
731 	case MADV_WILLNEED:
732 
733 		/*
734 		 * Activate all these pages, pre-faulting them in if
735 		 * necessary.
736 		 */
737 		error = uvm_map_willneed(&p->p_vmspace->vm_map,
738 		    addr, addr + size);
739 		break;
740 
741 	case MADV_DONTNEED:
742 
743 		/*
744 		 * Deactivate all these pages.  We don't need them
745 		 * any more.  We don't, however, toss the data in
746 		 * the pages.
747 		 */
748 
749 		error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
750 		    PGO_DEACTIVATE);
751 		break;
752 
753 	case MADV_FREE:
754 
755 		/*
756 		 * These pages contain no valid data, and may be
757 		 * garbage-collected.  Toss all resources, including
758 		 * any swap space in use.
759 		 */
760 
761 		error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
762 		    PGO_FREE);
763 		break;
764 
765 	case MADV_SPACEAVAIL:
766 
767 		/*
768 		 * XXXMRG What is this?  I think it's:
769 		 *
770 		 *	Ensure that we have allocated backing-store
771 		 *	for these pages.
772 		 *
773 		 * This is going to require changes to the page daemon,
774 		 * as it will free swap space allocated to pages in core.
775 		 * There's also what to do for device/file/anonymous memory.
776 		 */
777 
778 		return (EINVAL);
779 
780 	default:
781 		return (EINVAL);
782 	}
783 
784 	return error;
785 }
786 
787 /*
788  * sys_mlock: memory lock
789  */
790 
791 int
792 sys_mlock(struct lwp *l, const struct sys_mlock_args *uap, register_t *retval)
793 {
794 	/* {
795 		syscallarg(const void *) addr;
796 		syscallarg(size_t) len;
797 	} */
798 	struct proc *p = l->l_proc;
799 	vaddr_t addr;
800 	vsize_t size, pageoff;
801 	int error;
802 
803 	/*
804 	 * extract syscall args from uap
805 	 */
806 
807 	addr = (vaddr_t)SCARG(uap, addr);
808 	size = (vsize_t)SCARG(uap, len);
809 
810 	/*
811 	 * align the address to a page boundary and adjust the size accordingly
812 	 */
813 
814 	pageoff = (addr & PAGE_MASK);
815 	addr -= pageoff;
816 	size += pageoff;
817 	size = (vsize_t)round_page(size);
818 
819 	error = range_test(addr, size, false);
820 	if (error)
821 		return error;
822 
823 	if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
824 		return (EAGAIN);
825 
826 	if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
827 			p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
828 		return (EAGAIN);
829 
830 	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, false,
831 	    0);
832 	if (error == EFAULT)
833 		error = ENOMEM;
834 	return error;
835 }
836 
837 /*
838  * sys_munlock: unlock wired pages
839  */
840 
841 int
842 sys_munlock(struct lwp *l, const struct sys_munlock_args *uap,
843     register_t *retval)
844 {
845 	/* {
846 		syscallarg(const void *) addr;
847 		syscallarg(size_t) len;
848 	} */
849 	struct proc *p = l->l_proc;
850 	vaddr_t addr;
851 	vsize_t size, pageoff;
852 	int error;
853 
854 	/*
855 	 * extract syscall args from uap
856 	 */
857 
858 	addr = (vaddr_t)SCARG(uap, addr);
859 	size = (vsize_t)SCARG(uap, len);
860 
861 	/*
862 	 * align the address to a page boundary, and adjust the size accordingly
863 	 */
864 
865 	pageoff = (addr & PAGE_MASK);
866 	addr -= pageoff;
867 	size += pageoff;
868 	size = (vsize_t)round_page(size);
869 
870 	error = range_test(addr, size, false);
871 	if (error)
872 		return error;
873 
874 	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, true,
875 	    0);
876 	if (error == EFAULT)
877 		error = ENOMEM;
878 	return error;
879 }
880 
881 /*
882  * sys_mlockall: lock all pages mapped into an address space.
883  */
884 
885 int
886 sys_mlockall(struct lwp *l, const struct sys_mlockall_args *uap,
887     register_t *retval)
888 {
889 	/* {
890 		syscallarg(int) flags;
891 	} */
892 	struct proc *p = l->l_proc;
893 	int error, flags;
894 
895 	flags = SCARG(uap, flags);
896 
897 	if (flags == 0 ||
898 	    (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
899 		return (EINVAL);
900 
901 	error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
902 	    p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
903 	return (error);
904 }
905 
906 /*
907  * sys_munlockall: unlock all pages mapped into an address space.
908  */
909 
910 int
911 sys_munlockall(struct lwp *l, const void *v, register_t *retval)
912 {
913 	struct proc *p = l->l_proc;
914 
915 	(void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
916 	return (0);
917 }
918 
919 /*
920  * uvm_mmap: internal version of mmap
921  *
922  * - used by sys_mmap and various framebuffers
923  * - uobj is a struct uvm_object pointer or NULL for MAP_ANON
924  * - caller must page-align the file offset
925  */
926 
927 int
928 uvm_mmap(struct vm_map *map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
929     vm_prot_t maxprot, int flags, int advice, struct uvm_object *uobj,
930     voff_t foff, vsize_t locklimit)
931 {
932 	vaddr_t align = 0;
933 	int error;
934 	uvm_flag_t uvmflag = 0;
935 
936 	/*
937 	 * check params
938 	 */
939 
940 	if (size == 0)
941 		return(0);
942 	if (foff & PAGE_MASK)
943 		return(EINVAL);
944 	if ((prot & maxprot) != prot)
945 		return(EINVAL);
946 
947 	/*
948 	 * for non-fixed mappings, round off the suggested address.
949 	 * for fixed mappings, check alignment and zap old mappings.
950 	 */
951 
952 	if ((flags & MAP_FIXED) == 0) {
953 		*addr = round_page(*addr);
954 	} else {
955 		if (*addr & PAGE_MASK)
956 			return(EINVAL);
957 		uvmflag |= UVM_FLAG_FIXED;
958 		(void) uvm_unmap(map, *addr, *addr + size);
959 	}
960 
961 	/*
962 	 * Try to see if any requested alignment can even be attemped.
963 	 * Make sure we can express the alignment (asking for a >= 4GB
964 	 * alignment on an ILP32 architecure make no sense) and the
965 	 * alignment is at least for a page sized quanitiy.  If the
966 	 * request was for a fixed mapping, make sure supplied address
967 	 * adheres to the request alignment.
968 	 */
969 	align = (flags & MAP_ALIGNMENT_MASK) >> MAP_ALIGNMENT_SHIFT;
970 	if (align) {
971 		if (align >= sizeof(vaddr_t) * NBBY)
972 			return(EINVAL);
973 		align = 1L << align;
974 		if (align < PAGE_SIZE)
975 			return(EINVAL);
976 		if (align >= vm_map_max(map))
977 			return(ENOMEM);
978 		if (flags & MAP_FIXED) {
979 			if ((*addr & (align-1)) != 0)
980 				return(EINVAL);
981 			align = 0;
982 		}
983 	}
984 
985 	/*
986 	 * check resource limits
987 	 */
988 
989 	if (!VM_MAP_IS_KERNEL(map) &&
990 	    (((rlim_t)curproc->p_vmspace->vm_map.size + (rlim_t)size) >
991 	    curproc->p_rlimit[RLIMIT_AS].rlim_cur))
992 		return ENOMEM;
993 
994 	/*
995 	 * handle anon vs. non-anon mappings.   for non-anon mappings attach
996 	 * to underlying vm object.
997 	 */
998 
999 	if (flags & MAP_ANON) {
1000 		KASSERT(uobj == NULL);
1001 		foff = UVM_UNKNOWN_OFFSET;
1002 		if ((flags & MAP_SHARED) == 0)
1003 			/* XXX: defer amap create */
1004 			uvmflag |= UVM_FLAG_COPYONW;
1005 		else
1006 			/* shared: create amap now */
1007 			uvmflag |= UVM_FLAG_OVERLAY;
1008 
1009 	} else {
1010 		KASSERT(uobj != NULL);
1011 		if ((flags & MAP_SHARED) == 0) {
1012 			uvmflag |= UVM_FLAG_COPYONW;
1013 		}
1014 	}
1015 
1016 	uvmflag = UVM_MAPFLAG(prot, maxprot,
1017 			(flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY,
1018 			advice, uvmflag);
1019 	error = uvm_map(map, addr, size, uobj, foff, align, uvmflag);
1020 	if (error) {
1021 		if (uobj)
1022 			uobj->pgops->pgo_detach(uobj);
1023 		return error;
1024 	}
1025 
1026 	/*
1027 	 * POSIX 1003.1b -- if our address space was configured
1028 	 * to lock all future mappings, wire the one we just made.
1029 	 *
1030 	 * Also handle the MAP_WIRED flag here.
1031 	 */
1032 
1033 	if (prot == VM_PROT_NONE) {
1034 
1035 		/*
1036 		 * No more work to do in this case.
1037 		 */
1038 
1039 		return (0);
1040 	}
1041 	if ((flags & MAP_WIRED) != 0 || (map->flags & VM_MAP_WIREFUTURE) != 0) {
1042 		vm_map_lock(map);
1043 		if (atop(size) + uvmexp.wired > uvmexp.wiredmax ||
1044 		    (locklimit != 0 &&
1045 		     size + ptoa(pmap_wired_count(vm_map_pmap(map))) >
1046 		     locklimit)) {
1047 			vm_map_unlock(map);
1048 			uvm_unmap(map, *addr, *addr + size);
1049 			return ENOMEM;
1050 		}
1051 
1052 		/*
1053 		 * uvm_map_pageable() always returns the map unlocked.
1054 		 */
1055 
1056 		error = uvm_map_pageable(map, *addr, *addr + size,
1057 					 false, UVM_LK_ENTER);
1058 		if (error) {
1059 			uvm_unmap(map, *addr, *addr + size);
1060 			return error;
1061 		}
1062 		return (0);
1063 	}
1064 	return 0;
1065 }
1066 
1067 vaddr_t
1068 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz, int topdown)
1069 {
1070 
1071 	if (topdown)
1072 		return VM_DEFAULT_ADDRESS_TOPDOWN(base, sz);
1073 	else
1074 		return VM_DEFAULT_ADDRESS_BOTTOMUP(base, sz);
1075 }
1076 
1077 int
1078 uvm_mmap_dev(struct proc *p, void **addrp, size_t len, dev_t dev,
1079     off_t off)
1080 {
1081 	struct uvm_object *uobj;
1082 	int error, flags, prot;
1083 
1084 	flags = MAP_SHARED;
1085 	prot = VM_PROT_READ | VM_PROT_WRITE;
1086 	if (*addrp)
1087 		flags |= MAP_FIXED;
1088 	else
1089 		*addrp = (void *)p->p_emul->e_vm_default_addr(p,
1090 		    (vaddr_t)p->p_vmspace->vm_daddr, len,
1091 		    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
1092 
1093 	uobj = udv_attach(dev, prot, off, len);
1094 	if (uobj == NULL)
1095 		return EINVAL;
1096 
1097 	error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp,
1098 			 (vsize_t)len, prot, prot, flags, UVM_ADV_RANDOM,
1099 			 uobj, off, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
1100 	return error;
1101 }
1102 
1103 int
1104 uvm_mmap_anon(struct proc *p, void **addrp, size_t len)
1105 {
1106 	int error, flags, prot;
1107 
1108 	flags = MAP_PRIVATE | MAP_ANON;
1109 	prot = VM_PROT_READ | VM_PROT_WRITE;
1110 	if (*addrp)
1111 		flags |= MAP_FIXED;
1112 	else
1113 		*addrp = (void *)p->p_emul->e_vm_default_addr(p,
1114 		    (vaddr_t)p->p_vmspace->vm_daddr, len,
1115 		    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
1116 
1117 	error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp,
1118 			 (vsize_t)len, prot, prot, flags, UVM_ADV_NORMAL,
1119 			 NULL, 0, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
1120 	return error;
1121 }
1122