xref: /netbsd-src/sys/uvm/uvm_mmap.c (revision 796c32c94f6e154afc9de0f63da35c91bb739b45)
1 /*	$NetBSD: uvm_mmap.c,v 1.167 2017/10/27 12:01:08 utkarsh009 Exp $	*/
2 
3 /*
4  * Copyright (c) 1997 Charles D. Cranor and Washington University.
5  * Copyright (c) 1991, 1993 The Regents of the University of California.
6  * Copyright (c) 1988 University of Utah.
7  *
8  * All rights reserved.
9  *
10  * This code is derived from software contributed to Berkeley by
11  * the Systems Programming Group of the University of Utah Computer
12  * Science Department.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  * 3. Neither the name of the University nor the names of its contributors
23  *    may be used to endorse or promote products derived from this software
24  *    without specific prior written permission.
25  *
26  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
27  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
28  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
29  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
30  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
31  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
32  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
33  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
34  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
35  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
36  * SUCH DAMAGE.
37  *
38  * from: Utah $Hdr: vm_mmap.c 1.6 91/10/21$
39  *      @(#)vm_mmap.c   8.5 (Berkeley) 5/19/94
40  * from: Id: uvm_mmap.c,v 1.1.2.14 1998/01/05 21:04:26 chuck Exp
41  */
42 
43 /*
44  * uvm_mmap.c: system call interface into VM system, plus kernel vm_mmap
45  * function.
46  */
47 
48 #include <sys/cdefs.h>
49 __KERNEL_RCSID(0, "$NetBSD: uvm_mmap.c,v 1.167 2017/10/27 12:01:08 utkarsh009 Exp $");
50 
51 #include "opt_compat_netbsd.h"
52 #include "opt_pax.h"
53 
54 #include <sys/types.h>
55 #include <sys/file.h>
56 #include <sys/filedesc.h>
57 #include <sys/resourcevar.h>
58 #include <sys/mman.h>
59 #include <sys/pax.h>
60 
61 #include <sys/syscallargs.h>
62 
63 #include <uvm/uvm.h>
64 #include <uvm/uvm_device.h>
65 
66 static int uvm_mmap(struct vm_map *, vaddr_t *, vsize_t, vm_prot_t, vm_prot_t,
67     int, int, struct uvm_object *, voff_t, vsize_t);
68 
69 static int
70 range_test(struct vm_map *map, vaddr_t addr, vsize_t size, bool ismmap)
71 {
72 	vaddr_t vm_min_address = vm_map_min(map);
73 	vaddr_t vm_max_address = vm_map_max(map);
74 	vaddr_t eaddr = addr + size;
75 	int res = 0;
76 
77 	if (addr < vm_min_address)
78 		return EINVAL;
79 	if (eaddr > vm_max_address)
80 		return ismmap ? EFBIG : EINVAL;
81 	if (addr > eaddr) /* no wrapping! */
82 		return ismmap ? EOVERFLOW : EINVAL;
83 
84 #ifdef MD_MMAP_RANGE_TEST
85 	res = MD_MMAP_RANGE_TEST(addr, eaddr);
86 #endif
87 
88 	return res;
89 }
90 
91 /*
92  * unimplemented VM system calls:
93  */
94 
95 /*
96  * sys_sbrk: sbrk system call.
97  */
98 
99 /* ARGSUSED */
100 int
101 sys_sbrk(struct lwp *l, const struct sys_sbrk_args *uap, register_t *retval)
102 {
103 	/* {
104 		syscallarg(intptr_t) incr;
105 	} */
106 
107 	return ENOSYS;
108 }
109 
110 /*
111  * sys_sstk: sstk system call.
112  */
113 
114 /* ARGSUSED */
115 int
116 sys_sstk(struct lwp *l, const struct sys_sstk_args *uap, register_t *retval)
117 {
118 	/* {
119 		syscallarg(int) incr;
120 	} */
121 
122 	return ENOSYS;
123 }
124 
125 /*
126  * sys_mincore: determine if pages are in core or not.
127  */
128 
129 /* ARGSUSED */
130 int
131 sys_mincore(struct lwp *l, const struct sys_mincore_args *uap,
132     register_t *retval)
133 {
134 	/* {
135 		syscallarg(void *) addr;
136 		syscallarg(size_t) len;
137 		syscallarg(char *) vec;
138 	} */
139 	struct proc *p = l->l_proc;
140 	struct vm_page *pg;
141 	char *vec, pgi;
142 	struct uvm_object *uobj;
143 	struct vm_amap *amap;
144 	struct vm_anon *anon;
145 	struct vm_map_entry *entry;
146 	vaddr_t start, end, lim;
147 	struct vm_map *map;
148 	vsize_t len;
149 	int error = 0, npgs;
150 
151 	map = &p->p_vmspace->vm_map;
152 
153 	start = (vaddr_t)SCARG(uap, addr);
154 	len = SCARG(uap, len);
155 	vec = SCARG(uap, vec);
156 
157 	if (start & PAGE_MASK)
158 		return EINVAL;
159 	len = round_page(len);
160 	end = start + len;
161 	if (end <= start)
162 		return EINVAL;
163 
164 	/*
165 	 * Lock down vec, so our returned status isn't outdated by
166 	 * storing the status byte for a page.
167 	 */
168 
169 	npgs = len >> PAGE_SHIFT;
170 	error = uvm_vslock(p->p_vmspace, vec, npgs, VM_PROT_WRITE);
171 	if (error) {
172 		return error;
173 	}
174 	vm_map_lock_read(map);
175 
176 	if (uvm_map_lookup_entry(map, start, &entry) == false) {
177 		error = ENOMEM;
178 		goto out;
179 	}
180 
181 	for (/* nothing */;
182 	     entry != &map->header && entry->start < end;
183 	     entry = entry->next) {
184 		KASSERT(!UVM_ET_ISSUBMAP(entry));
185 		KASSERT(start >= entry->start);
186 
187 		/* Make sure there are no holes. */
188 		if (entry->end < end &&
189 		     (entry->next == &map->header ||
190 		      entry->next->start > entry->end)) {
191 			error = ENOMEM;
192 			goto out;
193 		}
194 
195 		lim = end < entry->end ? end : entry->end;
196 
197 		/*
198 		 * Special case for objects with no "real" pages.  Those
199 		 * are always considered resident (mapped devices).
200 		 */
201 
202 		if (UVM_ET_ISOBJ(entry)) {
203 			KASSERT(!UVM_OBJ_IS_KERN_OBJECT(entry->object.uvm_obj));
204 			if (UVM_OBJ_IS_DEVICE(entry->object.uvm_obj)) {
205 				for (/* nothing */; start < lim;
206 				     start += PAGE_SIZE, vec++)
207 					subyte(vec, 1);
208 				continue;
209 			}
210 		}
211 
212 		amap = entry->aref.ar_amap;	/* upper layer */
213 		uobj = entry->object.uvm_obj;	/* lower layer */
214 
215 		if (amap != NULL)
216 			amap_lock(amap);
217 		if (uobj != NULL)
218 			mutex_enter(uobj->vmobjlock);
219 
220 		for (/* nothing */; start < lim; start += PAGE_SIZE, vec++) {
221 			pgi = 0;
222 			if (amap != NULL) {
223 				/* Check the upper layer first. */
224 				anon = amap_lookup(&entry->aref,
225 				    start - entry->start);
226 				/* Don't need to lock anon here. */
227 				if (anon != NULL && anon->an_page != NULL) {
228 
229 					/*
230 					 * Anon has the page for this entry
231 					 * offset.
232 					 */
233 
234 					pgi = 1;
235 				}
236 			}
237 			if (uobj != NULL && pgi == 0) {
238 				/* Check the lower layer. */
239 				pg = uvm_pagelookup(uobj,
240 				    entry->offset + (start - entry->start));
241 				if (pg != NULL) {
242 
243 					/*
244 					 * Object has the page for this entry
245 					 * offset.
246 					 */
247 
248 					pgi = 1;
249 				}
250 			}
251 			(void) subyte(vec, pgi);
252 		}
253 		if (uobj != NULL)
254 			mutex_exit(uobj->vmobjlock);
255 		if (amap != NULL)
256 			amap_unlock(amap);
257 	}
258 
259  out:
260 	vm_map_unlock_read(map);
261 	uvm_vsunlock(p->p_vmspace, SCARG(uap, vec), npgs);
262 	return error;
263 }
264 
265 /*
266  * sys_mmap: mmap system call.
267  *
268  * => file offset and address may not be page aligned
269  *    - if MAP_FIXED, offset and address must have remainder mod PAGE_SIZE
270  *    - if address isn't page aligned the mapping starts at trunc_page(addr)
271  *      and the return value is adjusted up by the page offset.
272  */
273 
274 int
275 sys_mmap(struct lwp *l, const struct sys_mmap_args *uap, register_t *retval)
276 {
277 	/* {
278 		syscallarg(void *) addr;
279 		syscallarg(size_t) len;
280 		syscallarg(int) prot;
281 		syscallarg(int) flags;
282 		syscallarg(int) fd;
283 		syscallarg(long) pad;
284 		syscallarg(off_t) pos;
285 	} */
286 	struct proc *p = l->l_proc;
287 	vaddr_t addr;
288 	off_t pos;
289 	vsize_t size, pageoff, newsize;
290 	vm_prot_t prot, maxprot, extraprot;
291 	int flags, fd, advice;
292 	vaddr_t defaddr;
293 	struct file *fp = NULL;
294 	struct uvm_object *uobj;
295 	int error;
296 #ifdef PAX_ASLR
297 	vaddr_t orig_addr;
298 #endif /* PAX_ASLR */
299 
300 	/*
301 	 * first, extract syscall args from the uap.
302 	 */
303 
304 	addr = (vaddr_t)SCARG(uap, addr);
305 	size = (vsize_t)SCARG(uap, len);
306 	prot = SCARG(uap, prot) & VM_PROT_ALL;
307 	extraprot = PROT_MPROTECT_EXTRACT(SCARG(uap, prot));
308 	flags = SCARG(uap, flags);
309 	fd = SCARG(uap, fd);
310 	pos = SCARG(uap, pos);
311 
312 #ifdef PAX_ASLR
313 	orig_addr = addr;
314 #endif /* PAX_ASLR */
315 
316 	if ((flags & (MAP_SHARED|MAP_PRIVATE)) == (MAP_SHARED|MAP_PRIVATE))
317 		return EINVAL;
318 
319 	/*
320 	 * align file position and save offset.  adjust size.
321 	 */
322 
323 	pageoff = (pos & PAGE_MASK);
324 	pos    -= pageoff;
325 	newsize = size + pageoff;		/* add offset */
326 	newsize = (vsize_t)round_page(newsize);	/* round up */
327 
328 	if (newsize < size)
329 		return ENOMEM;
330 	size = newsize;
331 
332 	/*
333 	 * now check (MAP_FIXED) or get (!MAP_FIXED) the "addr"
334 	 */
335 	if (flags & MAP_FIXED) {
336 		/* ensure address and file offset are aligned properly */
337 		addr -= pageoff;
338 		if (addr & PAGE_MASK)
339 			return EINVAL;
340 
341 		error = range_test(&p->p_vmspace->vm_map, addr, size, true);
342 		if (error) {
343 			return error;
344 		}
345 	} else if (addr == 0 || !(flags & MAP_TRYFIXED)) {
346 		/*
347 		 * not fixed: make sure we skip over the largest
348 		 * possible heap for non-topdown mapping arrangements.
349 		 * we will refine our guess later (e.g. to account for
350 		 * VAC, etc)
351 		 */
352 
353 		defaddr = p->p_emul->e_vm_default_addr(p,
354 		    (vaddr_t)p->p_vmspace->vm_daddr, size,
355 		    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
356 
357 		if (addr == 0 || !(p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN))
358 			addr = MAX(addr, defaddr);
359 		else
360 			addr = MIN(addr, defaddr);
361 	}
362 
363 	/*
364 	 * check for file mappings (i.e. not anonymous) and verify file.
365 	 */
366 
367 	advice = UVM_ADV_NORMAL;
368 	if ((flags & MAP_ANON) == 0) {
369 		if ((fp = fd_getfile(fd)) == NULL)
370 			return EBADF;
371 
372 		if (fp->f_ops->fo_mmap == NULL) {
373 			error = ENODEV;
374 			goto out;
375 		}
376 		error = (*fp->f_ops->fo_mmap)(fp, &pos, size, prot, &flags,
377 		    &advice, &uobj, &maxprot);
378 		if (error) {
379 			goto out;
380 		}
381 		if (uobj == NULL) {
382 			flags |= MAP_ANON;
383 			fd_putfile(fd);
384 			fp = NULL;
385 			goto is_anon;
386 		}
387 	} else {		/* MAP_ANON case */
388 		/*
389 		 * XXX What do we do about (MAP_SHARED|MAP_PRIVATE) == 0?
390 		 */
391 		if (fd != -1)
392 			return EINVAL;
393 
394  is_anon:		/* label for SunOS style /dev/zero */
395 		uobj = NULL;
396 		maxprot = VM_PROT_ALL;
397 		pos = 0;
398 	}
399 
400 	maxprot = PAX_MPROTECT_MAXPROTECT(l, prot, extraprot, maxprot);
401 	if (((prot | extraprot) & maxprot) != (prot | extraprot)) {
402 		error = EACCES;
403 		goto out;
404 	}
405 	if ((error = PAX_MPROTECT_VALIDATE(l, prot)))
406 		goto out;
407 
408 	pax_aslr_mmap(l, &addr, orig_addr, flags);
409 
410 	/*
411 	 * now let kernel internal function uvm_mmap do the work.
412 	 */
413 
414 	error = uvm_mmap(&p->p_vmspace->vm_map, &addr, size, prot, maxprot,
415 	    flags, advice, uobj, pos, p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
416 
417 	/* remember to add offset */
418 	*retval = (register_t)(addr + pageoff);
419 
420  out:
421 	if (fp != NULL)
422 		fd_putfile(fd);
423 
424 	return error;
425 }
426 
427 /*
428  * sys___msync13: the msync system call (a front-end for flush)
429  */
430 
431 int
432 sys___msync13(struct lwp *l, const struct sys___msync13_args *uap,
433     register_t *retval)
434 {
435 	/* {
436 		syscallarg(void *) addr;
437 		syscallarg(size_t) len;
438 		syscallarg(int) flags;
439 	} */
440 	struct proc *p = l->l_proc;
441 	vaddr_t addr;
442 	vsize_t size, pageoff;
443 	struct vm_map *map;
444 	int error, flags, uvmflags;
445 	bool rv;
446 
447 	/*
448 	 * extract syscall args from the uap
449 	 */
450 
451 	addr = (vaddr_t)SCARG(uap, addr);
452 	size = (vsize_t)SCARG(uap, len);
453 	flags = SCARG(uap, flags);
454 
455 	/* sanity check flags */
456 	if ((flags & ~(MS_ASYNC | MS_SYNC | MS_INVALIDATE)) != 0 ||
457 	    (flags & (MS_ASYNC | MS_SYNC | MS_INVALIDATE)) == 0 ||
458 	    (flags & (MS_ASYNC | MS_SYNC)) == (MS_ASYNC | MS_SYNC))
459 		return EINVAL;
460 	if ((flags & (MS_ASYNC | MS_SYNC)) == 0)
461 		flags |= MS_SYNC;
462 
463 	/*
464 	 * align the address to a page boundary and adjust the size accordingly.
465 	 */
466 
467 	pageoff = (addr & PAGE_MASK);
468 	addr -= pageoff;
469 	size += pageoff;
470 	size = (vsize_t)round_page(size);
471 
472 
473 	/*
474 	 * get map
475 	 */
476 	map = &p->p_vmspace->vm_map;
477 
478 	error = range_test(map, addr, size, false);
479 	if (error)
480 		return ENOMEM;
481 
482 	/*
483 	 * XXXCDC: do we really need this semantic?
484 	 *
485 	 * XXX Gak!  If size is zero we are supposed to sync "all modified
486 	 * pages with the region containing addr".  Unfortunately, we
487 	 * don't really keep track of individual mmaps so we approximate
488 	 * by flushing the range of the map entry containing addr.
489 	 * This can be incorrect if the region splits or is coalesced
490 	 * with a neighbor.
491 	 */
492 
493 	if (size == 0) {
494 		struct vm_map_entry *entry;
495 
496 		vm_map_lock_read(map);
497 		rv = uvm_map_lookup_entry(map, addr, &entry);
498 		if (rv == true) {
499 			addr = entry->start;
500 			size = entry->end - entry->start;
501 		}
502 		vm_map_unlock_read(map);
503 		if (rv == false)
504 			return EINVAL;
505 	}
506 
507 	/*
508 	 * translate MS_ flags into PGO_ flags
509 	 */
510 
511 	uvmflags = PGO_CLEANIT;
512 	if (flags & MS_INVALIDATE)
513 		uvmflags |= PGO_FREE;
514 	if (flags & MS_SYNC)
515 		uvmflags |= PGO_SYNCIO;
516 
517 	error = uvm_map_clean(map, addr, addr+size, uvmflags);
518 	return error;
519 }
520 
521 /*
522  * sys_munmap: unmap a users memory
523  */
524 
525 int
526 sys_munmap(struct lwp *l, const struct sys_munmap_args *uap, register_t *retval)
527 {
528 	/* {
529 		syscallarg(void *) addr;
530 		syscallarg(size_t) len;
531 	} */
532 	struct proc *p = l->l_proc;
533 	vaddr_t addr;
534 	vsize_t size, pageoff;
535 	struct vm_map *map;
536 	struct vm_map_entry *dead_entries;
537 	int error;
538 
539 	/*
540 	 * get syscall args.
541 	 */
542 
543 	addr = (vaddr_t)SCARG(uap, addr);
544 	size = (vsize_t)SCARG(uap, len);
545 
546 	/*
547 	 * align the address to a page boundary and adjust the size accordingly.
548 	 */
549 
550 	pageoff = (addr & PAGE_MASK);
551 	addr -= pageoff;
552 	size += pageoff;
553 	size = (vsize_t)round_page(size);
554 
555 	if (size == 0)
556 		return 0;
557 
558 	map = &p->p_vmspace->vm_map;
559 
560 	error = range_test(map, addr, size, false);
561 	if (error)
562 		return EINVAL;
563 
564 	vm_map_lock(map);
565 #if 0
566 	/*
567 	 * interesting system call semantic: make sure entire range is
568 	 * allocated before allowing an unmap.
569 	 */
570 	if (!uvm_map_checkprot(map, addr, addr + size, VM_PROT_NONE)) {
571 		vm_map_unlock(map);
572 		return EINVAL;
573 	}
574 #endif
575 	uvm_unmap_remove(map, addr, addr + size, &dead_entries, 0);
576 	vm_map_unlock(map);
577 	if (dead_entries != NULL)
578 		uvm_unmap_detach(dead_entries, 0);
579 	return 0;
580 }
581 
582 /*
583  * sys_mprotect: the mprotect system call
584  */
585 
586 int
587 sys_mprotect(struct lwp *l, const struct sys_mprotect_args *uap,
588     register_t *retval)
589 {
590 	/* {
591 		syscallarg(void *) addr;
592 		syscallarg(size_t) len;
593 		syscallarg(int) prot;
594 	} */
595 	struct proc *p = l->l_proc;
596 	vaddr_t addr;
597 	vsize_t size, pageoff;
598 	vm_prot_t prot;
599 	int error;
600 
601 	/*
602 	 * extract syscall args from uap
603 	 */
604 
605 	addr = (vaddr_t)SCARG(uap, addr);
606 	size = (vsize_t)SCARG(uap, len);
607 	prot = SCARG(uap, prot) & VM_PROT_ALL;
608 
609 	/*
610 	 * align the address to a page boundary and adjust the size accordingly.
611 	 */
612 
613 	pageoff = (addr & PAGE_MASK);
614 	addr -= pageoff;
615 	size += pageoff;
616 	size = round_page(size);
617 
618 	error = range_test(&p->p_vmspace->vm_map, addr, size, false);
619 	if (error)
620 		return EINVAL;
621 
622 	error = uvm_map_protect_user(l, addr, addr + size, prot);
623 	return error;
624 }
625 
626 /*
627  * sys_minherit: the minherit system call
628  */
629 
630 int
631 sys_minherit(struct lwp *l, const struct sys_minherit_args *uap,
632    register_t *retval)
633 {
634 	/* {
635 		syscallarg(void *) addr;
636 		syscallarg(int) len;
637 		syscallarg(int) inherit;
638 	} */
639 	struct proc *p = l->l_proc;
640 	vaddr_t addr;
641 	vsize_t size, pageoff;
642 	vm_inherit_t inherit;
643 	int error;
644 
645 	addr = (vaddr_t)SCARG(uap, addr);
646 	size = (vsize_t)SCARG(uap, len);
647 	inherit = SCARG(uap, inherit);
648 
649 	/*
650 	 * align the address to a page boundary and adjust the size accordingly.
651 	 */
652 
653 	pageoff = (addr & PAGE_MASK);
654 	addr -= pageoff;
655 	size += pageoff;
656 	size = (vsize_t)round_page(size);
657 
658 	error = range_test(&p->p_vmspace->vm_map, addr, size, false);
659 	if (error)
660 		return EINVAL;
661 
662 	error = uvm_map_inherit(&p->p_vmspace->vm_map, addr, addr + size,
663 	    inherit);
664 	return error;
665 }
666 
667 /*
668  * sys_madvise: give advice about memory usage.
669  */
670 
671 /* ARGSUSED */
672 int
673 sys_madvise(struct lwp *l, const struct sys_madvise_args *uap,
674    register_t *retval)
675 {
676 	/* {
677 		syscallarg(void *) addr;
678 		syscallarg(size_t) len;
679 		syscallarg(int) behav;
680 	} */
681 	struct proc *p = l->l_proc;
682 	vaddr_t addr;
683 	vsize_t size, pageoff;
684 	int advice, error;
685 
686 	addr = (vaddr_t)SCARG(uap, addr);
687 	size = (vsize_t)SCARG(uap, len);
688 	advice = SCARG(uap, behav);
689 
690 	/*
691 	 * align the address to a page boundary, and adjust the size accordingly
692 	 */
693 
694 	pageoff = (addr & PAGE_MASK);
695 	addr -= pageoff;
696 	size += pageoff;
697 	size = (vsize_t)round_page(size);
698 
699 	error = range_test(&p->p_vmspace->vm_map, addr, size, false);
700 	if (error)
701 		return EINVAL;
702 
703 	switch (advice) {
704 	case MADV_NORMAL:
705 	case MADV_RANDOM:
706 	case MADV_SEQUENTIAL:
707 		error = uvm_map_advice(&p->p_vmspace->vm_map, addr, addr + size,
708 		    advice);
709 		break;
710 
711 	case MADV_WILLNEED:
712 
713 		/*
714 		 * Activate all these pages, pre-faulting them in if
715 		 * necessary.
716 		 */
717 		error = uvm_map_willneed(&p->p_vmspace->vm_map,
718 		    addr, addr + size);
719 		break;
720 
721 	case MADV_DONTNEED:
722 
723 		/*
724 		 * Deactivate all these pages.  We don't need them
725 		 * any more.  We don't, however, toss the data in
726 		 * the pages.
727 		 */
728 
729 		error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
730 		    PGO_DEACTIVATE);
731 		break;
732 
733 	case MADV_FREE:
734 
735 		/*
736 		 * These pages contain no valid data, and may be
737 		 * garbage-collected.  Toss all resources, including
738 		 * any swap space in use.
739 		 */
740 
741 		error = uvm_map_clean(&p->p_vmspace->vm_map, addr, addr + size,
742 		    PGO_FREE);
743 		break;
744 
745 	case MADV_SPACEAVAIL:
746 
747 		/*
748 		 * XXXMRG What is this?  I think it's:
749 		 *
750 		 *	Ensure that we have allocated backing-store
751 		 *	for these pages.
752 		 *
753 		 * This is going to require changes to the page daemon,
754 		 * as it will free swap space allocated to pages in core.
755 		 * There's also what to do for device/file/anonymous memory.
756 		 */
757 
758 		return EINVAL;
759 
760 	default:
761 		return EINVAL;
762 	}
763 
764 	return error;
765 }
766 
767 /*
768  * sys_mlock: memory lock
769  */
770 
771 int
772 sys_mlock(struct lwp *l, const struct sys_mlock_args *uap, register_t *retval)
773 {
774 	/* {
775 		syscallarg(const void *) addr;
776 		syscallarg(size_t) len;
777 	} */
778 	struct proc *p = l->l_proc;
779 	vaddr_t addr;
780 	vsize_t size, pageoff;
781 	int error;
782 
783 	/*
784 	 * extract syscall args from uap
785 	 */
786 
787 	addr = (vaddr_t)SCARG(uap, addr);
788 	size = (vsize_t)SCARG(uap, len);
789 
790 	/*
791 	 * align the address to a page boundary and adjust the size accordingly
792 	 */
793 
794 	pageoff = (addr & PAGE_MASK);
795 	addr -= pageoff;
796 	size += pageoff;
797 	size = (vsize_t)round_page(size);
798 
799 	error = range_test(&p->p_vmspace->vm_map, addr, size, false);
800 	if (error)
801 		return ENOMEM;
802 
803 	if (atop(size) + uvmexp.wired > uvmexp.wiredmax)
804 		return EAGAIN;
805 
806 	if (size + ptoa(pmap_wired_count(vm_map_pmap(&p->p_vmspace->vm_map))) >
807 	    p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur)
808 		return EAGAIN;
809 
810 	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, false,
811 	    0);
812 	if (error == EFAULT)
813 		error = ENOMEM;
814 	return error;
815 }
816 
817 /*
818  * sys_munlock: unlock wired pages
819  */
820 
821 int
822 sys_munlock(struct lwp *l, const struct sys_munlock_args *uap,
823     register_t *retval)
824 {
825 	/* {
826 		syscallarg(const void *) addr;
827 		syscallarg(size_t) len;
828 	} */
829 	struct proc *p = l->l_proc;
830 	vaddr_t addr;
831 	vsize_t size, pageoff;
832 	int error;
833 
834 	/*
835 	 * extract syscall args from uap
836 	 */
837 
838 	addr = (vaddr_t)SCARG(uap, addr);
839 	size = (vsize_t)SCARG(uap, len);
840 
841 	/*
842 	 * align the address to a page boundary, and adjust the size accordingly
843 	 */
844 
845 	pageoff = (addr & PAGE_MASK);
846 	addr -= pageoff;
847 	size += pageoff;
848 	size = (vsize_t)round_page(size);
849 
850 	error = range_test(&p->p_vmspace->vm_map, addr, size, false);
851 	if (error)
852 		return ENOMEM;
853 
854 	error = uvm_map_pageable(&p->p_vmspace->vm_map, addr, addr+size, true,
855 	    0);
856 	if (error)
857 		return ENOMEM;
858 
859 	return 0;
860 }
861 
862 /*
863  * sys_mlockall: lock all pages mapped into an address space.
864  */
865 
866 int
867 sys_mlockall(struct lwp *l, const struct sys_mlockall_args *uap,
868     register_t *retval)
869 {
870 	/* {
871 		syscallarg(int) flags;
872 	} */
873 	struct proc *p = l->l_proc;
874 	int error, flags;
875 
876 	flags = SCARG(uap, flags);
877 
878 	if (flags == 0 || (flags & ~(MCL_CURRENT|MCL_FUTURE)) != 0)
879 		return EINVAL;
880 
881 	error = uvm_map_pageable_all(&p->p_vmspace->vm_map, flags,
882 	    p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
883 	return error;
884 }
885 
886 /*
887  * sys_munlockall: unlock all pages mapped into an address space.
888  */
889 
890 int
891 sys_munlockall(struct lwp *l, const void *v, register_t *retval)
892 {
893 	struct proc *p = l->l_proc;
894 
895 	(void) uvm_map_pageable_all(&p->p_vmspace->vm_map, 0, 0);
896 	return 0;
897 }
898 
899 /*
900  * uvm_mmap: internal version of mmap
901  *
902  * - used by sys_mmap and various framebuffers
903  * - uobj is a struct uvm_object pointer or NULL for MAP_ANON
904  * - caller must page-align the file offset
905  */
906 
907 int
908 uvm_mmap(struct vm_map *map, vaddr_t *addr, vsize_t size, vm_prot_t prot,
909     vm_prot_t maxprot, int flags, int advice, struct uvm_object *uobj,
910     voff_t foff, vsize_t locklimit)
911 {
912 	vaddr_t align = 0;
913 	int error;
914 	uvm_flag_t uvmflag = 0;
915 
916 	/*
917 	 * check params
918 	 */
919 
920 	if (size == 0)
921 		return 0;
922 	if (foff & PAGE_MASK)
923 		return EINVAL;
924 	if ((prot & maxprot) != prot)
925 		return EINVAL;
926 
927 	/*
928 	 * for non-fixed mappings, round off the suggested address.
929 	 * for fixed mappings, check alignment.
930 	 */
931 
932 	if ((flags & MAP_FIXED) == 0) {
933 		*addr = round_page(*addr);
934 	} else {
935 		if (*addr & PAGE_MASK)
936 			return EINVAL;
937 		uvmflag |= UVM_FLAG_FIXED | UVM_FLAG_UNMAP;
938 	}
939 
940 	/*
941 	 * Try to see if any requested alignment can even be attemped.
942 	 * Make sure we can express the alignment (asking for a >= 4GB
943 	 * alignment on an ILP32 architecure make no sense) and the
944 	 * alignment is at least for a page sized quanitiy.  If the
945 	 * request was for a fixed mapping, make sure supplied address
946 	 * adheres to the request alignment.
947 	 */
948 	align = (flags & MAP_ALIGNMENT_MASK) >> MAP_ALIGNMENT_SHIFT;
949 	if (align) {
950 		if (align >= sizeof(vaddr_t) * NBBY)
951 			return EINVAL;
952 		align = 1L << align;
953 		if (align < PAGE_SIZE)
954 			return EINVAL;
955 		if (align >= vm_map_max(map))
956 			return ENOMEM;
957 		if (flags & MAP_FIXED) {
958 			if ((*addr & (align-1)) != 0)
959 				return EINVAL;
960 			align = 0;
961 		}
962 	}
963 
964 	/*
965 	 * check resource limits
966 	 */
967 
968 	if (!VM_MAP_IS_KERNEL(map) &&
969 	    (((rlim_t)curproc->p_vmspace->vm_map.size + (rlim_t)size) >
970 	    curproc->p_rlimit[RLIMIT_AS].rlim_cur))
971 		return ENOMEM;
972 
973 	/*
974 	 * handle anon vs. non-anon mappings.   for non-anon mappings attach
975 	 * to underlying vm object.
976 	 */
977 
978 	if (flags & MAP_ANON) {
979 		KASSERT(uobj == NULL);
980 		foff = UVM_UNKNOWN_OFFSET;
981 		if ((flags & MAP_SHARED) == 0)
982 			/* XXX: defer amap create */
983 			uvmflag |= UVM_FLAG_COPYONW;
984 		else
985 			/* shared: create amap now */
986 			uvmflag |= UVM_FLAG_OVERLAY;
987 
988 	} else {
989 		KASSERT(uobj != NULL);
990 		if ((flags & MAP_SHARED) == 0) {
991 			uvmflag |= UVM_FLAG_COPYONW;
992 		}
993 	}
994 
995 	uvmflag = UVM_MAPFLAG(prot, maxprot,
996 	    (flags & MAP_SHARED) ? UVM_INH_SHARE : UVM_INH_COPY, advice,
997 	    uvmflag);
998 	error = uvm_map(map, addr, size, uobj, foff, align, uvmflag);
999 	if (error) {
1000 		if (uobj)
1001 			uobj->pgops->pgo_detach(uobj);
1002 		return error;
1003 	}
1004 
1005 	/*
1006 	 * POSIX 1003.1b -- if our address space was configured
1007 	 * to lock all future mappings, wire the one we just made.
1008 	 *
1009 	 * Also handle the MAP_WIRED flag here.
1010 	 */
1011 
1012 	if (prot == VM_PROT_NONE) {
1013 
1014 		/*
1015 		 * No more work to do in this case.
1016 		 */
1017 
1018 		return 0;
1019 	}
1020 	if ((flags & MAP_WIRED) != 0 || (map->flags & VM_MAP_WIREFUTURE) != 0) {
1021 		vm_map_lock(map);
1022 		if (atop(size) + uvmexp.wired > uvmexp.wiredmax ||
1023 		    (locklimit != 0 &&
1024 		     size + ptoa(pmap_wired_count(vm_map_pmap(map))) >
1025 		     locklimit)) {
1026 			vm_map_unlock(map);
1027 			uvm_unmap(map, *addr, *addr + size);
1028 			return ENOMEM;
1029 		}
1030 
1031 		/*
1032 		 * uvm_map_pageable() always returns the map unlocked.
1033 		 */
1034 
1035 		error = uvm_map_pageable(map, *addr, *addr + size,
1036 		    false, UVM_LK_ENTER);
1037 		if (error) {
1038 			uvm_unmap(map, *addr, *addr + size);
1039 			return error;
1040 		}
1041 		return 0;
1042 	}
1043 	return 0;
1044 }
1045 
1046 vaddr_t
1047 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz, int topdown)
1048 {
1049 
1050 	if (topdown)
1051 		return VM_DEFAULT_ADDRESS_TOPDOWN(base, sz);
1052 	else
1053 		return VM_DEFAULT_ADDRESS_BOTTOMUP(base, sz);
1054 }
1055 
1056 int
1057 uvm_mmap_dev(struct proc *p, void **addrp, size_t len, dev_t dev,
1058     off_t off)
1059 {
1060 	struct uvm_object *uobj;
1061 	int error, flags, prot;
1062 
1063 	flags = MAP_SHARED;
1064 	prot = VM_PROT_READ | VM_PROT_WRITE;
1065 	if (*addrp)
1066 		flags |= MAP_FIXED;
1067 	else
1068 		*addrp = (void *)p->p_emul->e_vm_default_addr(p,
1069 		    (vaddr_t)p->p_vmspace->vm_daddr, len,
1070 		    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
1071 
1072 	uobj = udv_attach(dev, prot, off, len);
1073 	if (uobj == NULL)
1074 		return EINVAL;
1075 
1076 	error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp,
1077 	    (vsize_t)len, prot, prot, flags, UVM_ADV_RANDOM, uobj, off,
1078 	    p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
1079 	return error;
1080 }
1081 
1082 int
1083 uvm_mmap_anon(struct proc *p, void **addrp, size_t len)
1084 {
1085 	int error, flags, prot;
1086 
1087 	flags = MAP_PRIVATE | MAP_ANON;
1088 	prot = VM_PROT_READ | VM_PROT_WRITE;
1089 	if (*addrp)
1090 		flags |= MAP_FIXED;
1091 	else
1092 		*addrp = (void *)p->p_emul->e_vm_default_addr(p,
1093 		    (vaddr_t)p->p_vmspace->vm_daddr, len,
1094 		    p->p_vmspace->vm_map.flags & VM_MAP_TOPDOWN);
1095 
1096 	error = uvm_mmap(&p->p_vmspace->vm_map, (vaddr_t *)addrp,
1097 	    (vsize_t)len, prot, prot, flags, UVM_ADV_NORMAL, NULL, 0,
1098 	    p->p_rlimit[RLIMIT_MEMLOCK].rlim_cur);
1099 	return error;
1100 }
1101