1 /* $NetBSD: vm.c,v 1.197 2023/09/24 09:33:26 martin Exp $ */
2
3 /*
4 * Copyright (c) 2007-2011 Antti Kantee. All Rights Reserved.
5 *
6 * Development of this software was supported by
7 * The Finnish Cultural Foundation and the Research Foundation of
8 * The Helsinki University of Technology.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS
20 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29 * SUCH DAMAGE.
30 */
31
32 /*
33 * Virtual memory emulation routines.
34 */
35
36 /*
37 * XXX: we abuse pg->uanon for the virtual address of the storage
38 * for each page. phys_addr would fit the job description better,
39 * except that it will create unnecessary lossage on some platforms
40 * due to not being a pointer type.
41 */
42
43 #include <sys/cdefs.h>
44 __KERNEL_RCSID(0, "$NetBSD: vm.c,v 1.197 2023/09/24 09:33:26 martin Exp $");
45
46 #include <sys/param.h>
47 #include <sys/atomic.h>
48 #include <sys/buf.h>
49 #include <sys/kernel.h>
50 #include <sys/kmem.h>
51 #include <sys/vmem.h>
52 #include <sys/mman.h>
53 #include <sys/null.h>
54 #include <sys/vnode.h>
55 #include <sys/radixtree.h>
56 #include <sys/module.h>
57
58 #include <machine/pmap.h>
59
60 #if defined(__i386__) || defined(__x86_64__)
61 /*
62 * This file abuses the pmap abstraction to create its own statically
63 * allocated struct pmap object, even though it can't do anything
64 * useful with such a thing from userland. On x86 the struct pmap
65 * definition is private, so we have to go to extra effort to abuse it
66 * there. This should be fixed -- all of the struct pmap definitions
67 * should be private, and then rump can furnish its own fake struct
68 * pmap without clashing with anything.
69 */
70 #include <machine/pmap_private.h>
71 #endif
72
73 #include <uvm/uvm.h>
74 #include <uvm/uvm_ddb.h>
75 #include <uvm/uvm_pdpolicy.h>
76 #include <uvm/uvm_prot.h>
77 #include <uvm/uvm_readahead.h>
78 #include <uvm/uvm_device.h>
79
80 #include <rump-sys/kern.h>
81 #include <rump-sys/vfs.h>
82
83 #include <rump/rumpuser.h>
84
85 kmutex_t vmpage_lruqueue_lock; /* non-free page lock */
86 kmutex_t uvm_swap_data_lock;
87
88 struct uvmexp uvmexp;
89 struct uvm uvm;
90
91 #ifdef __uvmexp_pagesize
92 const int * const uvmexp_pagesize = &uvmexp.pagesize;
93 const int * const uvmexp_pagemask = &uvmexp.pagemask;
94 const int * const uvmexp_pageshift = &uvmexp.pageshift;
95 #endif
96
97 static struct vm_map kernel_map_store;
98 struct vm_map *kernel_map = &kernel_map_store;
99
100 static struct vm_map module_map_store;
101
102 static struct pmap pmap_kernel;
103 struct pmap rump_pmap_local;
104 struct pmap *const kernel_pmap_ptr = &pmap_kernel;
105
106 vmem_t *kmem_arena;
107 vmem_t *kmem_va_arena;
108
109 static unsigned int pdaemon_waiters;
110 static kmutex_t pdaemonmtx;
111 static kcondvar_t pdaemoncv, oomwait;
112
113 /* all local non-proc0 processes share this vmspace */
114 struct vmspace *rump_vmspace_local;
115
116 unsigned long rump_physmemlimit = RUMPMEM_UNLIMITED;
117 static unsigned long pdlimit = RUMPMEM_UNLIMITED; /* page daemon memlimit */
118 static unsigned long curphysmem;
119 static unsigned long dddlim; /* 90% of memory limit used */
120 #define NEED_PAGEDAEMON() \
121 (rump_physmemlimit != RUMPMEM_UNLIMITED && curphysmem > dddlim)
122 #define PDRESERVE (2*MAXPHYS)
123
124 /*
125 * Try to free two pages worth of pages from objects.
126 * If this successfully frees a full page cache page, we'll
127 * free the released page plus PAGE_SIZE/sizeof(vm_page).
128 */
129 #define PAGEDAEMON_OBJCHUNK (2*PAGE_SIZE / sizeof(struct vm_page))
130
131 /*
132 * Keep a list of least recently used pages. Since the only way a
133 * rump kernel can "access" a page is via lookup, we put the page
134 * at the back of queue every time a lookup for it is done. If the
135 * page is in front of this global queue and we're short of memory,
136 * it's a candidate for pageout.
137 */
138 static struct pglist vmpage_lruqueue;
139 static unsigned vmpage_onqueue;
140
141 /*
142 * vm pages
143 */
144
145 static int
pgctor(void * arg,void * obj,int flags)146 pgctor(void *arg, void *obj, int flags)
147 {
148 struct vm_page *pg = obj;
149
150 memset(pg, 0, sizeof(*pg));
151 pg->uanon = rump_hypermalloc(PAGE_SIZE, PAGE_SIZE,
152 (flags & PR_WAITOK) == PR_WAITOK, "pgalloc");
153 return pg->uanon == NULL;
154 }
155
156 static void
pgdtor(void * arg,void * obj)157 pgdtor(void *arg, void *obj)
158 {
159 struct vm_page *pg = obj;
160
161 rump_hyperfree(pg->uanon, PAGE_SIZE);
162 }
163
164 static struct pool_cache pagecache;
165
166 /* stub for UVM_OBJ_IS_VNODE */
167 struct uvm_pagerops rump_uvm_vnodeops;
168 __weak_alias(uvm_vnodeops,rump_uvm_vnodeops);
169
170 /*
171 * Called with the object locked. We don't support anons.
172 */
173 struct vm_page *
uvm_pagealloc_strat(struct uvm_object * uobj,voff_t off,struct vm_anon * anon,int flags,int strat,int free_list)174 uvm_pagealloc_strat(struct uvm_object *uobj, voff_t off, struct vm_anon *anon,
175 int flags, int strat, int free_list)
176 {
177 struct vm_page *pg;
178
179 KASSERT(uobj && rw_write_held(uobj->vmobjlock));
180 KASSERT(anon == NULL);
181
182 pg = pool_cache_get(&pagecache, PR_NOWAIT);
183 if (__predict_false(pg == NULL)) {
184 return NULL;
185 }
186 mutex_init(&pg->interlock, MUTEX_DEFAULT, IPL_NONE);
187
188 pg->offset = off;
189 pg->uobject = uobj;
190
191 if (radix_tree_insert_node(&uobj->uo_pages, off >> PAGE_SHIFT,
192 pg) != 0) {
193 pool_cache_put(&pagecache, pg);
194 return NULL;
195 }
196
197 if (UVM_OBJ_IS_VNODE(uobj)) {
198 if (uobj->uo_npages == 0) {
199 struct vnode *vp = (struct vnode *)uobj;
200 mutex_enter(vp->v_interlock);
201 vp->v_iflag |= VI_PAGES;
202 mutex_exit(vp->v_interlock);
203 }
204 pg->flags |= PG_FILE;
205 }
206 uobj->uo_npages++;
207
208 pg->flags = PG_CLEAN|PG_BUSY|PG_FAKE;
209 if (flags & UVM_PGA_ZERO) {
210 uvm_pagezero(pg);
211 }
212
213 /*
214 * Don't put anons on the LRU page queue. We can't flush them
215 * (there's no concept of swap in a rump kernel), so no reason
216 * to bother with them.
217 */
218 if (!UVM_OBJ_IS_AOBJ(uobj)) {
219 atomic_inc_uint(&vmpage_onqueue);
220 mutex_enter(&vmpage_lruqueue_lock);
221 TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue);
222 mutex_exit(&vmpage_lruqueue_lock);
223 } else {
224 pg->flags |= PG_AOBJ;
225 }
226
227 return pg;
228 }
229
230 /*
231 * Release a page.
232 *
233 * Called with the vm object locked.
234 */
235 void
uvm_pagefree(struct vm_page * pg)236 uvm_pagefree(struct vm_page *pg)
237 {
238 struct uvm_object *uobj = pg->uobject;
239 struct vm_page *pg2 __unused;
240
241 KASSERT(rw_write_held(uobj->vmobjlock));
242
243 mutex_enter(&pg->interlock);
244 uvm_pagewakeup(pg);
245 mutex_exit(&pg->interlock);
246
247 uobj->uo_npages--;
248 pg2 = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT);
249 KASSERT(pg == pg2);
250
251 if (!UVM_OBJ_IS_AOBJ(uobj)) {
252 mutex_enter(&vmpage_lruqueue_lock);
253 TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue);
254 mutex_exit(&vmpage_lruqueue_lock);
255 atomic_dec_uint(&vmpage_onqueue);
256 }
257
258 if (UVM_OBJ_IS_VNODE(uobj) && uobj->uo_npages == 0) {
259 struct vnode *vp = (struct vnode *)uobj;
260 mutex_enter(vp->v_interlock);
261 vp->v_iflag &= ~VI_PAGES;
262 mutex_exit(vp->v_interlock);
263 }
264
265 mutex_destroy(&pg->interlock);
266 pool_cache_put(&pagecache, pg);
267 }
268
269 void
uvm_pagezero(struct vm_page * pg)270 uvm_pagezero(struct vm_page *pg)
271 {
272
273 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
274 memset((void *)pg->uanon, 0, PAGE_SIZE);
275 }
276
277 /*
278 * uvm_page_owner_locked_p: return true if object associated with page is
279 * locked. this is a weak check for runtime assertions only.
280 */
281
282 bool
uvm_page_owner_locked_p(struct vm_page * pg,bool exclusive)283 uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive)
284 {
285
286 if (exclusive)
287 return rw_write_held(pg->uobject->vmobjlock);
288 else
289 return rw_lock_held(pg->uobject->vmobjlock);
290 }
291
292 /*
293 * Misc routines
294 */
295
296 static kmutex_t pagermtx;
297
298 void
uvm_init(void)299 uvm_init(void)
300 {
301 char buf[64];
302
303 if (rumpuser_getparam("RUMP_MEMLIMIT", buf, sizeof(buf)) == 0) {
304 unsigned long tmp;
305 char *ep;
306 int mult;
307
308 tmp = strtoul(buf, &ep, 10);
309 if (strlen(ep) > 1)
310 panic("uvm_init: invalid RUMP_MEMLIMIT: %s", buf);
311
312 /* mini-dehumanize-number */
313 mult = 1;
314 switch (*ep) {
315 case 'k':
316 mult = 1024;
317 break;
318 case 'm':
319 mult = 1024*1024;
320 break;
321 case 'g':
322 mult = 1024*1024*1024;
323 break;
324 case 0:
325 break;
326 default:
327 panic("uvm_init: invalid RUMP_MEMLIMIT: %s", buf);
328 }
329 rump_physmemlimit = tmp * mult;
330
331 if (rump_physmemlimit / mult != tmp)
332 panic("uvm_init: RUMP_MEMLIMIT overflow: %s", buf);
333
334 /* reserve some memory for the pager */
335 if (rump_physmemlimit <= PDRESERVE)
336 panic("uvm_init: system reserves %d bytes of mem, "
337 "only %lu bytes given",
338 PDRESERVE, rump_physmemlimit);
339 pdlimit = rump_physmemlimit;
340 rump_physmemlimit -= PDRESERVE;
341
342 if (pdlimit < 1024*1024)
343 printf("uvm_init: WARNING: <1MB RAM limit, "
344 "hope you know what you're doing\n");
345
346 #define HUMANIZE_BYTES 9
347 CTASSERT(sizeof(buf) >= HUMANIZE_BYTES);
348 format_bytes(buf, HUMANIZE_BYTES, rump_physmemlimit);
349 #undef HUMANIZE_BYTES
350 dddlim = 9 * (rump_physmemlimit / 10);
351 } else {
352 strlcpy(buf, "unlimited (host limit)", sizeof(buf));
353 }
354 aprint_verbose("total memory = %s\n", buf);
355
356 TAILQ_INIT(&vmpage_lruqueue);
357
358 if (rump_physmemlimit == RUMPMEM_UNLIMITED) {
359 uvmexp.npages = physmem;
360 } else {
361 uvmexp.npages = pdlimit >> PAGE_SHIFT;
362 uvmexp.reserve_pagedaemon = PDRESERVE >> PAGE_SHIFT;
363 uvmexp.freetarg = (rump_physmemlimit-dddlim) >> PAGE_SHIFT;
364 }
365 /*
366 * uvmexp.free is not used internally or updated. The reason is
367 * that the memory hypercall allocator is allowed to allocate
368 * non-page sized chunks. We use a byte count in curphysmem
369 * instead.
370 */
371 uvmexp.free = uvmexp.npages;
372
373 #ifndef __uvmexp_pagesize
374 uvmexp.pagesize = PAGE_SIZE;
375 uvmexp.pagemask = PAGE_MASK;
376 uvmexp.pageshift = PAGE_SHIFT;
377 #else
378 uvmexp.pagesize = rumpuser_getpagesize();
379 uvmexp.pagemask = uvmexp.pagesize-1;
380 uvmexp.pageshift = ffs(uvmexp.pagesize)-1;
381 #endif
382
383 mutex_init(&pagermtx, MUTEX_DEFAULT, IPL_NONE);
384 mutex_init(&vmpage_lruqueue_lock, MUTEX_DEFAULT, IPL_NONE);
385 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE);
386 mutex_init(&pdaemonmtx, MUTEX_DEFAULT, IPL_NONE);
387
388 cv_init(&pdaemoncv, "pdaemon");
389 cv_init(&oomwait, "oomwait");
390
391 module_map = &module_map_store;
392
393 kernel_map->pmap = pmap_kernel();
394
395 pool_subsystem_init();
396
397 kmem_arena = vmem_create("kmem", 0, 1024*1024, PAGE_SIZE,
398 NULL, NULL, NULL,
399 0, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM);
400
401 vmem_subsystem_init(kmem_arena);
402
403 kmem_va_arena = vmem_create("kva", 0, 0, PAGE_SIZE,
404 vmem_alloc, vmem_free, kmem_arena,
405 8 * PAGE_SIZE, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM);
406
407 pool_cache_bootstrap(&pagecache, sizeof(struct vm_page), 0, 0, 0,
408 "page$", NULL, IPL_NONE, pgctor, pgdtor, NULL);
409
410 radix_tree_init();
411
412 /* create vmspace used by local clients */
413 rump_vmspace_local = kmem_zalloc(sizeof(*rump_vmspace_local), KM_SLEEP);
414 uvmspace_init(rump_vmspace_local, &rump_pmap_local, 0, 0, false);
415 }
416
417 void
uvmspace_init(struct vmspace * vm,struct pmap * pmap,vaddr_t vmin,vaddr_t vmax,bool topdown)418 uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t vmin, vaddr_t vmax,
419 bool topdown)
420 {
421
422 vm->vm_map.pmap = pmap;
423 vm->vm_refcnt = 1;
424 }
425
426 int
uvm_map_pageable(struct vm_map * map,vaddr_t start,vaddr_t end,bool new_pageable,int lockflags)427 uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end,
428 bool new_pageable, int lockflags)
429 {
430 return 0;
431 }
432
433 void
uvm_pagewire(struct vm_page * pg)434 uvm_pagewire(struct vm_page *pg)
435 {
436
437 /* nada */
438 }
439
440 void
uvm_pageunwire(struct vm_page * pg)441 uvm_pageunwire(struct vm_page *pg)
442 {
443
444 /* nada */
445 }
446
447 int
uvm_availmem(bool cached)448 uvm_availmem(bool cached)
449 {
450
451 return uvmexp.free;
452 }
453
454 void
uvm_pagelock(struct vm_page * pg)455 uvm_pagelock(struct vm_page *pg)
456 {
457
458 mutex_enter(&pg->interlock);
459 }
460
461 void
uvm_pagelock2(struct vm_page * pg1,struct vm_page * pg2)462 uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2)
463 {
464
465 if (pg1 < pg2) {
466 mutex_enter(&pg1->interlock);
467 mutex_enter(&pg2->interlock);
468 } else {
469 mutex_enter(&pg2->interlock);
470 mutex_enter(&pg1->interlock);
471 }
472 }
473
474 void
uvm_pageunlock(struct vm_page * pg)475 uvm_pageunlock(struct vm_page *pg)
476 {
477
478 mutex_exit(&pg->interlock);
479 }
480
481 void
uvm_pageunlock2(struct vm_page * pg1,struct vm_page * pg2)482 uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2)
483 {
484
485 mutex_exit(&pg1->interlock);
486 mutex_exit(&pg2->interlock);
487 }
488
489 /* where's your schmonz now? */
490 #define PUNLIMIT(a) \
491 p->p_rlimit[a].rlim_cur = p->p_rlimit[a].rlim_max = RLIM_INFINITY;
492 void
uvm_init_limits(struct proc * p)493 uvm_init_limits(struct proc *p)
494 {
495
496 #ifndef DFLSSIZ
497 #define DFLSSIZ (16*1024*1024)
498 #endif
499 p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ;
500 p->p_rlimit[RLIMIT_STACK].rlim_max = MAXSSIZ;
501 PUNLIMIT(RLIMIT_DATA);
502 PUNLIMIT(RLIMIT_RSS);
503 PUNLIMIT(RLIMIT_AS);
504 /* nice, cascade */
505 }
506 #undef PUNLIMIT
507
508 /*
509 * This satisfies the "disgusting mmap hack" used by proplib.
510 */
511 int
uvm_mmap_anon(struct proc * p,void ** addrp,size_t size)512 uvm_mmap_anon(struct proc *p, void **addrp, size_t size)
513 {
514 int error;
515
516 /* no reason in particular, but cf. uvm_default_mapaddr() */
517 if (*addrp != NULL)
518 panic("uvm_mmap() variant unsupported");
519
520 if (RUMP_LOCALPROC_P(curproc)) {
521 error = rumpuser_anonmmap(NULL, size, 0, 0, addrp);
522 } else {
523 error = rump_sysproxy_anonmmap(RUMP_SPVM2CTL(p->p_vmspace),
524 size, addrp);
525 }
526 return error;
527 }
528
529 /*
530 * Stubs for things referenced from vfs_vnode.c but not used.
531 */
532 const dev_t zerodev;
533
534 struct uvm_object *
udv_attach(dev_t device,vm_prot_t accessprot,voff_t off,vsize_t size)535 udv_attach(dev_t device, vm_prot_t accessprot, voff_t off, vsize_t size)
536 {
537 return NULL;
538 }
539
540 struct pagerinfo {
541 vaddr_t pgr_kva;
542 int pgr_npages;
543 struct vm_page **pgr_pgs;
544 bool pgr_read;
545
546 LIST_ENTRY(pagerinfo) pgr_entries;
547 };
548 static LIST_HEAD(, pagerinfo) pagerlist = LIST_HEAD_INITIALIZER(pagerlist);
549
550 /*
551 * Pager "map" in routine. Instead of mapping, we allocate memory
552 * and copy page contents there. The reason for copying instead of
553 * mapping is simple: we do not assume we are running on virtual
554 * memory. Even if we could emulate virtual memory in some envs
555 * such as userspace, copying is much faster than trying to awkardly
556 * cope with remapping (see "Design and Implementation" pp.95-98).
557 * The downside of the approach is that the pager requires MAXPHYS
558 * free memory to perform paging, but short of virtual memory or
559 * making the pager do I/O in page-sized chunks we cannot do much
560 * about that.
561 */
562 vaddr_t
uvm_pagermapin(struct vm_page ** pgs,int npages,int flags)563 uvm_pagermapin(struct vm_page **pgs, int npages, int flags)
564 {
565 struct pagerinfo *pgri;
566 vaddr_t curkva;
567 int i;
568
569 /* allocate structures */
570 pgri = kmem_alloc(sizeof(*pgri), KM_SLEEP);
571 pgri->pgr_kva = (vaddr_t)kmem_alloc(npages * PAGE_SIZE, KM_SLEEP);
572 pgri->pgr_npages = npages;
573 pgri->pgr_pgs = kmem_alloc(sizeof(struct vm_page *) * npages, KM_SLEEP);
574 pgri->pgr_read = (flags & UVMPAGER_MAPIN_READ) != 0;
575
576 /* copy contents to "mapped" memory */
577 for (i = 0, curkva = pgri->pgr_kva;
578 i < npages;
579 i++, curkva += PAGE_SIZE) {
580 /*
581 * We need to copy the previous contents of the pages to
582 * the window even if we are reading from the
583 * device, since the device might not fill the contents of
584 * the full mapped range and we will end up corrupting
585 * data when we unmap the window.
586 */
587 memcpy((void*)curkva, pgs[i]->uanon, PAGE_SIZE);
588 pgri->pgr_pgs[i] = pgs[i];
589 }
590
591 mutex_enter(&pagermtx);
592 LIST_INSERT_HEAD(&pagerlist, pgri, pgr_entries);
593 mutex_exit(&pagermtx);
594
595 return pgri->pgr_kva;
596 }
597
598 /*
599 * map out the pager window. return contents from VA to page storage
600 * and free structures.
601 *
602 * Note: does not currently support partial frees
603 */
604 void
uvm_pagermapout(vaddr_t kva,int npages)605 uvm_pagermapout(vaddr_t kva, int npages)
606 {
607 struct pagerinfo *pgri;
608 vaddr_t curkva;
609 int i;
610
611 mutex_enter(&pagermtx);
612 LIST_FOREACH(pgri, &pagerlist, pgr_entries) {
613 if (pgri->pgr_kva == kva)
614 break;
615 }
616 KASSERT(pgri);
617 if (pgri->pgr_npages != npages)
618 panic("uvm_pagermapout: partial unmapping not supported");
619 LIST_REMOVE(pgri, pgr_entries);
620 mutex_exit(&pagermtx);
621
622 if (pgri->pgr_read) {
623 for (i = 0, curkva = pgri->pgr_kva;
624 i < pgri->pgr_npages;
625 i++, curkva += PAGE_SIZE) {
626 memcpy(pgri->pgr_pgs[i]->uanon,(void*)curkva,PAGE_SIZE);
627 }
628 }
629
630 kmem_free(pgri->pgr_pgs, npages * sizeof(struct vm_page *));
631 kmem_free((void*)pgri->pgr_kva, npages * PAGE_SIZE);
632 kmem_free(pgri, sizeof(*pgri));
633 }
634
635 /*
636 * convert va in pager window to page structure.
637 * XXX: how expensive is this (global lock, list traversal)?
638 */
639 struct vm_page *
uvm_pageratop(vaddr_t va)640 uvm_pageratop(vaddr_t va)
641 {
642 struct pagerinfo *pgri;
643 struct vm_page *pg = NULL;
644 int i;
645
646 mutex_enter(&pagermtx);
647 LIST_FOREACH(pgri, &pagerlist, pgr_entries) {
648 if (pgri->pgr_kva <= va
649 && va < pgri->pgr_kva + pgri->pgr_npages*PAGE_SIZE)
650 break;
651 }
652 if (pgri) {
653 i = (va - pgri->pgr_kva) >> PAGE_SHIFT;
654 pg = pgri->pgr_pgs[i];
655 }
656 mutex_exit(&pagermtx);
657
658 return pg;
659 }
660
661 /*
662 * Called with the vm object locked.
663 *
664 * Put vnode object pages at the end of the access queue to indicate
665 * they have been recently accessed and should not be immediate
666 * candidates for pageout. Do not do this for lookups done by
667 * the pagedaemon to mimic pmap_kentered mappings which don't track
668 * access information.
669 */
670 struct vm_page *
uvm_pagelookup(struct uvm_object * uobj,voff_t off)671 uvm_pagelookup(struct uvm_object *uobj, voff_t off)
672 {
673 struct vm_page *pg;
674 bool ispagedaemon = curlwp == uvm.pagedaemon_lwp;
675
676 pg = radix_tree_lookup_node(&uobj->uo_pages, off >> PAGE_SHIFT);
677 if (pg && !UVM_OBJ_IS_AOBJ(pg->uobject) && !ispagedaemon) {
678 mutex_enter(&vmpage_lruqueue_lock);
679 TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue);
680 TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue);
681 mutex_exit(&vmpage_lruqueue_lock);
682 }
683
684 return pg;
685 }
686
687 void
uvm_page_unbusy(struct vm_page ** pgs,int npgs)688 uvm_page_unbusy(struct vm_page **pgs, int npgs)
689 {
690 struct vm_page *pg;
691 int i, pageout_done;
692
693 KASSERT(npgs > 0);
694
695 pageout_done = 0;
696 for (i = 0; i < npgs; i++) {
697 pg = pgs[i];
698 if (pg == NULL || pg == PGO_DONTCARE) {
699 continue;
700 }
701
702 #if 0
703 KASSERT(uvm_page_owner_locked_p(pg, true));
704 #else
705 /*
706 * uvm_page_owner_locked_p() is not available in rump,
707 * and rump doesn't support amaps anyway.
708 */
709 KASSERT(rw_write_held(pg->uobject->vmobjlock));
710 #endif
711 KASSERT(pg->flags & PG_BUSY);
712
713 if (pg->flags & PG_PAGEOUT) {
714 pg->flags &= ~PG_PAGEOUT;
715 pg->flags |= PG_RELEASED;
716 pageout_done++;
717 atomic_inc_uint(&uvmexp.pdfreed);
718 }
719 if (pg->flags & PG_RELEASED) {
720 KASSERT(pg->uobject != NULL ||
721 (pg->uanon != NULL && pg->uanon->an_ref > 0));
722 pg->flags &= ~PG_RELEASED;
723 uvm_pagefree(pg);
724 } else {
725 KASSERT((pg->flags & PG_FAKE) == 0);
726 pg->flags &= ~PG_BUSY;
727 uvm_pagelock(pg);
728 uvm_pagewakeup(pg);
729 uvm_pageunlock(pg);
730 UVM_PAGE_OWN(pg, NULL);
731 }
732 }
733 if (pageout_done != 0) {
734 uvm_pageout_done(pageout_done);
735 }
736 }
737
738 void
uvm_pagewait(struct vm_page * pg,krwlock_t * lock,const char * wmesg)739 uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg)
740 {
741
742 KASSERT(rw_lock_held(lock));
743 KASSERT((pg->flags & PG_BUSY) != 0);
744
745 mutex_enter(&pg->interlock);
746 pg->pqflags |= PQ_WANTED;
747 rw_exit(lock);
748 UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0);
749 }
750
751 void
uvm_pagewakeup(struct vm_page * pg)752 uvm_pagewakeup(struct vm_page *pg)
753 {
754
755 KASSERT(mutex_owned(&pg->interlock));
756
757 if ((pg->pqflags & PQ_WANTED) != 0) {
758 pg->pqflags &= ~PQ_WANTED;
759 wakeup(pg);
760 }
761 }
762
763 void
uvm_estimatepageable(int * active,int * inactive)764 uvm_estimatepageable(int *active, int *inactive)
765 {
766
767 /* XXX: guessing game */
768 *active = 1024;
769 *inactive = 1024;
770 }
771
772 int
uvm_loan(struct vm_map * map,vaddr_t start,vsize_t len,void * v,int flags)773 uvm_loan(struct vm_map *map, vaddr_t start, vsize_t len, void *v, int flags)
774 {
775
776 panic("%s: unimplemented", __func__);
777 }
778
779 void
uvm_unloan(void * v,int npages,int flags)780 uvm_unloan(void *v, int npages, int flags)
781 {
782
783 panic("%s: unimplemented", __func__);
784 }
785
786 int
uvm_loanuobjpages(struct uvm_object * uobj,voff_t pgoff,int orignpages,struct vm_page ** opp)787 uvm_loanuobjpages(struct uvm_object *uobj, voff_t pgoff, int orignpages,
788 struct vm_page **opp)
789 {
790
791 return EBUSY;
792 }
793
794 struct vm_page *
uvm_loanbreak(struct vm_page * pg)795 uvm_loanbreak(struct vm_page *pg)
796 {
797
798 panic("%s: unimplemented", __func__);
799 }
800
801 void
ubc_purge(struct uvm_object * uobj)802 ubc_purge(struct uvm_object *uobj)
803 {
804
805 }
806
807 vaddr_t
uvm_default_mapaddr(struct proc * p,vaddr_t base,vsize_t sz,int topdown)808 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz, int topdown)
809 {
810
811 return 0;
812 }
813
814 int
uvm_map_protect(struct vm_map * map,vaddr_t start,vaddr_t end,vm_prot_t prot,bool set_max)815 uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end,
816 vm_prot_t prot, bool set_max)
817 {
818
819 return EOPNOTSUPP;
820 }
821
822 int
uvm_map(struct vm_map * map,vaddr_t * startp,vsize_t size,struct uvm_object * uobj,voff_t uoffset,vsize_t align,uvm_flag_t flags)823 uvm_map(struct vm_map *map, vaddr_t *startp, vsize_t size,
824 struct uvm_object *uobj, voff_t uoffset, vsize_t align,
825 uvm_flag_t flags)
826 {
827
828 *startp = (vaddr_t)rump_hypermalloc(size, align, true, "uvm_map");
829 return *startp != 0 ? 0 : ENOMEM;
830 }
831
832 void
uvm_unmap1(struct vm_map * map,vaddr_t start,vaddr_t end,int flags)833 uvm_unmap1(struct vm_map *map, vaddr_t start, vaddr_t end, int flags)
834 {
835
836 rump_hyperfree((void*)start, end-start);
837 }
838
839
840 /*
841 * UVM km
842 */
843
844 vaddr_t
uvm_km_alloc(struct vm_map * map,vsize_t size,vsize_t align,uvm_flag_t flags)845 uvm_km_alloc(struct vm_map *map, vsize_t size, vsize_t align, uvm_flag_t flags)
846 {
847 void *rv, *desired = NULL;
848 int alignbit, error;
849
850 #ifdef __x86_64__
851 /*
852 * On amd64, allocate all module memory from the lowest 2GB.
853 * This is because NetBSD kernel modules are compiled
854 * with -mcmodel=kernel and reserve only 4 bytes for
855 * offsets. If we load code compiled with -mcmodel=kernel
856 * anywhere except the lowest or highest 2GB, it will not
857 * work. Since userspace does not have access to the highest
858 * 2GB, use the lowest 2GB.
859 *
860 * Note: this assumes the rump kernel resides in
861 * the lowest 2GB as well.
862 *
863 * Note2: yes, it's a quick hack, but since this the only
864 * place where we care about the map we're allocating from,
865 * just use a simple "if" instead of coming up with a fancy
866 * generic solution.
867 */
868 if (map == module_map) {
869 desired = (void *)(0x80000000 - size);
870 }
871 #endif
872
873 if (__predict_false(map == module_map)) {
874 alignbit = 0;
875 if (align) {
876 alignbit = ffs(align)-1;
877 }
878 error = rumpuser_anonmmap(desired, size, alignbit,
879 flags & UVM_KMF_EXEC, &rv);
880 } else {
881 error = rumpuser_malloc(size, align, &rv);
882 }
883
884 if (error) {
885 if (flags & (UVM_KMF_CANFAIL | UVM_KMF_NOWAIT))
886 return 0;
887 else
888 panic("uvm_km_alloc failed");
889 }
890
891 if (flags & UVM_KMF_ZERO)
892 memset(rv, 0, size);
893
894 return (vaddr_t)rv;
895 }
896
897 void
uvm_km_free(struct vm_map * map,vaddr_t vaddr,vsize_t size,uvm_flag_t flags)898 uvm_km_free(struct vm_map *map, vaddr_t vaddr, vsize_t size, uvm_flag_t flags)
899 {
900
901 if (__predict_false(map == module_map))
902 rumpuser_unmap((void *)vaddr, size);
903 else
904 rumpuser_free((void *)vaddr, size);
905 }
906
907 int
uvm_km_protect(struct vm_map * map,vaddr_t vaddr,vsize_t size,vm_prot_t prot)908 uvm_km_protect(struct vm_map *map, vaddr_t vaddr, vsize_t size, vm_prot_t prot)
909 {
910 return 0;
911 }
912
913 struct vm_map *
uvm_km_suballoc(struct vm_map * map,vaddr_t * minaddr,vaddr_t * maxaddr,vsize_t size,int pageable,bool fixed,struct vm_map * submap)914 uvm_km_suballoc(struct vm_map *map, vaddr_t *minaddr, vaddr_t *maxaddr,
915 vsize_t size, int pageable, bool fixed, struct vm_map *submap)
916 {
917
918 return (struct vm_map *)417416;
919 }
920
921 int
uvm_km_kmem_alloc(vmem_t * vm,vmem_size_t size,vm_flag_t flags,vmem_addr_t * addr)922 uvm_km_kmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags,
923 vmem_addr_t *addr)
924 {
925 vaddr_t va;
926 va = (vaddr_t)rump_hypermalloc(size, PAGE_SIZE,
927 (flags & VM_SLEEP), "kmalloc");
928
929 if (va) {
930 *addr = va;
931 return 0;
932 } else {
933 return ENOMEM;
934 }
935 }
936
937 void
uvm_km_kmem_free(vmem_t * vm,vmem_addr_t addr,vmem_size_t size)938 uvm_km_kmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size)
939 {
940
941 rump_hyperfree((void *)addr, size);
942 }
943
944 /*
945 * VM space locking routines. We don't really have to do anything,
946 * since the pages are always "wired" (both local and remote processes).
947 */
948 int
uvm_vslock(struct vmspace * vs,void * addr,size_t len,vm_prot_t access)949 uvm_vslock(struct vmspace *vs, void *addr, size_t len, vm_prot_t access)
950 {
951
952 return 0;
953 }
954
955 void
uvm_vsunlock(struct vmspace * vs,void * addr,size_t len)956 uvm_vsunlock(struct vmspace *vs, void *addr, size_t len)
957 {
958
959 }
960
961 /*
962 * For the local case the buffer mappers don't need to do anything.
963 * For the remote case we need to reserve space and copy data in or
964 * out, depending on B_READ/B_WRITE.
965 */
966 int
vmapbuf(struct buf * bp,vsize_t len)967 vmapbuf(struct buf *bp, vsize_t len)
968 {
969 int error = 0;
970
971 bp->b_saveaddr = bp->b_data;
972
973 /* remote case */
974 if (!RUMP_LOCALPROC_P(curproc)) {
975 bp->b_data = rump_hypermalloc(len, 0, true, "vmapbuf");
976 if (BUF_ISWRITE(bp)) {
977 error = copyin(bp->b_saveaddr, bp->b_data, len);
978 if (error) {
979 rump_hyperfree(bp->b_data, len);
980 bp->b_data = bp->b_saveaddr;
981 bp->b_saveaddr = 0;
982 }
983 }
984 }
985
986 return error;
987 }
988
989 void
vunmapbuf(struct buf * bp,vsize_t len)990 vunmapbuf(struct buf *bp, vsize_t len)
991 {
992
993 /* remote case */
994 if (!RUMP_LOCALPROC_P(bp->b_proc)) {
995 if (BUF_ISREAD(bp)) {
996 bp->b_error = copyout_proc(bp->b_proc,
997 bp->b_data, bp->b_saveaddr, len);
998 }
999 rump_hyperfree(bp->b_data, len);
1000 }
1001
1002 bp->b_data = bp->b_saveaddr;
1003 bp->b_saveaddr = 0;
1004 }
1005
1006 void
uvmspace_addref(struct vmspace * vm)1007 uvmspace_addref(struct vmspace *vm)
1008 {
1009
1010 /*
1011 * No dynamically allocated vmspaces exist.
1012 */
1013 }
1014
1015 void
uvmspace_free(struct vmspace * vm)1016 uvmspace_free(struct vmspace *vm)
1017 {
1018
1019 /* nothing for now */
1020 }
1021
1022 /*
1023 * page life cycle stuff. it really doesn't exist, so just stubs.
1024 */
1025
1026 void
uvm_pageactivate(struct vm_page * pg)1027 uvm_pageactivate(struct vm_page *pg)
1028 {
1029
1030 /* nada */
1031 }
1032
1033 void
uvm_pagedeactivate(struct vm_page * pg)1034 uvm_pagedeactivate(struct vm_page *pg)
1035 {
1036
1037 /* nada */
1038 }
1039
1040 void
uvm_pagedequeue(struct vm_page * pg)1041 uvm_pagedequeue(struct vm_page *pg)
1042 {
1043
1044 /* nada*/
1045 }
1046
1047 void
uvm_pageenqueue(struct vm_page * pg)1048 uvm_pageenqueue(struct vm_page *pg)
1049 {
1050
1051 /* nada */
1052 }
1053
1054 void
uvmpdpol_anfree(struct vm_anon * an)1055 uvmpdpol_anfree(struct vm_anon *an)
1056 {
1057
1058 /* nada */
1059 }
1060
1061 /*
1062 * Physical address accessors.
1063 */
1064
1065 struct vm_page *
uvm_phys_to_vm_page(paddr_t pa)1066 uvm_phys_to_vm_page(paddr_t pa)
1067 {
1068
1069 return NULL;
1070 }
1071
1072 paddr_t
uvm_vm_page_to_phys(const struct vm_page * pg)1073 uvm_vm_page_to_phys(const struct vm_page *pg)
1074 {
1075
1076 return 0;
1077 }
1078
1079 vaddr_t
uvm_uarea_alloc(void)1080 uvm_uarea_alloc(void)
1081 {
1082
1083 /* non-zero */
1084 return (vaddr_t)11;
1085 }
1086
1087 void
uvm_uarea_free(vaddr_t uarea)1088 uvm_uarea_free(vaddr_t uarea)
1089 {
1090
1091 /* nata, so creamy */
1092 }
1093
1094 /*
1095 * Routines related to the Page Baroness.
1096 */
1097
1098 void
uvm_wait(const char * msg)1099 uvm_wait(const char *msg)
1100 {
1101
1102 if (__predict_false(rump_threads == 0))
1103 panic("pagedaemon missing (RUMP_THREADS = 0)");
1104
1105 if (curlwp == uvm.pagedaemon_lwp) {
1106 /* is it possible for us to later get memory? */
1107 if (!uvmexp.paging)
1108 panic("pagedaemon out of memory");
1109 }
1110
1111 mutex_enter(&pdaemonmtx);
1112 pdaemon_waiters++;
1113 cv_signal(&pdaemoncv);
1114 cv_wait(&oomwait, &pdaemonmtx);
1115 mutex_exit(&pdaemonmtx);
1116 }
1117
1118 void
uvm_pageout_start(int npages)1119 uvm_pageout_start(int npages)
1120 {
1121
1122 mutex_enter(&pdaemonmtx);
1123 uvmexp.paging += npages;
1124 mutex_exit(&pdaemonmtx);
1125 }
1126
1127 void
uvm_pageout_done(int npages)1128 uvm_pageout_done(int npages)
1129 {
1130
1131 if (!npages)
1132 return;
1133
1134 mutex_enter(&pdaemonmtx);
1135 KASSERT(uvmexp.paging >= npages);
1136 uvmexp.paging -= npages;
1137
1138 if (pdaemon_waiters) {
1139 pdaemon_waiters = 0;
1140 cv_broadcast(&oomwait);
1141 }
1142 mutex_exit(&pdaemonmtx);
1143 }
1144
1145 static bool
processpage(struct vm_page * pg)1146 processpage(struct vm_page *pg)
1147 {
1148 struct uvm_object *uobj;
1149
1150 uobj = pg->uobject;
1151 if (rw_tryenter(uobj->vmobjlock, RW_WRITER)) {
1152 if ((pg->flags & PG_BUSY) == 0) {
1153 mutex_exit(&vmpage_lruqueue_lock);
1154 uobj->pgops->pgo_put(uobj, pg->offset,
1155 pg->offset + PAGE_SIZE,
1156 PGO_CLEANIT|PGO_FREE);
1157 KASSERT(!rw_write_held(uobj->vmobjlock));
1158 return true;
1159 } else {
1160 rw_exit(uobj->vmobjlock);
1161 }
1162 }
1163
1164 return false;
1165 }
1166
1167 /*
1168 * The Diabolical pageDaemon Director (DDD).
1169 *
1170 * This routine can always use better heuristics.
1171 */
1172 void
uvm_pageout(void * arg)1173 uvm_pageout(void *arg)
1174 {
1175 struct vm_page *pg;
1176 struct pool *pp, *pp_first;
1177 int cleaned, skip, skipped;
1178 bool succ;
1179
1180 mutex_enter(&pdaemonmtx);
1181 for (;;) {
1182 if (pdaemon_waiters) {
1183 pdaemon_waiters = 0;
1184 cv_broadcast(&oomwait);
1185 }
1186 if (!NEED_PAGEDAEMON()) {
1187 kernel_map->flags &= ~VM_MAP_WANTVA;
1188 cv_wait(&pdaemoncv, &pdaemonmtx);
1189 }
1190 uvmexp.pdwoke++;
1191
1192 /* tell the world that we are hungry */
1193 kernel_map->flags |= VM_MAP_WANTVA;
1194 mutex_exit(&pdaemonmtx);
1195
1196 /*
1197 * step one: reclaim the page cache. this should give
1198 * us the biggest earnings since whole pages are released
1199 * into backing memory.
1200 */
1201 pool_cache_reclaim(&pagecache);
1202 if (!NEED_PAGEDAEMON()) {
1203 mutex_enter(&pdaemonmtx);
1204 continue;
1205 }
1206
1207 /*
1208 * Ok, so that didn't help. Next, try to hunt memory
1209 * by pushing out vnode pages. The pages might contain
1210 * useful cached data, but we need the memory.
1211 */
1212 cleaned = 0;
1213 skip = 0;
1214 again:
1215 mutex_enter(&vmpage_lruqueue_lock);
1216 while (cleaned < PAGEDAEMON_OBJCHUNK) {
1217 skipped = 0;
1218 TAILQ_FOREACH(pg, &vmpage_lruqueue, pageq.queue) {
1219
1220 /*
1221 * skip over pages we _might_ have tried
1222 * to handle earlier. they might not be
1223 * exactly the same ones, but I'm not too
1224 * concerned.
1225 */
1226 while (skipped++ < skip)
1227 continue;
1228
1229 if (processpage(pg)) {
1230 cleaned++;
1231 goto again;
1232 }
1233
1234 skip++;
1235 }
1236 break;
1237 }
1238 mutex_exit(&vmpage_lruqueue_lock);
1239
1240 /*
1241 * And of course we need to reclaim the page cache
1242 * again to actually release memory.
1243 */
1244 pool_cache_reclaim(&pagecache);
1245 if (!NEED_PAGEDAEMON()) {
1246 mutex_enter(&pdaemonmtx);
1247 continue;
1248 }
1249
1250 /*
1251 * And then drain the pools. Wipe them out ... all of them.
1252 */
1253 for (pp_first = NULL;;) {
1254 rump_vfs_drainbufs(10 /* XXX: estimate! */);
1255
1256 succ = pool_drain(&pp);
1257 if (succ || pp == pp_first)
1258 break;
1259
1260 if (pp_first == NULL)
1261 pp_first = pp;
1262 }
1263
1264 /*
1265 * Need to use PYEC on our bag of tricks.
1266 * Unfortunately, the wife just borrowed it.
1267 */
1268
1269 mutex_enter(&pdaemonmtx);
1270 if (!succ && cleaned == 0 && pdaemon_waiters &&
1271 uvmexp.paging == 0) {
1272 kpause("pddlk", false, hz, &pdaemonmtx);
1273 }
1274 }
1275
1276 panic("you can swap out any time you like, but you can never leave");
1277 }
1278
1279 void
uvm_kick_pdaemon()1280 uvm_kick_pdaemon()
1281 {
1282
1283 /*
1284 * Wake up the diabolical pagedaemon director if we are over
1285 * 90% of the memory limit. This is a complete and utter
1286 * stetson-harrison decision which you are allowed to finetune.
1287 * Don't bother locking. If we have some unflushed caches,
1288 * other waker-uppers will deal with the issue.
1289 */
1290 if (NEED_PAGEDAEMON()) {
1291 cv_signal(&pdaemoncv);
1292 }
1293 }
1294
1295 void *
rump_hypermalloc(size_t howmuch,int alignment,bool waitok,const char * wmsg)1296 rump_hypermalloc(size_t howmuch, int alignment, bool waitok, const char *wmsg)
1297 {
1298 const unsigned long thelimit =
1299 curlwp == uvm.pagedaemon_lwp ? pdlimit : rump_physmemlimit;
1300 unsigned long newmem;
1301 void *rv;
1302 int error;
1303
1304 uvm_kick_pdaemon(); /* ouch */
1305
1306 /* first we must be within the limit */
1307 limitagain:
1308 if (thelimit != RUMPMEM_UNLIMITED) {
1309 newmem = atomic_add_long_nv(&curphysmem, howmuch);
1310 if (newmem > thelimit) {
1311 newmem = atomic_add_long_nv(&curphysmem, -howmuch);
1312 if (!waitok) {
1313 return NULL;
1314 }
1315 uvm_wait(wmsg);
1316 goto limitagain;
1317 }
1318 }
1319
1320 /* second, we must get something from the backend */
1321 again:
1322 error = rumpuser_malloc(howmuch, alignment, &rv);
1323 if (__predict_false(error && waitok)) {
1324 uvm_wait(wmsg);
1325 goto again;
1326 }
1327
1328 return rv;
1329 }
1330
1331 void
rump_hyperfree(void * what,size_t size)1332 rump_hyperfree(void *what, size_t size)
1333 {
1334
1335 if (rump_physmemlimit != RUMPMEM_UNLIMITED) {
1336 atomic_add_long(&curphysmem, -size);
1337 }
1338 rumpuser_free(what, size);
1339 }
1340
1341 /*
1342 * UBC
1343 */
1344
1345 #define PAGERFLAGS (PGO_SYNCIO | PGO_NOBLOCKALLOC | PGO_NOTIMESTAMP)
1346
1347 void
ubc_zerorange(struct uvm_object * uobj,off_t off,size_t len,int flags)1348 ubc_zerorange(struct uvm_object *uobj, off_t off, size_t len, int flags)
1349 {
1350 struct vm_page **pgs;
1351 int maxpages = MIN(32, round_page(len) >> PAGE_SHIFT);
1352 int npages, i;
1353
1354 if (maxpages == 0)
1355 return;
1356
1357 pgs = kmem_alloc(maxpages * sizeof(pgs), KM_SLEEP);
1358 rw_enter(uobj->vmobjlock, RW_WRITER);
1359 while (len) {
1360 npages = MIN(maxpages, round_page(len) >> PAGE_SHIFT);
1361 memset(pgs, 0, npages * sizeof(struct vm_page *));
1362 (void)uobj->pgops->pgo_get(uobj, trunc_page(off),
1363 pgs, &npages, 0, VM_PROT_READ | VM_PROT_WRITE,
1364 0, PAGERFLAGS | PGO_PASTEOF);
1365 KASSERT(npages > 0);
1366
1367 rw_enter(uobj->vmobjlock, RW_WRITER);
1368 for (i = 0; i < npages; i++) {
1369 struct vm_page *pg;
1370 uint8_t *start;
1371 size_t chunkoff, chunklen;
1372
1373 pg = pgs[i];
1374 if (pg == NULL)
1375 break;
1376
1377 KASSERT(pg->uobject != NULL);
1378 KASSERT(uobj->vmobjlock == pg->uobject->vmobjlock);
1379
1380 chunkoff = off & PAGE_MASK;
1381 chunklen = MIN(PAGE_SIZE - chunkoff, len);
1382 start = (uint8_t *)pg->uanon + chunkoff;
1383
1384 memset(start, 0, chunklen);
1385 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
1386
1387 off += chunklen;
1388 len -= chunklen;
1389 }
1390 uvm_page_unbusy(pgs, npages);
1391 }
1392 rw_exit(uobj->vmobjlock);
1393 kmem_free(pgs, maxpages * sizeof(pgs));
1394 }
1395
1396 #define len2npages(off, len) \
1397 ((round_page(off+len) - trunc_page(off)) >> PAGE_SHIFT)
1398
1399 int
ubc_uiomove(struct uvm_object * uobj,struct uio * uio,vsize_t todo,int advice,int flags)1400 ubc_uiomove(struct uvm_object *uobj, struct uio *uio, vsize_t todo,
1401 int advice, int flags)
1402 {
1403 struct vm_page **pgs;
1404 int npages = len2npages(uio->uio_offset, todo);
1405 size_t pgalloc;
1406 int i, rv, pagerflags;
1407 vm_prot_t prot;
1408
1409 pgalloc = npages * sizeof(pgs);
1410 pgs = kmem_alloc(pgalloc, KM_SLEEP);
1411
1412 pagerflags = PAGERFLAGS;
1413 if (flags & UBC_WRITE)
1414 pagerflags |= PGO_PASTEOF;
1415 if (flags & UBC_FAULTBUSY)
1416 pagerflags |= PGO_OVERWRITE;
1417
1418 prot = VM_PROT_READ;
1419 if (flags & UBC_WRITE)
1420 prot |= VM_PROT_WRITE;
1421
1422 rw_enter(uobj->vmobjlock, RW_WRITER);
1423 do {
1424 npages = len2npages(uio->uio_offset, todo);
1425 memset(pgs, 0, pgalloc);
1426 rv = uobj->pgops->pgo_get(uobj, trunc_page(uio->uio_offset),
1427 pgs, &npages, 0, prot, 0, pagerflags);
1428 if (rv)
1429 goto out;
1430
1431 rw_enter(uobj->vmobjlock, RW_WRITER);
1432 for (i = 0; i < npages; i++) {
1433 struct vm_page *pg;
1434 size_t xfersize;
1435 off_t pageoff;
1436
1437 pg = pgs[i];
1438 if (pg == NULL)
1439 break;
1440
1441 KASSERT(pg->uobject != NULL);
1442 KASSERT(uobj->vmobjlock == pg->uobject->vmobjlock);
1443 pageoff = uio->uio_offset & PAGE_MASK;
1444
1445 xfersize = MIN(MIN(todo, PAGE_SIZE), PAGE_SIZE-pageoff);
1446 KASSERT(xfersize > 0);
1447 rv = uiomove((uint8_t *)pg->uanon + pageoff,
1448 xfersize, uio);
1449 if (rv) {
1450 uvm_page_unbusy(pgs, npages);
1451 rw_exit(uobj->vmobjlock);
1452 goto out;
1453 }
1454 if (uio->uio_rw == UIO_WRITE) {
1455 pg->flags &= ~PG_FAKE;
1456 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY);
1457 }
1458 todo -= xfersize;
1459 }
1460 uvm_page_unbusy(pgs, npages);
1461 } while (todo);
1462 rw_exit(uobj->vmobjlock);
1463
1464 out:
1465 kmem_free(pgs, pgalloc);
1466 return rv;
1467 }
1468