1 /* $NetBSD: vm.c,v 1.194 2022/10/26 23:22:07 riastradh Exp $ */ 2 3 /* 4 * Copyright (c) 2007-2011 Antti Kantee. All Rights Reserved. 5 * 6 * Development of this software was supported by 7 * The Finnish Cultural Foundation and the Research Foundation of 8 * The Helsinki University of Technology. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 20 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * Virtual memory emulation routines. 34 */ 35 36 /* 37 * XXX: we abuse pg->uanon for the virtual address of the storage 38 * for each page. phys_addr would fit the job description better, 39 * except that it will create unnecessary lossage on some platforms 40 * due to not being a pointer type. 41 */ 42 43 #include <sys/cdefs.h> 44 __KERNEL_RCSID(0, "$NetBSD: vm.c,v 1.194 2022/10/26 23:22:07 riastradh Exp $"); 45 46 #include <sys/param.h> 47 #include <sys/atomic.h> 48 #include <sys/buf.h> 49 #include <sys/kernel.h> 50 #include <sys/kmem.h> 51 #include <sys/vmem.h> 52 #include <sys/mman.h> 53 #include <sys/null.h> 54 #include <sys/vnode.h> 55 #include <sys/radixtree.h> 56 #include <sys/module.h> 57 58 #include <machine/pmap.h> 59 60 #if defined(__i386__) || defined(__x86_64__) 61 /* 62 * This file abuses the pmap abstraction to create its own statically 63 * allocated struct pmap object, even though it can't do anything 64 * useful with such a thing from userland. On x86 the struct pmap 65 * definition is private, so we have to go to extra effort to abuse it 66 * there. This should be fixed -- all of the struct pmap definitions 67 * should be private, and then rump can furnish its own fake struct 68 * pmap without clashing with anything. 69 */ 70 #include <machine/pmap_private.h> 71 #endif 72 73 #include <uvm/uvm.h> 74 #include <uvm/uvm_ddb.h> 75 #include <uvm/uvm_pdpolicy.h> 76 #include <uvm/uvm_prot.h> 77 #include <uvm/uvm_readahead.h> 78 #include <uvm/uvm_device.h> 79 80 #include <rump-sys/kern.h> 81 #include <rump-sys/vfs.h> 82 83 #include <rump/rumpuser.h> 84 85 kmutex_t vmpage_lruqueue_lock; /* non-free page lock */ 86 kmutex_t uvm_swap_data_lock; 87 88 struct uvmexp uvmexp; 89 struct uvm uvm; 90 91 #ifdef __uvmexp_pagesize 92 const int * const uvmexp_pagesize = &uvmexp.pagesize; 93 const int * const uvmexp_pagemask = &uvmexp.pagemask; 94 const int * const uvmexp_pageshift = &uvmexp.pageshift; 95 #endif 96 97 static struct vm_map kernel_map_store; 98 struct vm_map *kernel_map = &kernel_map_store; 99 100 static struct vm_map module_map_store; 101 102 static struct pmap pmap_kernel; 103 struct pmap rump_pmap_local; 104 struct pmap *const kernel_pmap_ptr = &pmap_kernel; 105 106 vmem_t *kmem_arena; 107 vmem_t *kmem_va_arena; 108 109 static unsigned int pdaemon_waiters; 110 static kmutex_t pdaemonmtx; 111 static kcondvar_t pdaemoncv, oomwait; 112 113 /* all local non-proc0 processes share this vmspace */ 114 struct vmspace *rump_vmspace_local; 115 116 unsigned long rump_physmemlimit = RUMPMEM_UNLIMITED; 117 static unsigned long pdlimit = RUMPMEM_UNLIMITED; /* page daemon memlimit */ 118 static unsigned long curphysmem; 119 static unsigned long dddlim; /* 90% of memory limit used */ 120 #define NEED_PAGEDAEMON() \ 121 (rump_physmemlimit != RUMPMEM_UNLIMITED && curphysmem > dddlim) 122 #define PDRESERVE (2*MAXPHYS) 123 124 /* 125 * Try to free two pages worth of pages from objects. 126 * If this successfully frees a full page cache page, we'll 127 * free the released page plus PAGE_SIZE/sizeof(vm_page). 128 */ 129 #define PAGEDAEMON_OBJCHUNK (2*PAGE_SIZE / sizeof(struct vm_page)) 130 131 /* 132 * Keep a list of least recently used pages. Since the only way a 133 * rump kernel can "access" a page is via lookup, we put the page 134 * at the back of queue every time a lookup for it is done. If the 135 * page is in front of this global queue and we're short of memory, 136 * it's a candidate for pageout. 137 */ 138 static struct pglist vmpage_lruqueue; 139 static unsigned vmpage_onqueue; 140 141 /* 142 * vm pages 143 */ 144 145 static int 146 pgctor(void *arg, void *obj, int flags) 147 { 148 struct vm_page *pg = obj; 149 150 memset(pg, 0, sizeof(*pg)); 151 pg->uanon = rump_hypermalloc(PAGE_SIZE, PAGE_SIZE, 152 (flags & PR_WAITOK) == PR_WAITOK, "pgalloc"); 153 return pg->uanon == NULL; 154 } 155 156 static void 157 pgdtor(void *arg, void *obj) 158 { 159 struct vm_page *pg = obj; 160 161 rump_hyperfree(pg->uanon, PAGE_SIZE); 162 } 163 164 static struct pool_cache pagecache; 165 166 /* 167 * Called with the object locked. We don't support anons. 168 */ 169 struct vm_page * 170 uvm_pagealloc_strat(struct uvm_object *uobj, voff_t off, struct vm_anon *anon, 171 int flags, int strat, int free_list) 172 { 173 struct vm_page *pg; 174 175 KASSERT(uobj && rw_write_held(uobj->vmobjlock)); 176 KASSERT(anon == NULL); 177 178 pg = pool_cache_get(&pagecache, PR_NOWAIT); 179 if (__predict_false(pg == NULL)) { 180 return NULL; 181 } 182 mutex_init(&pg->interlock, MUTEX_DEFAULT, IPL_NONE); 183 184 pg->offset = off; 185 pg->uobject = uobj; 186 187 if (radix_tree_insert_node(&uobj->uo_pages, off >> PAGE_SHIFT, 188 pg) != 0) { 189 pool_cache_put(&pagecache, pg); 190 return NULL; 191 } 192 193 if (UVM_OBJ_IS_VNODE(uobj)) { 194 if (uobj->uo_npages == 0) { 195 struct vnode *vp = (struct vnode *)uobj; 196 mutex_enter(vp->v_interlock); 197 vp->v_iflag |= VI_PAGES; 198 mutex_exit(vp->v_interlock); 199 } 200 pg->flags |= PG_FILE; 201 } 202 uobj->uo_npages++; 203 204 pg->flags = PG_CLEAN|PG_BUSY|PG_FAKE; 205 if (flags & UVM_PGA_ZERO) { 206 uvm_pagezero(pg); 207 } 208 209 /* 210 * Don't put anons on the LRU page queue. We can't flush them 211 * (there's no concept of swap in a rump kernel), so no reason 212 * to bother with them. 213 */ 214 if (!UVM_OBJ_IS_AOBJ(uobj)) { 215 atomic_inc_uint(&vmpage_onqueue); 216 mutex_enter(&vmpage_lruqueue_lock); 217 TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue); 218 mutex_exit(&vmpage_lruqueue_lock); 219 } else { 220 pg->flags |= PG_AOBJ; 221 } 222 223 return pg; 224 } 225 226 /* 227 * Release a page. 228 * 229 * Called with the vm object locked. 230 */ 231 void 232 uvm_pagefree(struct vm_page *pg) 233 { 234 struct uvm_object *uobj = pg->uobject; 235 struct vm_page *pg2 __unused; 236 237 KASSERT(rw_write_held(uobj->vmobjlock)); 238 239 mutex_enter(&pg->interlock); 240 uvm_pagewakeup(pg); 241 mutex_exit(&pg->interlock); 242 243 uobj->uo_npages--; 244 pg2 = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT); 245 KASSERT(pg == pg2); 246 247 if (!UVM_OBJ_IS_AOBJ(uobj)) { 248 mutex_enter(&vmpage_lruqueue_lock); 249 TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue); 250 mutex_exit(&vmpage_lruqueue_lock); 251 atomic_dec_uint(&vmpage_onqueue); 252 } 253 254 if (UVM_OBJ_IS_VNODE(uobj) && uobj->uo_npages == 0) { 255 struct vnode *vp = (struct vnode *)uobj; 256 mutex_enter(vp->v_interlock); 257 vp->v_iflag &= ~VI_PAGES; 258 mutex_exit(vp->v_interlock); 259 } 260 261 mutex_destroy(&pg->interlock); 262 pool_cache_put(&pagecache, pg); 263 } 264 265 void 266 uvm_pagezero(struct vm_page *pg) 267 { 268 269 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); 270 memset((void *)pg->uanon, 0, PAGE_SIZE); 271 } 272 273 /* 274 * uvm_page_owner_locked_p: return true if object associated with page is 275 * locked. this is a weak check for runtime assertions only. 276 */ 277 278 bool 279 uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive) 280 { 281 282 if (exclusive) 283 return rw_write_held(pg->uobject->vmobjlock); 284 else 285 return rw_lock_held(pg->uobject->vmobjlock); 286 } 287 288 /* 289 * Misc routines 290 */ 291 292 static kmutex_t pagermtx; 293 294 void 295 uvm_init(void) 296 { 297 char buf[64]; 298 299 if (rumpuser_getparam("RUMP_MEMLIMIT", buf, sizeof(buf)) == 0) { 300 unsigned long tmp; 301 char *ep; 302 int mult; 303 304 tmp = strtoul(buf, &ep, 10); 305 if (strlen(ep) > 1) 306 panic("uvm_init: invalid RUMP_MEMLIMIT: %s", buf); 307 308 /* mini-dehumanize-number */ 309 mult = 1; 310 switch (*ep) { 311 case 'k': 312 mult = 1024; 313 break; 314 case 'm': 315 mult = 1024*1024; 316 break; 317 case 'g': 318 mult = 1024*1024*1024; 319 break; 320 case 0: 321 break; 322 default: 323 panic("uvm_init: invalid RUMP_MEMLIMIT: %s", buf); 324 } 325 rump_physmemlimit = tmp * mult; 326 327 if (rump_physmemlimit / mult != tmp) 328 panic("uvm_init: RUMP_MEMLIMIT overflow: %s", buf); 329 330 /* reserve some memory for the pager */ 331 if (rump_physmemlimit <= PDRESERVE) 332 panic("uvm_init: system reserves %d bytes of mem, " 333 "only %lu bytes given", 334 PDRESERVE, rump_physmemlimit); 335 pdlimit = rump_physmemlimit; 336 rump_physmemlimit -= PDRESERVE; 337 338 if (pdlimit < 1024*1024) 339 printf("uvm_init: WARNING: <1MB RAM limit, " 340 "hope you know what you're doing\n"); 341 342 #define HUMANIZE_BYTES 9 343 CTASSERT(sizeof(buf) >= HUMANIZE_BYTES); 344 format_bytes(buf, HUMANIZE_BYTES, rump_physmemlimit); 345 #undef HUMANIZE_BYTES 346 dddlim = 9 * (rump_physmemlimit / 10); 347 } else { 348 strlcpy(buf, "unlimited (host limit)", sizeof(buf)); 349 } 350 aprint_verbose("total memory = %s\n", buf); 351 352 TAILQ_INIT(&vmpage_lruqueue); 353 354 if (rump_physmemlimit == RUMPMEM_UNLIMITED) { 355 uvmexp.npages = physmem; 356 } else { 357 uvmexp.npages = pdlimit >> PAGE_SHIFT; 358 uvmexp.reserve_pagedaemon = PDRESERVE >> PAGE_SHIFT; 359 uvmexp.freetarg = (rump_physmemlimit-dddlim) >> PAGE_SHIFT; 360 } 361 /* 362 * uvmexp.free is not used internally or updated. The reason is 363 * that the memory hypercall allocator is allowed to allocate 364 * non-page sized chunks. We use a byte count in curphysmem 365 * instead. 366 */ 367 uvmexp.free = uvmexp.npages; 368 369 #ifndef __uvmexp_pagesize 370 uvmexp.pagesize = PAGE_SIZE; 371 uvmexp.pagemask = PAGE_MASK; 372 uvmexp.pageshift = PAGE_SHIFT; 373 #else 374 #define FAKE_PAGE_SHIFT 12 375 uvmexp.pageshift = FAKE_PAGE_SHIFT; 376 uvmexp.pagesize = 1<<FAKE_PAGE_SHIFT; 377 uvmexp.pagemask = (1<<FAKE_PAGE_SHIFT)-1; 378 #undef FAKE_PAGE_SHIFT 379 #endif 380 381 mutex_init(&pagermtx, MUTEX_DEFAULT, IPL_NONE); 382 mutex_init(&vmpage_lruqueue_lock, MUTEX_DEFAULT, IPL_NONE); 383 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 384 mutex_init(&pdaemonmtx, MUTEX_DEFAULT, IPL_NONE); 385 386 cv_init(&pdaemoncv, "pdaemon"); 387 cv_init(&oomwait, "oomwait"); 388 389 module_map = &module_map_store; 390 391 kernel_map->pmap = pmap_kernel(); 392 393 pool_subsystem_init(); 394 395 kmem_arena = vmem_create("kmem", 0, 1024*1024, PAGE_SIZE, 396 NULL, NULL, NULL, 397 0, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM); 398 399 vmem_subsystem_init(kmem_arena); 400 401 kmem_va_arena = vmem_create("kva", 0, 0, PAGE_SIZE, 402 vmem_alloc, vmem_free, kmem_arena, 403 8 * PAGE_SIZE, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM); 404 405 pool_cache_bootstrap(&pagecache, sizeof(struct vm_page), 0, 0, 0, 406 "page$", NULL, IPL_NONE, pgctor, pgdtor, NULL); 407 408 radix_tree_init(); 409 410 /* create vmspace used by local clients */ 411 rump_vmspace_local = kmem_zalloc(sizeof(*rump_vmspace_local), KM_SLEEP); 412 uvmspace_init(rump_vmspace_local, &rump_pmap_local, 0, 0, false); 413 } 414 415 void 416 uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t vmin, vaddr_t vmax, 417 bool topdown) 418 { 419 420 vm->vm_map.pmap = pmap; 421 vm->vm_refcnt = 1; 422 } 423 424 int 425 uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end, 426 bool new_pageable, int lockflags) 427 { 428 return 0; 429 } 430 431 void 432 uvm_pagewire(struct vm_page *pg) 433 { 434 435 /* nada */ 436 } 437 438 void 439 uvm_pageunwire(struct vm_page *pg) 440 { 441 442 /* nada */ 443 } 444 445 int 446 uvm_availmem(bool cached) 447 { 448 449 return uvmexp.free; 450 } 451 452 void 453 uvm_pagelock(struct vm_page *pg) 454 { 455 456 mutex_enter(&pg->interlock); 457 } 458 459 void 460 uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2) 461 { 462 463 if (pg1 < pg2) { 464 mutex_enter(&pg1->interlock); 465 mutex_enter(&pg2->interlock); 466 } else { 467 mutex_enter(&pg2->interlock); 468 mutex_enter(&pg1->interlock); 469 } 470 } 471 472 void 473 uvm_pageunlock(struct vm_page *pg) 474 { 475 476 mutex_exit(&pg->interlock); 477 } 478 479 void 480 uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2) 481 { 482 483 mutex_exit(&pg1->interlock); 484 mutex_exit(&pg2->interlock); 485 } 486 487 /* where's your schmonz now? */ 488 #define PUNLIMIT(a) \ 489 p->p_rlimit[a].rlim_cur = p->p_rlimit[a].rlim_max = RLIM_INFINITY; 490 void 491 uvm_init_limits(struct proc *p) 492 { 493 494 #ifndef DFLSSIZ 495 #define DFLSSIZ (16*1024*1024) 496 #endif 497 p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ; 498 p->p_rlimit[RLIMIT_STACK].rlim_max = MAXSSIZ; 499 PUNLIMIT(RLIMIT_DATA); 500 PUNLIMIT(RLIMIT_RSS); 501 PUNLIMIT(RLIMIT_AS); 502 /* nice, cascade */ 503 } 504 #undef PUNLIMIT 505 506 /* 507 * This satisfies the "disgusting mmap hack" used by proplib. 508 */ 509 int 510 uvm_mmap_anon(struct proc *p, void **addrp, size_t size) 511 { 512 int error; 513 514 /* no reason in particular, but cf. uvm_default_mapaddr() */ 515 if (*addrp != NULL) 516 panic("uvm_mmap() variant unsupported"); 517 518 if (RUMP_LOCALPROC_P(curproc)) { 519 error = rumpuser_anonmmap(NULL, size, 0, 0, addrp); 520 } else { 521 error = rump_sysproxy_anonmmap(RUMP_SPVM2CTL(p->p_vmspace), 522 size, addrp); 523 } 524 return error; 525 } 526 527 /* 528 * Stubs for things referenced from vfs_vnode.c but not used. 529 */ 530 const dev_t zerodev; 531 532 struct uvm_object * 533 udv_attach(dev_t device, vm_prot_t accessprot, voff_t off, vsize_t size) 534 { 535 return NULL; 536 } 537 538 struct pagerinfo { 539 vaddr_t pgr_kva; 540 int pgr_npages; 541 struct vm_page **pgr_pgs; 542 bool pgr_read; 543 544 LIST_ENTRY(pagerinfo) pgr_entries; 545 }; 546 static LIST_HEAD(, pagerinfo) pagerlist = LIST_HEAD_INITIALIZER(pagerlist); 547 548 /* 549 * Pager "map" in routine. Instead of mapping, we allocate memory 550 * and copy page contents there. The reason for copying instead of 551 * mapping is simple: we do not assume we are running on virtual 552 * memory. Even if we could emulate virtual memory in some envs 553 * such as userspace, copying is much faster than trying to awkardly 554 * cope with remapping (see "Design and Implementation" pp.95-98). 555 * The downside of the approach is that the pager requires MAXPHYS 556 * free memory to perform paging, but short of virtual memory or 557 * making the pager do I/O in page-sized chunks we cannot do much 558 * about that. 559 */ 560 vaddr_t 561 uvm_pagermapin(struct vm_page **pgs, int npages, int flags) 562 { 563 struct pagerinfo *pgri; 564 vaddr_t curkva; 565 int i; 566 567 /* allocate structures */ 568 pgri = kmem_alloc(sizeof(*pgri), KM_SLEEP); 569 pgri->pgr_kva = (vaddr_t)kmem_alloc(npages * PAGE_SIZE, KM_SLEEP); 570 pgri->pgr_npages = npages; 571 pgri->pgr_pgs = kmem_alloc(sizeof(struct vm_page *) * npages, KM_SLEEP); 572 pgri->pgr_read = (flags & UVMPAGER_MAPIN_READ) != 0; 573 574 /* copy contents to "mapped" memory */ 575 for (i = 0, curkva = pgri->pgr_kva; 576 i < npages; 577 i++, curkva += PAGE_SIZE) { 578 /* 579 * We need to copy the previous contents of the pages to 580 * the window even if we are reading from the 581 * device, since the device might not fill the contents of 582 * the full mapped range and we will end up corrupting 583 * data when we unmap the window. 584 */ 585 memcpy((void*)curkva, pgs[i]->uanon, PAGE_SIZE); 586 pgri->pgr_pgs[i] = pgs[i]; 587 } 588 589 mutex_enter(&pagermtx); 590 LIST_INSERT_HEAD(&pagerlist, pgri, pgr_entries); 591 mutex_exit(&pagermtx); 592 593 return pgri->pgr_kva; 594 } 595 596 /* 597 * map out the pager window. return contents from VA to page storage 598 * and free structures. 599 * 600 * Note: does not currently support partial frees 601 */ 602 void 603 uvm_pagermapout(vaddr_t kva, int npages) 604 { 605 struct pagerinfo *pgri; 606 vaddr_t curkva; 607 int i; 608 609 mutex_enter(&pagermtx); 610 LIST_FOREACH(pgri, &pagerlist, pgr_entries) { 611 if (pgri->pgr_kva == kva) 612 break; 613 } 614 KASSERT(pgri); 615 if (pgri->pgr_npages != npages) 616 panic("uvm_pagermapout: partial unmapping not supported"); 617 LIST_REMOVE(pgri, pgr_entries); 618 mutex_exit(&pagermtx); 619 620 if (pgri->pgr_read) { 621 for (i = 0, curkva = pgri->pgr_kva; 622 i < pgri->pgr_npages; 623 i++, curkva += PAGE_SIZE) { 624 memcpy(pgri->pgr_pgs[i]->uanon,(void*)curkva,PAGE_SIZE); 625 } 626 } 627 628 kmem_free(pgri->pgr_pgs, npages * sizeof(struct vm_page *)); 629 kmem_free((void*)pgri->pgr_kva, npages * PAGE_SIZE); 630 kmem_free(pgri, sizeof(*pgri)); 631 } 632 633 /* 634 * convert va in pager window to page structure. 635 * XXX: how expensive is this (global lock, list traversal)? 636 */ 637 struct vm_page * 638 uvm_pageratop(vaddr_t va) 639 { 640 struct pagerinfo *pgri; 641 struct vm_page *pg = NULL; 642 int i; 643 644 mutex_enter(&pagermtx); 645 LIST_FOREACH(pgri, &pagerlist, pgr_entries) { 646 if (pgri->pgr_kva <= va 647 && va < pgri->pgr_kva + pgri->pgr_npages*PAGE_SIZE) 648 break; 649 } 650 if (pgri) { 651 i = (va - pgri->pgr_kva) >> PAGE_SHIFT; 652 pg = pgri->pgr_pgs[i]; 653 } 654 mutex_exit(&pagermtx); 655 656 return pg; 657 } 658 659 /* 660 * Called with the vm object locked. 661 * 662 * Put vnode object pages at the end of the access queue to indicate 663 * they have been recently accessed and should not be immediate 664 * candidates for pageout. Do not do this for lookups done by 665 * the pagedaemon to mimic pmap_kentered mappings which don't track 666 * access information. 667 */ 668 struct vm_page * 669 uvm_pagelookup(struct uvm_object *uobj, voff_t off) 670 { 671 struct vm_page *pg; 672 bool ispagedaemon = curlwp == uvm.pagedaemon_lwp; 673 674 pg = radix_tree_lookup_node(&uobj->uo_pages, off >> PAGE_SHIFT); 675 if (pg && !UVM_OBJ_IS_AOBJ(pg->uobject) && !ispagedaemon) { 676 mutex_enter(&vmpage_lruqueue_lock); 677 TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue); 678 TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue); 679 mutex_exit(&vmpage_lruqueue_lock); 680 } 681 682 return pg; 683 } 684 685 void 686 uvm_page_unbusy(struct vm_page **pgs, int npgs) 687 { 688 struct vm_page *pg; 689 int i, pageout_done; 690 691 KASSERT(npgs > 0); 692 693 pageout_done = 0; 694 for (i = 0; i < npgs; i++) { 695 pg = pgs[i]; 696 if (pg == NULL || pg == PGO_DONTCARE) { 697 continue; 698 } 699 700 #if 0 701 KASSERT(uvm_page_owner_locked_p(pg, true)); 702 #else 703 /* 704 * uvm_page_owner_locked_p() is not available in rump, 705 * and rump doesn't support amaps anyway. 706 */ 707 KASSERT(rw_write_held(pg->uobject->vmobjlock)); 708 #endif 709 KASSERT(pg->flags & PG_BUSY); 710 711 if (pg->flags & PG_PAGEOUT) { 712 pg->flags &= ~PG_PAGEOUT; 713 pg->flags |= PG_RELEASED; 714 pageout_done++; 715 atomic_inc_uint(&uvmexp.pdfreed); 716 } 717 if (pg->flags & PG_RELEASED) { 718 KASSERT(pg->uobject != NULL || 719 (pg->uanon != NULL && pg->uanon->an_ref > 0)); 720 pg->flags &= ~PG_RELEASED; 721 uvm_pagefree(pg); 722 } else { 723 KASSERT((pg->flags & PG_FAKE) == 0); 724 pg->flags &= ~PG_BUSY; 725 uvm_pagelock(pg); 726 uvm_pagewakeup(pg); 727 uvm_pageunlock(pg); 728 UVM_PAGE_OWN(pg, NULL); 729 } 730 } 731 if (pageout_done != 0) { 732 uvm_pageout_done(pageout_done); 733 } 734 } 735 736 void 737 uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg) 738 { 739 740 KASSERT(rw_lock_held(lock)); 741 KASSERT((pg->flags & PG_BUSY) != 0); 742 743 mutex_enter(&pg->interlock); 744 pg->pqflags |= PQ_WANTED; 745 rw_exit(lock); 746 UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0); 747 } 748 749 void 750 uvm_pagewakeup(struct vm_page *pg) 751 { 752 753 KASSERT(mutex_owned(&pg->interlock)); 754 755 if ((pg->pqflags & PQ_WANTED) != 0) { 756 pg->pqflags &= ~PQ_WANTED; 757 wakeup(pg); 758 } 759 } 760 761 void 762 uvm_estimatepageable(int *active, int *inactive) 763 { 764 765 /* XXX: guessing game */ 766 *active = 1024; 767 *inactive = 1024; 768 } 769 770 int 771 uvm_loan(struct vm_map *map, vaddr_t start, vsize_t len, void *v, int flags) 772 { 773 774 panic("%s: unimplemented", __func__); 775 } 776 777 void 778 uvm_unloan(void *v, int npages, int flags) 779 { 780 781 panic("%s: unimplemented", __func__); 782 } 783 784 int 785 uvm_loanuobjpages(struct uvm_object *uobj, voff_t pgoff, int orignpages, 786 struct vm_page **opp) 787 { 788 789 return EBUSY; 790 } 791 792 struct vm_page * 793 uvm_loanbreak(struct vm_page *pg) 794 { 795 796 panic("%s: unimplemented", __func__); 797 } 798 799 void 800 ubc_purge(struct uvm_object *uobj) 801 { 802 803 } 804 805 vaddr_t 806 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz, int topdown) 807 { 808 809 return 0; 810 } 811 812 int 813 uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end, 814 vm_prot_t prot, bool set_max) 815 { 816 817 return EOPNOTSUPP; 818 } 819 820 int 821 uvm_map(struct vm_map *map, vaddr_t *startp, vsize_t size, 822 struct uvm_object *uobj, voff_t uoffset, vsize_t align, 823 uvm_flag_t flags) 824 { 825 826 *startp = (vaddr_t)rump_hypermalloc(size, align, true, "uvm_map"); 827 return *startp != 0 ? 0 : ENOMEM; 828 } 829 830 void 831 uvm_unmap1(struct vm_map *map, vaddr_t start, vaddr_t end, int flags) 832 { 833 834 rump_hyperfree((void*)start, end-start); 835 } 836 837 838 /* 839 * UVM km 840 */ 841 842 vaddr_t 843 uvm_km_alloc(struct vm_map *map, vsize_t size, vsize_t align, uvm_flag_t flags) 844 { 845 void *rv, *desired = NULL; 846 int alignbit, error; 847 848 #ifdef __x86_64__ 849 /* 850 * On amd64, allocate all module memory from the lowest 2GB. 851 * This is because NetBSD kernel modules are compiled 852 * with -mcmodel=kernel and reserve only 4 bytes for 853 * offsets. If we load code compiled with -mcmodel=kernel 854 * anywhere except the lowest or highest 2GB, it will not 855 * work. Since userspace does not have access to the highest 856 * 2GB, use the lowest 2GB. 857 * 858 * Note: this assumes the rump kernel resides in 859 * the lowest 2GB as well. 860 * 861 * Note2: yes, it's a quick hack, but since this the only 862 * place where we care about the map we're allocating from, 863 * just use a simple "if" instead of coming up with a fancy 864 * generic solution. 865 */ 866 if (map == module_map) { 867 desired = (void *)(0x80000000 - size); 868 } 869 #endif 870 871 if (__predict_false(map == module_map)) { 872 alignbit = 0; 873 if (align) { 874 alignbit = ffs(align)-1; 875 } 876 error = rumpuser_anonmmap(desired, size, alignbit, 877 flags & UVM_KMF_EXEC, &rv); 878 } else { 879 error = rumpuser_malloc(size, align, &rv); 880 } 881 882 if (error) { 883 if (flags & (UVM_KMF_CANFAIL | UVM_KMF_NOWAIT)) 884 return 0; 885 else 886 panic("uvm_km_alloc failed"); 887 } 888 889 if (flags & UVM_KMF_ZERO) 890 memset(rv, 0, size); 891 892 return (vaddr_t)rv; 893 } 894 895 void 896 uvm_km_free(struct vm_map *map, vaddr_t vaddr, vsize_t size, uvm_flag_t flags) 897 { 898 899 if (__predict_false(map == module_map)) 900 rumpuser_unmap((void *)vaddr, size); 901 else 902 rumpuser_free((void *)vaddr, size); 903 } 904 905 int 906 uvm_km_protect(struct vm_map *map, vaddr_t vaddr, vsize_t size, vm_prot_t prot) 907 { 908 return 0; 909 } 910 911 struct vm_map * 912 uvm_km_suballoc(struct vm_map *map, vaddr_t *minaddr, vaddr_t *maxaddr, 913 vsize_t size, int pageable, bool fixed, struct vm_map *submap) 914 { 915 916 return (struct vm_map *)417416; 917 } 918 919 int 920 uvm_km_kmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags, 921 vmem_addr_t *addr) 922 { 923 vaddr_t va; 924 va = (vaddr_t)rump_hypermalloc(size, PAGE_SIZE, 925 (flags & VM_SLEEP), "kmalloc"); 926 927 if (va) { 928 *addr = va; 929 return 0; 930 } else { 931 return ENOMEM; 932 } 933 } 934 935 void 936 uvm_km_kmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) 937 { 938 939 rump_hyperfree((void *)addr, size); 940 } 941 942 /* 943 * VM space locking routines. We don't really have to do anything, 944 * since the pages are always "wired" (both local and remote processes). 945 */ 946 int 947 uvm_vslock(struct vmspace *vs, void *addr, size_t len, vm_prot_t access) 948 { 949 950 return 0; 951 } 952 953 void 954 uvm_vsunlock(struct vmspace *vs, void *addr, size_t len) 955 { 956 957 } 958 959 /* 960 * For the local case the buffer mappers don't need to do anything. 961 * For the remote case we need to reserve space and copy data in or 962 * out, depending on B_READ/B_WRITE. 963 */ 964 int 965 vmapbuf(struct buf *bp, vsize_t len) 966 { 967 int error = 0; 968 969 bp->b_saveaddr = bp->b_data; 970 971 /* remote case */ 972 if (!RUMP_LOCALPROC_P(curproc)) { 973 bp->b_data = rump_hypermalloc(len, 0, true, "vmapbuf"); 974 if (BUF_ISWRITE(bp)) { 975 error = copyin(bp->b_saveaddr, bp->b_data, len); 976 if (error) { 977 rump_hyperfree(bp->b_data, len); 978 bp->b_data = bp->b_saveaddr; 979 bp->b_saveaddr = 0; 980 } 981 } 982 } 983 984 return error; 985 } 986 987 void 988 vunmapbuf(struct buf *bp, vsize_t len) 989 { 990 991 /* remote case */ 992 if (!RUMP_LOCALPROC_P(bp->b_proc)) { 993 if (BUF_ISREAD(bp)) { 994 bp->b_error = copyout_proc(bp->b_proc, 995 bp->b_data, bp->b_saveaddr, len); 996 } 997 rump_hyperfree(bp->b_data, len); 998 } 999 1000 bp->b_data = bp->b_saveaddr; 1001 bp->b_saveaddr = 0; 1002 } 1003 1004 void 1005 uvmspace_addref(struct vmspace *vm) 1006 { 1007 1008 /* 1009 * No dynamically allocated vmspaces exist. 1010 */ 1011 } 1012 1013 void 1014 uvmspace_free(struct vmspace *vm) 1015 { 1016 1017 /* nothing for now */ 1018 } 1019 1020 /* 1021 * page life cycle stuff. it really doesn't exist, so just stubs. 1022 */ 1023 1024 void 1025 uvm_pageactivate(struct vm_page *pg) 1026 { 1027 1028 /* nada */ 1029 } 1030 1031 void 1032 uvm_pagedeactivate(struct vm_page *pg) 1033 { 1034 1035 /* nada */ 1036 } 1037 1038 void 1039 uvm_pagedequeue(struct vm_page *pg) 1040 { 1041 1042 /* nada*/ 1043 } 1044 1045 void 1046 uvm_pageenqueue(struct vm_page *pg) 1047 { 1048 1049 /* nada */ 1050 } 1051 1052 void 1053 uvmpdpol_anfree(struct vm_anon *an) 1054 { 1055 1056 /* nada */ 1057 } 1058 1059 /* 1060 * Physical address accessors. 1061 */ 1062 1063 struct vm_page * 1064 uvm_phys_to_vm_page(paddr_t pa) 1065 { 1066 1067 return NULL; 1068 } 1069 1070 paddr_t 1071 uvm_vm_page_to_phys(const struct vm_page *pg) 1072 { 1073 1074 return 0; 1075 } 1076 1077 vaddr_t 1078 uvm_uarea_alloc(void) 1079 { 1080 1081 /* non-zero */ 1082 return (vaddr_t)11; 1083 } 1084 1085 void 1086 uvm_uarea_free(vaddr_t uarea) 1087 { 1088 1089 /* nata, so creamy */ 1090 } 1091 1092 /* 1093 * Routines related to the Page Baroness. 1094 */ 1095 1096 void 1097 uvm_wait(const char *msg) 1098 { 1099 1100 if (__predict_false(rump_threads == 0)) 1101 panic("pagedaemon missing (RUMP_THREADS = 0)"); 1102 1103 if (curlwp == uvm.pagedaemon_lwp) { 1104 /* is it possible for us to later get memory? */ 1105 if (!uvmexp.paging) 1106 panic("pagedaemon out of memory"); 1107 } 1108 1109 mutex_enter(&pdaemonmtx); 1110 pdaemon_waiters++; 1111 cv_signal(&pdaemoncv); 1112 cv_wait(&oomwait, &pdaemonmtx); 1113 mutex_exit(&pdaemonmtx); 1114 } 1115 1116 void 1117 uvm_pageout_start(int npages) 1118 { 1119 1120 mutex_enter(&pdaemonmtx); 1121 uvmexp.paging += npages; 1122 mutex_exit(&pdaemonmtx); 1123 } 1124 1125 void 1126 uvm_pageout_done(int npages) 1127 { 1128 1129 if (!npages) 1130 return; 1131 1132 mutex_enter(&pdaemonmtx); 1133 KASSERT(uvmexp.paging >= npages); 1134 uvmexp.paging -= npages; 1135 1136 if (pdaemon_waiters) { 1137 pdaemon_waiters = 0; 1138 cv_broadcast(&oomwait); 1139 } 1140 mutex_exit(&pdaemonmtx); 1141 } 1142 1143 static bool 1144 processpage(struct vm_page *pg) 1145 { 1146 struct uvm_object *uobj; 1147 1148 uobj = pg->uobject; 1149 if (rw_tryenter(uobj->vmobjlock, RW_WRITER)) { 1150 if ((pg->flags & PG_BUSY) == 0) { 1151 mutex_exit(&vmpage_lruqueue_lock); 1152 uobj->pgops->pgo_put(uobj, pg->offset, 1153 pg->offset + PAGE_SIZE, 1154 PGO_CLEANIT|PGO_FREE); 1155 KASSERT(!rw_write_held(uobj->vmobjlock)); 1156 return true; 1157 } else { 1158 rw_exit(uobj->vmobjlock); 1159 } 1160 } 1161 1162 return false; 1163 } 1164 1165 /* 1166 * The Diabolical pageDaemon Director (DDD). 1167 * 1168 * This routine can always use better heuristics. 1169 */ 1170 void 1171 uvm_pageout(void *arg) 1172 { 1173 struct vm_page *pg; 1174 struct pool *pp, *pp_first; 1175 int cleaned, skip, skipped; 1176 bool succ; 1177 1178 mutex_enter(&pdaemonmtx); 1179 for (;;) { 1180 if (pdaemon_waiters) { 1181 pdaemon_waiters = 0; 1182 cv_broadcast(&oomwait); 1183 } 1184 if (!NEED_PAGEDAEMON()) { 1185 kernel_map->flags &= ~VM_MAP_WANTVA; 1186 cv_wait(&pdaemoncv, &pdaemonmtx); 1187 } 1188 uvmexp.pdwoke++; 1189 1190 /* tell the world that we are hungry */ 1191 kernel_map->flags |= VM_MAP_WANTVA; 1192 mutex_exit(&pdaemonmtx); 1193 1194 /* 1195 * step one: reclaim the page cache. this should give 1196 * us the biggest earnings since whole pages are released 1197 * into backing memory. 1198 */ 1199 pool_cache_reclaim(&pagecache); 1200 if (!NEED_PAGEDAEMON()) { 1201 mutex_enter(&pdaemonmtx); 1202 continue; 1203 } 1204 1205 /* 1206 * Ok, so that didn't help. Next, try to hunt memory 1207 * by pushing out vnode pages. The pages might contain 1208 * useful cached data, but we need the memory. 1209 */ 1210 cleaned = 0; 1211 skip = 0; 1212 again: 1213 mutex_enter(&vmpage_lruqueue_lock); 1214 while (cleaned < PAGEDAEMON_OBJCHUNK) { 1215 skipped = 0; 1216 TAILQ_FOREACH(pg, &vmpage_lruqueue, pageq.queue) { 1217 1218 /* 1219 * skip over pages we _might_ have tried 1220 * to handle earlier. they might not be 1221 * exactly the same ones, but I'm not too 1222 * concerned. 1223 */ 1224 while (skipped++ < skip) 1225 continue; 1226 1227 if (processpage(pg)) { 1228 cleaned++; 1229 goto again; 1230 } 1231 1232 skip++; 1233 } 1234 break; 1235 } 1236 mutex_exit(&vmpage_lruqueue_lock); 1237 1238 /* 1239 * And of course we need to reclaim the page cache 1240 * again to actually release memory. 1241 */ 1242 pool_cache_reclaim(&pagecache); 1243 if (!NEED_PAGEDAEMON()) { 1244 mutex_enter(&pdaemonmtx); 1245 continue; 1246 } 1247 1248 /* 1249 * And then drain the pools. Wipe them out ... all of them. 1250 */ 1251 for (pp_first = NULL;;) { 1252 rump_vfs_drainbufs(10 /* XXX: estimate! */); 1253 1254 succ = pool_drain(&pp); 1255 if (succ || pp == pp_first) 1256 break; 1257 1258 if (pp_first == NULL) 1259 pp_first = pp; 1260 } 1261 1262 /* 1263 * Need to use PYEC on our bag of tricks. 1264 * Unfortunately, the wife just borrowed it. 1265 */ 1266 1267 mutex_enter(&pdaemonmtx); 1268 if (!succ && cleaned == 0 && pdaemon_waiters && 1269 uvmexp.paging == 0) { 1270 kpause("pddlk", false, hz, &pdaemonmtx); 1271 } 1272 } 1273 1274 panic("you can swap out any time you like, but you can never leave"); 1275 } 1276 1277 void 1278 uvm_kick_pdaemon() 1279 { 1280 1281 /* 1282 * Wake up the diabolical pagedaemon director if we are over 1283 * 90% of the memory limit. This is a complete and utter 1284 * stetson-harrison decision which you are allowed to finetune. 1285 * Don't bother locking. If we have some unflushed caches, 1286 * other waker-uppers will deal with the issue. 1287 */ 1288 if (NEED_PAGEDAEMON()) { 1289 cv_signal(&pdaemoncv); 1290 } 1291 } 1292 1293 void * 1294 rump_hypermalloc(size_t howmuch, int alignment, bool waitok, const char *wmsg) 1295 { 1296 const unsigned long thelimit = 1297 curlwp == uvm.pagedaemon_lwp ? pdlimit : rump_physmemlimit; 1298 unsigned long newmem; 1299 void *rv; 1300 int error; 1301 1302 uvm_kick_pdaemon(); /* ouch */ 1303 1304 /* first we must be within the limit */ 1305 limitagain: 1306 if (thelimit != RUMPMEM_UNLIMITED) { 1307 newmem = atomic_add_long_nv(&curphysmem, howmuch); 1308 if (newmem > thelimit) { 1309 newmem = atomic_add_long_nv(&curphysmem, -howmuch); 1310 if (!waitok) { 1311 return NULL; 1312 } 1313 uvm_wait(wmsg); 1314 goto limitagain; 1315 } 1316 } 1317 1318 /* second, we must get something from the backend */ 1319 again: 1320 error = rumpuser_malloc(howmuch, alignment, &rv); 1321 if (__predict_false(error && waitok)) { 1322 uvm_wait(wmsg); 1323 goto again; 1324 } 1325 1326 return rv; 1327 } 1328 1329 void 1330 rump_hyperfree(void *what, size_t size) 1331 { 1332 1333 if (rump_physmemlimit != RUMPMEM_UNLIMITED) { 1334 atomic_add_long(&curphysmem, -size); 1335 } 1336 rumpuser_free(what, size); 1337 } 1338