1 /* $NetBSD: vm.c,v 1.190 2020/06/11 19:20:46 ad Exp $ */ 2 3 /* 4 * Copyright (c) 2007-2011 Antti Kantee. All Rights Reserved. 5 * 6 * Development of this software was supported by 7 * The Finnish Cultural Foundation and the Research Foundation of 8 * The Helsinki University of Technology. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 20 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * Virtual memory emulation routines. 34 */ 35 36 /* 37 * XXX: we abuse pg->uanon for the virtual address of the storage 38 * for each page. phys_addr would fit the job description better, 39 * except that it will create unnecessary lossage on some platforms 40 * due to not being a pointer type. 41 */ 42 43 #include <sys/cdefs.h> 44 __KERNEL_RCSID(0, "$NetBSD: vm.c,v 1.190 2020/06/11 19:20:46 ad Exp $"); 45 46 #include <sys/param.h> 47 #include <sys/atomic.h> 48 #include <sys/buf.h> 49 #include <sys/kernel.h> 50 #include <sys/kmem.h> 51 #include <sys/vmem.h> 52 #include <sys/mman.h> 53 #include <sys/null.h> 54 #include <sys/vnode.h> 55 #include <sys/radixtree.h> 56 57 #include <machine/pmap.h> 58 59 #include <uvm/uvm.h> 60 #include <uvm/uvm_ddb.h> 61 #include <uvm/uvm_pdpolicy.h> 62 #include <uvm/uvm_prot.h> 63 #include <uvm/uvm_readahead.h> 64 #include <uvm/uvm_device.h> 65 66 #include <rump-sys/kern.h> 67 #include <rump-sys/vfs.h> 68 69 #include <rump/rumpuser.h> 70 71 kmutex_t vmpage_lruqueue_lock; /* non-free page lock */ 72 kmutex_t uvm_swap_data_lock; 73 74 struct uvmexp uvmexp; 75 struct uvm uvm; 76 77 #ifdef __uvmexp_pagesize 78 const int * const uvmexp_pagesize = &uvmexp.pagesize; 79 const int * const uvmexp_pagemask = &uvmexp.pagemask; 80 const int * const uvmexp_pageshift = &uvmexp.pageshift; 81 #endif 82 83 static struct vm_map kernel_map_store; 84 struct vm_map *kernel_map = &kernel_map_store; 85 86 static struct vm_map module_map_store; 87 extern struct vm_map *module_map; 88 89 static struct pmap pmap_kernel; 90 struct pmap rump_pmap_local; 91 struct pmap *const kernel_pmap_ptr = &pmap_kernel; 92 93 vmem_t *kmem_arena; 94 vmem_t *kmem_va_arena; 95 96 static unsigned int pdaemon_waiters; 97 static kmutex_t pdaemonmtx; 98 static kcondvar_t pdaemoncv, oomwait; 99 100 /* all local non-proc0 processes share this vmspace */ 101 struct vmspace *rump_vmspace_local; 102 103 unsigned long rump_physmemlimit = RUMPMEM_UNLIMITED; 104 static unsigned long pdlimit = RUMPMEM_UNLIMITED; /* page daemon memlimit */ 105 static unsigned long curphysmem; 106 static unsigned long dddlim; /* 90% of memory limit used */ 107 #define NEED_PAGEDAEMON() \ 108 (rump_physmemlimit != RUMPMEM_UNLIMITED && curphysmem > dddlim) 109 #define PDRESERVE (2*MAXPHYS) 110 111 /* 112 * Try to free two pages worth of pages from objects. 113 * If this succesfully frees a full page cache page, we'll 114 * free the released page plus PAGE_SIZE/sizeof(vm_page). 115 */ 116 #define PAGEDAEMON_OBJCHUNK (2*PAGE_SIZE / sizeof(struct vm_page)) 117 118 /* 119 * Keep a list of least recently used pages. Since the only way a 120 * rump kernel can "access" a page is via lookup, we put the page 121 * at the back of queue every time a lookup for it is done. If the 122 * page is in front of this global queue and we're short of memory, 123 * it's a candidate for pageout. 124 */ 125 static struct pglist vmpage_lruqueue; 126 static unsigned vmpage_onqueue; 127 128 /* 129 * vm pages 130 */ 131 132 static int 133 pgctor(void *arg, void *obj, int flags) 134 { 135 struct vm_page *pg = obj; 136 137 memset(pg, 0, sizeof(*pg)); 138 pg->uanon = rump_hypermalloc(PAGE_SIZE, PAGE_SIZE, 139 (flags & PR_WAITOK) == PR_WAITOK, "pgalloc"); 140 return pg->uanon == NULL; 141 } 142 143 static void 144 pgdtor(void *arg, void *obj) 145 { 146 struct vm_page *pg = obj; 147 148 rump_hyperfree(pg->uanon, PAGE_SIZE); 149 } 150 151 static struct pool_cache pagecache; 152 153 /* 154 * Called with the object locked. We don't support anons. 155 */ 156 struct vm_page * 157 uvm_pagealloc_strat(struct uvm_object *uobj, voff_t off, struct vm_anon *anon, 158 int flags, int strat, int free_list) 159 { 160 struct vm_page *pg; 161 162 KASSERT(uobj && rw_write_held(uobj->vmobjlock)); 163 KASSERT(anon == NULL); 164 165 pg = pool_cache_get(&pagecache, PR_NOWAIT); 166 if (__predict_false(pg == NULL)) { 167 return NULL; 168 } 169 mutex_init(&pg->interlock, MUTEX_DEFAULT, IPL_NONE); 170 171 pg->offset = off; 172 pg->uobject = uobj; 173 174 if (radix_tree_insert_node(&uobj->uo_pages, off >> PAGE_SHIFT, 175 pg) != 0) { 176 pool_cache_put(&pagecache, pg); 177 return NULL; 178 } 179 180 if (UVM_OBJ_IS_VNODE(uobj)) { 181 if (uobj->uo_npages == 0) { 182 struct vnode *vp = (struct vnode *)uobj; 183 mutex_enter(vp->v_interlock); 184 vp->v_iflag |= VI_PAGES; 185 mutex_exit(vp->v_interlock); 186 } 187 pg->flags |= PG_FILE; 188 } 189 uobj->uo_npages++; 190 191 pg->flags = PG_CLEAN|PG_BUSY|PG_FAKE; 192 if (flags & UVM_PGA_ZERO) { 193 uvm_pagezero(pg); 194 } 195 196 /* 197 * Don't put anons on the LRU page queue. We can't flush them 198 * (there's no concept of swap in a rump kernel), so no reason 199 * to bother with them. 200 */ 201 if (!UVM_OBJ_IS_AOBJ(uobj)) { 202 atomic_inc_uint(&vmpage_onqueue); 203 mutex_enter(&vmpage_lruqueue_lock); 204 TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue); 205 mutex_exit(&vmpage_lruqueue_lock); 206 } else { 207 pg->flags |= PG_AOBJ; 208 } 209 210 return pg; 211 } 212 213 /* 214 * Release a page. 215 * 216 * Called with the vm object locked. 217 */ 218 void 219 uvm_pagefree(struct vm_page *pg) 220 { 221 struct uvm_object *uobj = pg->uobject; 222 struct vm_page *pg2 __unused; 223 224 KASSERT(rw_write_held(uobj->vmobjlock)); 225 226 mutex_enter(&pg->interlock); 227 uvm_pagewakeup(pg); 228 mutex_exit(&pg->interlock); 229 230 uobj->uo_npages--; 231 pg2 = radix_tree_remove_node(&uobj->uo_pages, pg->offset >> PAGE_SHIFT); 232 KASSERT(pg == pg2); 233 234 if (!UVM_OBJ_IS_AOBJ(uobj)) { 235 mutex_enter(&vmpage_lruqueue_lock); 236 TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue); 237 mutex_exit(&vmpage_lruqueue_lock); 238 atomic_dec_uint(&vmpage_onqueue); 239 } 240 241 if (UVM_OBJ_IS_VNODE(uobj) && uobj->uo_npages == 0) { 242 struct vnode *vp = (struct vnode *)uobj; 243 mutex_enter(vp->v_interlock); 244 vp->v_iflag &= ~VI_PAGES; 245 mutex_exit(vp->v_interlock); 246 } 247 248 mutex_destroy(&pg->interlock); 249 pool_cache_put(&pagecache, pg); 250 } 251 252 void 253 uvm_pagezero(struct vm_page *pg) 254 { 255 256 uvm_pagemarkdirty(pg, UVM_PAGE_STATUS_DIRTY); 257 memset((void *)pg->uanon, 0, PAGE_SIZE); 258 } 259 260 /* 261 * uvm_page_owner_locked_p: return true if object associated with page is 262 * locked. this is a weak check for runtime assertions only. 263 */ 264 265 bool 266 uvm_page_owner_locked_p(struct vm_page *pg, bool exclusive) 267 { 268 269 if (exclusive) 270 return rw_write_held(pg->uobject->vmobjlock); 271 else 272 return rw_lock_held(pg->uobject->vmobjlock); 273 } 274 275 /* 276 * Misc routines 277 */ 278 279 static kmutex_t pagermtx; 280 281 void 282 uvm_init(void) 283 { 284 char buf[64]; 285 286 if (rumpuser_getparam("RUMP_MEMLIMIT", buf, sizeof(buf)) == 0) { 287 unsigned long tmp; 288 char *ep; 289 int mult; 290 291 tmp = strtoul(buf, &ep, 10); 292 if (strlen(ep) > 1) 293 panic("uvm_init: invalid RUMP_MEMLIMIT: %s", buf); 294 295 /* mini-dehumanize-number */ 296 mult = 1; 297 switch (*ep) { 298 case 'k': 299 mult = 1024; 300 break; 301 case 'm': 302 mult = 1024*1024; 303 break; 304 case 'g': 305 mult = 1024*1024*1024; 306 break; 307 case 0: 308 break; 309 default: 310 panic("uvm_init: invalid RUMP_MEMLIMIT: %s", buf); 311 } 312 rump_physmemlimit = tmp * mult; 313 314 if (rump_physmemlimit / mult != tmp) 315 panic("uvm_init: RUMP_MEMLIMIT overflow: %s", buf); 316 317 /* reserve some memory for the pager */ 318 if (rump_physmemlimit <= PDRESERVE) 319 panic("uvm_init: system reserves %d bytes of mem, " 320 "only %lu bytes given", 321 PDRESERVE, rump_physmemlimit); 322 pdlimit = rump_physmemlimit; 323 rump_physmemlimit -= PDRESERVE; 324 325 if (pdlimit < 1024*1024) 326 printf("uvm_init: WARNING: <1MB RAM limit, " 327 "hope you know what you're doing\n"); 328 329 #define HUMANIZE_BYTES 9 330 CTASSERT(sizeof(buf) >= HUMANIZE_BYTES); 331 format_bytes(buf, HUMANIZE_BYTES, rump_physmemlimit); 332 #undef HUMANIZE_BYTES 333 dddlim = 9 * (rump_physmemlimit / 10); 334 } else { 335 strlcpy(buf, "unlimited (host limit)", sizeof(buf)); 336 } 337 aprint_verbose("total memory = %s\n", buf); 338 339 TAILQ_INIT(&vmpage_lruqueue); 340 341 if (rump_physmemlimit == RUMPMEM_UNLIMITED) { 342 uvmexp.npages = physmem; 343 } else { 344 uvmexp.npages = pdlimit >> PAGE_SHIFT; 345 uvmexp.reserve_pagedaemon = PDRESERVE >> PAGE_SHIFT; 346 uvmexp.freetarg = (rump_physmemlimit-dddlim) >> PAGE_SHIFT; 347 } 348 /* 349 * uvmexp.free is not used internally or updated. The reason is 350 * that the memory hypercall allocator is allowed to allocate 351 * non-page sized chunks. We use a byte count in curphysmem 352 * instead. 353 */ 354 uvmexp.free = uvmexp.npages; 355 356 #ifndef __uvmexp_pagesize 357 uvmexp.pagesize = PAGE_SIZE; 358 uvmexp.pagemask = PAGE_MASK; 359 uvmexp.pageshift = PAGE_SHIFT; 360 #else 361 #define FAKE_PAGE_SHIFT 12 362 uvmexp.pageshift = FAKE_PAGE_SHIFT; 363 uvmexp.pagesize = 1<<FAKE_PAGE_SHIFT; 364 uvmexp.pagemask = (1<<FAKE_PAGE_SHIFT)-1; 365 #undef FAKE_PAGE_SHIFT 366 #endif 367 368 mutex_init(&pagermtx, MUTEX_DEFAULT, IPL_NONE); 369 mutex_init(&vmpage_lruqueue_lock, MUTEX_DEFAULT, IPL_NONE); 370 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 371 mutex_init(&pdaemonmtx, MUTEX_DEFAULT, IPL_NONE); 372 373 cv_init(&pdaemoncv, "pdaemon"); 374 cv_init(&oomwait, "oomwait"); 375 376 module_map = &module_map_store; 377 378 kernel_map->pmap = pmap_kernel(); 379 380 pool_subsystem_init(); 381 382 kmem_arena = vmem_create("kmem", 0, 1024*1024, PAGE_SIZE, 383 NULL, NULL, NULL, 384 0, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM); 385 386 vmem_subsystem_init(kmem_arena); 387 388 kmem_va_arena = vmem_create("kva", 0, 0, PAGE_SIZE, 389 vmem_alloc, vmem_free, kmem_arena, 390 8 * PAGE_SIZE, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM); 391 392 pool_cache_bootstrap(&pagecache, sizeof(struct vm_page), 0, 0, 0, 393 "page$", NULL, IPL_NONE, pgctor, pgdtor, NULL); 394 395 radix_tree_init(); 396 397 /* create vmspace used by local clients */ 398 rump_vmspace_local = kmem_zalloc(sizeof(*rump_vmspace_local), KM_SLEEP); 399 uvmspace_init(rump_vmspace_local, &rump_pmap_local, 0, 0, false); 400 } 401 402 void 403 uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t vmin, vaddr_t vmax, 404 bool topdown) 405 { 406 407 vm->vm_map.pmap = pmap; 408 vm->vm_refcnt = 1; 409 } 410 411 int 412 uvm_map_pageable(struct vm_map *map, vaddr_t start, vaddr_t end, 413 bool new_pageable, int lockflags) 414 { 415 return 0; 416 } 417 418 void 419 uvm_pagewire(struct vm_page *pg) 420 { 421 422 /* nada */ 423 } 424 425 void 426 uvm_pageunwire(struct vm_page *pg) 427 { 428 429 /* nada */ 430 } 431 432 int 433 uvm_availmem(bool cached) 434 { 435 436 return uvmexp.free; 437 } 438 439 void 440 uvm_pagelock(struct vm_page *pg) 441 { 442 443 mutex_enter(&pg->interlock); 444 } 445 446 void 447 uvm_pagelock2(struct vm_page *pg1, struct vm_page *pg2) 448 { 449 450 if (pg1 < pg2) { 451 mutex_enter(&pg1->interlock); 452 mutex_enter(&pg2->interlock); 453 } else { 454 mutex_enter(&pg2->interlock); 455 mutex_enter(&pg1->interlock); 456 } 457 } 458 459 void 460 uvm_pageunlock(struct vm_page *pg) 461 { 462 463 mutex_exit(&pg->interlock); 464 } 465 466 void 467 uvm_pageunlock2(struct vm_page *pg1, struct vm_page *pg2) 468 { 469 470 mutex_exit(&pg1->interlock); 471 mutex_exit(&pg2->interlock); 472 } 473 474 /* where's your schmonz now? */ 475 #define PUNLIMIT(a) \ 476 p->p_rlimit[a].rlim_cur = p->p_rlimit[a].rlim_max = RLIM_INFINITY; 477 void 478 uvm_init_limits(struct proc *p) 479 { 480 481 #ifndef DFLSSIZ 482 #define DFLSSIZ (16*1024*1024) 483 #endif 484 p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ; 485 p->p_rlimit[RLIMIT_STACK].rlim_max = MAXSSIZ; 486 PUNLIMIT(RLIMIT_DATA); 487 PUNLIMIT(RLIMIT_RSS); 488 PUNLIMIT(RLIMIT_AS); 489 /* nice, cascade */ 490 } 491 #undef PUNLIMIT 492 493 /* 494 * This satisfies the "disgusting mmap hack" used by proplib. 495 */ 496 int 497 uvm_mmap_anon(struct proc *p, void **addrp, size_t size) 498 { 499 int error; 500 501 /* no reason in particular, but cf. uvm_default_mapaddr() */ 502 if (*addrp != NULL) 503 panic("uvm_mmap() variant unsupported"); 504 505 if (RUMP_LOCALPROC_P(curproc)) { 506 error = rumpuser_anonmmap(NULL, size, 0, 0, addrp); 507 } else { 508 error = rump_sysproxy_anonmmap(RUMP_SPVM2CTL(p->p_vmspace), 509 size, addrp); 510 } 511 return error; 512 } 513 514 /* 515 * Stubs for things referenced from vfs_vnode.c but not used. 516 */ 517 const dev_t zerodev; 518 519 struct uvm_object * 520 udv_attach(dev_t device, vm_prot_t accessprot, voff_t off, vsize_t size) 521 { 522 return NULL; 523 } 524 525 struct pagerinfo { 526 vaddr_t pgr_kva; 527 int pgr_npages; 528 struct vm_page **pgr_pgs; 529 bool pgr_read; 530 531 LIST_ENTRY(pagerinfo) pgr_entries; 532 }; 533 static LIST_HEAD(, pagerinfo) pagerlist = LIST_HEAD_INITIALIZER(pagerlist); 534 535 /* 536 * Pager "map" in routine. Instead of mapping, we allocate memory 537 * and copy page contents there. The reason for copying instead of 538 * mapping is simple: we do not assume we are running on virtual 539 * memory. Even if we could emulate virtual memory in some envs 540 * such as userspace, copying is much faster than trying to awkardly 541 * cope with remapping (see "Design and Implementation" pp.95-98). 542 * The downside of the approach is that the pager requires MAXPHYS 543 * free memory to perform paging, but short of virtual memory or 544 * making the pager do I/O in page-sized chunks we cannot do much 545 * about that. 546 */ 547 vaddr_t 548 uvm_pagermapin(struct vm_page **pgs, int npages, int flags) 549 { 550 struct pagerinfo *pgri; 551 vaddr_t curkva; 552 int i; 553 554 /* allocate structures */ 555 pgri = kmem_alloc(sizeof(*pgri), KM_SLEEP); 556 pgri->pgr_kva = (vaddr_t)kmem_alloc(npages * PAGE_SIZE, KM_SLEEP); 557 pgri->pgr_npages = npages; 558 pgri->pgr_pgs = kmem_alloc(sizeof(struct vm_page *) * npages, KM_SLEEP); 559 pgri->pgr_read = (flags & UVMPAGER_MAPIN_READ) != 0; 560 561 /* copy contents to "mapped" memory */ 562 for (i = 0, curkva = pgri->pgr_kva; 563 i < npages; 564 i++, curkva += PAGE_SIZE) { 565 /* 566 * We need to copy the previous contents of the pages to 567 * the window even if we are reading from the 568 * device, since the device might not fill the contents of 569 * the full mapped range and we will end up corrupting 570 * data when we unmap the window. 571 */ 572 memcpy((void*)curkva, pgs[i]->uanon, PAGE_SIZE); 573 pgri->pgr_pgs[i] = pgs[i]; 574 } 575 576 mutex_enter(&pagermtx); 577 LIST_INSERT_HEAD(&pagerlist, pgri, pgr_entries); 578 mutex_exit(&pagermtx); 579 580 return pgri->pgr_kva; 581 } 582 583 /* 584 * map out the pager window. return contents from VA to page storage 585 * and free structures. 586 * 587 * Note: does not currently support partial frees 588 */ 589 void 590 uvm_pagermapout(vaddr_t kva, int npages) 591 { 592 struct pagerinfo *pgri; 593 vaddr_t curkva; 594 int i; 595 596 mutex_enter(&pagermtx); 597 LIST_FOREACH(pgri, &pagerlist, pgr_entries) { 598 if (pgri->pgr_kva == kva) 599 break; 600 } 601 KASSERT(pgri); 602 if (pgri->pgr_npages != npages) 603 panic("uvm_pagermapout: partial unmapping not supported"); 604 LIST_REMOVE(pgri, pgr_entries); 605 mutex_exit(&pagermtx); 606 607 if (pgri->pgr_read) { 608 for (i = 0, curkva = pgri->pgr_kva; 609 i < pgri->pgr_npages; 610 i++, curkva += PAGE_SIZE) { 611 memcpy(pgri->pgr_pgs[i]->uanon,(void*)curkva,PAGE_SIZE); 612 } 613 } 614 615 kmem_free(pgri->pgr_pgs, npages * sizeof(struct vm_page *)); 616 kmem_free((void*)pgri->pgr_kva, npages * PAGE_SIZE); 617 kmem_free(pgri, sizeof(*pgri)); 618 } 619 620 /* 621 * convert va in pager window to page structure. 622 * XXX: how expensive is this (global lock, list traversal)? 623 */ 624 struct vm_page * 625 uvm_pageratop(vaddr_t va) 626 { 627 struct pagerinfo *pgri; 628 struct vm_page *pg = NULL; 629 int i; 630 631 mutex_enter(&pagermtx); 632 LIST_FOREACH(pgri, &pagerlist, pgr_entries) { 633 if (pgri->pgr_kva <= va 634 && va < pgri->pgr_kva + pgri->pgr_npages*PAGE_SIZE) 635 break; 636 } 637 if (pgri) { 638 i = (va - pgri->pgr_kva) >> PAGE_SHIFT; 639 pg = pgri->pgr_pgs[i]; 640 } 641 mutex_exit(&pagermtx); 642 643 return pg; 644 } 645 646 /* 647 * Called with the vm object locked. 648 * 649 * Put vnode object pages at the end of the access queue to indicate 650 * they have been recently accessed and should not be immediate 651 * candidates for pageout. Do not do this for lookups done by 652 * the pagedaemon to mimic pmap_kentered mappings which don't track 653 * access information. 654 */ 655 struct vm_page * 656 uvm_pagelookup(struct uvm_object *uobj, voff_t off) 657 { 658 struct vm_page *pg; 659 bool ispagedaemon = curlwp == uvm.pagedaemon_lwp; 660 661 pg = radix_tree_lookup_node(&uobj->uo_pages, off >> PAGE_SHIFT); 662 if (pg && !UVM_OBJ_IS_AOBJ(pg->uobject) && !ispagedaemon) { 663 mutex_enter(&vmpage_lruqueue_lock); 664 TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue); 665 TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue); 666 mutex_exit(&vmpage_lruqueue_lock); 667 } 668 669 return pg; 670 } 671 672 void 673 uvm_page_unbusy(struct vm_page **pgs, int npgs) 674 { 675 struct vm_page *pg; 676 int i; 677 678 KASSERT(npgs > 0); 679 KASSERT(rw_write_held(pgs[0]->uobject->vmobjlock)); 680 681 for (i = 0; i < npgs; i++) { 682 pg = pgs[i]; 683 if (pg == NULL) 684 continue; 685 686 KASSERT(pg->flags & PG_BUSY); 687 if (pg->flags & PG_RELEASED) { 688 uvm_pagefree(pg); 689 } else { 690 pg->flags &= ~PG_BUSY; 691 uvm_pagelock(pg); 692 uvm_pagewakeup(pg); 693 uvm_pageunlock(pg); 694 } 695 } 696 } 697 698 void 699 uvm_pagewait(struct vm_page *pg, krwlock_t *lock, const char *wmesg) 700 { 701 702 KASSERT(rw_lock_held(lock)); 703 KASSERT((pg->flags & PG_BUSY) != 0); 704 705 mutex_enter(&pg->interlock); 706 pg->pqflags |= PQ_WANTED; 707 rw_exit(lock); 708 UVM_UNLOCK_AND_WAIT(pg, &pg->interlock, false, wmesg, 0); 709 } 710 711 void 712 uvm_pagewakeup(struct vm_page *pg) 713 { 714 715 KASSERT(mutex_owned(&pg->interlock)); 716 717 if ((pg->pqflags & PQ_WANTED) != 0) { 718 pg->pqflags &= ~PQ_WANTED; 719 wakeup(pg); 720 } 721 } 722 723 void 724 uvm_estimatepageable(int *active, int *inactive) 725 { 726 727 /* XXX: guessing game */ 728 *active = 1024; 729 *inactive = 1024; 730 } 731 732 int 733 uvm_loan(struct vm_map *map, vaddr_t start, vsize_t len, void *v, int flags) 734 { 735 736 panic("%s: unimplemented", __func__); 737 } 738 739 void 740 uvm_unloan(void *v, int npages, int flags) 741 { 742 743 panic("%s: unimplemented", __func__); 744 } 745 746 int 747 uvm_loanuobjpages(struct uvm_object *uobj, voff_t pgoff, int orignpages, 748 struct vm_page **opp) 749 { 750 751 return EBUSY; 752 } 753 754 struct vm_page * 755 uvm_loanbreak(struct vm_page *pg) 756 { 757 758 panic("%s: unimplemented", __func__); 759 } 760 761 void 762 ubc_purge(struct uvm_object *uobj) 763 { 764 765 } 766 767 vaddr_t 768 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz, int topdown) 769 { 770 771 return 0; 772 } 773 774 int 775 uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end, 776 vm_prot_t prot, bool set_max) 777 { 778 779 return EOPNOTSUPP; 780 } 781 782 int 783 uvm_map(struct vm_map *map, vaddr_t *startp, vsize_t size, 784 struct uvm_object *uobj, voff_t uoffset, vsize_t align, 785 uvm_flag_t flags) 786 { 787 788 *startp = (vaddr_t)rump_hypermalloc(size, align, true, "uvm_map"); 789 return *startp != 0 ? 0 : ENOMEM; 790 } 791 792 void 793 uvm_unmap1(struct vm_map *map, vaddr_t start, vaddr_t end, int flags) 794 { 795 796 rump_hyperfree((void*)start, end-start); 797 } 798 799 800 /* 801 * UVM km 802 */ 803 804 vaddr_t 805 uvm_km_alloc(struct vm_map *map, vsize_t size, vsize_t align, uvm_flag_t flags) 806 { 807 void *rv, *desired = NULL; 808 int alignbit, error; 809 810 #ifdef __x86_64__ 811 /* 812 * On amd64, allocate all module memory from the lowest 2GB. 813 * This is because NetBSD kernel modules are compiled 814 * with -mcmodel=kernel and reserve only 4 bytes for 815 * offsets. If we load code compiled with -mcmodel=kernel 816 * anywhere except the lowest or highest 2GB, it will not 817 * work. Since userspace does not have access to the highest 818 * 2GB, use the lowest 2GB. 819 * 820 * Note: this assumes the rump kernel resides in 821 * the lowest 2GB as well. 822 * 823 * Note2: yes, it's a quick hack, but since this the only 824 * place where we care about the map we're allocating from, 825 * just use a simple "if" instead of coming up with a fancy 826 * generic solution. 827 */ 828 if (map == module_map) { 829 desired = (void *)(0x80000000 - size); 830 } 831 #endif 832 833 if (__predict_false(map == module_map)) { 834 alignbit = 0; 835 if (align) { 836 alignbit = ffs(align)-1; 837 } 838 error = rumpuser_anonmmap(desired, size, alignbit, 839 flags & UVM_KMF_EXEC, &rv); 840 } else { 841 error = rumpuser_malloc(size, align, &rv); 842 } 843 844 if (error) { 845 if (flags & (UVM_KMF_CANFAIL | UVM_KMF_NOWAIT)) 846 return 0; 847 else 848 panic("uvm_km_alloc failed"); 849 } 850 851 if (flags & UVM_KMF_ZERO) 852 memset(rv, 0, size); 853 854 return (vaddr_t)rv; 855 } 856 857 void 858 uvm_km_free(struct vm_map *map, vaddr_t vaddr, vsize_t size, uvm_flag_t flags) 859 { 860 861 if (__predict_false(map == module_map)) 862 rumpuser_unmap((void *)vaddr, size); 863 else 864 rumpuser_free((void *)vaddr, size); 865 } 866 867 int 868 uvm_km_protect(struct vm_map *map, vaddr_t vaddr, vsize_t size, vm_prot_t prot) 869 { 870 return 0; 871 } 872 873 struct vm_map * 874 uvm_km_suballoc(struct vm_map *map, vaddr_t *minaddr, vaddr_t *maxaddr, 875 vsize_t size, int pageable, bool fixed, struct vm_map *submap) 876 { 877 878 return (struct vm_map *)417416; 879 } 880 881 int 882 uvm_km_kmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags, 883 vmem_addr_t *addr) 884 { 885 vaddr_t va; 886 va = (vaddr_t)rump_hypermalloc(size, PAGE_SIZE, 887 (flags & VM_SLEEP), "kmalloc"); 888 889 if (va) { 890 *addr = va; 891 return 0; 892 } else { 893 return ENOMEM; 894 } 895 } 896 897 void 898 uvm_km_kmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) 899 { 900 901 rump_hyperfree((void *)addr, size); 902 } 903 904 /* 905 * VM space locking routines. We don't really have to do anything, 906 * since the pages are always "wired" (both local and remote processes). 907 */ 908 int 909 uvm_vslock(struct vmspace *vs, void *addr, size_t len, vm_prot_t access) 910 { 911 912 return 0; 913 } 914 915 void 916 uvm_vsunlock(struct vmspace *vs, void *addr, size_t len) 917 { 918 919 } 920 921 /* 922 * For the local case the buffer mappers don't need to do anything. 923 * For the remote case we need to reserve space and copy data in or 924 * out, depending on B_READ/B_WRITE. 925 */ 926 int 927 vmapbuf(struct buf *bp, vsize_t len) 928 { 929 int error = 0; 930 931 bp->b_saveaddr = bp->b_data; 932 933 /* remote case */ 934 if (!RUMP_LOCALPROC_P(curproc)) { 935 bp->b_data = rump_hypermalloc(len, 0, true, "vmapbuf"); 936 if (BUF_ISWRITE(bp)) { 937 error = copyin(bp->b_saveaddr, bp->b_data, len); 938 if (error) { 939 rump_hyperfree(bp->b_data, len); 940 bp->b_data = bp->b_saveaddr; 941 bp->b_saveaddr = 0; 942 } 943 } 944 } 945 946 return error; 947 } 948 949 void 950 vunmapbuf(struct buf *bp, vsize_t len) 951 { 952 953 /* remote case */ 954 if (!RUMP_LOCALPROC_P(bp->b_proc)) { 955 if (BUF_ISREAD(bp)) { 956 bp->b_error = copyout_proc(bp->b_proc, 957 bp->b_data, bp->b_saveaddr, len); 958 } 959 rump_hyperfree(bp->b_data, len); 960 } 961 962 bp->b_data = bp->b_saveaddr; 963 bp->b_saveaddr = 0; 964 } 965 966 void 967 uvmspace_addref(struct vmspace *vm) 968 { 969 970 /* 971 * No dynamically allocated vmspaces exist. 972 */ 973 } 974 975 void 976 uvmspace_free(struct vmspace *vm) 977 { 978 979 /* nothing for now */ 980 } 981 982 /* 983 * page life cycle stuff. it really doesn't exist, so just stubs. 984 */ 985 986 void 987 uvm_pageactivate(struct vm_page *pg) 988 { 989 990 /* nada */ 991 } 992 993 void 994 uvm_pagedeactivate(struct vm_page *pg) 995 { 996 997 /* nada */ 998 } 999 1000 void 1001 uvm_pagedequeue(struct vm_page *pg) 1002 { 1003 1004 /* nada*/ 1005 } 1006 1007 void 1008 uvm_pageenqueue(struct vm_page *pg) 1009 { 1010 1011 /* nada */ 1012 } 1013 1014 void 1015 uvmpdpol_anfree(struct vm_anon *an) 1016 { 1017 1018 /* nada */ 1019 } 1020 1021 /* 1022 * Physical address accessors. 1023 */ 1024 1025 struct vm_page * 1026 uvm_phys_to_vm_page(paddr_t pa) 1027 { 1028 1029 return NULL; 1030 } 1031 1032 paddr_t 1033 uvm_vm_page_to_phys(const struct vm_page *pg) 1034 { 1035 1036 return 0; 1037 } 1038 1039 vaddr_t 1040 uvm_uarea_alloc(void) 1041 { 1042 1043 /* non-zero */ 1044 return (vaddr_t)11; 1045 } 1046 1047 void 1048 uvm_uarea_free(vaddr_t uarea) 1049 { 1050 1051 /* nata, so creamy */ 1052 } 1053 1054 /* 1055 * Routines related to the Page Baroness. 1056 */ 1057 1058 void 1059 uvm_wait(const char *msg) 1060 { 1061 1062 if (__predict_false(rump_threads == 0)) 1063 panic("pagedaemon missing (RUMP_THREADS = 0)"); 1064 1065 if (curlwp == uvm.pagedaemon_lwp) { 1066 /* is it possible for us to later get memory? */ 1067 if (!uvmexp.paging) 1068 panic("pagedaemon out of memory"); 1069 } 1070 1071 mutex_enter(&pdaemonmtx); 1072 pdaemon_waiters++; 1073 cv_signal(&pdaemoncv); 1074 cv_wait(&oomwait, &pdaemonmtx); 1075 mutex_exit(&pdaemonmtx); 1076 } 1077 1078 void 1079 uvm_pageout_start(int npages) 1080 { 1081 1082 mutex_enter(&pdaemonmtx); 1083 uvmexp.paging += npages; 1084 mutex_exit(&pdaemonmtx); 1085 } 1086 1087 void 1088 uvm_pageout_done(int npages) 1089 { 1090 1091 if (!npages) 1092 return; 1093 1094 mutex_enter(&pdaemonmtx); 1095 KASSERT(uvmexp.paging >= npages); 1096 uvmexp.paging -= npages; 1097 1098 if (pdaemon_waiters) { 1099 pdaemon_waiters = 0; 1100 cv_broadcast(&oomwait); 1101 } 1102 mutex_exit(&pdaemonmtx); 1103 } 1104 1105 static bool 1106 processpage(struct vm_page *pg) 1107 { 1108 struct uvm_object *uobj; 1109 1110 uobj = pg->uobject; 1111 if (rw_tryenter(uobj->vmobjlock, RW_WRITER)) { 1112 if ((pg->flags & PG_BUSY) == 0) { 1113 mutex_exit(&vmpage_lruqueue_lock); 1114 uobj->pgops->pgo_put(uobj, pg->offset, 1115 pg->offset + PAGE_SIZE, 1116 PGO_CLEANIT|PGO_FREE); 1117 KASSERT(!rw_write_held(uobj->vmobjlock)); 1118 return true; 1119 } else { 1120 rw_exit(uobj->vmobjlock); 1121 } 1122 } 1123 1124 return false; 1125 } 1126 1127 /* 1128 * The Diabolical pageDaemon Director (DDD). 1129 * 1130 * This routine can always use better heuristics. 1131 */ 1132 void 1133 uvm_pageout(void *arg) 1134 { 1135 struct vm_page *pg; 1136 struct pool *pp, *pp_first; 1137 int cleaned, skip, skipped; 1138 bool succ; 1139 1140 mutex_enter(&pdaemonmtx); 1141 for (;;) { 1142 if (pdaemon_waiters) { 1143 pdaemon_waiters = 0; 1144 cv_broadcast(&oomwait); 1145 } 1146 if (!NEED_PAGEDAEMON()) { 1147 kernel_map->flags &= ~VM_MAP_WANTVA; 1148 cv_wait(&pdaemoncv, &pdaemonmtx); 1149 } 1150 uvmexp.pdwoke++; 1151 1152 /* tell the world that we are hungry */ 1153 kernel_map->flags |= VM_MAP_WANTVA; 1154 mutex_exit(&pdaemonmtx); 1155 1156 /* 1157 * step one: reclaim the page cache. this should give 1158 * us the biggest earnings since whole pages are released 1159 * into backing memory. 1160 */ 1161 pool_cache_reclaim(&pagecache); 1162 if (!NEED_PAGEDAEMON()) { 1163 mutex_enter(&pdaemonmtx); 1164 continue; 1165 } 1166 1167 /* 1168 * Ok, so that didn't help. Next, try to hunt memory 1169 * by pushing out vnode pages. The pages might contain 1170 * useful cached data, but we need the memory. 1171 */ 1172 cleaned = 0; 1173 skip = 0; 1174 again: 1175 mutex_enter(&vmpage_lruqueue_lock); 1176 while (cleaned < PAGEDAEMON_OBJCHUNK) { 1177 skipped = 0; 1178 TAILQ_FOREACH(pg, &vmpage_lruqueue, pageq.queue) { 1179 1180 /* 1181 * skip over pages we _might_ have tried 1182 * to handle earlier. they might not be 1183 * exactly the same ones, but I'm not too 1184 * concerned. 1185 */ 1186 while (skipped++ < skip) 1187 continue; 1188 1189 if (processpage(pg)) { 1190 cleaned++; 1191 goto again; 1192 } 1193 1194 skip++; 1195 } 1196 break; 1197 } 1198 mutex_exit(&vmpage_lruqueue_lock); 1199 1200 /* 1201 * And of course we need to reclaim the page cache 1202 * again to actually release memory. 1203 */ 1204 pool_cache_reclaim(&pagecache); 1205 if (!NEED_PAGEDAEMON()) { 1206 mutex_enter(&pdaemonmtx); 1207 continue; 1208 } 1209 1210 /* 1211 * And then drain the pools. Wipe them out ... all of them. 1212 */ 1213 for (pp_first = NULL;;) { 1214 rump_vfs_drainbufs(10 /* XXX: estimate! */); 1215 1216 succ = pool_drain(&pp); 1217 if (succ || pp == pp_first) 1218 break; 1219 1220 if (pp_first == NULL) 1221 pp_first = pp; 1222 } 1223 1224 /* 1225 * Need to use PYEC on our bag of tricks. 1226 * Unfortunately, the wife just borrowed it. 1227 */ 1228 1229 mutex_enter(&pdaemonmtx); 1230 if (!succ && cleaned == 0 && pdaemon_waiters && 1231 uvmexp.paging == 0) { 1232 kpause("pddlk", false, hz, &pdaemonmtx); 1233 } 1234 } 1235 1236 panic("you can swap out any time you like, but you can never leave"); 1237 } 1238 1239 void 1240 uvm_kick_pdaemon() 1241 { 1242 1243 /* 1244 * Wake up the diabolical pagedaemon director if we are over 1245 * 90% of the memory limit. This is a complete and utter 1246 * stetson-harrison decision which you are allowed to finetune. 1247 * Don't bother locking. If we have some unflushed caches, 1248 * other waker-uppers will deal with the issue. 1249 */ 1250 if (NEED_PAGEDAEMON()) { 1251 cv_signal(&pdaemoncv); 1252 } 1253 } 1254 1255 void * 1256 rump_hypermalloc(size_t howmuch, int alignment, bool waitok, const char *wmsg) 1257 { 1258 const unsigned long thelimit = 1259 curlwp == uvm.pagedaemon_lwp ? pdlimit : rump_physmemlimit; 1260 unsigned long newmem; 1261 void *rv; 1262 int error; 1263 1264 uvm_kick_pdaemon(); /* ouch */ 1265 1266 /* first we must be within the limit */ 1267 limitagain: 1268 if (thelimit != RUMPMEM_UNLIMITED) { 1269 newmem = atomic_add_long_nv(&curphysmem, howmuch); 1270 if (newmem > thelimit) { 1271 newmem = atomic_add_long_nv(&curphysmem, -howmuch); 1272 if (!waitok) { 1273 return NULL; 1274 } 1275 uvm_wait(wmsg); 1276 goto limitagain; 1277 } 1278 } 1279 1280 /* second, we must get something from the backend */ 1281 again: 1282 error = rumpuser_malloc(howmuch, alignment, &rv); 1283 if (__predict_false(error && waitok)) { 1284 uvm_wait(wmsg); 1285 goto again; 1286 } 1287 1288 return rv; 1289 } 1290 1291 void 1292 rump_hyperfree(void *what, size_t size) 1293 { 1294 1295 if (rump_physmemlimit != RUMPMEM_UNLIMITED) { 1296 atomic_add_long(&curphysmem, -size); 1297 } 1298 rumpuser_free(what, size); 1299 } 1300