1 /* $NetBSD: vm.c,v 1.146 2013/11/23 22:24:31 christos Exp $ */ 2 3 /* 4 * Copyright (c) 2007-2011 Antti Kantee. All Rights Reserved. 5 * 6 * Development of this software was supported by 7 * The Finnish Cultural Foundation and the Research Foundation of 8 * The Helsinki University of Technology. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 20 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * Virtual memory emulation routines. 34 */ 35 36 /* 37 * XXX: we abuse pg->uanon for the virtual address of the storage 38 * for each page. phys_addr would fit the job description better, 39 * except that it will create unnecessary lossage on some platforms 40 * due to not being a pointer type. 41 */ 42 43 #include <sys/cdefs.h> 44 __KERNEL_RCSID(0, "$NetBSD: vm.c,v 1.146 2013/11/23 22:24:31 christos Exp $"); 45 46 #include <sys/param.h> 47 #include <sys/atomic.h> 48 #include <sys/buf.h> 49 #include <sys/kernel.h> 50 #include <sys/kmem.h> 51 #include <sys/vmem.h> 52 #include <sys/mman.h> 53 #include <sys/null.h> 54 #include <sys/vnode.h> 55 56 #include <machine/pmap.h> 57 58 #include <rump/rumpuser.h> 59 60 #include <uvm/uvm.h> 61 #include <uvm/uvm_ddb.h> 62 #include <uvm/uvm_pdpolicy.h> 63 #include <uvm/uvm_prot.h> 64 #include <uvm/uvm_readahead.h> 65 66 #include "rump_private.h" 67 #include "rump_vfs_private.h" 68 69 kmutex_t uvm_pageqlock; 70 kmutex_t uvm_swap_data_lock; 71 72 struct uvmexp uvmexp; 73 struct uvm uvm; 74 75 #ifdef __uvmexp_pagesize 76 const int * const uvmexp_pagesize = &uvmexp.pagesize; 77 const int * const uvmexp_pagemask = &uvmexp.pagemask; 78 const int * const uvmexp_pageshift = &uvmexp.pageshift; 79 #endif 80 81 struct vm_map rump_vmmap; 82 83 static struct vm_map kernel_map_store; 84 struct vm_map *kernel_map = &kernel_map_store; 85 86 static struct vm_map module_map_store; 87 extern struct vm_map *module_map; 88 89 vmem_t *kmem_arena; 90 vmem_t *kmem_va_arena; 91 92 static unsigned int pdaemon_waiters; 93 static kmutex_t pdaemonmtx; 94 static kcondvar_t pdaemoncv, oomwait; 95 96 unsigned long rump_physmemlimit = RUMPMEM_UNLIMITED; 97 static unsigned long curphysmem; 98 static unsigned long dddlim; /* 90% of memory limit used */ 99 #define NEED_PAGEDAEMON() \ 100 (rump_physmemlimit != RUMPMEM_UNLIMITED && curphysmem > dddlim) 101 102 /* 103 * Try to free two pages worth of pages from objects. 104 * If this succesfully frees a full page cache page, we'll 105 * free the released page plus PAGE_SIZE/sizeof(vm_page). 106 */ 107 #define PAGEDAEMON_OBJCHUNK (2*PAGE_SIZE / sizeof(struct vm_page)) 108 109 /* 110 * Keep a list of least recently used pages. Since the only way a 111 * rump kernel can "access" a page is via lookup, we put the page 112 * at the back of queue every time a lookup for it is done. If the 113 * page is in front of this global queue and we're short of memory, 114 * it's a candidate for pageout. 115 */ 116 static struct pglist vmpage_lruqueue; 117 static unsigned vmpage_onqueue; 118 119 static int 120 pg_compare_key(void *ctx, const void *n, const void *key) 121 { 122 voff_t a = ((const struct vm_page *)n)->offset; 123 voff_t b = *(const voff_t *)key; 124 125 if (a < b) 126 return -1; 127 else if (a > b) 128 return 1; 129 else 130 return 0; 131 } 132 133 static int 134 pg_compare_nodes(void *ctx, const void *n1, const void *n2) 135 { 136 137 return pg_compare_key(ctx, n1, &((const struct vm_page *)n2)->offset); 138 } 139 140 const rb_tree_ops_t uvm_page_tree_ops = { 141 .rbto_compare_nodes = pg_compare_nodes, 142 .rbto_compare_key = pg_compare_key, 143 .rbto_node_offset = offsetof(struct vm_page, rb_node), 144 .rbto_context = NULL 145 }; 146 147 /* 148 * vm pages 149 */ 150 151 static int 152 pgctor(void *arg, void *obj, int flags) 153 { 154 struct vm_page *pg = obj; 155 156 memset(pg, 0, sizeof(*pg)); 157 pg->uanon = rump_hypermalloc(PAGE_SIZE, PAGE_SIZE, 158 (flags & PR_WAITOK) == PR_WAITOK, "pgalloc"); 159 return pg->uanon == NULL; 160 } 161 162 static void 163 pgdtor(void *arg, void *obj) 164 { 165 struct vm_page *pg = obj; 166 167 rump_hyperfree(pg->uanon, PAGE_SIZE); 168 } 169 170 static struct pool_cache pagecache; 171 172 /* 173 * Called with the object locked. We don't support anons. 174 */ 175 struct vm_page * 176 uvm_pagealloc_strat(struct uvm_object *uobj, voff_t off, struct vm_anon *anon, 177 int flags, int strat, int free_list) 178 { 179 struct vm_page *pg; 180 181 KASSERT(uobj && mutex_owned(uobj->vmobjlock)); 182 KASSERT(anon == NULL); 183 184 pg = pool_cache_get(&pagecache, PR_NOWAIT); 185 if (__predict_false(pg == NULL)) { 186 return NULL; 187 } 188 189 pg->offset = off; 190 pg->uobject = uobj; 191 192 pg->flags = PG_CLEAN|PG_BUSY|PG_FAKE; 193 if (flags & UVM_PGA_ZERO) { 194 uvm_pagezero(pg); 195 } 196 197 TAILQ_INSERT_TAIL(&uobj->memq, pg, listq.queue); 198 (void)rb_tree_insert_node(&uobj->rb_tree, pg); 199 200 /* 201 * Don't put anons on the LRU page queue. We can't flush them 202 * (there's no concept of swap in a rump kernel), so no reason 203 * to bother with them. 204 */ 205 if (!UVM_OBJ_IS_AOBJ(uobj)) { 206 atomic_inc_uint(&vmpage_onqueue); 207 mutex_enter(&uvm_pageqlock); 208 TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue); 209 mutex_exit(&uvm_pageqlock); 210 } 211 212 uobj->uo_npages++; 213 214 return pg; 215 } 216 217 /* 218 * Release a page. 219 * 220 * Called with the vm object locked. 221 */ 222 void 223 uvm_pagefree(struct vm_page *pg) 224 { 225 struct uvm_object *uobj = pg->uobject; 226 227 KASSERT(mutex_owned(&uvm_pageqlock)); 228 KASSERT(mutex_owned(uobj->vmobjlock)); 229 230 if (pg->flags & PG_WANTED) 231 wakeup(pg); 232 233 TAILQ_REMOVE(&uobj->memq, pg, listq.queue); 234 235 uobj->uo_npages--; 236 rb_tree_remove_node(&uobj->rb_tree, pg); 237 238 if (!UVM_OBJ_IS_AOBJ(uobj)) { 239 TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue); 240 atomic_dec_uint(&vmpage_onqueue); 241 } 242 243 pool_cache_put(&pagecache, pg); 244 } 245 246 void 247 uvm_pagezero(struct vm_page *pg) 248 { 249 250 pg->flags &= ~PG_CLEAN; 251 memset((void *)pg->uanon, 0, PAGE_SIZE); 252 } 253 254 /* 255 * uvm_page_locked_p: return true if object associated with page is 256 * locked. this is a weak check for runtime assertions only. 257 */ 258 259 bool 260 uvm_page_locked_p(struct vm_page *pg) 261 { 262 263 return mutex_owned(pg->uobject->vmobjlock); 264 } 265 266 /* 267 * Misc routines 268 */ 269 270 static kmutex_t pagermtx; 271 272 void 273 uvm_init(void) 274 { 275 char buf[64]; 276 277 if (rumpuser_getparam("RUMP_MEMLIMIT", buf, sizeof(buf)) == 0) { 278 unsigned long tmp; 279 char *ep; 280 int mult; 281 282 tmp = strtoul(buf, &ep, 10); 283 if (strlen(ep) > 1) 284 panic("uvm_init: invalid RUMP_MEMLIMIT: %s", buf); 285 286 /* mini-dehumanize-number */ 287 mult = 1; 288 switch (*ep) { 289 case 'k': 290 mult = 1024; 291 break; 292 case 'm': 293 mult = 1024*1024; 294 break; 295 case 'g': 296 mult = 1024*1024*1024; 297 break; 298 case 0: 299 break; 300 default: 301 panic("uvm_init: invalid RUMP_MEMLIMIT: %s", buf); 302 } 303 rump_physmemlimit = tmp * mult; 304 305 if (rump_physmemlimit / mult != tmp) 306 panic("uvm_init: RUMP_MEMLIMIT overflow: %s", buf); 307 /* it's not like we'd get far with, say, 1 byte, but ... */ 308 if (rump_physmemlimit == 0) 309 panic("uvm_init: no memory"); 310 311 #define HUMANIZE_BYTES 9 312 CTASSERT(sizeof(buf) >= HUMANIZE_BYTES); 313 format_bytes(buf, HUMANIZE_BYTES, rump_physmemlimit); 314 #undef HUMANIZE_BYTES 315 dddlim = 9 * (rump_physmemlimit / 10); 316 } else { 317 strlcpy(buf, "unlimited (host limit)", sizeof(buf)); 318 } 319 aprint_verbose("total memory = %s\n", buf); 320 321 TAILQ_INIT(&vmpage_lruqueue); 322 323 uvmexp.free = 1024*1024; /* XXX: arbitrary & not updated */ 324 325 #ifndef __uvmexp_pagesize 326 uvmexp.pagesize = PAGE_SIZE; 327 uvmexp.pagemask = PAGE_MASK; 328 uvmexp.pageshift = PAGE_SHIFT; 329 #else 330 #define FAKE_PAGE_SHIFT 12 331 uvmexp.pageshift = FAKE_PAGE_SHIFT; 332 uvmexp.pagesize = 1<<FAKE_PAGE_SHIFT; 333 uvmexp.pagemask = (1<<FAKE_PAGE_SHIFT)-1; 334 #undef FAKE_PAGE_SHIFT 335 #endif 336 337 mutex_init(&pagermtx, MUTEX_DEFAULT, IPL_NONE); 338 mutex_init(&uvm_pageqlock, MUTEX_DEFAULT, IPL_NONE); 339 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 340 341 mutex_init(&pdaemonmtx, MUTEX_DEFAULT, IPL_NONE); 342 cv_init(&pdaemoncv, "pdaemon"); 343 cv_init(&oomwait, "oomwait"); 344 345 module_map = &module_map_store; 346 347 kernel_map->pmap = pmap_kernel(); 348 349 pool_subsystem_init(); 350 351 kmem_arena = vmem_create("kmem", 0, 1024*1024, PAGE_SIZE, 352 NULL, NULL, NULL, 353 0, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM); 354 355 vmem_subsystem_init(kmem_arena); 356 357 kmem_va_arena = vmem_create("kva", 0, 0, PAGE_SIZE, 358 vmem_alloc, vmem_free, kmem_arena, 359 8 * PAGE_SIZE, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM); 360 361 pool_cache_bootstrap(&pagecache, sizeof(struct vm_page), 0, 0, 0, 362 "page$", NULL, IPL_NONE, pgctor, pgdtor, NULL); 363 } 364 365 void 366 uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t vmin, vaddr_t vmax, 367 bool topdown) 368 { 369 370 vm->vm_map.pmap = pmap_kernel(); 371 vm->vm_refcnt = 1; 372 } 373 374 void 375 uvm_pagewire(struct vm_page *pg) 376 { 377 378 /* nada */ 379 } 380 381 void 382 uvm_pageunwire(struct vm_page *pg) 383 { 384 385 /* nada */ 386 } 387 388 /* where's your schmonz now? */ 389 #define PUNLIMIT(a) \ 390 p->p_rlimit[a].rlim_cur = p->p_rlimit[a].rlim_max = RLIM_INFINITY; 391 void 392 uvm_init_limits(struct proc *p) 393 { 394 395 PUNLIMIT(RLIMIT_STACK); 396 PUNLIMIT(RLIMIT_DATA); 397 PUNLIMIT(RLIMIT_RSS); 398 PUNLIMIT(RLIMIT_AS); 399 /* nice, cascade */ 400 } 401 #undef PUNLIMIT 402 403 /* 404 * This satisfies the "disgusting mmap hack" used by proplib. 405 * We probably should grow some more assertables to make sure we're 406 * not satisfying anything we shouldn't be satisfying. 407 */ 408 int 409 uvm_mmap(struct vm_map *map, vaddr_t *addr, vsize_t size, vm_prot_t prot, 410 vm_prot_t maxprot, int flags, void *handle, voff_t off, vsize_t locklim) 411 { 412 void *uaddr; 413 int error; 414 415 if (prot != (VM_PROT_READ | VM_PROT_WRITE)) 416 panic("uvm_mmap() variant unsupported"); 417 if (flags != (MAP_PRIVATE | MAP_ANON)) 418 panic("uvm_mmap() variant unsupported"); 419 420 /* no reason in particular, but cf. uvm_default_mapaddr() */ 421 if (*addr != 0) 422 panic("uvm_mmap() variant unsupported"); 423 424 if (RUMP_LOCALPROC_P(curproc)) { 425 error = rumpuser_anonmmap(NULL, size, 0, 0, &uaddr); 426 } else { 427 error = rumpuser_sp_anonmmap(curproc->p_vmspace->vm_map.pmap, 428 size, &uaddr); 429 } 430 if (error) 431 return error; 432 433 *addr = (vaddr_t)uaddr; 434 return 0; 435 } 436 437 struct pagerinfo { 438 vaddr_t pgr_kva; 439 int pgr_npages; 440 struct vm_page **pgr_pgs; 441 bool pgr_read; 442 443 LIST_ENTRY(pagerinfo) pgr_entries; 444 }; 445 static LIST_HEAD(, pagerinfo) pagerlist = LIST_HEAD_INITIALIZER(pagerlist); 446 447 /* 448 * Pager "map" in routine. Instead of mapping, we allocate memory 449 * and copy page contents there. Not optimal or even strictly 450 * correct (the caller might modify the page contents after mapping 451 * them in), but what the heck. Assumes UVMPAGER_MAPIN_WAITOK. 452 */ 453 vaddr_t 454 uvm_pagermapin(struct vm_page **pgs, int npages, int flags) 455 { 456 struct pagerinfo *pgri; 457 vaddr_t curkva; 458 int i; 459 460 /* allocate structures */ 461 pgri = kmem_alloc(sizeof(*pgri), KM_SLEEP); 462 pgri->pgr_kva = (vaddr_t)kmem_alloc(npages * PAGE_SIZE, KM_SLEEP); 463 pgri->pgr_npages = npages; 464 pgri->pgr_pgs = kmem_alloc(sizeof(struct vm_page *) * npages, KM_SLEEP); 465 pgri->pgr_read = (flags & UVMPAGER_MAPIN_READ) != 0; 466 467 /* copy contents to "mapped" memory */ 468 for (i = 0, curkva = pgri->pgr_kva; 469 i < npages; 470 i++, curkva += PAGE_SIZE) { 471 /* 472 * We need to copy the previous contents of the pages to 473 * the window even if we are reading from the 474 * device, since the device might not fill the contents of 475 * the full mapped range and we will end up corrupting 476 * data when we unmap the window. 477 */ 478 memcpy((void*)curkva, pgs[i]->uanon, PAGE_SIZE); 479 pgri->pgr_pgs[i] = pgs[i]; 480 } 481 482 mutex_enter(&pagermtx); 483 LIST_INSERT_HEAD(&pagerlist, pgri, pgr_entries); 484 mutex_exit(&pagermtx); 485 486 return pgri->pgr_kva; 487 } 488 489 /* 490 * map out the pager window. return contents from VA to page storage 491 * and free structures. 492 * 493 * Note: does not currently support partial frees 494 */ 495 void 496 uvm_pagermapout(vaddr_t kva, int npages) 497 { 498 struct pagerinfo *pgri; 499 vaddr_t curkva; 500 int i; 501 502 mutex_enter(&pagermtx); 503 LIST_FOREACH(pgri, &pagerlist, pgr_entries) { 504 if (pgri->pgr_kva == kva) 505 break; 506 } 507 KASSERT(pgri); 508 if (pgri->pgr_npages != npages) 509 panic("uvm_pagermapout: partial unmapping not supported"); 510 LIST_REMOVE(pgri, pgr_entries); 511 mutex_exit(&pagermtx); 512 513 if (pgri->pgr_read) { 514 for (i = 0, curkva = pgri->pgr_kva; 515 i < pgri->pgr_npages; 516 i++, curkva += PAGE_SIZE) { 517 memcpy(pgri->pgr_pgs[i]->uanon,(void*)curkva,PAGE_SIZE); 518 } 519 } 520 521 kmem_free(pgri->pgr_pgs, npages * sizeof(struct vm_page *)); 522 kmem_free((void*)pgri->pgr_kva, npages * PAGE_SIZE); 523 kmem_free(pgri, sizeof(*pgri)); 524 } 525 526 /* 527 * convert va in pager window to page structure. 528 * XXX: how expensive is this (global lock, list traversal)? 529 */ 530 struct vm_page * 531 uvm_pageratop(vaddr_t va) 532 { 533 struct pagerinfo *pgri; 534 struct vm_page *pg = NULL; 535 int i; 536 537 mutex_enter(&pagermtx); 538 LIST_FOREACH(pgri, &pagerlist, pgr_entries) { 539 if (pgri->pgr_kva <= va 540 && va < pgri->pgr_kva + pgri->pgr_npages*PAGE_SIZE) 541 break; 542 } 543 if (pgri) { 544 i = (va - pgri->pgr_kva) >> PAGE_SHIFT; 545 pg = pgri->pgr_pgs[i]; 546 } 547 mutex_exit(&pagermtx); 548 549 return pg; 550 } 551 552 /* 553 * Called with the vm object locked. 554 * 555 * Put vnode object pages at the end of the access queue to indicate 556 * they have been recently accessed and should not be immediate 557 * candidates for pageout. Do not do this for lookups done by 558 * the pagedaemon to mimic pmap_kentered mappings which don't track 559 * access information. 560 */ 561 struct vm_page * 562 uvm_pagelookup(struct uvm_object *uobj, voff_t off) 563 { 564 struct vm_page *pg; 565 bool ispagedaemon = curlwp == uvm.pagedaemon_lwp; 566 567 pg = rb_tree_find_node(&uobj->rb_tree, &off); 568 if (pg && !UVM_OBJ_IS_AOBJ(pg->uobject) && !ispagedaemon) { 569 mutex_enter(&uvm_pageqlock); 570 TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue); 571 TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue); 572 mutex_exit(&uvm_pageqlock); 573 } 574 575 return pg; 576 } 577 578 void 579 uvm_page_unbusy(struct vm_page **pgs, int npgs) 580 { 581 struct vm_page *pg; 582 int i; 583 584 KASSERT(npgs > 0); 585 KASSERT(mutex_owned(pgs[0]->uobject->vmobjlock)); 586 587 for (i = 0; i < npgs; i++) { 588 pg = pgs[i]; 589 if (pg == NULL) 590 continue; 591 592 KASSERT(pg->flags & PG_BUSY); 593 if (pg->flags & PG_WANTED) 594 wakeup(pg); 595 if (pg->flags & PG_RELEASED) 596 uvm_pagefree(pg); 597 else 598 pg->flags &= ~(PG_WANTED|PG_BUSY); 599 } 600 } 601 602 void 603 uvm_estimatepageable(int *active, int *inactive) 604 { 605 606 /* XXX: guessing game */ 607 *active = 1024; 608 *inactive = 1024; 609 } 610 611 bool 612 vm_map_starved_p(struct vm_map *map) 613 { 614 615 if (map->flags & VM_MAP_WANTVA) 616 return true; 617 618 return false; 619 } 620 621 int 622 uvm_loan(struct vm_map *map, vaddr_t start, vsize_t len, void *v, int flags) 623 { 624 625 panic("%s: unimplemented", __func__); 626 } 627 628 void 629 uvm_unloan(void *v, int npages, int flags) 630 { 631 632 panic("%s: unimplemented", __func__); 633 } 634 635 int 636 uvm_loanuobjpages(struct uvm_object *uobj, voff_t pgoff, int orignpages, 637 struct vm_page **opp) 638 { 639 640 return EBUSY; 641 } 642 643 struct vm_page * 644 uvm_loanbreak(struct vm_page *pg) 645 { 646 647 panic("%s: unimplemented", __func__); 648 } 649 650 void 651 ubc_purge(struct uvm_object *uobj) 652 { 653 654 } 655 656 #ifdef DEBUGPRINT 657 void 658 uvm_object_printit(struct uvm_object *uobj, bool full, 659 void (*pr)(const char *, ...)) 660 { 661 662 pr("VM OBJECT at %p, refs %d", uobj, uobj->uo_refs); 663 } 664 #endif 665 666 vaddr_t 667 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz) 668 { 669 670 return 0; 671 } 672 673 int 674 uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end, 675 vm_prot_t prot, bool set_max) 676 { 677 678 return EOPNOTSUPP; 679 } 680 681 /* 682 * UVM km 683 */ 684 685 vaddr_t 686 uvm_km_alloc(struct vm_map *map, vsize_t size, vsize_t align, uvm_flag_t flags) 687 { 688 void *rv, *desired = NULL; 689 int alignbit, error; 690 691 #ifdef __x86_64__ 692 /* 693 * On amd64, allocate all module memory from the lowest 2GB. 694 * This is because NetBSD kernel modules are compiled 695 * with -mcmodel=kernel and reserve only 4 bytes for 696 * offsets. If we load code compiled with -mcmodel=kernel 697 * anywhere except the lowest or highest 2GB, it will not 698 * work. Since userspace does not have access to the highest 699 * 2GB, use the lowest 2GB. 700 * 701 * Note: this assumes the rump kernel resides in 702 * the lowest 2GB as well. 703 * 704 * Note2: yes, it's a quick hack, but since this the only 705 * place where we care about the map we're allocating from, 706 * just use a simple "if" instead of coming up with a fancy 707 * generic solution. 708 */ 709 if (map == module_map) { 710 desired = (void *)(0x80000000 - size); 711 } 712 #endif 713 714 if (__predict_false(map == module_map)) { 715 alignbit = 0; 716 if (align) { 717 alignbit = ffs(align)-1; 718 } 719 error = rumpuser_anonmmap(desired, size, alignbit, 720 flags & UVM_KMF_EXEC, &rv); 721 } else { 722 error = rumpuser_malloc(size, align, &rv); 723 } 724 725 if (error) { 726 if (flags & (UVM_KMF_CANFAIL | UVM_KMF_NOWAIT)) 727 return 0; 728 else 729 panic("uvm_km_alloc failed"); 730 } 731 732 if (flags & UVM_KMF_ZERO) 733 memset(rv, 0, size); 734 735 return (vaddr_t)rv; 736 } 737 738 void 739 uvm_km_free(struct vm_map *map, vaddr_t vaddr, vsize_t size, uvm_flag_t flags) 740 { 741 742 if (__predict_false(map == module_map)) 743 rumpuser_unmap((void *)vaddr, size); 744 else 745 rumpuser_free((void *)vaddr, size); 746 } 747 748 struct vm_map * 749 uvm_km_suballoc(struct vm_map *map, vaddr_t *minaddr, vaddr_t *maxaddr, 750 vsize_t size, int pageable, bool fixed, struct vm_map *submap) 751 { 752 753 return (struct vm_map *)417416; 754 } 755 756 int 757 uvm_km_kmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags, 758 vmem_addr_t *addr) 759 { 760 vaddr_t va; 761 va = (vaddr_t)rump_hypermalloc(size, PAGE_SIZE, 762 (flags & VM_SLEEP), "kmalloc"); 763 764 if (va) { 765 *addr = va; 766 return 0; 767 } else { 768 return ENOMEM; 769 } 770 } 771 772 void 773 uvm_km_kmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) 774 { 775 776 rump_hyperfree((void *)addr, size); 777 } 778 779 /* 780 * VM space locking routines. We don't really have to do anything, 781 * since the pages are always "wired" (both local and remote processes). 782 */ 783 int 784 uvm_vslock(struct vmspace *vs, void *addr, size_t len, vm_prot_t access) 785 { 786 787 return 0; 788 } 789 790 void 791 uvm_vsunlock(struct vmspace *vs, void *addr, size_t len) 792 { 793 794 } 795 796 /* 797 * For the local case the buffer mappers don't need to do anything. 798 * For the remote case we need to reserve space and copy data in or 799 * out, depending on B_READ/B_WRITE. 800 */ 801 int 802 vmapbuf(struct buf *bp, vsize_t len) 803 { 804 int error = 0; 805 806 bp->b_saveaddr = bp->b_data; 807 808 /* remote case */ 809 if (!RUMP_LOCALPROC_P(curproc)) { 810 bp->b_data = rump_hypermalloc(len, 0, true, "vmapbuf"); 811 if (BUF_ISWRITE(bp)) { 812 error = copyin(bp->b_saveaddr, bp->b_data, len); 813 if (error) { 814 rump_hyperfree(bp->b_data, len); 815 bp->b_data = bp->b_saveaddr; 816 bp->b_saveaddr = 0; 817 } 818 } 819 } 820 821 return error; 822 } 823 824 void 825 vunmapbuf(struct buf *bp, vsize_t len) 826 { 827 828 /* remote case */ 829 if (!RUMP_LOCALPROC_P(bp->b_proc)) { 830 if (BUF_ISREAD(bp)) { 831 bp->b_error = copyout_proc(bp->b_proc, 832 bp->b_data, bp->b_saveaddr, len); 833 } 834 rump_hyperfree(bp->b_data, len); 835 } 836 837 bp->b_data = bp->b_saveaddr; 838 bp->b_saveaddr = 0; 839 } 840 841 void 842 uvmspace_addref(struct vmspace *vm) 843 { 844 845 /* 846 * No dynamically allocated vmspaces exist. 847 */ 848 } 849 850 void 851 uvmspace_free(struct vmspace *vm) 852 { 853 854 /* nothing for now */ 855 } 856 857 /* 858 * page life cycle stuff. it really doesn't exist, so just stubs. 859 */ 860 861 void 862 uvm_pageactivate(struct vm_page *pg) 863 { 864 865 /* nada */ 866 } 867 868 void 869 uvm_pagedeactivate(struct vm_page *pg) 870 { 871 872 /* nada */ 873 } 874 875 void 876 uvm_pagedequeue(struct vm_page *pg) 877 { 878 879 /* nada*/ 880 } 881 882 void 883 uvm_pageenqueue(struct vm_page *pg) 884 { 885 886 /* nada */ 887 } 888 889 void 890 uvmpdpol_anfree(struct vm_anon *an) 891 { 892 893 /* nada */ 894 } 895 896 /* 897 * Physical address accessors. 898 */ 899 900 struct vm_page * 901 uvm_phys_to_vm_page(paddr_t pa) 902 { 903 904 return NULL; 905 } 906 907 paddr_t 908 uvm_vm_page_to_phys(const struct vm_page *pg) 909 { 910 911 return 0; 912 } 913 914 /* 915 * Routines related to the Page Baroness. 916 */ 917 918 void 919 uvm_wait(const char *msg) 920 { 921 922 if (__predict_false(curlwp == uvm.pagedaemon_lwp)) 923 panic("pagedaemon out of memory"); 924 if (__predict_false(rump_threads == 0)) 925 panic("pagedaemon missing (RUMP_THREADS = 0)"); 926 927 mutex_enter(&pdaemonmtx); 928 pdaemon_waiters++; 929 cv_signal(&pdaemoncv); 930 cv_wait(&oomwait, &pdaemonmtx); 931 mutex_exit(&pdaemonmtx); 932 } 933 934 void 935 uvm_pageout_start(int npages) 936 { 937 938 mutex_enter(&pdaemonmtx); 939 uvmexp.paging += npages; 940 mutex_exit(&pdaemonmtx); 941 } 942 943 void 944 uvm_pageout_done(int npages) 945 { 946 947 if (!npages) 948 return; 949 950 mutex_enter(&pdaemonmtx); 951 KASSERT(uvmexp.paging >= npages); 952 uvmexp.paging -= npages; 953 954 if (pdaemon_waiters) { 955 pdaemon_waiters = 0; 956 cv_broadcast(&oomwait); 957 } 958 mutex_exit(&pdaemonmtx); 959 } 960 961 static bool 962 processpage(struct vm_page *pg, bool *lockrunning) 963 { 964 struct uvm_object *uobj; 965 966 uobj = pg->uobject; 967 if (mutex_tryenter(uobj->vmobjlock)) { 968 if ((pg->flags & PG_BUSY) == 0) { 969 mutex_exit(&uvm_pageqlock); 970 uobj->pgops->pgo_put(uobj, pg->offset, 971 pg->offset + PAGE_SIZE, 972 PGO_CLEANIT|PGO_FREE); 973 KASSERT(!mutex_owned(uobj->vmobjlock)); 974 return true; 975 } else { 976 mutex_exit(uobj->vmobjlock); 977 } 978 } else if (*lockrunning == false && ncpu > 1) { 979 CPU_INFO_ITERATOR cii; 980 struct cpu_info *ci; 981 struct lwp *l; 982 983 l = mutex_owner(uobj->vmobjlock); 984 for (CPU_INFO_FOREACH(cii, ci)) { 985 if (ci->ci_curlwp == l) { 986 *lockrunning = true; 987 break; 988 } 989 } 990 } 991 992 return false; 993 } 994 995 /* 996 * The Diabolical pageDaemon Director (DDD). 997 * 998 * This routine can always use better heuristics. 999 */ 1000 void 1001 uvm_pageout(void *arg) 1002 { 1003 struct vm_page *pg; 1004 struct pool *pp, *pp_first; 1005 int cleaned, skip, skipped; 1006 bool succ; 1007 bool lockrunning; 1008 1009 mutex_enter(&pdaemonmtx); 1010 for (;;) { 1011 if (!NEED_PAGEDAEMON()) { 1012 kernel_map->flags &= ~VM_MAP_WANTVA; 1013 } 1014 1015 if (pdaemon_waiters) { 1016 pdaemon_waiters = 0; 1017 cv_broadcast(&oomwait); 1018 } 1019 1020 cv_wait(&pdaemoncv, &pdaemonmtx); 1021 uvmexp.pdwoke++; 1022 1023 /* tell the world that we are hungry */ 1024 kernel_map->flags |= VM_MAP_WANTVA; 1025 mutex_exit(&pdaemonmtx); 1026 1027 /* 1028 * step one: reclaim the page cache. this should give 1029 * us the biggest earnings since whole pages are released 1030 * into backing memory. 1031 */ 1032 pool_cache_reclaim(&pagecache); 1033 if (!NEED_PAGEDAEMON()) { 1034 mutex_enter(&pdaemonmtx); 1035 continue; 1036 } 1037 1038 /* 1039 * Ok, so that didn't help. Next, try to hunt memory 1040 * by pushing out vnode pages. The pages might contain 1041 * useful cached data, but we need the memory. 1042 */ 1043 cleaned = 0; 1044 skip = 0; 1045 lockrunning = false; 1046 again: 1047 mutex_enter(&uvm_pageqlock); 1048 while (cleaned < PAGEDAEMON_OBJCHUNK) { 1049 skipped = 0; 1050 TAILQ_FOREACH(pg, &vmpage_lruqueue, pageq.queue) { 1051 1052 /* 1053 * skip over pages we _might_ have tried 1054 * to handle earlier. they might not be 1055 * exactly the same ones, but I'm not too 1056 * concerned. 1057 */ 1058 while (skipped++ < skip) 1059 continue; 1060 1061 if (processpage(pg, &lockrunning)) { 1062 cleaned++; 1063 goto again; 1064 } 1065 1066 skip++; 1067 } 1068 break; 1069 } 1070 mutex_exit(&uvm_pageqlock); 1071 1072 /* 1073 * Ok, someone is running with an object lock held. 1074 * We want to yield the host CPU to make sure the 1075 * thread is not parked on the host. Since sched_yield() 1076 * doesn't appear to do anything on NetBSD, nanosleep 1077 * for the smallest possible time and hope we're back in 1078 * the game soon. 1079 */ 1080 if (cleaned == 0 && lockrunning) { 1081 rumpuser_clock_sleep(RUMPUSER_CLOCK_RELWALL, 0, 1); 1082 1083 lockrunning = false; 1084 skip = 0; 1085 1086 /* and here we go again */ 1087 goto again; 1088 } 1089 1090 /* 1091 * And of course we need to reclaim the page cache 1092 * again to actually release memory. 1093 */ 1094 pool_cache_reclaim(&pagecache); 1095 if (!NEED_PAGEDAEMON()) { 1096 mutex_enter(&pdaemonmtx); 1097 continue; 1098 } 1099 1100 /* 1101 * And then drain the pools. Wipe them out ... all of them. 1102 */ 1103 for (pp_first = NULL;;) { 1104 if (rump_vfs_drainbufs) 1105 rump_vfs_drainbufs(10 /* XXX: estimate! */); 1106 1107 succ = pool_drain(&pp); 1108 if (succ || pp == pp_first) 1109 break; 1110 1111 if (pp_first == NULL) 1112 pp_first = pp; 1113 } 1114 1115 /* 1116 * Need to use PYEC on our bag of tricks. 1117 * Unfortunately, the wife just borrowed it. 1118 */ 1119 1120 mutex_enter(&pdaemonmtx); 1121 if (!succ && cleaned == 0 && pdaemon_waiters && 1122 uvmexp.paging == 0) { 1123 rumpuser_dprintf("pagedaemoness: failed to reclaim " 1124 "memory ... sleeping (deadlock?)\n"); 1125 cv_timedwait(&pdaemoncv, &pdaemonmtx, hz); 1126 } 1127 } 1128 1129 panic("you can swap out any time you like, but you can never leave"); 1130 } 1131 1132 void 1133 uvm_kick_pdaemon() 1134 { 1135 1136 /* 1137 * Wake up the diabolical pagedaemon director if we are over 1138 * 90% of the memory limit. This is a complete and utter 1139 * stetson-harrison decision which you are allowed to finetune. 1140 * Don't bother locking. If we have some unflushed caches, 1141 * other waker-uppers will deal with the issue. 1142 */ 1143 if (NEED_PAGEDAEMON()) { 1144 cv_signal(&pdaemoncv); 1145 } 1146 } 1147 1148 void * 1149 rump_hypermalloc(size_t howmuch, int alignment, bool waitok, const char *wmsg) 1150 { 1151 unsigned long newmem; 1152 void *rv; 1153 int error; 1154 1155 uvm_kick_pdaemon(); /* ouch */ 1156 1157 /* first we must be within the limit */ 1158 limitagain: 1159 if (rump_physmemlimit != RUMPMEM_UNLIMITED) { 1160 newmem = atomic_add_long_nv(&curphysmem, howmuch); 1161 if (newmem > rump_physmemlimit) { 1162 newmem = atomic_add_long_nv(&curphysmem, -howmuch); 1163 if (!waitok) { 1164 return NULL; 1165 } 1166 uvm_wait(wmsg); 1167 goto limitagain; 1168 } 1169 } 1170 1171 /* second, we must get something from the backend */ 1172 again: 1173 error = rumpuser_malloc(howmuch, alignment, &rv); 1174 if (__predict_false(error && waitok)) { 1175 uvm_wait(wmsg); 1176 goto again; 1177 } 1178 1179 return rv; 1180 } 1181 1182 void 1183 rump_hyperfree(void *what, size_t size) 1184 { 1185 1186 if (rump_physmemlimit != RUMPMEM_UNLIMITED) { 1187 atomic_add_long(&curphysmem, -size); 1188 } 1189 rumpuser_free(what, size); 1190 } 1191 1192 void 1193 uvm_swap_shutdown(struct lwp *lwp) 1194 { 1195 } 1196