1 /* $NetBSD: vm.c,v 1.120 2011/10/31 13:23:55 yamt Exp $ */ 2 3 /* 4 * Copyright (c) 2007-2011 Antti Kantee. All Rights Reserved. 5 * 6 * Development of this software was supported by 7 * The Finnish Cultural Foundation and the Research Foundation of 8 * The Helsinki University of Technology. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 20 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * Virtual memory emulation routines. 34 */ 35 36 /* 37 * XXX: we abuse pg->uanon for the virtual address of the storage 38 * for each page. phys_addr would fit the job description better, 39 * except that it will create unnecessary lossage on some platforms 40 * due to not being a pointer type. 41 */ 42 43 #include <sys/cdefs.h> 44 __KERNEL_RCSID(0, "$NetBSD: vm.c,v 1.120 2011/10/31 13:23:55 yamt Exp $"); 45 46 #include <sys/param.h> 47 #include <sys/atomic.h> 48 #include <sys/buf.h> 49 #include <sys/kernel.h> 50 #include <sys/kmem.h> 51 #include <sys/mman.h> 52 #include <sys/null.h> 53 #include <sys/vnode.h> 54 55 #include <machine/pmap.h> 56 57 #include <rump/rumpuser.h> 58 59 #include <uvm/uvm.h> 60 #include <uvm/uvm_ddb.h> 61 #include <uvm/uvm_pdpolicy.h> 62 #include <uvm/uvm_prot.h> 63 #include <uvm/uvm_readahead.h> 64 65 #include "rump_private.h" 66 #include "rump_vfs_private.h" 67 68 kmutex_t uvm_pageqlock; 69 kmutex_t uvm_swap_data_lock; 70 71 struct uvmexp uvmexp; 72 struct uvm uvm; 73 74 #ifdef __uvmexp_pagesize 75 int *uvmexp_pagesize = &uvmexp.pagesize; 76 int *uvmexp_pagemask = &uvmexp.pagemask; 77 int *uvmexp_pageshift = &uvmexp.pageshift; 78 #endif 79 80 struct vm_map rump_vmmap; 81 static struct vm_map_kernel kmem_map_store; 82 struct vm_map *kmem_map = &kmem_map_store.vmk_map; 83 84 static struct vm_map_kernel kernel_map_store; 85 struct vm_map *kernel_map = &kernel_map_store.vmk_map; 86 87 static unsigned int pdaemon_waiters; 88 static kmutex_t pdaemonmtx; 89 static kcondvar_t pdaemoncv, oomwait; 90 91 unsigned long rump_physmemlimit = RUMPMEM_UNLIMITED; 92 static unsigned long curphysmem; 93 static unsigned long dddlim; /* 90% of memory limit used */ 94 #define NEED_PAGEDAEMON() \ 95 (rump_physmemlimit != RUMPMEM_UNLIMITED && curphysmem > dddlim) 96 97 /* 98 * Try to free two pages worth of pages from objects. 99 * If this succesfully frees a full page cache page, we'll 100 * free the released page plus PAGE_SIZE/sizeof(vm_page). 101 */ 102 #define PAGEDAEMON_OBJCHUNK (2*PAGE_SIZE / sizeof(struct vm_page)) 103 104 /* 105 * Keep a list of least recently used pages. Since the only way a 106 * rump kernel can "access" a page is via lookup, we put the page 107 * at the back of queue every time a lookup for it is done. If the 108 * page is in front of this global queue and we're short of memory, 109 * it's a candidate for pageout. 110 */ 111 static struct pglist vmpage_lruqueue; 112 static unsigned vmpage_onqueue; 113 114 static int 115 pg_compare_key(void *ctx, const void *n, const void *key) 116 { 117 voff_t a = ((const struct vm_page *)n)->offset; 118 voff_t b = *(const voff_t *)key; 119 120 if (a < b) 121 return -1; 122 else if (a > b) 123 return 1; 124 else 125 return 0; 126 } 127 128 static int 129 pg_compare_nodes(void *ctx, const void *n1, const void *n2) 130 { 131 132 return pg_compare_key(ctx, n1, &((const struct vm_page *)n2)->offset); 133 } 134 135 const rb_tree_ops_t uvm_page_tree_ops = { 136 .rbto_compare_nodes = pg_compare_nodes, 137 .rbto_compare_key = pg_compare_key, 138 .rbto_node_offset = offsetof(struct vm_page, rb_node), 139 .rbto_context = NULL 140 }; 141 142 /* 143 * vm pages 144 */ 145 146 static int 147 pgctor(void *arg, void *obj, int flags) 148 { 149 struct vm_page *pg = obj; 150 151 memset(pg, 0, sizeof(*pg)); 152 pg->uanon = rump_hypermalloc(PAGE_SIZE, PAGE_SIZE, 153 (flags & PR_WAITOK) == PR_WAITOK, "pgalloc"); 154 return pg->uanon == NULL; 155 } 156 157 static void 158 pgdtor(void *arg, void *obj) 159 { 160 struct vm_page *pg = obj; 161 162 rump_hyperfree(pg->uanon, PAGE_SIZE); 163 } 164 165 static struct pool_cache pagecache; 166 167 /* 168 * Called with the object locked. We don't support anons. 169 */ 170 struct vm_page * 171 uvm_pagealloc_strat(struct uvm_object *uobj, voff_t off, struct vm_anon *anon, 172 int flags, int strat, int free_list) 173 { 174 struct vm_page *pg; 175 176 KASSERT(uobj && mutex_owned(uobj->vmobjlock)); 177 KASSERT(anon == NULL); 178 179 pg = pool_cache_get(&pagecache, PR_NOWAIT); 180 if (__predict_false(pg == NULL)) { 181 return NULL; 182 } 183 184 pg->offset = off; 185 pg->uobject = uobj; 186 187 pg->flags = PG_CLEAN|PG_BUSY|PG_FAKE; 188 if (flags & UVM_PGA_ZERO) { 189 uvm_pagezero(pg); 190 } 191 192 TAILQ_INSERT_TAIL(&uobj->memq, pg, listq.queue); 193 (void)rb_tree_insert_node(&uobj->rb_tree, pg); 194 195 /* 196 * Don't put anons on the LRU page queue. We can't flush them 197 * (there's no concept of swap in a rump kernel), so no reason 198 * to bother with them. 199 */ 200 if (!UVM_OBJ_IS_AOBJ(uobj)) { 201 atomic_inc_uint(&vmpage_onqueue); 202 mutex_enter(&uvm_pageqlock); 203 TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue); 204 mutex_exit(&uvm_pageqlock); 205 } 206 207 uobj->uo_npages++; 208 209 return pg; 210 } 211 212 /* 213 * Release a page. 214 * 215 * Called with the vm object locked. 216 */ 217 void 218 uvm_pagefree(struct vm_page *pg) 219 { 220 struct uvm_object *uobj = pg->uobject; 221 222 KASSERT(mutex_owned(&uvm_pageqlock)); 223 KASSERT(mutex_owned(uobj->vmobjlock)); 224 225 if (pg->flags & PG_WANTED) 226 wakeup(pg); 227 228 TAILQ_REMOVE(&uobj->memq, pg, listq.queue); 229 230 uobj->uo_npages--; 231 rb_tree_remove_node(&uobj->rb_tree, pg); 232 233 if (!UVM_OBJ_IS_AOBJ(uobj)) { 234 TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue); 235 atomic_dec_uint(&vmpage_onqueue); 236 } 237 238 pool_cache_put(&pagecache, pg); 239 } 240 241 void 242 uvm_pagezero(struct vm_page *pg) 243 { 244 245 pg->flags &= ~PG_CLEAN; 246 memset((void *)pg->uanon, 0, PAGE_SIZE); 247 } 248 249 /* 250 * Misc routines 251 */ 252 253 static kmutex_t pagermtx; 254 255 void 256 uvm_init(void) 257 { 258 char buf[64]; 259 int error; 260 261 if (rumpuser_getenv("RUMP_MEMLIMIT", buf, sizeof(buf), &error) == 0) { 262 unsigned long tmp; 263 char *ep; 264 int mult; 265 266 tmp = strtoul(buf, &ep, 10); 267 if (strlen(ep) > 1) 268 panic("uvm_init: invalid RUMP_MEMLIMIT: %s", buf); 269 270 /* mini-dehumanize-number */ 271 mult = 1; 272 switch (*ep) { 273 case 'k': 274 mult = 1024; 275 break; 276 case 'm': 277 mult = 1024*1024; 278 break; 279 case 'g': 280 mult = 1024*1024*1024; 281 break; 282 case 0: 283 break; 284 default: 285 panic("uvm_init: invalid RUMP_MEMLIMIT: %s", buf); 286 } 287 rump_physmemlimit = tmp * mult; 288 289 if (rump_physmemlimit / mult != tmp) 290 panic("uvm_init: RUMP_MEMLIMIT overflow: %s", buf); 291 /* it's not like we'd get far with, say, 1 byte, but ... */ 292 if (rump_physmemlimit == 0) 293 panic("uvm_init: no memory"); 294 295 #define HUMANIZE_BYTES 9 296 CTASSERT(sizeof(buf) >= HUMANIZE_BYTES); 297 format_bytes(buf, HUMANIZE_BYTES, rump_physmemlimit); 298 #undef HUMANIZE_BYTES 299 dddlim = 9 * (rump_physmemlimit / 10); 300 } else { 301 strlcpy(buf, "unlimited (host limit)", sizeof(buf)); 302 } 303 aprint_verbose("total memory = %s\n", buf); 304 305 TAILQ_INIT(&vmpage_lruqueue); 306 307 uvmexp.free = 1024*1024; /* XXX: arbitrary & not updated */ 308 309 #ifndef __uvmexp_pagesize 310 uvmexp.pagesize = PAGE_SIZE; 311 uvmexp.pagemask = PAGE_MASK; 312 uvmexp.pageshift = PAGE_SHIFT; 313 #else 314 #define FAKE_PAGE_SHIFT 12 315 uvmexp.pageshift = FAKE_PAGE_SHIFT; 316 uvmexp.pagesize = 1<<FAKE_PAGE_SHIFT; 317 uvmexp.pagemask = (1<<FAKE_PAGE_SHIFT)-1; 318 #undef FAKE_PAGE_SHIFT 319 #endif 320 321 mutex_init(&pagermtx, MUTEX_DEFAULT, 0); 322 mutex_init(&uvm_pageqlock, MUTEX_DEFAULT, 0); 323 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, 0); 324 325 mutex_init(&pdaemonmtx, MUTEX_DEFAULT, 0); 326 cv_init(&pdaemoncv, "pdaemon"); 327 cv_init(&oomwait, "oomwait"); 328 329 kernel_map->pmap = pmap_kernel(); 330 callback_head_init(&kernel_map_store.vmk_reclaim_callback, IPL_VM); 331 kmem_map->pmap = pmap_kernel(); 332 callback_head_init(&kmem_map_store.vmk_reclaim_callback, IPL_VM); 333 334 pool_cache_bootstrap(&pagecache, sizeof(struct vm_page), 0, 0, 0, 335 "page$", NULL, IPL_NONE, pgctor, pgdtor, NULL); 336 } 337 338 void 339 uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t vmin, vaddr_t vmax) 340 { 341 342 vm->vm_map.pmap = pmap_kernel(); 343 vm->vm_refcnt = 1; 344 } 345 346 void 347 uvm_pagewire(struct vm_page *pg) 348 { 349 350 /* nada */ 351 } 352 353 void 354 uvm_pageunwire(struct vm_page *pg) 355 { 356 357 /* nada */ 358 } 359 360 /* 361 * The uvm reclaim hook is not currently necessary because it is 362 * used only by ZFS and implements exactly the same functionality 363 * as the kva reclaim hook which we already run in the pagedaemon 364 * (rump vm does not have a concept of uvm_map(), so we cannot 365 * reclaim kva it when a mapping operation fails due to insufficient 366 * available kva). 367 */ 368 void 369 uvm_reclaim_hook_add(struct uvm_reclaim_hook *hook_entry) 370 { 371 372 } 373 __strong_alias(uvm_reclaim_hook_del,uvm_reclaim_hook_add); 374 375 /* where's your schmonz now? */ 376 #define PUNLIMIT(a) \ 377 p->p_rlimit[a].rlim_cur = p->p_rlimit[a].rlim_max = RLIM_INFINITY; 378 void 379 uvm_init_limits(struct proc *p) 380 { 381 382 PUNLIMIT(RLIMIT_STACK); 383 PUNLIMIT(RLIMIT_DATA); 384 PUNLIMIT(RLIMIT_RSS); 385 PUNLIMIT(RLIMIT_AS); 386 /* nice, cascade */ 387 } 388 #undef PUNLIMIT 389 390 /* 391 * This satisfies the "disgusting mmap hack" used by proplib. 392 * We probably should grow some more assertables to make sure we're 393 * not satisfying anything we shouldn't be satisfying. 394 */ 395 int 396 uvm_mmap(struct vm_map *map, vaddr_t *addr, vsize_t size, vm_prot_t prot, 397 vm_prot_t maxprot, int flags, void *handle, voff_t off, vsize_t locklim) 398 { 399 void *uaddr; 400 int error; 401 402 if (prot != (VM_PROT_READ | VM_PROT_WRITE)) 403 panic("uvm_mmap() variant unsupported"); 404 if (flags != (MAP_PRIVATE | MAP_ANON)) 405 panic("uvm_mmap() variant unsupported"); 406 407 /* no reason in particular, but cf. uvm_default_mapaddr() */ 408 if (*addr != 0) 409 panic("uvm_mmap() variant unsupported"); 410 411 if (RUMP_LOCALPROC_P(curproc)) { 412 uaddr = rumpuser_anonmmap(NULL, size, 0, 0, &error); 413 } else { 414 error = rumpuser_sp_anonmmap(curproc->p_vmspace->vm_map.pmap, 415 size, &uaddr); 416 } 417 if (uaddr == NULL) 418 return error; 419 420 *addr = (vaddr_t)uaddr; 421 return 0; 422 } 423 424 struct pagerinfo { 425 vaddr_t pgr_kva; 426 int pgr_npages; 427 struct vm_page **pgr_pgs; 428 bool pgr_read; 429 430 LIST_ENTRY(pagerinfo) pgr_entries; 431 }; 432 static LIST_HEAD(, pagerinfo) pagerlist = LIST_HEAD_INITIALIZER(pagerlist); 433 434 /* 435 * Pager "map" in routine. Instead of mapping, we allocate memory 436 * and copy page contents there. Not optimal or even strictly 437 * correct (the caller might modify the page contents after mapping 438 * them in), but what the heck. Assumes UVMPAGER_MAPIN_WAITOK. 439 */ 440 vaddr_t 441 uvm_pagermapin(struct vm_page **pgs, int npages, int flags) 442 { 443 struct pagerinfo *pgri; 444 vaddr_t curkva; 445 int i; 446 447 /* allocate structures */ 448 pgri = kmem_alloc(sizeof(*pgri), KM_SLEEP); 449 pgri->pgr_kva = (vaddr_t)kmem_alloc(npages * PAGE_SIZE, KM_SLEEP); 450 pgri->pgr_npages = npages; 451 pgri->pgr_pgs = kmem_alloc(sizeof(struct vm_page *) * npages, KM_SLEEP); 452 pgri->pgr_read = (flags & UVMPAGER_MAPIN_READ) != 0; 453 454 /* copy contents to "mapped" memory */ 455 for (i = 0, curkva = pgri->pgr_kva; 456 i < npages; 457 i++, curkva += PAGE_SIZE) { 458 /* 459 * We need to copy the previous contents of the pages to 460 * the window even if we are reading from the 461 * device, since the device might not fill the contents of 462 * the full mapped range and we will end up corrupting 463 * data when we unmap the window. 464 */ 465 memcpy((void*)curkva, pgs[i]->uanon, PAGE_SIZE); 466 pgri->pgr_pgs[i] = pgs[i]; 467 } 468 469 mutex_enter(&pagermtx); 470 LIST_INSERT_HEAD(&pagerlist, pgri, pgr_entries); 471 mutex_exit(&pagermtx); 472 473 return pgri->pgr_kva; 474 } 475 476 /* 477 * map out the pager window. return contents from VA to page storage 478 * and free structures. 479 * 480 * Note: does not currently support partial frees 481 */ 482 void 483 uvm_pagermapout(vaddr_t kva, int npages) 484 { 485 struct pagerinfo *pgri; 486 vaddr_t curkva; 487 int i; 488 489 mutex_enter(&pagermtx); 490 LIST_FOREACH(pgri, &pagerlist, pgr_entries) { 491 if (pgri->pgr_kva == kva) 492 break; 493 } 494 KASSERT(pgri); 495 if (pgri->pgr_npages != npages) 496 panic("uvm_pagermapout: partial unmapping not supported"); 497 LIST_REMOVE(pgri, pgr_entries); 498 mutex_exit(&pagermtx); 499 500 if (pgri->pgr_read) { 501 for (i = 0, curkva = pgri->pgr_kva; 502 i < pgri->pgr_npages; 503 i++, curkva += PAGE_SIZE) { 504 memcpy(pgri->pgr_pgs[i]->uanon,(void*)curkva,PAGE_SIZE); 505 } 506 } 507 508 kmem_free(pgri->pgr_pgs, npages * sizeof(struct vm_page *)); 509 kmem_free((void*)pgri->pgr_kva, npages * PAGE_SIZE); 510 kmem_free(pgri, sizeof(*pgri)); 511 } 512 513 /* 514 * convert va in pager window to page structure. 515 * XXX: how expensive is this (global lock, list traversal)? 516 */ 517 struct vm_page * 518 uvm_pageratop(vaddr_t va) 519 { 520 struct pagerinfo *pgri; 521 struct vm_page *pg = NULL; 522 int i; 523 524 mutex_enter(&pagermtx); 525 LIST_FOREACH(pgri, &pagerlist, pgr_entries) { 526 if (pgri->pgr_kva <= va 527 && va < pgri->pgr_kva + pgri->pgr_npages*PAGE_SIZE) 528 break; 529 } 530 if (pgri) { 531 i = (va - pgri->pgr_kva) >> PAGE_SHIFT; 532 pg = pgri->pgr_pgs[i]; 533 } 534 mutex_exit(&pagermtx); 535 536 return pg; 537 } 538 539 /* 540 * Called with the vm object locked. 541 * 542 * Put vnode object pages at the end of the access queue to indicate 543 * they have been recently accessed and should not be immediate 544 * candidates for pageout. Do not do this for lookups done by 545 * the pagedaemon to mimic pmap_kentered mappings which don't track 546 * access information. 547 */ 548 struct vm_page * 549 uvm_pagelookup(struct uvm_object *uobj, voff_t off) 550 { 551 struct vm_page *pg; 552 bool ispagedaemon = curlwp == uvm.pagedaemon_lwp; 553 554 pg = rb_tree_find_node(&uobj->rb_tree, &off); 555 if (pg && !UVM_OBJ_IS_AOBJ(pg->uobject) && !ispagedaemon) { 556 mutex_enter(&uvm_pageqlock); 557 TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue); 558 TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue); 559 mutex_exit(&uvm_pageqlock); 560 } 561 562 return pg; 563 } 564 565 void 566 uvm_page_unbusy(struct vm_page **pgs, int npgs) 567 { 568 struct vm_page *pg; 569 int i; 570 571 KASSERT(npgs > 0); 572 KASSERT(mutex_owned(pgs[0]->uobject->vmobjlock)); 573 574 for (i = 0; i < npgs; i++) { 575 pg = pgs[i]; 576 if (pg == NULL) 577 continue; 578 579 KASSERT(pg->flags & PG_BUSY); 580 if (pg->flags & PG_WANTED) 581 wakeup(pg); 582 if (pg->flags & PG_RELEASED) 583 uvm_pagefree(pg); 584 else 585 pg->flags &= ~(PG_WANTED|PG_BUSY); 586 } 587 } 588 589 void 590 uvm_estimatepageable(int *active, int *inactive) 591 { 592 593 /* XXX: guessing game */ 594 *active = 1024; 595 *inactive = 1024; 596 } 597 598 struct vm_map_kernel * 599 vm_map_to_kernel(struct vm_map *map) 600 { 601 602 return (struct vm_map_kernel *)map; 603 } 604 605 bool 606 vm_map_starved_p(struct vm_map *map) 607 { 608 609 if (map->flags & VM_MAP_WANTVA) 610 return true; 611 612 return false; 613 } 614 615 int 616 uvm_loan(struct vm_map *map, vaddr_t start, vsize_t len, void *v, int flags) 617 { 618 619 panic("%s: unimplemented", __func__); 620 } 621 622 void 623 uvm_unloan(void *v, int npages, int flags) 624 { 625 626 panic("%s: unimplemented", __func__); 627 } 628 629 int 630 uvm_loanuobjpages(struct uvm_object *uobj, voff_t pgoff, int orignpages, 631 struct vm_page **opp) 632 { 633 634 return EBUSY; 635 } 636 637 struct vm_page * 638 uvm_loanbreak(struct vm_page *pg) 639 { 640 641 panic("%s: unimplemented", __func__); 642 } 643 644 void 645 ubc_purge(struct uvm_object *uobj) 646 { 647 648 } 649 650 #ifdef DEBUGPRINT 651 void 652 uvm_object_printit(struct uvm_object *uobj, bool full, 653 void (*pr)(const char *, ...)) 654 { 655 656 pr("VM OBJECT at %p, refs %d", uobj, uobj->uo_refs); 657 } 658 #endif 659 660 vaddr_t 661 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz) 662 { 663 664 return 0; 665 } 666 667 int 668 uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end, 669 vm_prot_t prot, bool set_max) 670 { 671 672 return EOPNOTSUPP; 673 } 674 675 /* 676 * UVM km 677 */ 678 679 vaddr_t 680 uvm_km_alloc(struct vm_map *map, vsize_t size, vsize_t align, uvm_flag_t flags) 681 { 682 void *rv, *desired = NULL; 683 int alignbit, error; 684 685 #ifdef __x86_64__ 686 /* 687 * On amd64, allocate all module memory from the lowest 2GB. 688 * This is because NetBSD kernel modules are compiled 689 * with -mcmodel=kernel and reserve only 4 bytes for 690 * offsets. If we load code compiled with -mcmodel=kernel 691 * anywhere except the lowest or highest 2GB, it will not 692 * work. Since userspace does not have access to the highest 693 * 2GB, use the lowest 2GB. 694 * 695 * Note: this assumes the rump kernel resides in 696 * the lowest 2GB as well. 697 * 698 * Note2: yes, it's a quick hack, but since this the only 699 * place where we care about the map we're allocating from, 700 * just use a simple "if" instead of coming up with a fancy 701 * generic solution. 702 */ 703 extern struct vm_map *module_map; 704 if (map == module_map) { 705 desired = (void *)(0x80000000 - size); 706 } 707 #endif 708 709 alignbit = 0; 710 if (align) { 711 alignbit = ffs(align)-1; 712 } 713 714 rv = rumpuser_anonmmap(desired, size, alignbit, flags & UVM_KMF_EXEC, 715 &error); 716 if (rv == NULL) { 717 if (flags & (UVM_KMF_CANFAIL | UVM_KMF_NOWAIT)) 718 return 0; 719 else 720 panic("uvm_km_alloc failed"); 721 } 722 723 if (flags & UVM_KMF_ZERO) 724 memset(rv, 0, size); 725 726 return (vaddr_t)rv; 727 } 728 729 void 730 uvm_km_free(struct vm_map *map, vaddr_t vaddr, vsize_t size, uvm_flag_t flags) 731 { 732 733 rumpuser_unmap((void *)vaddr, size); 734 } 735 736 struct vm_map * 737 uvm_km_suballoc(struct vm_map *map, vaddr_t *minaddr, vaddr_t *maxaddr, 738 vsize_t size, int pageable, bool fixed, struct vm_map_kernel *submap) 739 { 740 741 return (struct vm_map *)417416; 742 } 743 744 vaddr_t 745 uvm_km_alloc_poolpage(struct vm_map *map, bool waitok) 746 { 747 748 return (vaddr_t)rump_hypermalloc(PAGE_SIZE, PAGE_SIZE, 749 waitok, "kmalloc"); 750 } 751 752 void 753 uvm_km_free_poolpage(struct vm_map *map, vaddr_t addr) 754 { 755 756 rump_hyperfree((void *)addr, PAGE_SIZE); 757 } 758 759 vaddr_t 760 uvm_km_alloc_poolpage_cache(struct vm_map *map, bool waitok) 761 { 762 763 return uvm_km_alloc_poolpage(map, waitok); 764 } 765 766 void 767 uvm_km_free_poolpage_cache(struct vm_map *map, vaddr_t vaddr) 768 { 769 770 uvm_km_free_poolpage(map, vaddr); 771 } 772 773 void 774 uvm_km_va_drain(struct vm_map *map, uvm_flag_t flags) 775 { 776 777 /* we eventually maybe want some model for available memory */ 778 } 779 780 /* 781 * VM space locking routines. We don't really have to do anything, 782 * since the pages are always "wired" (both local and remote processes). 783 */ 784 int 785 uvm_vslock(struct vmspace *vs, void *addr, size_t len, vm_prot_t access) 786 { 787 788 return 0; 789 } 790 791 void 792 uvm_vsunlock(struct vmspace *vs, void *addr, size_t len) 793 { 794 795 } 796 797 /* 798 * For the local case the buffer mappers don't need to do anything. 799 * For the remote case we need to reserve space and copy data in or 800 * out, depending on B_READ/B_WRITE. 801 */ 802 int 803 vmapbuf(struct buf *bp, vsize_t len) 804 { 805 int error = 0; 806 807 bp->b_saveaddr = bp->b_data; 808 809 /* remote case */ 810 if (!RUMP_LOCALPROC_P(curproc)) { 811 bp->b_data = rump_hypermalloc(len, 0, true, "vmapbuf"); 812 if (BUF_ISWRITE(bp)) { 813 error = copyin(bp->b_saveaddr, bp->b_data, len); 814 if (error) { 815 rump_hyperfree(bp->b_data, len); 816 bp->b_data = bp->b_saveaddr; 817 bp->b_saveaddr = 0; 818 } 819 } 820 } 821 822 return error; 823 } 824 825 void 826 vunmapbuf(struct buf *bp, vsize_t len) 827 { 828 829 /* remote case */ 830 if (!RUMP_LOCALPROC_P(bp->b_proc)) { 831 if (BUF_ISREAD(bp)) { 832 bp->b_error = copyout_proc(bp->b_proc, 833 bp->b_data, bp->b_saveaddr, len); 834 } 835 rump_hyperfree(bp->b_data, len); 836 } 837 838 bp->b_data = bp->b_saveaddr; 839 bp->b_saveaddr = 0; 840 } 841 842 void 843 uvmspace_addref(struct vmspace *vm) 844 { 845 846 /* 847 * No dynamically allocated vmspaces exist. 848 */ 849 } 850 851 void 852 uvmspace_free(struct vmspace *vm) 853 { 854 855 /* nothing for now */ 856 } 857 858 /* 859 * page life cycle stuff. it really doesn't exist, so just stubs. 860 */ 861 862 void 863 uvm_pageactivate(struct vm_page *pg) 864 { 865 866 /* nada */ 867 } 868 869 void 870 uvm_pagedeactivate(struct vm_page *pg) 871 { 872 873 /* nada */ 874 } 875 876 void 877 uvm_pagedequeue(struct vm_page *pg) 878 { 879 880 /* nada*/ 881 } 882 883 void 884 uvm_pageenqueue(struct vm_page *pg) 885 { 886 887 /* nada */ 888 } 889 890 void 891 uvmpdpol_anfree(struct vm_anon *an) 892 { 893 894 /* nada */ 895 } 896 897 /* 898 * Physical address accessors. 899 */ 900 901 struct vm_page * 902 uvm_phys_to_vm_page(paddr_t pa) 903 { 904 905 return NULL; 906 } 907 908 paddr_t 909 uvm_vm_page_to_phys(const struct vm_page *pg) 910 { 911 912 return 0; 913 } 914 915 /* 916 * Routines related to the Page Baroness. 917 */ 918 919 void 920 uvm_wait(const char *msg) 921 { 922 923 if (__predict_false(curlwp == uvm.pagedaemon_lwp)) 924 panic("pagedaemon out of memory"); 925 if (__predict_false(rump_threads == 0)) 926 panic("pagedaemon missing (RUMP_THREADS = 0)"); 927 928 mutex_enter(&pdaemonmtx); 929 pdaemon_waiters++; 930 cv_signal(&pdaemoncv); 931 cv_wait(&oomwait, &pdaemonmtx); 932 mutex_exit(&pdaemonmtx); 933 } 934 935 void 936 uvm_pageout_start(int npages) 937 { 938 939 mutex_enter(&pdaemonmtx); 940 uvmexp.paging += npages; 941 mutex_exit(&pdaemonmtx); 942 } 943 944 void 945 uvm_pageout_done(int npages) 946 { 947 948 if (!npages) 949 return; 950 951 mutex_enter(&pdaemonmtx); 952 KASSERT(uvmexp.paging >= npages); 953 uvmexp.paging -= npages; 954 955 if (pdaemon_waiters) { 956 pdaemon_waiters = 0; 957 cv_broadcast(&oomwait); 958 } 959 mutex_exit(&pdaemonmtx); 960 } 961 962 static bool 963 processpage(struct vm_page *pg, bool *lockrunning) 964 { 965 struct uvm_object *uobj; 966 967 uobj = pg->uobject; 968 if (mutex_tryenter(uobj->vmobjlock)) { 969 if ((pg->flags & PG_BUSY) == 0) { 970 mutex_exit(&uvm_pageqlock); 971 uobj->pgops->pgo_put(uobj, pg->offset, 972 pg->offset + PAGE_SIZE, 973 PGO_CLEANIT|PGO_FREE); 974 KASSERT(!mutex_owned(uobj->vmobjlock)); 975 return true; 976 } else { 977 mutex_exit(uobj->vmobjlock); 978 } 979 } else if (*lockrunning == false && ncpu > 1) { 980 CPU_INFO_ITERATOR cii; 981 struct cpu_info *ci; 982 struct lwp *l; 983 984 l = mutex_owner(uobj->vmobjlock); 985 for (CPU_INFO_FOREACH(cii, ci)) { 986 if (ci->ci_curlwp == l) { 987 *lockrunning = true; 988 break; 989 } 990 } 991 } 992 993 return false; 994 } 995 996 /* 997 * The Diabolical pageDaemon Director (DDD). 998 * 999 * This routine can always use better heuristics. 1000 */ 1001 void 1002 uvm_pageout(void *arg) 1003 { 1004 struct vm_page *pg; 1005 struct pool *pp, *pp_first; 1006 uint64_t where; 1007 int cleaned, skip, skipped; 1008 int waspaging; 1009 bool succ; 1010 bool lockrunning; 1011 1012 mutex_enter(&pdaemonmtx); 1013 for (;;) { 1014 if (!NEED_PAGEDAEMON()) { 1015 kernel_map->flags &= ~VM_MAP_WANTVA; 1016 kmem_map->flags &= ~VM_MAP_WANTVA; 1017 } 1018 1019 if (pdaemon_waiters) { 1020 pdaemon_waiters = 0; 1021 cv_broadcast(&oomwait); 1022 } 1023 1024 cv_wait(&pdaemoncv, &pdaemonmtx); 1025 uvmexp.pdwoke++; 1026 waspaging = uvmexp.paging; 1027 1028 /* tell the world that we are hungry */ 1029 kernel_map->flags |= VM_MAP_WANTVA; 1030 kmem_map->flags |= VM_MAP_WANTVA; 1031 mutex_exit(&pdaemonmtx); 1032 1033 /* 1034 * step one: reclaim the page cache. this should give 1035 * us the biggest earnings since whole pages are released 1036 * into backing memory. 1037 */ 1038 pool_cache_reclaim(&pagecache); 1039 if (!NEED_PAGEDAEMON()) { 1040 mutex_enter(&pdaemonmtx); 1041 continue; 1042 } 1043 1044 /* 1045 * Ok, so that didn't help. Next, try to hunt memory 1046 * by pushing out vnode pages. The pages might contain 1047 * useful cached data, but we need the memory. 1048 */ 1049 cleaned = 0; 1050 skip = 0; 1051 lockrunning = false; 1052 again: 1053 mutex_enter(&uvm_pageqlock); 1054 while (cleaned < PAGEDAEMON_OBJCHUNK) { 1055 skipped = 0; 1056 TAILQ_FOREACH(pg, &vmpage_lruqueue, pageq.queue) { 1057 1058 /* 1059 * skip over pages we _might_ have tried 1060 * to handle earlier. they might not be 1061 * exactly the same ones, but I'm not too 1062 * concerned. 1063 */ 1064 while (skipped++ < skip) 1065 continue; 1066 1067 if (processpage(pg, &lockrunning)) { 1068 cleaned++; 1069 goto again; 1070 } 1071 1072 skip++; 1073 } 1074 break; 1075 } 1076 mutex_exit(&uvm_pageqlock); 1077 1078 /* 1079 * Ok, someone is running with an object lock held. 1080 * We want to yield the host CPU to make sure the 1081 * thread is not parked on the host. Since sched_yield() 1082 * doesn't appear to do anything on NetBSD, nanosleep 1083 * for the smallest possible time and hope we're back in 1084 * the game soon. 1085 */ 1086 if (cleaned == 0 && lockrunning) { 1087 uint64_t sec, nsec; 1088 1089 sec = 0; 1090 nsec = 1; 1091 rumpuser_nanosleep(&sec, &nsec, NULL); 1092 1093 lockrunning = false; 1094 skip = 0; 1095 1096 /* and here we go again */ 1097 goto again; 1098 } 1099 1100 /* 1101 * And of course we need to reclaim the page cache 1102 * again to actually release memory. 1103 */ 1104 pool_cache_reclaim(&pagecache); 1105 if (!NEED_PAGEDAEMON()) { 1106 mutex_enter(&pdaemonmtx); 1107 continue; 1108 } 1109 1110 /* 1111 * Still not there? sleeves come off right about now. 1112 * First: do reclaim on kernel/kmem map. 1113 */ 1114 callback_run_roundrobin(&kernel_map_store.vmk_reclaim_callback, 1115 NULL); 1116 callback_run_roundrobin(&kmem_map_store.vmk_reclaim_callback, 1117 NULL); 1118 1119 /* 1120 * And then drain the pools. Wipe them out ... all of them. 1121 */ 1122 1123 pool_drain_start(&pp_first, &where); 1124 pp = pp_first; 1125 for (;;) { 1126 rump_vfs_drainbufs(10 /* XXX: estimate better */); 1127 succ = pool_drain_end(pp, where); 1128 if (succ) 1129 break; 1130 pool_drain_start(&pp, &where); 1131 if (pp == pp_first) { 1132 succ = pool_drain_end(pp, where); 1133 break; 1134 } 1135 } 1136 1137 /* 1138 * Need to use PYEC on our bag of tricks. 1139 * Unfortunately, the wife just borrowed it. 1140 */ 1141 1142 mutex_enter(&pdaemonmtx); 1143 if (!succ && cleaned == 0 && pdaemon_waiters && 1144 uvmexp.paging == 0) { 1145 rumpuser_dprintf("pagedaemoness: failed to reclaim " 1146 "memory ... sleeping (deadlock?)\n"); 1147 cv_timedwait(&pdaemoncv, &pdaemonmtx, hz); 1148 mutex_enter(&pdaemonmtx); 1149 } 1150 } 1151 1152 panic("you can swap out any time you like, but you can never leave"); 1153 } 1154 1155 void 1156 uvm_kick_pdaemon() 1157 { 1158 1159 /* 1160 * Wake up the diabolical pagedaemon director if we are over 1161 * 90% of the memory limit. This is a complete and utter 1162 * stetson-harrison decision which you are allowed to finetune. 1163 * Don't bother locking. If we have some unflushed caches, 1164 * other waker-uppers will deal with the issue. 1165 */ 1166 if (NEED_PAGEDAEMON()) { 1167 cv_signal(&pdaemoncv); 1168 } 1169 } 1170 1171 void * 1172 rump_hypermalloc(size_t howmuch, int alignment, bool waitok, const char *wmsg) 1173 { 1174 unsigned long newmem; 1175 void *rv; 1176 1177 uvm_kick_pdaemon(); /* ouch */ 1178 1179 /* first we must be within the limit */ 1180 limitagain: 1181 if (rump_physmemlimit != RUMPMEM_UNLIMITED) { 1182 newmem = atomic_add_long_nv(&curphysmem, howmuch); 1183 if (newmem > rump_physmemlimit) { 1184 newmem = atomic_add_long_nv(&curphysmem, -howmuch); 1185 if (!waitok) { 1186 return NULL; 1187 } 1188 uvm_wait(wmsg); 1189 goto limitagain; 1190 } 1191 } 1192 1193 /* second, we must get something from the backend */ 1194 again: 1195 rv = rumpuser_malloc(howmuch, alignment); 1196 if (__predict_false(rv == NULL && waitok)) { 1197 uvm_wait(wmsg); 1198 goto again; 1199 } 1200 1201 return rv; 1202 } 1203 1204 void 1205 rump_hyperfree(void *what, size_t size) 1206 { 1207 1208 if (rump_physmemlimit != RUMPMEM_UNLIMITED) { 1209 atomic_add_long(&curphysmem, -size); 1210 } 1211 rumpuser_free(what); 1212 } 1213