1 /* $NetBSD: vm.c,v 1.149 2014/02/18 06:18:13 pooka Exp $ */ 2 3 /* 4 * Copyright (c) 2007-2011 Antti Kantee. All Rights Reserved. 5 * 6 * Development of this software was supported by 7 * The Finnish Cultural Foundation and the Research Foundation of 8 * The Helsinki University of Technology. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 20 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * Virtual memory emulation routines. 34 */ 35 36 /* 37 * XXX: we abuse pg->uanon for the virtual address of the storage 38 * for each page. phys_addr would fit the job description better, 39 * except that it will create unnecessary lossage on some platforms 40 * due to not being a pointer type. 41 */ 42 43 #include <sys/cdefs.h> 44 __KERNEL_RCSID(0, "$NetBSD: vm.c,v 1.149 2014/02/18 06:18:13 pooka Exp $"); 45 46 #include <sys/param.h> 47 #include <sys/atomic.h> 48 #include <sys/buf.h> 49 #include <sys/kernel.h> 50 #include <sys/kmem.h> 51 #include <sys/vmem.h> 52 #include <sys/mman.h> 53 #include <sys/null.h> 54 #include <sys/vnode.h> 55 56 #include <machine/pmap.h> 57 58 #include <rump/rumpuser.h> 59 60 #include <uvm/uvm.h> 61 #include <uvm/uvm_ddb.h> 62 #include <uvm/uvm_pdpolicy.h> 63 #include <uvm/uvm_prot.h> 64 #include <uvm/uvm_readahead.h> 65 66 #include "rump_private.h" 67 #include "rump_vfs_private.h" 68 69 kmutex_t uvm_pageqlock; 70 kmutex_t uvm_swap_data_lock; 71 72 struct uvmexp uvmexp; 73 struct uvm uvm; 74 75 #ifdef __uvmexp_pagesize 76 const int * const uvmexp_pagesize = &uvmexp.pagesize; 77 const int * const uvmexp_pagemask = &uvmexp.pagemask; 78 const int * const uvmexp_pageshift = &uvmexp.pageshift; 79 #endif 80 81 struct vm_map rump_vmmap; 82 83 static struct vm_map kernel_map_store; 84 struct vm_map *kernel_map = &kernel_map_store; 85 86 static struct vm_map module_map_store; 87 extern struct vm_map *module_map; 88 89 vmem_t *kmem_arena; 90 vmem_t *kmem_va_arena; 91 92 static unsigned int pdaemon_waiters; 93 static kmutex_t pdaemonmtx; 94 static kcondvar_t pdaemoncv, oomwait; 95 96 unsigned long rump_physmemlimit = RUMPMEM_UNLIMITED; 97 static unsigned long pdlimit = RUMPMEM_UNLIMITED; /* page daemon memlimit */ 98 static unsigned long curphysmem; 99 static unsigned long dddlim; /* 90% of memory limit used */ 100 #define NEED_PAGEDAEMON() \ 101 (rump_physmemlimit != RUMPMEM_UNLIMITED && curphysmem > dddlim) 102 103 /* 104 * Try to free two pages worth of pages from objects. 105 * If this succesfully frees a full page cache page, we'll 106 * free the released page plus PAGE_SIZE/sizeof(vm_page). 107 */ 108 #define PAGEDAEMON_OBJCHUNK (2*PAGE_SIZE / sizeof(struct vm_page)) 109 110 /* 111 * Keep a list of least recently used pages. Since the only way a 112 * rump kernel can "access" a page is via lookup, we put the page 113 * at the back of queue every time a lookup for it is done. If the 114 * page is in front of this global queue and we're short of memory, 115 * it's a candidate for pageout. 116 */ 117 static struct pglist vmpage_lruqueue; 118 static unsigned vmpage_onqueue; 119 120 static int 121 pg_compare_key(void *ctx, const void *n, const void *key) 122 { 123 voff_t a = ((const struct vm_page *)n)->offset; 124 voff_t b = *(const voff_t *)key; 125 126 if (a < b) 127 return -1; 128 else if (a > b) 129 return 1; 130 else 131 return 0; 132 } 133 134 static int 135 pg_compare_nodes(void *ctx, const void *n1, const void *n2) 136 { 137 138 return pg_compare_key(ctx, n1, &((const struct vm_page *)n2)->offset); 139 } 140 141 const rb_tree_ops_t uvm_page_tree_ops = { 142 .rbto_compare_nodes = pg_compare_nodes, 143 .rbto_compare_key = pg_compare_key, 144 .rbto_node_offset = offsetof(struct vm_page, rb_node), 145 .rbto_context = NULL 146 }; 147 148 /* 149 * vm pages 150 */ 151 152 static int 153 pgctor(void *arg, void *obj, int flags) 154 { 155 struct vm_page *pg = obj; 156 157 memset(pg, 0, sizeof(*pg)); 158 pg->uanon = rump_hypermalloc(PAGE_SIZE, PAGE_SIZE, 159 (flags & PR_WAITOK) == PR_WAITOK, "pgalloc"); 160 return pg->uanon == NULL; 161 } 162 163 static void 164 pgdtor(void *arg, void *obj) 165 { 166 struct vm_page *pg = obj; 167 168 rump_hyperfree(pg->uanon, PAGE_SIZE); 169 } 170 171 static struct pool_cache pagecache; 172 173 /* 174 * Called with the object locked. We don't support anons. 175 */ 176 struct vm_page * 177 uvm_pagealloc_strat(struct uvm_object *uobj, voff_t off, struct vm_anon *anon, 178 int flags, int strat, int free_list) 179 { 180 struct vm_page *pg; 181 182 KASSERT(uobj && mutex_owned(uobj->vmobjlock)); 183 KASSERT(anon == NULL); 184 185 pg = pool_cache_get(&pagecache, PR_NOWAIT); 186 if (__predict_false(pg == NULL)) { 187 return NULL; 188 } 189 190 pg->offset = off; 191 pg->uobject = uobj; 192 193 pg->flags = PG_CLEAN|PG_BUSY|PG_FAKE; 194 if (flags & UVM_PGA_ZERO) { 195 uvm_pagezero(pg); 196 } 197 198 TAILQ_INSERT_TAIL(&uobj->memq, pg, listq.queue); 199 (void)rb_tree_insert_node(&uobj->rb_tree, pg); 200 201 /* 202 * Don't put anons on the LRU page queue. We can't flush them 203 * (there's no concept of swap in a rump kernel), so no reason 204 * to bother with them. 205 */ 206 if (!UVM_OBJ_IS_AOBJ(uobj)) { 207 atomic_inc_uint(&vmpage_onqueue); 208 mutex_enter(&uvm_pageqlock); 209 TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue); 210 mutex_exit(&uvm_pageqlock); 211 } 212 213 uobj->uo_npages++; 214 215 return pg; 216 } 217 218 /* 219 * Release a page. 220 * 221 * Called with the vm object locked. 222 */ 223 void 224 uvm_pagefree(struct vm_page *pg) 225 { 226 struct uvm_object *uobj = pg->uobject; 227 228 KASSERT(mutex_owned(&uvm_pageqlock)); 229 KASSERT(mutex_owned(uobj->vmobjlock)); 230 231 if (pg->flags & PG_WANTED) 232 wakeup(pg); 233 234 TAILQ_REMOVE(&uobj->memq, pg, listq.queue); 235 236 uobj->uo_npages--; 237 rb_tree_remove_node(&uobj->rb_tree, pg); 238 239 if (!UVM_OBJ_IS_AOBJ(uobj)) { 240 TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue); 241 atomic_dec_uint(&vmpage_onqueue); 242 } 243 244 pool_cache_put(&pagecache, pg); 245 } 246 247 void 248 uvm_pagezero(struct vm_page *pg) 249 { 250 251 pg->flags &= ~PG_CLEAN; 252 memset((void *)pg->uanon, 0, PAGE_SIZE); 253 } 254 255 /* 256 * uvm_page_locked_p: return true if object associated with page is 257 * locked. this is a weak check for runtime assertions only. 258 */ 259 260 bool 261 uvm_page_locked_p(struct vm_page *pg) 262 { 263 264 return mutex_owned(pg->uobject->vmobjlock); 265 } 266 267 /* 268 * Misc routines 269 */ 270 271 static kmutex_t pagermtx; 272 273 void 274 uvm_init(void) 275 { 276 char buf[64]; 277 278 if (rumpuser_getparam("RUMP_MEMLIMIT", buf, sizeof(buf)) == 0) { 279 unsigned long tmp; 280 char *ep; 281 int mult; 282 283 tmp = strtoul(buf, &ep, 10); 284 if (strlen(ep) > 1) 285 panic("uvm_init: invalid RUMP_MEMLIMIT: %s", buf); 286 287 /* mini-dehumanize-number */ 288 mult = 1; 289 switch (*ep) { 290 case 'k': 291 mult = 1024; 292 break; 293 case 'm': 294 mult = 1024*1024; 295 break; 296 case 'g': 297 mult = 1024*1024*1024; 298 break; 299 case 0: 300 break; 301 default: 302 panic("uvm_init: invalid RUMP_MEMLIMIT: %s", buf); 303 } 304 rump_physmemlimit = tmp * mult; 305 306 if (rump_physmemlimit / mult != tmp) 307 panic("uvm_init: RUMP_MEMLIMIT overflow: %s", buf); 308 /* it's not like we'd get far with, say, 1 byte, but ... */ 309 if (rump_physmemlimit < 1024*1024) 310 printf("uvm_init: WARNING: <1MB RAM limit, " 311 "hope you know what you're doing\n"); 312 313 /* reserve some memory for the pager */ 314 pdlimit = rump_physmemlimit; 315 rump_physmemlimit -= 2*MAXPHYS; 316 317 #define HUMANIZE_BYTES 9 318 CTASSERT(sizeof(buf) >= HUMANIZE_BYTES); 319 format_bytes(buf, HUMANIZE_BYTES, rump_physmemlimit); 320 #undef HUMANIZE_BYTES 321 dddlim = 9 * (rump_physmemlimit / 10); 322 } else { 323 strlcpy(buf, "unlimited (host limit)", sizeof(buf)); 324 } 325 aprint_verbose("total memory = %s\n", buf); 326 327 TAILQ_INIT(&vmpage_lruqueue); 328 329 uvmexp.free = 1024*1024; /* XXX: arbitrary & not updated */ 330 331 #ifndef __uvmexp_pagesize 332 uvmexp.pagesize = PAGE_SIZE; 333 uvmexp.pagemask = PAGE_MASK; 334 uvmexp.pageshift = PAGE_SHIFT; 335 #else 336 #define FAKE_PAGE_SHIFT 12 337 uvmexp.pageshift = FAKE_PAGE_SHIFT; 338 uvmexp.pagesize = 1<<FAKE_PAGE_SHIFT; 339 uvmexp.pagemask = (1<<FAKE_PAGE_SHIFT)-1; 340 #undef FAKE_PAGE_SHIFT 341 #endif 342 343 mutex_init(&pagermtx, MUTEX_DEFAULT, IPL_NONE); 344 mutex_init(&uvm_pageqlock, MUTEX_DEFAULT, IPL_NONE); 345 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 346 347 mutex_init(&pdaemonmtx, MUTEX_DEFAULT, IPL_NONE); 348 cv_init(&pdaemoncv, "pdaemon"); 349 cv_init(&oomwait, "oomwait"); 350 351 module_map = &module_map_store; 352 353 kernel_map->pmap = pmap_kernel(); 354 355 pool_subsystem_init(); 356 357 kmem_arena = vmem_create("kmem", 0, 1024*1024, PAGE_SIZE, 358 NULL, NULL, NULL, 359 0, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM); 360 361 vmem_subsystem_init(kmem_arena); 362 363 kmem_va_arena = vmem_create("kva", 0, 0, PAGE_SIZE, 364 vmem_alloc, vmem_free, kmem_arena, 365 8 * PAGE_SIZE, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM); 366 367 pool_cache_bootstrap(&pagecache, sizeof(struct vm_page), 0, 0, 0, 368 "page$", NULL, IPL_NONE, pgctor, pgdtor, NULL); 369 } 370 371 void 372 uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t vmin, vaddr_t vmax, 373 bool topdown) 374 { 375 376 vm->vm_map.pmap = pmap_kernel(); 377 vm->vm_refcnt = 1; 378 } 379 380 void 381 uvm_pagewire(struct vm_page *pg) 382 { 383 384 /* nada */ 385 } 386 387 void 388 uvm_pageunwire(struct vm_page *pg) 389 { 390 391 /* nada */ 392 } 393 394 /* where's your schmonz now? */ 395 #define PUNLIMIT(a) \ 396 p->p_rlimit[a].rlim_cur = p->p_rlimit[a].rlim_max = RLIM_INFINITY; 397 void 398 uvm_init_limits(struct proc *p) 399 { 400 401 PUNLIMIT(RLIMIT_STACK); 402 PUNLIMIT(RLIMIT_DATA); 403 PUNLIMIT(RLIMIT_RSS); 404 PUNLIMIT(RLIMIT_AS); 405 /* nice, cascade */ 406 } 407 #undef PUNLIMIT 408 409 /* 410 * This satisfies the "disgusting mmap hack" used by proplib. 411 * We probably should grow some more assertables to make sure we're 412 * not satisfying anything we shouldn't be satisfying. 413 */ 414 int 415 uvm_mmap(struct vm_map *map, vaddr_t *addr, vsize_t size, vm_prot_t prot, 416 vm_prot_t maxprot, int flags, void *handle, voff_t off, vsize_t locklim) 417 { 418 void *uaddr; 419 int error; 420 421 if (prot != (VM_PROT_READ | VM_PROT_WRITE)) 422 panic("uvm_mmap() variant unsupported"); 423 if (flags != (MAP_PRIVATE | MAP_ANON)) 424 panic("uvm_mmap() variant unsupported"); 425 426 /* no reason in particular, but cf. uvm_default_mapaddr() */ 427 if (*addr != 0) 428 panic("uvm_mmap() variant unsupported"); 429 430 if (RUMP_LOCALPROC_P(curproc)) { 431 error = rumpuser_anonmmap(NULL, size, 0, 0, &uaddr); 432 } else { 433 error = rumpuser_sp_anonmmap(curproc->p_vmspace->vm_map.pmap, 434 size, &uaddr); 435 } 436 if (error) 437 return error; 438 439 *addr = (vaddr_t)uaddr; 440 return 0; 441 } 442 443 struct pagerinfo { 444 vaddr_t pgr_kva; 445 int pgr_npages; 446 struct vm_page **pgr_pgs; 447 bool pgr_read; 448 449 LIST_ENTRY(pagerinfo) pgr_entries; 450 }; 451 static LIST_HEAD(, pagerinfo) pagerlist = LIST_HEAD_INITIALIZER(pagerlist); 452 453 /* 454 * Pager "map" in routine. Instead of mapping, we allocate memory 455 * and copy page contents there. Not optimal or even strictly 456 * correct (the caller might modify the page contents after mapping 457 * them in), but what the heck. Assumes UVMPAGER_MAPIN_WAITOK. 458 */ 459 vaddr_t 460 uvm_pagermapin(struct vm_page **pgs, int npages, int flags) 461 { 462 struct pagerinfo *pgri; 463 vaddr_t curkva; 464 int i; 465 466 /* allocate structures */ 467 pgri = kmem_alloc(sizeof(*pgri), KM_SLEEP); 468 pgri->pgr_kva = (vaddr_t)kmem_alloc(npages * PAGE_SIZE, KM_SLEEP); 469 pgri->pgr_npages = npages; 470 pgri->pgr_pgs = kmem_alloc(sizeof(struct vm_page *) * npages, KM_SLEEP); 471 pgri->pgr_read = (flags & UVMPAGER_MAPIN_READ) != 0; 472 473 /* copy contents to "mapped" memory */ 474 for (i = 0, curkva = pgri->pgr_kva; 475 i < npages; 476 i++, curkva += PAGE_SIZE) { 477 /* 478 * We need to copy the previous contents of the pages to 479 * the window even if we are reading from the 480 * device, since the device might not fill the contents of 481 * the full mapped range and we will end up corrupting 482 * data when we unmap the window. 483 */ 484 memcpy((void*)curkva, pgs[i]->uanon, PAGE_SIZE); 485 pgri->pgr_pgs[i] = pgs[i]; 486 } 487 488 mutex_enter(&pagermtx); 489 LIST_INSERT_HEAD(&pagerlist, pgri, pgr_entries); 490 mutex_exit(&pagermtx); 491 492 return pgri->pgr_kva; 493 } 494 495 /* 496 * map out the pager window. return contents from VA to page storage 497 * and free structures. 498 * 499 * Note: does not currently support partial frees 500 */ 501 void 502 uvm_pagermapout(vaddr_t kva, int npages) 503 { 504 struct pagerinfo *pgri; 505 vaddr_t curkva; 506 int i; 507 508 mutex_enter(&pagermtx); 509 LIST_FOREACH(pgri, &pagerlist, pgr_entries) { 510 if (pgri->pgr_kva == kva) 511 break; 512 } 513 KASSERT(pgri); 514 if (pgri->pgr_npages != npages) 515 panic("uvm_pagermapout: partial unmapping not supported"); 516 LIST_REMOVE(pgri, pgr_entries); 517 mutex_exit(&pagermtx); 518 519 if (pgri->pgr_read) { 520 for (i = 0, curkva = pgri->pgr_kva; 521 i < pgri->pgr_npages; 522 i++, curkva += PAGE_SIZE) { 523 memcpy(pgri->pgr_pgs[i]->uanon,(void*)curkva,PAGE_SIZE); 524 } 525 } 526 527 kmem_free(pgri->pgr_pgs, npages * sizeof(struct vm_page *)); 528 kmem_free((void*)pgri->pgr_kva, npages * PAGE_SIZE); 529 kmem_free(pgri, sizeof(*pgri)); 530 } 531 532 /* 533 * convert va in pager window to page structure. 534 * XXX: how expensive is this (global lock, list traversal)? 535 */ 536 struct vm_page * 537 uvm_pageratop(vaddr_t va) 538 { 539 struct pagerinfo *pgri; 540 struct vm_page *pg = NULL; 541 int i; 542 543 mutex_enter(&pagermtx); 544 LIST_FOREACH(pgri, &pagerlist, pgr_entries) { 545 if (pgri->pgr_kva <= va 546 && va < pgri->pgr_kva + pgri->pgr_npages*PAGE_SIZE) 547 break; 548 } 549 if (pgri) { 550 i = (va - pgri->pgr_kva) >> PAGE_SHIFT; 551 pg = pgri->pgr_pgs[i]; 552 } 553 mutex_exit(&pagermtx); 554 555 return pg; 556 } 557 558 /* 559 * Called with the vm object locked. 560 * 561 * Put vnode object pages at the end of the access queue to indicate 562 * they have been recently accessed and should not be immediate 563 * candidates for pageout. Do not do this for lookups done by 564 * the pagedaemon to mimic pmap_kentered mappings which don't track 565 * access information. 566 */ 567 struct vm_page * 568 uvm_pagelookup(struct uvm_object *uobj, voff_t off) 569 { 570 struct vm_page *pg; 571 bool ispagedaemon = curlwp == uvm.pagedaemon_lwp; 572 573 pg = rb_tree_find_node(&uobj->rb_tree, &off); 574 if (pg && !UVM_OBJ_IS_AOBJ(pg->uobject) && !ispagedaemon) { 575 mutex_enter(&uvm_pageqlock); 576 TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue); 577 TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue); 578 mutex_exit(&uvm_pageqlock); 579 } 580 581 return pg; 582 } 583 584 void 585 uvm_page_unbusy(struct vm_page **pgs, int npgs) 586 { 587 struct vm_page *pg; 588 int i; 589 590 KASSERT(npgs > 0); 591 KASSERT(mutex_owned(pgs[0]->uobject->vmobjlock)); 592 593 for (i = 0; i < npgs; i++) { 594 pg = pgs[i]; 595 if (pg == NULL) 596 continue; 597 598 KASSERT(pg->flags & PG_BUSY); 599 if (pg->flags & PG_WANTED) 600 wakeup(pg); 601 if (pg->flags & PG_RELEASED) 602 uvm_pagefree(pg); 603 else 604 pg->flags &= ~(PG_WANTED|PG_BUSY); 605 } 606 } 607 608 void 609 uvm_estimatepageable(int *active, int *inactive) 610 { 611 612 /* XXX: guessing game */ 613 *active = 1024; 614 *inactive = 1024; 615 } 616 617 bool 618 vm_map_starved_p(struct vm_map *map) 619 { 620 621 if (map->flags & VM_MAP_WANTVA) 622 return true; 623 624 return false; 625 } 626 627 int 628 uvm_loan(struct vm_map *map, vaddr_t start, vsize_t len, void *v, int flags) 629 { 630 631 panic("%s: unimplemented", __func__); 632 } 633 634 void 635 uvm_unloan(void *v, int npages, int flags) 636 { 637 638 panic("%s: unimplemented", __func__); 639 } 640 641 int 642 uvm_loanuobjpages(struct uvm_object *uobj, voff_t pgoff, int orignpages, 643 struct vm_page **opp) 644 { 645 646 return EBUSY; 647 } 648 649 struct vm_page * 650 uvm_loanbreak(struct vm_page *pg) 651 { 652 653 panic("%s: unimplemented", __func__); 654 } 655 656 void 657 ubc_purge(struct uvm_object *uobj) 658 { 659 660 } 661 662 #ifdef DEBUGPRINT 663 void 664 uvm_object_printit(struct uvm_object *uobj, bool full, 665 void (*pr)(const char *, ...)) 666 { 667 668 pr("VM OBJECT at %p, refs %d", uobj, uobj->uo_refs); 669 } 670 #endif 671 672 vaddr_t 673 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz) 674 { 675 676 return 0; 677 } 678 679 int 680 uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end, 681 vm_prot_t prot, bool set_max) 682 { 683 684 return EOPNOTSUPP; 685 } 686 687 /* 688 * UVM km 689 */ 690 691 vaddr_t 692 uvm_km_alloc(struct vm_map *map, vsize_t size, vsize_t align, uvm_flag_t flags) 693 { 694 void *rv, *desired = NULL; 695 int alignbit, error; 696 697 #ifdef __x86_64__ 698 /* 699 * On amd64, allocate all module memory from the lowest 2GB. 700 * This is because NetBSD kernel modules are compiled 701 * with -mcmodel=kernel and reserve only 4 bytes for 702 * offsets. If we load code compiled with -mcmodel=kernel 703 * anywhere except the lowest or highest 2GB, it will not 704 * work. Since userspace does not have access to the highest 705 * 2GB, use the lowest 2GB. 706 * 707 * Note: this assumes the rump kernel resides in 708 * the lowest 2GB as well. 709 * 710 * Note2: yes, it's a quick hack, but since this the only 711 * place where we care about the map we're allocating from, 712 * just use a simple "if" instead of coming up with a fancy 713 * generic solution. 714 */ 715 if (map == module_map) { 716 desired = (void *)(0x80000000 - size); 717 } 718 #endif 719 720 if (__predict_false(map == module_map)) { 721 alignbit = 0; 722 if (align) { 723 alignbit = ffs(align)-1; 724 } 725 error = rumpuser_anonmmap(desired, size, alignbit, 726 flags & UVM_KMF_EXEC, &rv); 727 } else { 728 error = rumpuser_malloc(size, align, &rv); 729 } 730 731 if (error) { 732 if (flags & (UVM_KMF_CANFAIL | UVM_KMF_NOWAIT)) 733 return 0; 734 else 735 panic("uvm_km_alloc failed"); 736 } 737 738 if (flags & UVM_KMF_ZERO) 739 memset(rv, 0, size); 740 741 return (vaddr_t)rv; 742 } 743 744 void 745 uvm_km_free(struct vm_map *map, vaddr_t vaddr, vsize_t size, uvm_flag_t flags) 746 { 747 748 if (__predict_false(map == module_map)) 749 rumpuser_unmap((void *)vaddr, size); 750 else 751 rumpuser_free((void *)vaddr, size); 752 } 753 754 struct vm_map * 755 uvm_km_suballoc(struct vm_map *map, vaddr_t *minaddr, vaddr_t *maxaddr, 756 vsize_t size, int pageable, bool fixed, struct vm_map *submap) 757 { 758 759 return (struct vm_map *)417416; 760 } 761 762 int 763 uvm_km_kmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags, 764 vmem_addr_t *addr) 765 { 766 vaddr_t va; 767 va = (vaddr_t)rump_hypermalloc(size, PAGE_SIZE, 768 (flags & VM_SLEEP), "kmalloc"); 769 770 if (va) { 771 *addr = va; 772 return 0; 773 } else { 774 return ENOMEM; 775 } 776 } 777 778 void 779 uvm_km_kmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) 780 { 781 782 rump_hyperfree((void *)addr, size); 783 } 784 785 /* 786 * VM space locking routines. We don't really have to do anything, 787 * since the pages are always "wired" (both local and remote processes). 788 */ 789 int 790 uvm_vslock(struct vmspace *vs, void *addr, size_t len, vm_prot_t access) 791 { 792 793 return 0; 794 } 795 796 void 797 uvm_vsunlock(struct vmspace *vs, void *addr, size_t len) 798 { 799 800 } 801 802 /* 803 * For the local case the buffer mappers don't need to do anything. 804 * For the remote case we need to reserve space and copy data in or 805 * out, depending on B_READ/B_WRITE. 806 */ 807 int 808 vmapbuf(struct buf *bp, vsize_t len) 809 { 810 int error = 0; 811 812 bp->b_saveaddr = bp->b_data; 813 814 /* remote case */ 815 if (!RUMP_LOCALPROC_P(curproc)) { 816 bp->b_data = rump_hypermalloc(len, 0, true, "vmapbuf"); 817 if (BUF_ISWRITE(bp)) { 818 error = copyin(bp->b_saveaddr, bp->b_data, len); 819 if (error) { 820 rump_hyperfree(bp->b_data, len); 821 bp->b_data = bp->b_saveaddr; 822 bp->b_saveaddr = 0; 823 } 824 } 825 } 826 827 return error; 828 } 829 830 void 831 vunmapbuf(struct buf *bp, vsize_t len) 832 { 833 834 /* remote case */ 835 if (!RUMP_LOCALPROC_P(bp->b_proc)) { 836 if (BUF_ISREAD(bp)) { 837 bp->b_error = copyout_proc(bp->b_proc, 838 bp->b_data, bp->b_saveaddr, len); 839 } 840 rump_hyperfree(bp->b_data, len); 841 } 842 843 bp->b_data = bp->b_saveaddr; 844 bp->b_saveaddr = 0; 845 } 846 847 void 848 uvmspace_addref(struct vmspace *vm) 849 { 850 851 /* 852 * No dynamically allocated vmspaces exist. 853 */ 854 } 855 856 void 857 uvmspace_free(struct vmspace *vm) 858 { 859 860 /* nothing for now */ 861 } 862 863 /* 864 * page life cycle stuff. it really doesn't exist, so just stubs. 865 */ 866 867 void 868 uvm_pageactivate(struct vm_page *pg) 869 { 870 871 /* nada */ 872 } 873 874 void 875 uvm_pagedeactivate(struct vm_page *pg) 876 { 877 878 /* nada */ 879 } 880 881 void 882 uvm_pagedequeue(struct vm_page *pg) 883 { 884 885 /* nada*/ 886 } 887 888 void 889 uvm_pageenqueue(struct vm_page *pg) 890 { 891 892 /* nada */ 893 } 894 895 void 896 uvmpdpol_anfree(struct vm_anon *an) 897 { 898 899 /* nada */ 900 } 901 902 /* 903 * Physical address accessors. 904 */ 905 906 struct vm_page * 907 uvm_phys_to_vm_page(paddr_t pa) 908 { 909 910 return NULL; 911 } 912 913 paddr_t 914 uvm_vm_page_to_phys(const struct vm_page *pg) 915 { 916 917 return 0; 918 } 919 920 /* 921 * Routines related to the Page Baroness. 922 */ 923 924 void 925 uvm_wait(const char *msg) 926 { 927 928 if (__predict_false(rump_threads == 0)) 929 panic("pagedaemon missing (RUMP_THREADS = 0)"); 930 931 if (curlwp == uvm.pagedaemon_lwp) { 932 /* is it possible for us to later get memory? */ 933 if (!uvmexp.paging) 934 panic("pagedaemon out of memory"); 935 } 936 937 mutex_enter(&pdaemonmtx); 938 pdaemon_waiters++; 939 cv_signal(&pdaemoncv); 940 cv_wait(&oomwait, &pdaemonmtx); 941 mutex_exit(&pdaemonmtx); 942 } 943 944 void 945 uvm_pageout_start(int npages) 946 { 947 948 mutex_enter(&pdaemonmtx); 949 uvmexp.paging += npages; 950 mutex_exit(&pdaemonmtx); 951 } 952 953 void 954 uvm_pageout_done(int npages) 955 { 956 957 if (!npages) 958 return; 959 960 mutex_enter(&pdaemonmtx); 961 KASSERT(uvmexp.paging >= npages); 962 uvmexp.paging -= npages; 963 964 if (pdaemon_waiters) { 965 pdaemon_waiters = 0; 966 cv_broadcast(&oomwait); 967 } 968 mutex_exit(&pdaemonmtx); 969 } 970 971 static bool 972 processpage(struct vm_page *pg, bool *lockrunning) 973 { 974 struct uvm_object *uobj; 975 976 uobj = pg->uobject; 977 if (mutex_tryenter(uobj->vmobjlock)) { 978 if ((pg->flags & PG_BUSY) == 0) { 979 mutex_exit(&uvm_pageqlock); 980 uobj->pgops->pgo_put(uobj, pg->offset, 981 pg->offset + PAGE_SIZE, 982 PGO_CLEANIT|PGO_FREE); 983 KASSERT(!mutex_owned(uobj->vmobjlock)); 984 return true; 985 } else { 986 mutex_exit(uobj->vmobjlock); 987 } 988 } else if (*lockrunning == false && ncpu > 1) { 989 CPU_INFO_ITERATOR cii; 990 struct cpu_info *ci; 991 struct lwp *l; 992 993 l = mutex_owner(uobj->vmobjlock); 994 for (CPU_INFO_FOREACH(cii, ci)) { 995 if (ci->ci_curlwp == l) { 996 *lockrunning = true; 997 break; 998 } 999 } 1000 } 1001 1002 return false; 1003 } 1004 1005 /* 1006 * The Diabolical pageDaemon Director (DDD). 1007 * 1008 * This routine can always use better heuristics. 1009 */ 1010 void 1011 uvm_pageout(void *arg) 1012 { 1013 struct vm_page *pg; 1014 struct pool *pp, *pp_first; 1015 int cleaned, skip, skipped; 1016 bool succ; 1017 bool lockrunning; 1018 1019 mutex_enter(&pdaemonmtx); 1020 for (;;) { 1021 if (!NEED_PAGEDAEMON()) { 1022 kernel_map->flags &= ~VM_MAP_WANTVA; 1023 } 1024 1025 if (pdaemon_waiters) { 1026 pdaemon_waiters = 0; 1027 cv_broadcast(&oomwait); 1028 } 1029 1030 cv_wait(&pdaemoncv, &pdaemonmtx); 1031 uvmexp.pdwoke++; 1032 1033 /* tell the world that we are hungry */ 1034 kernel_map->flags |= VM_MAP_WANTVA; 1035 mutex_exit(&pdaemonmtx); 1036 1037 /* 1038 * step one: reclaim the page cache. this should give 1039 * us the biggest earnings since whole pages are released 1040 * into backing memory. 1041 */ 1042 pool_cache_reclaim(&pagecache); 1043 if (!NEED_PAGEDAEMON()) { 1044 mutex_enter(&pdaemonmtx); 1045 continue; 1046 } 1047 1048 /* 1049 * Ok, so that didn't help. Next, try to hunt memory 1050 * by pushing out vnode pages. The pages might contain 1051 * useful cached data, but we need the memory. 1052 */ 1053 cleaned = 0; 1054 skip = 0; 1055 lockrunning = false; 1056 again: 1057 mutex_enter(&uvm_pageqlock); 1058 while (cleaned < PAGEDAEMON_OBJCHUNK) { 1059 skipped = 0; 1060 TAILQ_FOREACH(pg, &vmpage_lruqueue, pageq.queue) { 1061 1062 /* 1063 * skip over pages we _might_ have tried 1064 * to handle earlier. they might not be 1065 * exactly the same ones, but I'm not too 1066 * concerned. 1067 */ 1068 while (skipped++ < skip) 1069 continue; 1070 1071 if (processpage(pg, &lockrunning)) { 1072 cleaned++; 1073 goto again; 1074 } 1075 1076 skip++; 1077 } 1078 break; 1079 } 1080 mutex_exit(&uvm_pageqlock); 1081 1082 /* 1083 * Ok, someone is running with an object lock held. 1084 * We want to yield the host CPU to make sure the 1085 * thread is not parked on the host. Since sched_yield() 1086 * doesn't appear to do anything on NetBSD, nanosleep 1087 * for the smallest possible time and hope we're back in 1088 * the game soon. 1089 */ 1090 if (cleaned == 0 && lockrunning) { 1091 rumpuser_clock_sleep(RUMPUSER_CLOCK_RELWALL, 0, 1); 1092 1093 lockrunning = false; 1094 skip = 0; 1095 1096 /* and here we go again */ 1097 goto again; 1098 } 1099 1100 /* 1101 * And of course we need to reclaim the page cache 1102 * again to actually release memory. 1103 */ 1104 pool_cache_reclaim(&pagecache); 1105 if (!NEED_PAGEDAEMON()) { 1106 mutex_enter(&pdaemonmtx); 1107 continue; 1108 } 1109 1110 /* 1111 * And then drain the pools. Wipe them out ... all of them. 1112 */ 1113 for (pp_first = NULL;;) { 1114 if (rump_vfs_drainbufs) 1115 rump_vfs_drainbufs(10 /* XXX: estimate! */); 1116 1117 succ = pool_drain(&pp); 1118 if (succ || pp == pp_first) 1119 break; 1120 1121 if (pp_first == NULL) 1122 pp_first = pp; 1123 } 1124 1125 /* 1126 * Need to use PYEC on our bag of tricks. 1127 * Unfortunately, the wife just borrowed it. 1128 */ 1129 1130 mutex_enter(&pdaemonmtx); 1131 if (!succ && cleaned == 0 && pdaemon_waiters && 1132 uvmexp.paging == 0) { 1133 rumpuser_dprintf("pagedaemoness: failed to reclaim " 1134 "memory ... sleeping (deadlock?)\n"); 1135 cv_timedwait(&pdaemoncv, &pdaemonmtx, hz); 1136 } 1137 } 1138 1139 panic("you can swap out any time you like, but you can never leave"); 1140 } 1141 1142 void 1143 uvm_kick_pdaemon() 1144 { 1145 1146 /* 1147 * Wake up the diabolical pagedaemon director if we are over 1148 * 90% of the memory limit. This is a complete and utter 1149 * stetson-harrison decision which you are allowed to finetune. 1150 * Don't bother locking. If we have some unflushed caches, 1151 * other waker-uppers will deal with the issue. 1152 */ 1153 if (NEED_PAGEDAEMON()) { 1154 cv_signal(&pdaemoncv); 1155 } 1156 } 1157 1158 void * 1159 rump_hypermalloc(size_t howmuch, int alignment, bool waitok, const char *wmsg) 1160 { 1161 unsigned long newmem; 1162 void *rv; 1163 int error; 1164 1165 uvm_kick_pdaemon(); /* ouch */ 1166 1167 /* first we must be within the limit */ 1168 limitagain: 1169 if (rump_physmemlimit != RUMPMEM_UNLIMITED) { 1170 newmem = atomic_add_long_nv(&curphysmem, howmuch); 1171 if ((newmem > rump_physmemlimit) && 1172 !(curlwp == uvm.pagedaemon_lwp || newmem > pdlimit)) { 1173 newmem = atomic_add_long_nv(&curphysmem, -howmuch); 1174 if (!waitok) { 1175 return NULL; 1176 } 1177 uvm_wait(wmsg); 1178 goto limitagain; 1179 } 1180 } 1181 1182 /* second, we must get something from the backend */ 1183 again: 1184 error = rumpuser_malloc(howmuch, alignment, &rv); 1185 if (__predict_false(error && waitok)) { 1186 uvm_wait(wmsg); 1187 goto again; 1188 } 1189 1190 return rv; 1191 } 1192 1193 void 1194 rump_hyperfree(void *what, size_t size) 1195 { 1196 1197 if (rump_physmemlimit != RUMPMEM_UNLIMITED) { 1198 atomic_add_long(&curphysmem, -size); 1199 } 1200 rumpuser_free(what, size); 1201 } 1202