1 /* $NetBSD: vm.c,v 1.156 2014/04/25 13:20:45 pooka Exp $ */ 2 3 /* 4 * Copyright (c) 2007-2011 Antti Kantee. All Rights Reserved. 5 * 6 * Development of this software was supported by 7 * The Finnish Cultural Foundation and the Research Foundation of 8 * The Helsinki University of Technology. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS 20 * OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED 21 * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE 22 * DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE 23 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) 26 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 29 * SUCH DAMAGE. 30 */ 31 32 /* 33 * Virtual memory emulation routines. 34 */ 35 36 /* 37 * XXX: we abuse pg->uanon for the virtual address of the storage 38 * for each page. phys_addr would fit the job description better, 39 * except that it will create unnecessary lossage on some platforms 40 * due to not being a pointer type. 41 */ 42 43 #include <sys/cdefs.h> 44 __KERNEL_RCSID(0, "$NetBSD: vm.c,v 1.156 2014/04/25 13:20:45 pooka Exp $"); 45 46 #include <sys/param.h> 47 #include <sys/atomic.h> 48 #include <sys/buf.h> 49 #include <sys/kernel.h> 50 #include <sys/kmem.h> 51 #include <sys/vmem.h> 52 #include <sys/mman.h> 53 #include <sys/null.h> 54 #include <sys/vnode.h> 55 56 #include <machine/pmap.h> 57 58 #include <rump/rumpuser.h> 59 60 #include <uvm/uvm.h> 61 #include <uvm/uvm_ddb.h> 62 #include <uvm/uvm_pdpolicy.h> 63 #include <uvm/uvm_prot.h> 64 #include <uvm/uvm_readahead.h> 65 66 #include "rump_private.h" 67 #include "rump_vfs_private.h" 68 69 kmutex_t uvm_pageqlock; /* non-free page lock */ 70 kmutex_t uvm_fpageqlock; /* free page lock, non-gpl license */ 71 kmutex_t uvm_swap_data_lock; 72 73 struct uvmexp uvmexp; 74 struct uvm uvm; 75 76 #ifdef __uvmexp_pagesize 77 const int * const uvmexp_pagesize = &uvmexp.pagesize; 78 const int * const uvmexp_pagemask = &uvmexp.pagemask; 79 const int * const uvmexp_pageshift = &uvmexp.pageshift; 80 #endif 81 82 struct vm_map rump_vmmap; 83 84 static struct vm_map kernel_map_store; 85 struct vm_map *kernel_map = &kernel_map_store; 86 87 static struct vm_map module_map_store; 88 extern struct vm_map *module_map; 89 90 vmem_t *kmem_arena; 91 vmem_t *kmem_va_arena; 92 93 static unsigned int pdaemon_waiters; 94 static kmutex_t pdaemonmtx; 95 static kcondvar_t pdaemoncv, oomwait; 96 97 unsigned long rump_physmemlimit = RUMPMEM_UNLIMITED; 98 static unsigned long pdlimit = RUMPMEM_UNLIMITED; /* page daemon memlimit */ 99 static unsigned long curphysmem; 100 static unsigned long dddlim; /* 90% of memory limit used */ 101 #define NEED_PAGEDAEMON() \ 102 (rump_physmemlimit != RUMPMEM_UNLIMITED && curphysmem > dddlim) 103 104 /* 105 * Try to free two pages worth of pages from objects. 106 * If this succesfully frees a full page cache page, we'll 107 * free the released page plus PAGE_SIZE/sizeof(vm_page). 108 */ 109 #define PAGEDAEMON_OBJCHUNK (2*PAGE_SIZE / sizeof(struct vm_page)) 110 111 /* 112 * Keep a list of least recently used pages. Since the only way a 113 * rump kernel can "access" a page is via lookup, we put the page 114 * at the back of queue every time a lookup for it is done. If the 115 * page is in front of this global queue and we're short of memory, 116 * it's a candidate for pageout. 117 */ 118 static struct pglist vmpage_lruqueue; 119 static unsigned vmpage_onqueue; 120 121 static int 122 pg_compare_key(void *ctx, const void *n, const void *key) 123 { 124 voff_t a = ((const struct vm_page *)n)->offset; 125 voff_t b = *(const voff_t *)key; 126 127 if (a < b) 128 return -1; 129 else if (a > b) 130 return 1; 131 else 132 return 0; 133 } 134 135 static int 136 pg_compare_nodes(void *ctx, const void *n1, const void *n2) 137 { 138 139 return pg_compare_key(ctx, n1, &((const struct vm_page *)n2)->offset); 140 } 141 142 const rb_tree_ops_t uvm_page_tree_ops = { 143 .rbto_compare_nodes = pg_compare_nodes, 144 .rbto_compare_key = pg_compare_key, 145 .rbto_node_offset = offsetof(struct vm_page, rb_node), 146 .rbto_context = NULL 147 }; 148 149 /* 150 * vm pages 151 */ 152 153 static int 154 pgctor(void *arg, void *obj, int flags) 155 { 156 struct vm_page *pg = obj; 157 158 memset(pg, 0, sizeof(*pg)); 159 pg->uanon = rump_hypermalloc(PAGE_SIZE, PAGE_SIZE, 160 (flags & PR_WAITOK) == PR_WAITOK, "pgalloc"); 161 return pg->uanon == NULL; 162 } 163 164 static void 165 pgdtor(void *arg, void *obj) 166 { 167 struct vm_page *pg = obj; 168 169 rump_hyperfree(pg->uanon, PAGE_SIZE); 170 } 171 172 static struct pool_cache pagecache; 173 174 /* 175 * Called with the object locked. We don't support anons. 176 */ 177 struct vm_page * 178 uvm_pagealloc_strat(struct uvm_object *uobj, voff_t off, struct vm_anon *anon, 179 int flags, int strat, int free_list) 180 { 181 struct vm_page *pg; 182 183 KASSERT(uobj && mutex_owned(uobj->vmobjlock)); 184 KASSERT(anon == NULL); 185 186 pg = pool_cache_get(&pagecache, PR_NOWAIT); 187 if (__predict_false(pg == NULL)) { 188 return NULL; 189 } 190 191 pg->offset = off; 192 pg->uobject = uobj; 193 194 pg->flags = PG_CLEAN|PG_BUSY|PG_FAKE; 195 if (flags & UVM_PGA_ZERO) { 196 uvm_pagezero(pg); 197 } 198 199 TAILQ_INSERT_TAIL(&uobj->memq, pg, listq.queue); 200 (void)rb_tree_insert_node(&uobj->rb_tree, pg); 201 202 /* 203 * Don't put anons on the LRU page queue. We can't flush them 204 * (there's no concept of swap in a rump kernel), so no reason 205 * to bother with them. 206 */ 207 if (!UVM_OBJ_IS_AOBJ(uobj)) { 208 atomic_inc_uint(&vmpage_onqueue); 209 mutex_enter(&uvm_pageqlock); 210 TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue); 211 mutex_exit(&uvm_pageqlock); 212 } 213 214 uobj->uo_npages++; 215 216 return pg; 217 } 218 219 /* 220 * Release a page. 221 * 222 * Called with the vm object locked. 223 */ 224 void 225 uvm_pagefree(struct vm_page *pg) 226 { 227 struct uvm_object *uobj = pg->uobject; 228 229 KASSERT(mutex_owned(&uvm_pageqlock)); 230 KASSERT(mutex_owned(uobj->vmobjlock)); 231 232 if (pg->flags & PG_WANTED) 233 wakeup(pg); 234 235 TAILQ_REMOVE(&uobj->memq, pg, listq.queue); 236 237 uobj->uo_npages--; 238 rb_tree_remove_node(&uobj->rb_tree, pg); 239 240 if (!UVM_OBJ_IS_AOBJ(uobj)) { 241 TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue); 242 atomic_dec_uint(&vmpage_onqueue); 243 } 244 245 pool_cache_put(&pagecache, pg); 246 } 247 248 void 249 uvm_pagezero(struct vm_page *pg) 250 { 251 252 pg->flags &= ~PG_CLEAN; 253 memset((void *)pg->uanon, 0, PAGE_SIZE); 254 } 255 256 /* 257 * uvm_page_locked_p: return true if object associated with page is 258 * locked. this is a weak check for runtime assertions only. 259 */ 260 261 bool 262 uvm_page_locked_p(struct vm_page *pg) 263 { 264 265 return mutex_owned(pg->uobject->vmobjlock); 266 } 267 268 /* 269 * Misc routines 270 */ 271 272 static kmutex_t pagermtx; 273 274 void 275 uvm_init(void) 276 { 277 char buf[64]; 278 279 if (rumpuser_getparam("RUMP_MEMLIMIT", buf, sizeof(buf)) == 0) { 280 unsigned long tmp; 281 char *ep; 282 int mult; 283 284 tmp = strtoul(buf, &ep, 10); 285 if (strlen(ep) > 1) 286 panic("uvm_init: invalid RUMP_MEMLIMIT: %s", buf); 287 288 /* mini-dehumanize-number */ 289 mult = 1; 290 switch (*ep) { 291 case 'k': 292 mult = 1024; 293 break; 294 case 'm': 295 mult = 1024*1024; 296 break; 297 case 'g': 298 mult = 1024*1024*1024; 299 break; 300 case 0: 301 break; 302 default: 303 panic("uvm_init: invalid RUMP_MEMLIMIT: %s", buf); 304 } 305 rump_physmemlimit = tmp * mult; 306 307 if (rump_physmemlimit / mult != tmp) 308 panic("uvm_init: RUMP_MEMLIMIT overflow: %s", buf); 309 /* it's not like we'd get far with, say, 1 byte, but ... */ 310 if (rump_physmemlimit < 1024*1024) 311 printf("uvm_init: WARNING: <1MB RAM limit, " 312 "hope you know what you're doing\n"); 313 314 /* reserve some memory for the pager */ 315 pdlimit = rump_physmemlimit; 316 rump_physmemlimit -= 2*MAXPHYS; 317 318 #define HUMANIZE_BYTES 9 319 CTASSERT(sizeof(buf) >= HUMANIZE_BYTES); 320 format_bytes(buf, HUMANIZE_BYTES, rump_physmemlimit); 321 #undef HUMANIZE_BYTES 322 dddlim = 9 * (rump_physmemlimit / 10); 323 } else { 324 strlcpy(buf, "unlimited (host limit)", sizeof(buf)); 325 } 326 aprint_verbose("total memory = %s\n", buf); 327 328 TAILQ_INIT(&vmpage_lruqueue); 329 330 uvmexp.free = 1024*1024; /* XXX: arbitrary & not updated */ 331 332 #ifndef __uvmexp_pagesize 333 uvmexp.pagesize = PAGE_SIZE; 334 uvmexp.pagemask = PAGE_MASK; 335 uvmexp.pageshift = PAGE_SHIFT; 336 #else 337 #define FAKE_PAGE_SHIFT 12 338 uvmexp.pageshift = FAKE_PAGE_SHIFT; 339 uvmexp.pagesize = 1<<FAKE_PAGE_SHIFT; 340 uvmexp.pagemask = (1<<FAKE_PAGE_SHIFT)-1; 341 #undef FAKE_PAGE_SHIFT 342 #endif 343 344 mutex_init(&pagermtx, MUTEX_DEFAULT, IPL_NONE); 345 mutex_init(&uvm_pageqlock, MUTEX_DEFAULT, IPL_NONE); 346 mutex_init(&uvm_swap_data_lock, MUTEX_DEFAULT, IPL_NONE); 347 348 /* just to appease linkage */ 349 mutex_init(&uvm_fpageqlock, MUTEX_SPIN, IPL_VM); 350 351 mutex_init(&pdaemonmtx, MUTEX_DEFAULT, IPL_NONE); 352 cv_init(&pdaemoncv, "pdaemon"); 353 cv_init(&oomwait, "oomwait"); 354 355 module_map = &module_map_store; 356 357 kernel_map->pmap = pmap_kernel(); 358 359 pool_subsystem_init(); 360 361 kmem_arena = vmem_create("kmem", 0, 1024*1024, PAGE_SIZE, 362 NULL, NULL, NULL, 363 0, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM); 364 365 vmem_subsystem_init(kmem_arena); 366 367 kmem_va_arena = vmem_create("kva", 0, 0, PAGE_SIZE, 368 vmem_alloc, vmem_free, kmem_arena, 369 8 * PAGE_SIZE, VM_NOSLEEP | VM_BOOTSTRAP, IPL_VM); 370 371 pool_cache_bootstrap(&pagecache, sizeof(struct vm_page), 0, 0, 0, 372 "page$", NULL, IPL_NONE, pgctor, pgdtor, NULL); 373 } 374 375 void 376 uvmspace_init(struct vmspace *vm, struct pmap *pmap, vaddr_t vmin, vaddr_t vmax, 377 bool topdown) 378 { 379 380 vm->vm_map.pmap = pmap_kernel(); 381 vm->vm_refcnt = 1; 382 } 383 384 void 385 uvm_pagewire(struct vm_page *pg) 386 { 387 388 /* nada */ 389 } 390 391 void 392 uvm_pageunwire(struct vm_page *pg) 393 { 394 395 /* nada */ 396 } 397 398 /* where's your schmonz now? */ 399 #define PUNLIMIT(a) \ 400 p->p_rlimit[a].rlim_cur = p->p_rlimit[a].rlim_max = RLIM_INFINITY; 401 void 402 uvm_init_limits(struct proc *p) 403 { 404 405 #ifndef DFLSSIZ 406 #define DFLSSIZ (16*1024*1024) 407 #endif 408 p->p_rlimit[RLIMIT_STACK].rlim_cur = DFLSSIZ; 409 p->p_rlimit[RLIMIT_STACK].rlim_max = MAXSSIZ; 410 PUNLIMIT(RLIMIT_DATA); 411 PUNLIMIT(RLIMIT_RSS); 412 PUNLIMIT(RLIMIT_AS); 413 /* nice, cascade */ 414 } 415 #undef PUNLIMIT 416 417 /* 418 * This satisfies the "disgusting mmap hack" used by proplib. 419 * We probably should grow some more assertables to make sure we're 420 * not satisfying anything we shouldn't be satisfying. 421 */ 422 int 423 uvm_mmap(struct vm_map *map, vaddr_t *addr, vsize_t size, vm_prot_t prot, 424 vm_prot_t maxprot, int flags, void *handle, voff_t off, vsize_t locklim) 425 { 426 void *uaddr; 427 int error; 428 429 if (prot != (VM_PROT_READ | VM_PROT_WRITE)) 430 panic("uvm_mmap() variant unsupported"); 431 if (flags != (MAP_PRIVATE | MAP_ANON)) 432 panic("uvm_mmap() variant unsupported"); 433 434 /* no reason in particular, but cf. uvm_default_mapaddr() */ 435 if (*addr != 0) 436 panic("uvm_mmap() variant unsupported"); 437 438 if (RUMP_LOCALPROC_P(curproc)) { 439 error = rumpuser_anonmmap(NULL, size, 0, 0, &uaddr); 440 } else { 441 error = rumpuser_sp_anonmmap(curproc->p_vmspace->vm_map.pmap, 442 size, &uaddr); 443 } 444 if (error) 445 return error; 446 447 *addr = (vaddr_t)uaddr; 448 return 0; 449 } 450 451 struct pagerinfo { 452 vaddr_t pgr_kva; 453 int pgr_npages; 454 struct vm_page **pgr_pgs; 455 bool pgr_read; 456 457 LIST_ENTRY(pagerinfo) pgr_entries; 458 }; 459 static LIST_HEAD(, pagerinfo) pagerlist = LIST_HEAD_INITIALIZER(pagerlist); 460 461 /* 462 * Pager "map" in routine. Instead of mapping, we allocate memory 463 * and copy page contents there. Not optimal or even strictly 464 * correct (the caller might modify the page contents after mapping 465 * them in), but what the heck. Assumes UVMPAGER_MAPIN_WAITOK. 466 */ 467 vaddr_t 468 uvm_pagermapin(struct vm_page **pgs, int npages, int flags) 469 { 470 struct pagerinfo *pgri; 471 vaddr_t curkva; 472 int i; 473 474 /* allocate structures */ 475 pgri = kmem_alloc(sizeof(*pgri), KM_SLEEP); 476 pgri->pgr_kva = (vaddr_t)kmem_alloc(npages * PAGE_SIZE, KM_SLEEP); 477 pgri->pgr_npages = npages; 478 pgri->pgr_pgs = kmem_alloc(sizeof(struct vm_page *) * npages, KM_SLEEP); 479 pgri->pgr_read = (flags & UVMPAGER_MAPIN_READ) != 0; 480 481 /* copy contents to "mapped" memory */ 482 for (i = 0, curkva = pgri->pgr_kva; 483 i < npages; 484 i++, curkva += PAGE_SIZE) { 485 /* 486 * We need to copy the previous contents of the pages to 487 * the window even if we are reading from the 488 * device, since the device might not fill the contents of 489 * the full mapped range and we will end up corrupting 490 * data when we unmap the window. 491 */ 492 memcpy((void*)curkva, pgs[i]->uanon, PAGE_SIZE); 493 pgri->pgr_pgs[i] = pgs[i]; 494 } 495 496 mutex_enter(&pagermtx); 497 LIST_INSERT_HEAD(&pagerlist, pgri, pgr_entries); 498 mutex_exit(&pagermtx); 499 500 return pgri->pgr_kva; 501 } 502 503 /* 504 * map out the pager window. return contents from VA to page storage 505 * and free structures. 506 * 507 * Note: does not currently support partial frees 508 */ 509 void 510 uvm_pagermapout(vaddr_t kva, int npages) 511 { 512 struct pagerinfo *pgri; 513 vaddr_t curkva; 514 int i; 515 516 mutex_enter(&pagermtx); 517 LIST_FOREACH(pgri, &pagerlist, pgr_entries) { 518 if (pgri->pgr_kva == kva) 519 break; 520 } 521 KASSERT(pgri); 522 if (pgri->pgr_npages != npages) 523 panic("uvm_pagermapout: partial unmapping not supported"); 524 LIST_REMOVE(pgri, pgr_entries); 525 mutex_exit(&pagermtx); 526 527 if (pgri->pgr_read) { 528 for (i = 0, curkva = pgri->pgr_kva; 529 i < pgri->pgr_npages; 530 i++, curkva += PAGE_SIZE) { 531 memcpy(pgri->pgr_pgs[i]->uanon,(void*)curkva,PAGE_SIZE); 532 } 533 } 534 535 kmem_free(pgri->pgr_pgs, npages * sizeof(struct vm_page *)); 536 kmem_free((void*)pgri->pgr_kva, npages * PAGE_SIZE); 537 kmem_free(pgri, sizeof(*pgri)); 538 } 539 540 /* 541 * convert va in pager window to page structure. 542 * XXX: how expensive is this (global lock, list traversal)? 543 */ 544 struct vm_page * 545 uvm_pageratop(vaddr_t va) 546 { 547 struct pagerinfo *pgri; 548 struct vm_page *pg = NULL; 549 int i; 550 551 mutex_enter(&pagermtx); 552 LIST_FOREACH(pgri, &pagerlist, pgr_entries) { 553 if (pgri->pgr_kva <= va 554 && va < pgri->pgr_kva + pgri->pgr_npages*PAGE_SIZE) 555 break; 556 } 557 if (pgri) { 558 i = (va - pgri->pgr_kva) >> PAGE_SHIFT; 559 pg = pgri->pgr_pgs[i]; 560 } 561 mutex_exit(&pagermtx); 562 563 return pg; 564 } 565 566 /* 567 * Called with the vm object locked. 568 * 569 * Put vnode object pages at the end of the access queue to indicate 570 * they have been recently accessed and should not be immediate 571 * candidates for pageout. Do not do this for lookups done by 572 * the pagedaemon to mimic pmap_kentered mappings which don't track 573 * access information. 574 */ 575 struct vm_page * 576 uvm_pagelookup(struct uvm_object *uobj, voff_t off) 577 { 578 struct vm_page *pg; 579 bool ispagedaemon = curlwp == uvm.pagedaemon_lwp; 580 581 pg = rb_tree_find_node(&uobj->rb_tree, &off); 582 if (pg && !UVM_OBJ_IS_AOBJ(pg->uobject) && !ispagedaemon) { 583 mutex_enter(&uvm_pageqlock); 584 TAILQ_REMOVE(&vmpage_lruqueue, pg, pageq.queue); 585 TAILQ_INSERT_TAIL(&vmpage_lruqueue, pg, pageq.queue); 586 mutex_exit(&uvm_pageqlock); 587 } 588 589 return pg; 590 } 591 592 void 593 uvm_page_unbusy(struct vm_page **pgs, int npgs) 594 { 595 struct vm_page *pg; 596 int i; 597 598 KASSERT(npgs > 0); 599 KASSERT(mutex_owned(pgs[0]->uobject->vmobjlock)); 600 601 for (i = 0; i < npgs; i++) { 602 pg = pgs[i]; 603 if (pg == NULL) 604 continue; 605 606 KASSERT(pg->flags & PG_BUSY); 607 if (pg->flags & PG_WANTED) 608 wakeup(pg); 609 if (pg->flags & PG_RELEASED) 610 uvm_pagefree(pg); 611 else 612 pg->flags &= ~(PG_WANTED|PG_BUSY); 613 } 614 } 615 616 void 617 uvm_estimatepageable(int *active, int *inactive) 618 { 619 620 /* XXX: guessing game */ 621 *active = 1024; 622 *inactive = 1024; 623 } 624 625 bool 626 vm_map_starved_p(struct vm_map *map) 627 { 628 629 if (map->flags & VM_MAP_WANTVA) 630 return true; 631 632 return false; 633 } 634 635 int 636 uvm_loan(struct vm_map *map, vaddr_t start, vsize_t len, void *v, int flags) 637 { 638 639 panic("%s: unimplemented", __func__); 640 } 641 642 void 643 uvm_unloan(void *v, int npages, int flags) 644 { 645 646 panic("%s: unimplemented", __func__); 647 } 648 649 int 650 uvm_loanuobjpages(struct uvm_object *uobj, voff_t pgoff, int orignpages, 651 struct vm_page **opp) 652 { 653 654 return EBUSY; 655 } 656 657 struct vm_page * 658 uvm_loanbreak(struct vm_page *pg) 659 { 660 661 panic("%s: unimplemented", __func__); 662 } 663 664 void 665 ubc_purge(struct uvm_object *uobj) 666 { 667 668 } 669 670 vaddr_t 671 uvm_default_mapaddr(struct proc *p, vaddr_t base, vsize_t sz) 672 { 673 674 return 0; 675 } 676 677 int 678 uvm_map_protect(struct vm_map *map, vaddr_t start, vaddr_t end, 679 vm_prot_t prot, bool set_max) 680 { 681 682 return EOPNOTSUPP; 683 } 684 685 /* 686 * UVM km 687 */ 688 689 vaddr_t 690 uvm_km_alloc(struct vm_map *map, vsize_t size, vsize_t align, uvm_flag_t flags) 691 { 692 void *rv, *desired = NULL; 693 int alignbit, error; 694 695 #ifdef __x86_64__ 696 /* 697 * On amd64, allocate all module memory from the lowest 2GB. 698 * This is because NetBSD kernel modules are compiled 699 * with -mcmodel=kernel and reserve only 4 bytes for 700 * offsets. If we load code compiled with -mcmodel=kernel 701 * anywhere except the lowest or highest 2GB, it will not 702 * work. Since userspace does not have access to the highest 703 * 2GB, use the lowest 2GB. 704 * 705 * Note: this assumes the rump kernel resides in 706 * the lowest 2GB as well. 707 * 708 * Note2: yes, it's a quick hack, but since this the only 709 * place where we care about the map we're allocating from, 710 * just use a simple "if" instead of coming up with a fancy 711 * generic solution. 712 */ 713 if (map == module_map) { 714 desired = (void *)(0x80000000 - size); 715 } 716 #endif 717 718 if (__predict_false(map == module_map)) { 719 alignbit = 0; 720 if (align) { 721 alignbit = ffs(align)-1; 722 } 723 error = rumpuser_anonmmap(desired, size, alignbit, 724 flags & UVM_KMF_EXEC, &rv); 725 } else { 726 error = rumpuser_malloc(size, align, &rv); 727 } 728 729 if (error) { 730 if (flags & (UVM_KMF_CANFAIL | UVM_KMF_NOWAIT)) 731 return 0; 732 else 733 panic("uvm_km_alloc failed"); 734 } 735 736 if (flags & UVM_KMF_ZERO) 737 memset(rv, 0, size); 738 739 return (vaddr_t)rv; 740 } 741 742 void 743 uvm_km_free(struct vm_map *map, vaddr_t vaddr, vsize_t size, uvm_flag_t flags) 744 { 745 746 if (__predict_false(map == module_map)) 747 rumpuser_unmap((void *)vaddr, size); 748 else 749 rumpuser_free((void *)vaddr, size); 750 } 751 752 struct vm_map * 753 uvm_km_suballoc(struct vm_map *map, vaddr_t *minaddr, vaddr_t *maxaddr, 754 vsize_t size, int pageable, bool fixed, struct vm_map *submap) 755 { 756 757 return (struct vm_map *)417416; 758 } 759 760 int 761 uvm_km_kmem_alloc(vmem_t *vm, vmem_size_t size, vm_flag_t flags, 762 vmem_addr_t *addr) 763 { 764 vaddr_t va; 765 va = (vaddr_t)rump_hypermalloc(size, PAGE_SIZE, 766 (flags & VM_SLEEP), "kmalloc"); 767 768 if (va) { 769 *addr = va; 770 return 0; 771 } else { 772 return ENOMEM; 773 } 774 } 775 776 void 777 uvm_km_kmem_free(vmem_t *vm, vmem_addr_t addr, vmem_size_t size) 778 { 779 780 rump_hyperfree((void *)addr, size); 781 } 782 783 /* 784 * VM space locking routines. We don't really have to do anything, 785 * since the pages are always "wired" (both local and remote processes). 786 */ 787 int 788 uvm_vslock(struct vmspace *vs, void *addr, size_t len, vm_prot_t access) 789 { 790 791 return 0; 792 } 793 794 void 795 uvm_vsunlock(struct vmspace *vs, void *addr, size_t len) 796 { 797 798 } 799 800 /* 801 * For the local case the buffer mappers don't need to do anything. 802 * For the remote case we need to reserve space and copy data in or 803 * out, depending on B_READ/B_WRITE. 804 */ 805 int 806 vmapbuf(struct buf *bp, vsize_t len) 807 { 808 int error = 0; 809 810 bp->b_saveaddr = bp->b_data; 811 812 /* remote case */ 813 if (!RUMP_LOCALPROC_P(curproc)) { 814 bp->b_data = rump_hypermalloc(len, 0, true, "vmapbuf"); 815 if (BUF_ISWRITE(bp)) { 816 error = copyin(bp->b_saveaddr, bp->b_data, len); 817 if (error) { 818 rump_hyperfree(bp->b_data, len); 819 bp->b_data = bp->b_saveaddr; 820 bp->b_saveaddr = 0; 821 } 822 } 823 } 824 825 return error; 826 } 827 828 void 829 vunmapbuf(struct buf *bp, vsize_t len) 830 { 831 832 /* remote case */ 833 if (!RUMP_LOCALPROC_P(bp->b_proc)) { 834 if (BUF_ISREAD(bp)) { 835 bp->b_error = copyout_proc(bp->b_proc, 836 bp->b_data, bp->b_saveaddr, len); 837 } 838 rump_hyperfree(bp->b_data, len); 839 } 840 841 bp->b_data = bp->b_saveaddr; 842 bp->b_saveaddr = 0; 843 } 844 845 void 846 uvmspace_addref(struct vmspace *vm) 847 { 848 849 /* 850 * No dynamically allocated vmspaces exist. 851 */ 852 } 853 854 void 855 uvmspace_free(struct vmspace *vm) 856 { 857 858 /* nothing for now */ 859 } 860 861 /* 862 * page life cycle stuff. it really doesn't exist, so just stubs. 863 */ 864 865 void 866 uvm_pageactivate(struct vm_page *pg) 867 { 868 869 /* nada */ 870 } 871 872 void 873 uvm_pagedeactivate(struct vm_page *pg) 874 { 875 876 /* nada */ 877 } 878 879 void 880 uvm_pagedequeue(struct vm_page *pg) 881 { 882 883 /* nada*/ 884 } 885 886 void 887 uvm_pageenqueue(struct vm_page *pg) 888 { 889 890 /* nada */ 891 } 892 893 void 894 uvmpdpol_anfree(struct vm_anon *an) 895 { 896 897 /* nada */ 898 } 899 900 /* 901 * Physical address accessors. 902 */ 903 904 struct vm_page * 905 uvm_phys_to_vm_page(paddr_t pa) 906 { 907 908 return NULL; 909 } 910 911 paddr_t 912 uvm_vm_page_to_phys(const struct vm_page *pg) 913 { 914 915 return 0; 916 } 917 918 vaddr_t 919 uvm_uarea_alloc(void) 920 { 921 922 /* non-zero */ 923 return (vaddr_t)11; 924 } 925 926 void 927 uvm_uarea_free(vaddr_t uarea) 928 { 929 930 /* nata, so creamy */ 931 } 932 933 /* 934 * Routines related to the Page Baroness. 935 */ 936 937 void 938 uvm_wait(const char *msg) 939 { 940 941 if (__predict_false(rump_threads == 0)) 942 panic("pagedaemon missing (RUMP_THREADS = 0)"); 943 944 if (curlwp == uvm.pagedaemon_lwp) { 945 /* is it possible for us to later get memory? */ 946 if (!uvmexp.paging) 947 panic("pagedaemon out of memory"); 948 } 949 950 mutex_enter(&pdaemonmtx); 951 pdaemon_waiters++; 952 cv_signal(&pdaemoncv); 953 cv_wait(&oomwait, &pdaemonmtx); 954 mutex_exit(&pdaemonmtx); 955 } 956 957 void 958 uvm_pageout_start(int npages) 959 { 960 961 mutex_enter(&pdaemonmtx); 962 uvmexp.paging += npages; 963 mutex_exit(&pdaemonmtx); 964 } 965 966 void 967 uvm_pageout_done(int npages) 968 { 969 970 if (!npages) 971 return; 972 973 mutex_enter(&pdaemonmtx); 974 KASSERT(uvmexp.paging >= npages); 975 uvmexp.paging -= npages; 976 977 if (pdaemon_waiters) { 978 pdaemon_waiters = 0; 979 cv_broadcast(&oomwait); 980 } 981 mutex_exit(&pdaemonmtx); 982 } 983 984 static bool 985 processpage(struct vm_page *pg, bool *lockrunning) 986 { 987 struct uvm_object *uobj; 988 989 uobj = pg->uobject; 990 if (mutex_tryenter(uobj->vmobjlock)) { 991 if ((pg->flags & PG_BUSY) == 0) { 992 mutex_exit(&uvm_pageqlock); 993 uobj->pgops->pgo_put(uobj, pg->offset, 994 pg->offset + PAGE_SIZE, 995 PGO_CLEANIT|PGO_FREE); 996 KASSERT(!mutex_owned(uobj->vmobjlock)); 997 return true; 998 } else { 999 mutex_exit(uobj->vmobjlock); 1000 } 1001 } else if (*lockrunning == false && ncpu > 1) { 1002 CPU_INFO_ITERATOR cii; 1003 struct cpu_info *ci; 1004 struct lwp *l; 1005 1006 l = mutex_owner(uobj->vmobjlock); 1007 for (CPU_INFO_FOREACH(cii, ci)) { 1008 if (ci->ci_curlwp == l) { 1009 *lockrunning = true; 1010 break; 1011 } 1012 } 1013 } 1014 1015 return false; 1016 } 1017 1018 /* 1019 * The Diabolical pageDaemon Director (DDD). 1020 * 1021 * This routine can always use better heuristics. 1022 */ 1023 void 1024 uvm_pageout(void *arg) 1025 { 1026 struct vm_page *pg; 1027 struct pool *pp, *pp_first; 1028 int cleaned, skip, skipped; 1029 bool succ; 1030 bool lockrunning; 1031 1032 mutex_enter(&pdaemonmtx); 1033 for (;;) { 1034 if (!NEED_PAGEDAEMON()) { 1035 kernel_map->flags &= ~VM_MAP_WANTVA; 1036 } 1037 1038 if (pdaemon_waiters) { 1039 pdaemon_waiters = 0; 1040 cv_broadcast(&oomwait); 1041 } 1042 1043 cv_wait(&pdaemoncv, &pdaemonmtx); 1044 uvmexp.pdwoke++; 1045 1046 /* tell the world that we are hungry */ 1047 kernel_map->flags |= VM_MAP_WANTVA; 1048 mutex_exit(&pdaemonmtx); 1049 1050 /* 1051 * step one: reclaim the page cache. this should give 1052 * us the biggest earnings since whole pages are released 1053 * into backing memory. 1054 */ 1055 pool_cache_reclaim(&pagecache); 1056 if (!NEED_PAGEDAEMON()) { 1057 mutex_enter(&pdaemonmtx); 1058 continue; 1059 } 1060 1061 /* 1062 * Ok, so that didn't help. Next, try to hunt memory 1063 * by pushing out vnode pages. The pages might contain 1064 * useful cached data, but we need the memory. 1065 */ 1066 cleaned = 0; 1067 skip = 0; 1068 lockrunning = false; 1069 again: 1070 mutex_enter(&uvm_pageqlock); 1071 while (cleaned < PAGEDAEMON_OBJCHUNK) { 1072 skipped = 0; 1073 TAILQ_FOREACH(pg, &vmpage_lruqueue, pageq.queue) { 1074 1075 /* 1076 * skip over pages we _might_ have tried 1077 * to handle earlier. they might not be 1078 * exactly the same ones, but I'm not too 1079 * concerned. 1080 */ 1081 while (skipped++ < skip) 1082 continue; 1083 1084 if (processpage(pg, &lockrunning)) { 1085 cleaned++; 1086 goto again; 1087 } 1088 1089 skip++; 1090 } 1091 break; 1092 } 1093 mutex_exit(&uvm_pageqlock); 1094 1095 /* 1096 * Ok, someone is running with an object lock held. 1097 * We want to yield the host CPU to make sure the 1098 * thread is not parked on the host. Since sched_yield() 1099 * doesn't appear to do anything on NetBSD, nanosleep 1100 * for the smallest possible time and hope we're back in 1101 * the game soon. 1102 */ 1103 if (cleaned == 0 && lockrunning) { 1104 rumpuser_clock_sleep(RUMPUSER_CLOCK_RELWALL, 0, 1); 1105 1106 lockrunning = false; 1107 skip = 0; 1108 1109 /* and here we go again */ 1110 goto again; 1111 } 1112 1113 /* 1114 * And of course we need to reclaim the page cache 1115 * again to actually release memory. 1116 */ 1117 pool_cache_reclaim(&pagecache); 1118 if (!NEED_PAGEDAEMON()) { 1119 mutex_enter(&pdaemonmtx); 1120 continue; 1121 } 1122 1123 /* 1124 * And then drain the pools. Wipe them out ... all of them. 1125 */ 1126 for (pp_first = NULL;;) { 1127 rump_vfs_drainbufs(10 /* XXX: estimate! */); 1128 1129 succ = pool_drain(&pp); 1130 if (succ || pp == pp_first) 1131 break; 1132 1133 if (pp_first == NULL) 1134 pp_first = pp; 1135 } 1136 1137 /* 1138 * Need to use PYEC on our bag of tricks. 1139 * Unfortunately, the wife just borrowed it. 1140 */ 1141 1142 mutex_enter(&pdaemonmtx); 1143 if (!succ && cleaned == 0 && pdaemon_waiters && 1144 uvmexp.paging == 0) { 1145 rumpuser_dprintf("pagedaemoness: failed to reclaim " 1146 "memory ... sleeping (deadlock?)\n"); 1147 cv_timedwait(&pdaemoncv, &pdaemonmtx, hz); 1148 } 1149 } 1150 1151 panic("you can swap out any time you like, but you can never leave"); 1152 } 1153 1154 void 1155 uvm_kick_pdaemon() 1156 { 1157 1158 /* 1159 * Wake up the diabolical pagedaemon director if we are over 1160 * 90% of the memory limit. This is a complete and utter 1161 * stetson-harrison decision which you are allowed to finetune. 1162 * Don't bother locking. If we have some unflushed caches, 1163 * other waker-uppers will deal with the issue. 1164 */ 1165 if (NEED_PAGEDAEMON()) { 1166 cv_signal(&pdaemoncv); 1167 } 1168 } 1169 1170 void * 1171 rump_hypermalloc(size_t howmuch, int alignment, bool waitok, const char *wmsg) 1172 { 1173 const unsigned long thelimit = 1174 curlwp == uvm.pagedaemon_lwp ? pdlimit : rump_physmemlimit; 1175 unsigned long newmem; 1176 void *rv; 1177 int error; 1178 1179 uvm_kick_pdaemon(); /* ouch */ 1180 1181 /* first we must be within the limit */ 1182 limitagain: 1183 if (thelimit != RUMPMEM_UNLIMITED) { 1184 newmem = atomic_add_long_nv(&curphysmem, howmuch); 1185 if (newmem > thelimit) { 1186 newmem = atomic_add_long_nv(&curphysmem, -howmuch); 1187 if (!waitok) { 1188 return NULL; 1189 } 1190 uvm_wait(wmsg); 1191 goto limitagain; 1192 } 1193 } 1194 1195 /* second, we must get something from the backend */ 1196 again: 1197 error = rumpuser_malloc(howmuch, alignment, &rv); 1198 if (__predict_false(error && waitok)) { 1199 uvm_wait(wmsg); 1200 goto again; 1201 } 1202 1203 return rv; 1204 } 1205 1206 void 1207 rump_hyperfree(void *what, size_t size) 1208 { 1209 1210 if (rump_physmemlimit != RUMPMEM_UNLIMITED) { 1211 atomic_add_long(&curphysmem, -size); 1212 } 1213 rumpuser_free(what, size); 1214 } 1215