1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 1997, 1998 John S. Dyson 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice immediately at the beginning of the file, without modification, 12 * this list of conditions, and the following disclaimer. 13 * 2. Absolutely no warranty of function or purpose is made by the author 14 * John S. Dyson. 15 * 16 * $FreeBSD: src/sys/vm/vm_zone.c,v 1.30.2.6 2002/10/10 19:50:16 dillon Exp $ 17 * $DragonFly: src/sys/vm/vm_zone.c,v 1.28 2008/01/23 17:35:48 nth Exp $ 18 */ 19 20 #include <sys/param.h> 21 #include <sys/queue.h> 22 #include <sys/systm.h> 23 #include <sys/kernel.h> 24 #include <sys/lock.h> 25 #include <sys/malloc.h> 26 #include <sys/sysctl.h> 27 #include <sys/vmmeter.h> 28 29 #include <vm/vm.h> 30 #include <vm/vm_object.h> 31 #include <vm/vm_page.h> 32 #include <vm/vm_map.h> 33 #include <vm/vm_kern.h> 34 #include <vm/vm_extern.h> 35 #include <vm/vm_zone.h> 36 37 #include <sys/spinlock2.h> 38 39 static MALLOC_DEFINE(M_ZONE, "ZONE", "Zone header"); 40 41 #define ZONE_ERROR_INVALID 0 42 #define ZONE_ERROR_NOTFREE 1 43 #define ZONE_ERROR_ALREADYFREE 2 44 45 #define ZONE_ROUNDING 32 46 47 #define ZENTRY_FREE 0x12342378 48 49 static void *zget(vm_zone_t z); 50 51 /* 52 * Return an item from the specified zone. This function is non-blocking for 53 * ZONE_INTERRUPT zones. 54 * 55 * No requirements. 56 */ 57 void * 58 zalloc(vm_zone_t z) 59 { 60 globaldata_t gd = mycpu; 61 void *item; 62 63 #ifdef INVARIANTS 64 if (z == NULL) 65 zerror(ZONE_ERROR_INVALID); 66 #endif 67 /* 68 * Avoid spinlock contention by allocating from a per-cpu queue 69 */ 70 if (z->zfreecnt_pcpu[gd->gd_cpuid] > 0) { 71 crit_enter_gd(gd); 72 if (z->zfreecnt_pcpu[gd->gd_cpuid] > 0) { 73 item = z->zitems_pcpu[gd->gd_cpuid]; 74 #ifdef INVARIANTS 75 KASSERT(item != NULL, 76 ("zitems_pcpu unexpectedly NULL")); 77 if (((void **)item)[1] != (void *)ZENTRY_FREE) 78 zerror(ZONE_ERROR_NOTFREE); 79 ((void **)item)[1] = 0; 80 #endif 81 z->zitems_pcpu[gd->gd_cpuid] = ((void **) item)[0]; 82 --z->zfreecnt_pcpu[gd->gd_cpuid]; 83 z->znalloc++; 84 crit_exit_gd(gd); 85 return item; 86 } 87 crit_exit_gd(gd); 88 } 89 90 /* 91 * Per-zone spinlock for the remainder. 92 */ 93 spin_lock(&z->zlock); 94 if (z->zfreecnt > z->zfreemin) { 95 item = z->zitems; 96 #ifdef INVARIANTS 97 KASSERT(item != NULL, ("zitems unexpectedly NULL")); 98 if (((void **)item)[1] != (void *)ZENTRY_FREE) 99 zerror(ZONE_ERROR_NOTFREE); 100 ((void **)item)[1] = 0; 101 #endif 102 z->zitems = ((void **)item)[0]; 103 z->zfreecnt--; 104 z->znalloc++; 105 spin_unlock(&z->zlock); 106 } else { 107 spin_unlock(&z->zlock); 108 item = zget(z); 109 /* 110 * PANICFAIL allows the caller to assume that the zalloc() 111 * will always succeed. If it doesn't, we panic here. 112 */ 113 if (item == NULL && (z->zflags & ZONE_PANICFAIL)) 114 panic("zalloc(%s) failed", z->zname); 115 } 116 return item; 117 } 118 119 /* 120 * Free an item to the specified zone. 121 * 122 * No requirements. 123 */ 124 void 125 zfree(vm_zone_t z, void *item) 126 { 127 globaldata_t gd = mycpu; 128 int zmax; 129 130 /* 131 * Avoid spinlock contention by freeing into a per-cpu queue 132 */ 133 if ((zmax = z->zmax) != 0) 134 zmax = zmax / ncpus / 16; 135 if (zmax < 64) 136 zmax = 64; 137 138 if (z->zfreecnt_pcpu[gd->gd_cpuid] < zmax) { 139 crit_enter_gd(gd); 140 ((void **)item)[0] = z->zitems_pcpu[gd->gd_cpuid]; 141 #ifdef INVARIANTS 142 if (((void **)item)[1] == (void *)ZENTRY_FREE) 143 zerror(ZONE_ERROR_ALREADYFREE); 144 ((void **)item)[1] = (void *)ZENTRY_FREE; 145 #endif 146 z->zitems_pcpu[gd->gd_cpuid] = item; 147 ++z->zfreecnt_pcpu[gd->gd_cpuid]; 148 crit_exit_gd(gd); 149 return; 150 } 151 152 /* 153 * Per-zone spinlock for the remainder. 154 */ 155 spin_lock(&z->zlock); 156 ((void **)item)[0] = z->zitems; 157 #ifdef INVARIANTS 158 if (((void **)item)[1] == (void *)ZENTRY_FREE) 159 zerror(ZONE_ERROR_ALREADYFREE); 160 ((void **)item)[1] = (void *)ZENTRY_FREE; 161 #endif 162 z->zitems = item; 163 z->zfreecnt++; 164 spin_unlock(&z->zlock); 165 } 166 167 /* 168 * This file comprises a very simple zone allocator. This is used 169 * in lieu of the malloc allocator, where needed or more optimal. 170 * 171 * Note that the initial implementation of this had coloring, and 172 * absolutely no improvement (actually perf degradation) occurred. 173 * 174 * Note also that the zones are type stable. The only restriction is 175 * that the first two longwords of a data structure can be changed 176 * between allocations. Any data that must be stable between allocations 177 * must reside in areas after the first two longwords. 178 * 179 * zinitna, zinit, zbootinit are the initialization routines. 180 * zalloc, zfree, are the allocation/free routines. 181 */ 182 183 LIST_HEAD(zlist, vm_zone) zlist = LIST_HEAD_INITIALIZER(zlist); 184 static int sysctl_vm_zone(SYSCTL_HANDLER_ARGS); 185 static int zone_kmem_pages, zone_kern_pages, zone_kmem_kvaspace; 186 187 /* 188 * Create a zone, but don't allocate the zone structure. If the 189 * zone had been previously created by the zone boot code, initialize 190 * various parts of the zone code. 191 * 192 * If waits are not allowed during allocation (e.g. during interrupt 193 * code), a-priori allocate the kernel virtual space, and allocate 194 * only pages when needed. 195 * 196 * Arguments: 197 * z pointer to zone structure. 198 * obj pointer to VM object (opt). 199 * name name of zone. 200 * size size of zone entries. 201 * nentries number of zone entries allocated (only ZONE_INTERRUPT.) 202 * flags ZONE_INTERRUPT -- items can be allocated at interrupt time. 203 * zalloc number of pages allocated when memory is needed. 204 * 205 * Note that when using ZONE_INTERRUPT, the size of the zone is limited 206 * by the nentries argument. The size of the memory allocatable is 207 * unlimited if ZONE_INTERRUPT is not set. 208 * 209 * No requirements. 210 */ 211 int 212 zinitna(vm_zone_t z, vm_object_t obj, char *name, int size, 213 int nentries, int flags, int zalloc) 214 { 215 int totsize; 216 217 /* 218 * Only zones created with zinit() are destroyable. 219 */ 220 if (z->zflags & ZONE_DESTROYABLE) 221 panic("zinitna: can't create destroyable zone"); 222 223 /* 224 * NOTE: We can only adjust zsize if we previously did not 225 * use zbootinit(). 226 */ 227 if ((z->zflags & ZONE_BOOT) == 0) { 228 z->zsize = (size + ZONE_ROUNDING - 1) & ~(ZONE_ROUNDING - 1); 229 spin_init(&z->zlock); 230 z->zfreecnt = 0; 231 z->ztotal = 0; 232 z->zmax = 0; 233 z->zname = name; 234 z->znalloc = 0; 235 z->zitems = NULL; 236 237 lwkt_gettoken(&vm_token); 238 LIST_INSERT_HEAD(&zlist, z, zlink); 239 lwkt_reltoken(&vm_token); 240 241 bzero(z->zitems_pcpu, sizeof(z->zitems_pcpu)); 242 bzero(z->zfreecnt_pcpu, sizeof(z->zfreecnt_pcpu)); 243 } 244 245 z->zkmvec = NULL; 246 z->zkmcur = z->zkmmax = 0; 247 z->zflags |= flags; 248 249 /* 250 * If we cannot wait, allocate KVA space up front, and we will fill 251 * in pages as needed. This is particularly required when creating 252 * an allocation space for map entries in kernel_map, because we 253 * do not want to go into a recursion deadlock with 254 * vm_map_entry_reserve(). 255 */ 256 if (z->zflags & ZONE_INTERRUPT) { 257 totsize = round_page(z->zsize * nentries); 258 zone_kmem_kvaspace += totsize; 259 260 z->zkva = kmem_alloc_pageable(&kernel_map, totsize); 261 if (z->zkva == 0) { 262 LIST_REMOVE(z, zlink); 263 return 0; 264 } 265 266 z->zpagemax = totsize / PAGE_SIZE; 267 if (obj == NULL) { 268 z->zobj = vm_object_allocate(OBJT_DEFAULT, z->zpagemax); 269 } else { 270 z->zobj = obj; 271 _vm_object_allocate(OBJT_DEFAULT, z->zpagemax, obj); 272 } 273 z->zallocflag = VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT; 274 z->zmax += nentries; 275 } else { 276 z->zallocflag = VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM; 277 z->zmax = 0; 278 } 279 280 281 if (z->zsize > PAGE_SIZE) 282 z->zfreemin = 1; 283 else 284 z->zfreemin = PAGE_SIZE / z->zsize; 285 286 z->zpagecount = 0; 287 if (zalloc) 288 z->zalloc = zalloc; 289 else 290 z->zalloc = 1; 291 292 /* 293 * Populate the interrrupt zone at creation time rather than 294 * on first allocation, as this is a potentially long operation. 295 */ 296 if (z->zflags & ZONE_INTERRUPT) { 297 void *buf; 298 299 buf = zget(z); 300 zfree(z, buf); 301 } 302 303 return 1; 304 } 305 306 /* 307 * Subroutine same as zinitna, except zone data structure is allocated 308 * automatically by malloc. This routine should normally be used, except 309 * in certain tricky startup conditions in the VM system -- then 310 * zbootinit and zinitna can be used. Zinit is the standard zone 311 * initialization call. 312 * 313 * No requirements. 314 */ 315 vm_zone_t 316 zinit(char *name, int size, int nentries, int flags, int zalloc) 317 { 318 vm_zone_t z; 319 320 z = (vm_zone_t) kmalloc(sizeof (struct vm_zone), M_ZONE, M_NOWAIT); 321 if (z == NULL) 322 return NULL; 323 324 z->zflags = 0; 325 if (zinitna(z, NULL, name, size, nentries, 326 flags & ~ZONE_DESTROYABLE, zalloc) == 0) { 327 kfree(z, M_ZONE); 328 return NULL; 329 } 330 331 if (flags & ZONE_DESTROYABLE) 332 z->zflags |= ZONE_DESTROYABLE; 333 334 return z; 335 } 336 337 /* 338 * Initialize a zone before the system is fully up. This routine should 339 * only be called before full VM startup. 340 * 341 * Called from the low level boot code only. 342 */ 343 void 344 zbootinit(vm_zone_t z, char *name, int size, void *item, int nitems) 345 { 346 int i; 347 348 bzero(z->zitems_pcpu, sizeof(z->zitems_pcpu)); 349 bzero(z->zfreecnt_pcpu, sizeof(z->zfreecnt_pcpu)); 350 351 z->zname = name; 352 z->zsize = size; 353 z->zpagemax = 0; 354 z->zobj = NULL; 355 z->zflags = ZONE_BOOT; 356 z->zfreemin = 0; 357 z->zallocflag = 0; 358 z->zpagecount = 0; 359 z->zalloc = 0; 360 z->znalloc = 0; 361 spin_init(&z->zlock); 362 363 bzero(item, nitems * z->zsize); 364 z->zitems = NULL; 365 for (i = 0; i < nitems; i++) { 366 ((void **)item)[0] = z->zitems; 367 #ifdef INVARIANTS 368 ((void **)item)[1] = (void *)ZENTRY_FREE; 369 #endif 370 z->zitems = item; 371 item = (uint8_t *)item + z->zsize; 372 } 373 z->zfreecnt = nitems; 374 z->zmax = nitems; 375 z->ztotal = nitems; 376 377 lwkt_gettoken(&vm_token); 378 LIST_INSERT_HEAD(&zlist, z, zlink); 379 lwkt_reltoken(&vm_token); 380 } 381 382 /* 383 * Release all resources owned by zone created with zinit(). 384 * 385 * No requirements. 386 */ 387 void 388 zdestroy(vm_zone_t z) 389 { 390 int i; 391 392 if (z == NULL) 393 panic("zdestroy: null zone"); 394 if ((z->zflags & ZONE_DESTROYABLE) == 0) 395 panic("zdestroy: undestroyable zone"); 396 397 lwkt_gettoken(&vm_token); 398 LIST_REMOVE(z, zlink); 399 lwkt_reltoken(&vm_token); 400 401 /* 402 * Release virtual mappings, physical memory and update sysctl stats. 403 */ 404 if (z->zflags & ZONE_INTERRUPT) { 405 /* 406 * Pages mapped via pmap_kenter() must be removed from the 407 * kernel_pmap() before calling kmem_free() to avoid issues 408 * with kernel_pmap.pm_stats.resident_count. 409 */ 410 pmap_qremove(z->zkva, z->zpagemax); 411 412 /* 413 * Free the mapping. 414 */ 415 kmem_free(&kernel_map, z->zkva, z->zpagemax*PAGE_SIZE); 416 atomic_subtract_int(&zone_kmem_kvaspace, z->zpagemax*PAGE_SIZE); 417 418 /* 419 * Free the backing object and physical pages. 420 */ 421 vm_object_deallocate(z->zobj); 422 atomic_subtract_int(&zone_kmem_pages, z->zpagecount); 423 } else { 424 for (i=0; i < z->zkmcur; i++) { 425 kmem_free(&kernel_map, z->zkmvec[i], 426 z->zalloc*PAGE_SIZE); 427 atomic_subtract_int(&zone_kern_pages, z->zalloc); 428 } 429 if (z->zkmvec != NULL) 430 kfree(z->zkmvec, M_ZONE); 431 } 432 433 spin_uninit(&z->zlock); 434 kfree(z, M_ZONE); 435 } 436 437 438 /* 439 * void *zalloc(vm_zone_t zone) -- 440 * Returns an item from a specified zone. May not be called from a 441 * FAST interrupt or IPI function. 442 * 443 * void zfree(vm_zone_t zone, void *item) -- 444 * Frees an item back to a specified zone. May not be called from a 445 * FAST interrupt or IPI function. 446 */ 447 448 /* 449 * Internal zone routine. Not to be called from external (non vm_zone) code. 450 * 451 * No requirements. 452 */ 453 static void * 454 zget(vm_zone_t z) 455 { 456 int i; 457 vm_page_t m; 458 int nitems, nbytes; 459 int savezpc; 460 void *item; 461 462 if (z == NULL) 463 panic("zget: null zone"); 464 465 if (z->zflags & ZONE_INTERRUPT) { 466 /* 467 * Interrupt zones do not mess with the kernel_map, they 468 * simply populate an existing mapping. 469 */ 470 vm_object_hold(z->zobj); 471 savezpc = z->zpagecount; 472 nbytes = z->zpagecount * PAGE_SIZE; 473 nbytes -= nbytes % z->zsize; 474 item = (char *) z->zkva + nbytes; 475 for (i = 0; ((i < z->zalloc) && (z->zpagecount < z->zpagemax)); 476 i++) { 477 vm_offset_t zkva; 478 479 m = vm_page_alloc(z->zobj, z->zpagecount, 480 z->zallocflag); 481 /* note: z might be modified due to blocking */ 482 if (m == NULL) 483 break; 484 485 /* 486 * Unbusy page so it can freed in zdestroy(). Make 487 * sure it is not on any queue and so can not be 488 * recycled under our feet. 489 */ 490 KKASSERT(m->queue == PQ_NONE); 491 vm_page_flag_clear(m, PG_BUSY); 492 493 zkva = z->zkva + z->zpagecount * PAGE_SIZE; 494 pmap_kenter(zkva, VM_PAGE_TO_PHYS(m)); /* YYY */ 495 bzero((void *)zkva, PAGE_SIZE); 496 KKASSERT(savezpc == z->zpagecount); 497 ++savezpc; 498 z->zpagecount++; 499 zone_kmem_pages++; 500 vmstats.v_wire_count++; 501 } 502 nitems = ((z->zpagecount * PAGE_SIZE) - nbytes) / z->zsize; 503 vm_object_drop(z->zobj); 504 } else if (z->zflags & ZONE_SPECIAL) { 505 /* 506 * The special zone is the one used for vm_map_entry_t's. 507 * We have to avoid an infinite recursion in 508 * vm_map_entry_reserve() by using vm_map_entry_kreserve() 509 * instead. The map entries are pre-reserved by the kernel 510 * by vm_map_entry_reserve_cpu_init(). 511 */ 512 nbytes = z->zalloc * PAGE_SIZE; 513 514 item = (void *)kmem_alloc3(&kernel_map, nbytes, KM_KRESERVE); 515 516 /* note: z might be modified due to blocking */ 517 if (item != NULL) { 518 zone_kern_pages += z->zalloc; /* not MP-safe XXX */ 519 bzero(item, nbytes); 520 } else { 521 nbytes = 0; 522 } 523 nitems = nbytes / z->zsize; 524 } else { 525 /* 526 * Otherwise allocate KVA from the kernel_map. 527 */ 528 nbytes = z->zalloc * PAGE_SIZE; 529 530 item = (void *)kmem_alloc3(&kernel_map, nbytes, 0); 531 532 /* note: z might be modified due to blocking */ 533 if (item != NULL) { 534 zone_kern_pages += z->zalloc; /* not MP-safe XXX */ 535 bzero(item, nbytes); 536 537 if (z->zflags & ZONE_DESTROYABLE) { 538 if (z->zkmcur == z->zkmmax) { 539 z->zkmmax = 540 z->zkmmax==0 ? 1 : z->zkmmax*2; 541 z->zkmvec = krealloc(z->zkmvec, 542 z->zkmmax * sizeof(z->zkmvec[0]), 543 M_ZONE, M_WAITOK); 544 } 545 z->zkmvec[z->zkmcur++] = (vm_offset_t)item; 546 } 547 } else { 548 nbytes = 0; 549 } 550 nitems = nbytes / z->zsize; 551 } 552 553 spin_lock(&z->zlock); 554 z->ztotal += nitems; 555 /* 556 * Save one for immediate allocation 557 */ 558 if (nitems != 0) { 559 nitems -= 1; 560 for (i = 0; i < nitems; i++) { 561 ((void **)item)[0] = z->zitems; 562 #ifdef INVARIANTS 563 ((void **)item)[1] = (void *)ZENTRY_FREE; 564 #endif 565 z->zitems = item; 566 item = (uint8_t *)item + z->zsize; 567 } 568 z->zfreecnt += nitems; 569 z->znalloc++; 570 } else if (z->zfreecnt > 0) { 571 item = z->zitems; 572 z->zitems = ((void **)item)[0]; 573 #ifdef INVARIANTS 574 if (((void **)item)[1] != (void *)ZENTRY_FREE) 575 zerror(ZONE_ERROR_NOTFREE); 576 ((void **) item)[1] = 0; 577 #endif 578 z->zfreecnt--; 579 z->znalloc++; 580 } else { 581 item = NULL; 582 } 583 spin_unlock(&z->zlock); 584 585 /* 586 * A special zone may have used a kernel-reserved vm_map_entry. If 587 * so we have to be sure to recover our reserve so we don't run out. 588 * We will panic if we run out. 589 */ 590 if (z->zflags & ZONE_SPECIAL) 591 vm_map_entry_reserve(0); 592 593 return item; 594 } 595 596 /* 597 * No requirements. 598 */ 599 static int 600 sysctl_vm_zone(SYSCTL_HANDLER_ARGS) 601 { 602 int error=0; 603 vm_zone_t curzone; 604 char tmpbuf[128]; 605 char tmpname[14]; 606 607 ksnprintf(tmpbuf, sizeof(tmpbuf), 608 "\nITEM SIZE LIMIT USED FREE REQUESTS\n"); 609 error = SYSCTL_OUT(req, tmpbuf, strlen(tmpbuf)); 610 if (error) 611 return (error); 612 613 lwkt_gettoken(&vm_token); 614 LIST_FOREACH(curzone, &zlist, zlink) { 615 int i; 616 int len; 617 int offset; 618 619 len = strlen(curzone->zname); 620 if (len >= (sizeof(tmpname) - 1)) 621 len = (sizeof(tmpname) - 1); 622 for(i = 0; i < sizeof(tmpname) - 1; i++) 623 tmpname[i] = ' '; 624 tmpname[i] = 0; 625 memcpy(tmpname, curzone->zname, len); 626 tmpname[len] = ':'; 627 offset = 0; 628 if (curzone == LIST_FIRST(&zlist)) { 629 offset = 1; 630 tmpbuf[0] = '\n'; 631 } 632 633 ksnprintf(tmpbuf + offset, sizeof(tmpbuf) - offset, 634 "%s %6.6u, %8.8u, %6.6u, %6.6u, %8.8u\n", 635 tmpname, curzone->zsize, curzone->zmax, 636 (curzone->ztotal - curzone->zfreecnt), 637 curzone->zfreecnt, curzone->znalloc); 638 639 len = strlen((char *)tmpbuf); 640 if (LIST_NEXT(curzone, zlink) == NULL) 641 tmpbuf[len - 1] = 0; 642 643 error = SYSCTL_OUT(req, tmpbuf, len); 644 645 if (error) 646 break; 647 } 648 lwkt_reltoken(&vm_token); 649 return (error); 650 } 651 652 #if defined(INVARIANTS) 653 654 /* 655 * Debugging only. 656 */ 657 void 658 zerror(int error) 659 { 660 char *msg; 661 662 switch (error) { 663 case ZONE_ERROR_INVALID: 664 msg = "zone: invalid zone"; 665 break; 666 case ZONE_ERROR_NOTFREE: 667 msg = "zone: entry not free"; 668 break; 669 case ZONE_ERROR_ALREADYFREE: 670 msg = "zone: freeing free entry"; 671 break; 672 default: 673 msg = "zone: invalid error"; 674 break; 675 } 676 panic(msg); 677 } 678 #endif 679 680 SYSCTL_OID(_vm, OID_AUTO, zone, CTLTYPE_STRING|CTLFLAG_RD, \ 681 NULL, 0, sysctl_vm_zone, "A", "Zone Info"); 682 683 SYSCTL_INT(_vm, OID_AUTO, zone_kmem_pages, 684 CTLFLAG_RD, &zone_kmem_pages, 0, "Number of interrupt safe pages allocated by zone"); 685 SYSCTL_INT(_vm, OID_AUTO, zone_kmem_kvaspace, 686 CTLFLAG_RD, &zone_kmem_kvaspace, 0, "KVA space allocated by zone"); 687 SYSCTL_INT(_vm, OID_AUTO, zone_kern_pages, 688 CTLFLAG_RD, &zone_kern_pages, 0, "Number of non-interrupt safe pages allocated by zone"); 689