1 /* 2 * (MPSAFE) 3 * 4 * Copyright (c) 1997, 1998 John S. Dyson 5 * All rights reserved. 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 1. Redistributions of source code must retain the above copyright 11 * notice immediately at the beginning of the file, without modification, 12 * this list of conditions, and the following disclaimer. 13 * 2. Absolutely no warranty of function or purpose is made by the author 14 * John S. Dyson. 15 * 16 * $FreeBSD: src/sys/vm/vm_zone.c,v 1.30.2.6 2002/10/10 19:50:16 dillon Exp $ 17 * $DragonFly: src/sys/vm/vm_zone.c,v 1.28 2008/01/23 17:35:48 nth Exp $ 18 */ 19 20 #include <sys/param.h> 21 #include <sys/queue.h> 22 #include <sys/systm.h> 23 #include <sys/kernel.h> 24 #include <sys/lock.h> 25 #include <sys/malloc.h> 26 #include <sys/sysctl.h> 27 #include <sys/vmmeter.h> 28 29 #include <vm/vm.h> 30 #include <vm/vm_object.h> 31 #include <vm/vm_page.h> 32 #include <vm/vm_map.h> 33 #include <vm/vm_kern.h> 34 #include <vm/vm_extern.h> 35 #include <vm/vm_zone.h> 36 37 #include <sys/spinlock2.h> 38 39 static MALLOC_DEFINE(M_ZONE, "ZONE", "Zone header"); 40 41 #define ZONE_ERROR_INVALID 0 42 #define ZONE_ERROR_NOTFREE 1 43 #define ZONE_ERROR_ALREADYFREE 2 44 45 #define ZONE_ROUNDING 32 46 47 #define ZENTRY_FREE 0x12342378 48 49 static void *zget(vm_zone_t z); 50 51 /* 52 * Return an item from the specified zone. This function is non-blocking for 53 * ZONE_INTERRUPT zones. 54 * 55 * No requirements. 56 */ 57 void * 58 zalloc(vm_zone_t z) 59 { 60 void *item; 61 62 #ifdef INVARIANTS 63 if (z == NULL) 64 zerror(ZONE_ERROR_INVALID); 65 #endif 66 spin_lock(&z->zlock); 67 if (z->zfreecnt > z->zfreemin) { 68 item = z->zitems; 69 #ifdef INVARIANTS 70 KASSERT(item != NULL, ("zitems unexpectedly NULL")); 71 if (((void **) item)[1] != (void *) ZENTRY_FREE) 72 zerror(ZONE_ERROR_NOTFREE); 73 ((void **) item)[1] = 0; 74 #endif 75 z->zitems = ((void **) item)[0]; 76 z->zfreecnt--; 77 z->znalloc++; 78 spin_unlock(&z->zlock); 79 } else { 80 spin_unlock(&z->zlock); 81 item = zget(z); 82 /* 83 * PANICFAIL allows the caller to assume that the zalloc() 84 * will always succeed. If it doesn't, we panic here. 85 */ 86 if (item == NULL && (z->zflags & ZONE_PANICFAIL)) 87 panic("zalloc(%s) failed", z->zname); 88 } 89 return item; 90 } 91 92 /* 93 * Free an item to the specified zone. 94 * 95 * No requirements. 96 */ 97 void 98 zfree(vm_zone_t z, void *item) 99 { 100 spin_lock(&z->zlock); 101 ((void **) item)[0] = z->zitems; 102 #ifdef INVARIANTS 103 if (((void **) item)[1] == (void *) ZENTRY_FREE) 104 zerror(ZONE_ERROR_ALREADYFREE); 105 ((void **) item)[1] = (void *) ZENTRY_FREE; 106 #endif 107 z->zitems = item; 108 z->zfreecnt++; 109 spin_unlock(&z->zlock); 110 } 111 112 /* 113 * This file comprises a very simple zone allocator. This is used 114 * in lieu of the malloc allocator, where needed or more optimal. 115 * 116 * Note that the initial implementation of this had coloring, and 117 * absolutely no improvement (actually perf degradation) occurred. 118 * 119 * Note also that the zones are type stable. The only restriction is 120 * that the first two longwords of a data structure can be changed 121 * between allocations. Any data that must be stable between allocations 122 * must reside in areas after the first two longwords. 123 * 124 * zinitna, zinit, zbootinit are the initialization routines. 125 * zalloc, zfree, are the allocation/free routines. 126 */ 127 128 LIST_HEAD(zlist, vm_zone) zlist = LIST_HEAD_INITIALIZER(zlist); 129 static int sysctl_vm_zone(SYSCTL_HANDLER_ARGS); 130 static int zone_kmem_pages, zone_kern_pages, zone_kmem_kvaspace; 131 132 /* 133 * Create a zone, but don't allocate the zone structure. If the 134 * zone had been previously created by the zone boot code, initialize 135 * various parts of the zone code. 136 * 137 * If waits are not allowed during allocation (e.g. during interrupt 138 * code), a-priori allocate the kernel virtual space, and allocate 139 * only pages when needed. 140 * 141 * Arguments: 142 * z pointer to zone structure. 143 * obj pointer to VM object (opt). 144 * name name of zone. 145 * size size of zone entries. 146 * nentries number of zone entries allocated (only ZONE_INTERRUPT.) 147 * flags ZONE_INTERRUPT -- items can be allocated at interrupt time. 148 * zalloc number of pages allocated when memory is needed. 149 * 150 * Note that when using ZONE_INTERRUPT, the size of the zone is limited 151 * by the nentries argument. The size of the memory allocatable is 152 * unlimited if ZONE_INTERRUPT is not set. 153 * 154 * No requirements. 155 */ 156 int 157 zinitna(vm_zone_t z, vm_object_t obj, char *name, int size, 158 int nentries, int flags, int zalloc) 159 { 160 int totsize; 161 162 /* 163 * Only zones created with zinit() are destroyable. 164 */ 165 if (z->zflags & ZONE_DESTROYABLE) 166 panic("zinitna: can't create destroyable zone"); 167 168 /* 169 * NOTE: We can only adjust zsize if we previously did not 170 * use zbootinit(). 171 */ 172 if ((z->zflags & ZONE_BOOT) == 0) { 173 z->zsize = (size + ZONE_ROUNDING - 1) & ~(ZONE_ROUNDING - 1); 174 spin_init(&z->zlock); 175 z->zfreecnt = 0; 176 z->ztotal = 0; 177 z->zmax = 0; 178 z->zname = name; 179 z->znalloc = 0; 180 z->zitems = NULL; 181 182 lwkt_gettoken(&vm_token); 183 LIST_INSERT_HEAD(&zlist, z, zlink); 184 lwkt_reltoken(&vm_token); 185 } 186 187 z->zkmvec = NULL; 188 z->zkmcur = z->zkmmax = 0; 189 z->zflags |= flags; 190 191 /* 192 * If we cannot wait, allocate KVA space up front, and we will fill 193 * in pages as needed. This is particularly required when creating 194 * an allocation space for map entries in kernel_map, because we 195 * do not want to go into a recursion deadlock with 196 * vm_map_entry_reserve(). 197 */ 198 if (z->zflags & ZONE_INTERRUPT) { 199 totsize = round_page(z->zsize * nentries); 200 zone_kmem_kvaspace += totsize; 201 202 z->zkva = kmem_alloc_pageable(&kernel_map, totsize); 203 if (z->zkva == 0) { 204 LIST_REMOVE(z, zlink); 205 return 0; 206 } 207 208 z->zpagemax = totsize / PAGE_SIZE; 209 if (obj == NULL) { 210 z->zobj = vm_object_allocate(OBJT_DEFAULT, z->zpagemax); 211 } else { 212 z->zobj = obj; 213 _vm_object_allocate(OBJT_DEFAULT, z->zpagemax, obj); 214 } 215 z->zallocflag = VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT; 216 z->zmax += nentries; 217 } else { 218 z->zallocflag = VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM; 219 z->zmax = 0; 220 } 221 222 223 if (z->zsize > PAGE_SIZE) 224 z->zfreemin = 1; 225 else 226 z->zfreemin = PAGE_SIZE / z->zsize; 227 228 z->zpagecount = 0; 229 if (zalloc) 230 z->zalloc = zalloc; 231 else 232 z->zalloc = 1; 233 234 /* 235 * Populate the interrrupt zone at creation time rather than 236 * on first allocation, as this is a potentially long operation. 237 */ 238 if (z->zflags & ZONE_INTERRUPT) { 239 void *buf; 240 241 buf = zget(z); 242 zfree(z, buf); 243 } 244 245 return 1; 246 } 247 248 /* 249 * Subroutine same as zinitna, except zone data structure is allocated 250 * automatically by malloc. This routine should normally be used, except 251 * in certain tricky startup conditions in the VM system -- then 252 * zbootinit and zinitna can be used. Zinit is the standard zone 253 * initialization call. 254 * 255 * No requirements. 256 */ 257 vm_zone_t 258 zinit(char *name, int size, int nentries, int flags, int zalloc) 259 { 260 vm_zone_t z; 261 262 z = (vm_zone_t) kmalloc(sizeof (struct vm_zone), M_ZONE, M_NOWAIT); 263 if (z == NULL) 264 return NULL; 265 266 z->zflags = 0; 267 if (zinitna(z, NULL, name, size, nentries, 268 flags & ~ZONE_DESTROYABLE, zalloc) == 0) { 269 kfree(z, M_ZONE); 270 return NULL; 271 } 272 273 if (flags & ZONE_DESTROYABLE) 274 z->zflags |= ZONE_DESTROYABLE; 275 276 return z; 277 } 278 279 /* 280 * Initialize a zone before the system is fully up. This routine should 281 * only be called before full VM startup. 282 * 283 * Called from the low level boot code only. 284 */ 285 void 286 zbootinit(vm_zone_t z, char *name, int size, void *item, int nitems) 287 { 288 int i; 289 290 z->zname = name; 291 z->zsize = size; 292 z->zpagemax = 0; 293 z->zobj = NULL; 294 z->zflags = ZONE_BOOT; 295 z->zfreemin = 0; 296 z->zallocflag = 0; 297 z->zpagecount = 0; 298 z->zalloc = 0; 299 z->znalloc = 0; 300 spin_init(&z->zlock); 301 302 bzero(item, nitems * z->zsize); 303 z->zitems = NULL; 304 for (i = 0; i < nitems; i++) { 305 ((void **) item)[0] = z->zitems; 306 #ifdef INVARIANTS 307 ((void **) item)[1] = (void *) ZENTRY_FREE; 308 #endif 309 z->zitems = item; 310 item = (uint8_t *)item + z->zsize; 311 } 312 z->zfreecnt = nitems; 313 z->zmax = nitems; 314 z->ztotal = nitems; 315 316 lwkt_gettoken(&vm_token); 317 LIST_INSERT_HEAD(&zlist, z, zlink); 318 lwkt_reltoken(&vm_token); 319 } 320 321 /* 322 * Release all resources owned by zone created with zinit(). 323 * 324 * No requirements. 325 */ 326 void 327 zdestroy(vm_zone_t z) 328 { 329 int i; 330 331 if (z == NULL) 332 panic("zdestroy: null zone"); 333 if ((z->zflags & ZONE_DESTROYABLE) == 0) 334 panic("zdestroy: undestroyable zone"); 335 336 lwkt_gettoken(&vm_token); 337 LIST_REMOVE(z, zlink); 338 lwkt_reltoken(&vm_token); 339 340 /* 341 * Release virtual mappings, physical memory and update sysctl stats. 342 */ 343 if (z->zflags & ZONE_INTERRUPT) { 344 /* 345 * Pages mapped via pmap_kenter() must be removed from the 346 * kernel_pmap() before calling kmem_free() to avoid issues 347 * with kernel_pmap.pm_stats.resident_count. 348 */ 349 pmap_qremove(z->zkva, z->zpagemax); 350 351 /* 352 * Free the mapping. 353 */ 354 kmem_free(&kernel_map, z->zkva, z->zpagemax*PAGE_SIZE); 355 atomic_subtract_int(&zone_kmem_kvaspace, z->zpagemax*PAGE_SIZE); 356 357 /* 358 * Free the backing object and physical pages. 359 */ 360 vm_object_deallocate(z->zobj); 361 atomic_subtract_int(&zone_kmem_pages, z->zpagecount); 362 } else { 363 for (i=0; i < z->zkmcur; i++) { 364 kmem_free(&kernel_map, z->zkmvec[i], 365 z->zalloc*PAGE_SIZE); 366 atomic_subtract_int(&zone_kern_pages, z->zalloc); 367 } 368 if (z->zkmvec != NULL) 369 kfree(z->zkmvec, M_ZONE); 370 } 371 372 spin_uninit(&z->zlock); 373 kfree(z, M_ZONE); 374 } 375 376 377 /* 378 * void *zalloc(vm_zone_t zone) -- 379 * Returns an item from a specified zone. May not be called from a 380 * FAST interrupt or IPI function. 381 * 382 * void zfree(vm_zone_t zone, void *item) -- 383 * Frees an item back to a specified zone. May not be called from a 384 * FAST interrupt or IPI function. 385 */ 386 387 /* 388 * Internal zone routine. Not to be called from external (non vm_zone) code. 389 * 390 * No requirements. 391 */ 392 static void * 393 zget(vm_zone_t z) 394 { 395 int i; 396 vm_page_t m; 397 int nitems, nbytes; 398 int savezpc; 399 void *item; 400 401 if (z == NULL) 402 panic("zget: null zone"); 403 404 if (z->zflags & ZONE_INTERRUPT) { 405 /* 406 * Interrupt zones do not mess with the kernel_map, they 407 * simply populate an existing mapping. 408 */ 409 lwkt_gettoken(&vm_token); 410 savezpc = z->zpagecount; 411 nbytes = z->zpagecount * PAGE_SIZE; 412 nbytes -= nbytes % z->zsize; 413 item = (char *) z->zkva + nbytes; 414 for (i = 0; ((i < z->zalloc) && (z->zpagecount < z->zpagemax)); 415 i++) { 416 vm_offset_t zkva; 417 418 m = vm_page_alloc(z->zobj, z->zpagecount, 419 z->zallocflag); 420 /* note: z might be modified due to blocking */ 421 if (m == NULL) 422 break; 423 424 /* 425 * Unbusy page so it can freed in zdestroy(). Make 426 * sure it is not on any queue and so can not be 427 * recycled under our feet. 428 */ 429 KKASSERT(m->queue == PQ_NONE); 430 vm_page_flag_clear(m, PG_BUSY); 431 432 zkva = z->zkva + z->zpagecount * PAGE_SIZE; 433 pmap_kenter(zkva, VM_PAGE_TO_PHYS(m)); /* YYY */ 434 bzero((void *)zkva, PAGE_SIZE); 435 KKASSERT(savezpc == z->zpagecount); 436 ++savezpc; 437 z->zpagecount++; 438 zone_kmem_pages++; 439 vmstats.v_wire_count++; 440 } 441 nitems = ((z->zpagecount * PAGE_SIZE) - nbytes) / z->zsize; 442 lwkt_reltoken(&vm_token); 443 } else if (z->zflags & ZONE_SPECIAL) { 444 /* 445 * The special zone is the one used for vm_map_entry_t's. 446 * We have to avoid an infinite recursion in 447 * vm_map_entry_reserve() by using vm_map_entry_kreserve() 448 * instead. The map entries are pre-reserved by the kernel 449 * by vm_map_entry_reserve_cpu_init(). 450 */ 451 nbytes = z->zalloc * PAGE_SIZE; 452 453 item = (void *)kmem_alloc3(&kernel_map, nbytes, KM_KRESERVE); 454 455 /* note: z might be modified due to blocking */ 456 if (item != NULL) { 457 zone_kern_pages += z->zalloc; /* not MP-safe XXX */ 458 bzero(item, nbytes); 459 } else { 460 nbytes = 0; 461 } 462 nitems = nbytes / z->zsize; 463 } else { 464 /* 465 * Otherwise allocate KVA from the kernel_map. 466 */ 467 nbytes = z->zalloc * PAGE_SIZE; 468 469 item = (void *)kmem_alloc3(&kernel_map, nbytes, 0); 470 471 /* note: z might be modified due to blocking */ 472 if (item != NULL) { 473 zone_kern_pages += z->zalloc; /* not MP-safe XXX */ 474 bzero(item, nbytes); 475 476 if (z->zflags & ZONE_DESTROYABLE) { 477 if (z->zkmcur == z->zkmmax) { 478 z->zkmmax = 479 z->zkmmax==0 ? 1 : z->zkmmax*2; 480 z->zkmvec = krealloc(z->zkmvec, 481 z->zkmmax * sizeof(z->zkmvec[0]), 482 M_ZONE, M_WAITOK); 483 } 484 z->zkmvec[z->zkmcur++] = (vm_offset_t)item; 485 } 486 } else { 487 nbytes = 0; 488 } 489 nitems = nbytes / z->zsize; 490 } 491 492 spin_lock(&z->zlock); 493 z->ztotal += nitems; 494 /* 495 * Save one for immediate allocation 496 */ 497 if (nitems != 0) { 498 nitems -= 1; 499 for (i = 0; i < nitems; i++) { 500 ((void **) item)[0] = z->zitems; 501 #ifdef INVARIANTS 502 ((void **) item)[1] = (void *) ZENTRY_FREE; 503 #endif 504 z->zitems = item; 505 item = (uint8_t *)item + z->zsize; 506 } 507 z->zfreecnt += nitems; 508 z->znalloc++; 509 } else if (z->zfreecnt > 0) { 510 item = z->zitems; 511 z->zitems = ((void **) item)[0]; 512 #ifdef INVARIANTS 513 if (((void **) item)[1] != (void *) ZENTRY_FREE) 514 zerror(ZONE_ERROR_NOTFREE); 515 ((void **) item)[1] = 0; 516 #endif 517 z->zfreecnt--; 518 z->znalloc++; 519 } else { 520 item = NULL; 521 } 522 spin_unlock(&z->zlock); 523 524 /* 525 * A special zone may have used a kernel-reserved vm_map_entry. If 526 * so we have to be sure to recover our reserve so we don't run out. 527 * We will panic if we run out. 528 */ 529 if (z->zflags & ZONE_SPECIAL) 530 vm_map_entry_reserve(0); 531 532 return item; 533 } 534 535 /* 536 * No requirements. 537 */ 538 static int 539 sysctl_vm_zone(SYSCTL_HANDLER_ARGS) 540 { 541 int error=0; 542 vm_zone_t curzone; 543 char tmpbuf[128]; 544 char tmpname[14]; 545 546 ksnprintf(tmpbuf, sizeof(tmpbuf), 547 "\nITEM SIZE LIMIT USED FREE REQUESTS\n"); 548 error = SYSCTL_OUT(req, tmpbuf, strlen(tmpbuf)); 549 if (error) 550 return (error); 551 552 lwkt_gettoken(&vm_token); 553 LIST_FOREACH(curzone, &zlist, zlink) { 554 int i; 555 int len; 556 int offset; 557 558 len = strlen(curzone->zname); 559 if (len >= (sizeof(tmpname) - 1)) 560 len = (sizeof(tmpname) - 1); 561 for(i = 0; i < sizeof(tmpname) - 1; i++) 562 tmpname[i] = ' '; 563 tmpname[i] = 0; 564 memcpy(tmpname, curzone->zname, len); 565 tmpname[len] = ':'; 566 offset = 0; 567 if (curzone == LIST_FIRST(&zlist)) { 568 offset = 1; 569 tmpbuf[0] = '\n'; 570 } 571 572 ksnprintf(tmpbuf + offset, sizeof(tmpbuf) - offset, 573 "%s %6.6u, %8.8u, %6.6u, %6.6u, %8.8u\n", 574 tmpname, curzone->zsize, curzone->zmax, 575 (curzone->ztotal - curzone->zfreecnt), 576 curzone->zfreecnt, curzone->znalloc); 577 578 len = strlen((char *)tmpbuf); 579 if (LIST_NEXT(curzone, zlink) == NULL) 580 tmpbuf[len - 1] = 0; 581 582 error = SYSCTL_OUT(req, tmpbuf, len); 583 584 if (error) 585 break; 586 } 587 lwkt_reltoken(&vm_token); 588 return (error); 589 } 590 591 #if defined(INVARIANTS) 592 593 /* 594 * Debugging only. 595 */ 596 void 597 zerror(int error) 598 { 599 char *msg; 600 601 switch (error) { 602 case ZONE_ERROR_INVALID: 603 msg = "zone: invalid zone"; 604 break; 605 case ZONE_ERROR_NOTFREE: 606 msg = "zone: entry not free"; 607 break; 608 case ZONE_ERROR_ALREADYFREE: 609 msg = "zone: freeing free entry"; 610 break; 611 default: 612 msg = "zone: invalid error"; 613 break; 614 } 615 panic(msg); 616 } 617 #endif 618 619 SYSCTL_OID(_vm, OID_AUTO, zone, CTLTYPE_STRING|CTLFLAG_RD, \ 620 NULL, 0, sysctl_vm_zone, "A", "Zone Info"); 621 622 SYSCTL_INT(_vm, OID_AUTO, zone_kmem_pages, 623 CTLFLAG_RD, &zone_kmem_pages, 0, "Number of interrupt safe pages allocated by zone"); 624 SYSCTL_INT(_vm, OID_AUTO, zone_kmem_kvaspace, 625 CTLFLAG_RD, &zone_kmem_kvaspace, 0, "KVA space allocated by zone"); 626 SYSCTL_INT(_vm, OID_AUTO, zone_kern_pages, 627 CTLFLAG_RD, &zone_kern_pages, 0, "Number of non-interrupt safe pages allocated by zone"); 628