xref: /dflybsd-src/sys/vm/vm_zone.c (revision c9e3d8f96688a159959b1af2d4fef14b744173e3)
1 /*
2  * Copyright (c) 1997, 1998 John S. Dyson
3  * All rights reserved.
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions
7  * are met:
8  * 1. Redistributions of source code must retain the above copyright
9  *	notice immediately at the beginning of the file, without modification,
10  *	this list of conditions, and the following disclaimer.
11  * 2. Absolutely no warranty of function or purpose is made by the author
12  *	John S. Dyson.
13  *
14  * $FreeBSD: src/sys/vm/vm_zone.c,v 1.30.2.6 2002/10/10 19:50:16 dillon Exp $
15  * $DragonFly: src/sys/vm/vm_zone.c,v 1.28 2008/01/23 17:35:48 nth Exp $
16  */
17 
18 #include <sys/param.h>
19 #include <sys/queue.h>
20 #include <sys/systm.h>
21 #include <sys/kernel.h>
22 #include <sys/lock.h>
23 #include <sys/malloc.h>
24 #include <sys/sysctl.h>
25 #include <sys/vmmeter.h>
26 
27 #include <vm/vm.h>
28 #include <vm/vm_object.h>
29 #include <vm/vm_page.h>
30 #include <vm/vm_map.h>
31 #include <vm/vm_kern.h>
32 #include <vm/vm_extern.h>
33 #include <vm/vm_zone.h>
34 
35 #include <sys/spinlock2.h>
36 #include <sys/mplock2.h>
37 
38 static MALLOC_DEFINE(M_ZONE, "ZONE", "Zone header");
39 
40 #define	ZONE_ERROR_INVALID 0
41 #define	ZONE_ERROR_NOTFREE 1
42 #define	ZONE_ERROR_ALREADYFREE 2
43 
44 #define ZONE_ROUNDING	32
45 
46 #define	ZENTRY_FREE	0x12342378
47 
48 static void *zget(vm_zone_t z);
49 
50 /*
51  * Return an item from the specified zone.   This function is non-blocking for
52  * ZONE_INTERRUPT zones.
53  */
54 void *
55 zalloc(vm_zone_t z)
56 {
57 	void *item;
58 
59 #ifdef INVARIANTS
60 	if (z == NULL)
61 		zerror(ZONE_ERROR_INVALID);
62 #endif
63 	spin_lock_wr(&z->zlock);
64 	if (z->zfreecnt > z->zfreemin) {
65 		item = z->zitems;
66 #ifdef INVARIANTS
67 		KASSERT(item != NULL, ("zitems unexpectedly NULL"));
68 		if (((void **) item)[1] != (void *) ZENTRY_FREE)
69 			zerror(ZONE_ERROR_NOTFREE);
70 		((void **) item)[1] = 0;
71 #endif
72 		z->zitems = ((void **) item)[0];
73 		z->zfreecnt--;
74 		z->znalloc++;
75 		spin_unlock_wr(&z->zlock);
76 	} else {
77 		spin_unlock_wr(&z->zlock);
78 		item = zget(z);
79 		/*
80 		 * PANICFAIL allows the caller to assume that the zalloc()
81 		 * will always succeed.  If it doesn't, we panic here.
82 		 */
83 		if (item == NULL && (z->zflags & ZONE_PANICFAIL))
84 			panic("zalloc(%s) failed", z->zname);
85 	}
86 	return item;
87 }
88 
89 /*
90  * Free an item to the specified zone.
91  */
92 void
93 zfree(vm_zone_t z, void *item)
94 {
95 
96 	spin_lock_wr(&z->zlock);
97 	((void **) item)[0] = z->zitems;
98 #ifdef INVARIANTS
99 	if (((void **) item)[1] == (void *) ZENTRY_FREE)
100 		zerror(ZONE_ERROR_ALREADYFREE);
101 	((void **) item)[1] = (void *) ZENTRY_FREE;
102 #endif
103 	z->zitems = item;
104 	z->zfreecnt++;
105 	spin_unlock_wr(&z->zlock);
106 }
107 
108 /*
109  * This file comprises a very simple zone allocator.  This is used
110  * in lieu of the malloc allocator, where needed or more optimal.
111  *
112  * Note that the initial implementation of this had coloring, and
113  * absolutely no improvement (actually perf degradation) occurred.
114  *
115  * Note also that the zones are type stable.  The only restriction is
116  * that the first two longwords of a data structure can be changed
117  * between allocations.  Any data that must be stable between allocations
118  * must reside in areas after the first two longwords.
119  *
120  * zinitna, zinit, zbootinit are the initialization routines.
121  * zalloc, zfree, are the allocation/free routines.
122  */
123 
124 LIST_HEAD(zlist, vm_zone) zlist = LIST_HEAD_INITIALIZER(zlist);
125 static int sysctl_vm_zone(SYSCTL_HANDLER_ARGS);
126 static int zone_kmem_pages, zone_kern_pages, zone_kmem_kvaspace;
127 
128 /*
129  * Create a zone, but don't allocate the zone structure.  If the
130  * zone had been previously created by the zone boot code, initialize
131  * various parts of the zone code.
132  *
133  * If waits are not allowed during allocation (e.g. during interrupt
134  * code), a-priori allocate the kernel virtual space, and allocate
135  * only pages when needed.
136  *
137  * Arguments:
138  * z		pointer to zone structure.
139  * obj		pointer to VM object (opt).
140  * name		name of zone.
141  * size		size of zone entries.
142  * nentries	number of zone entries allocated (only ZONE_INTERRUPT.)
143  * flags	ZONE_INTERRUPT -- items can be allocated at interrupt time.
144  * zalloc	number of pages allocated when memory is needed.
145  *
146  * Note that when using ZONE_INTERRUPT, the size of the zone is limited
147  * by the nentries argument.  The size of the memory allocatable is
148  * unlimited if ZONE_INTERRUPT is not set.
149  *
150  */
151 int
152 zinitna(vm_zone_t z, vm_object_t obj, char *name, int size,
153 	int nentries, int flags, int zalloc)
154 {
155 	int totsize;
156 
157 	/*
158 	 * Only zones created with zinit() are destroyable.
159 	 */
160 	if (z->zflags & ZONE_DESTROYABLE)
161 		panic("zinitna: can't create destroyable zone");
162 
163 	/*
164 	 * NOTE: We can only adjust zsize if we previously did not
165 	 * 	 use zbootinit().
166 	 */
167 	if ((z->zflags & ZONE_BOOT) == 0) {
168 		z->zsize = (size + ZONE_ROUNDING - 1) & ~(ZONE_ROUNDING - 1);
169 		spin_init(&z->zlock);
170 		z->zfreecnt = 0;
171 		z->ztotal = 0;
172 		z->zmax = 0;
173 		z->zname = name;
174 		z->znalloc = 0;
175 		z->zitems = NULL;
176 
177 		LIST_INSERT_HEAD(&zlist, z, zlink);
178 	}
179 
180 	z->zkmvec = NULL;
181 	z->zkmcur = z->zkmmax = 0;
182 	z->zflags |= flags;
183 
184 	/*
185 	 * If we cannot wait, allocate KVA space up front, and we will fill
186 	 * in pages as needed.  This is particularly required when creating
187 	 * an allocation space for map entries in kernel_map, because we
188 	 * do not want to go into a recursion deadlock with
189 	 * vm_map_entry_reserve().
190 	 */
191 	if (z->zflags & ZONE_INTERRUPT) {
192 		totsize = round_page(z->zsize * nentries);
193 		zone_kmem_kvaspace += totsize;
194 
195 		z->zkva = kmem_alloc_pageable(&kernel_map, totsize);
196 		if (z->zkva == 0) {
197 			LIST_REMOVE(z, zlink);
198 			return 0;
199 		}
200 
201 		z->zpagemax = totsize / PAGE_SIZE;
202 		if (obj == NULL) {
203 			z->zobj = vm_object_allocate(OBJT_DEFAULT, z->zpagemax);
204 		} else {
205 			z->zobj = obj;
206 			_vm_object_allocate(OBJT_DEFAULT, z->zpagemax, obj);
207 		}
208 		z->zallocflag = VM_ALLOC_SYSTEM | VM_ALLOC_INTERRUPT;
209 		z->zmax += nentries;
210 	} else {
211 		z->zallocflag = VM_ALLOC_NORMAL | VM_ALLOC_SYSTEM;
212 		z->zmax = 0;
213 	}
214 
215 
216 	if (z->zsize > PAGE_SIZE)
217 		z->zfreemin = 1;
218 	else
219 		z->zfreemin = PAGE_SIZE / z->zsize;
220 
221 	z->zpagecount = 0;
222 	if (zalloc)
223 		z->zalloc = zalloc;
224 	else
225 		z->zalloc = 1;
226 
227 	/*
228 	 * Populate the interrrupt zone at creation time rather than
229 	 * on first allocation, as this is a potentially long operation.
230 	 */
231 	if (z->zflags & ZONE_INTERRUPT) {
232 		void *buf;
233 
234 		buf = zget(z);
235 		zfree(z, buf);
236 	}
237 
238 	return 1;
239 }
240 
241 /*
242  * Subroutine same as zinitna, except zone data structure is allocated
243  * automatically by malloc.  This routine should normally be used, except
244  * in certain tricky startup conditions in the VM system -- then
245  * zbootinit and zinitna can be used.  Zinit is the standard zone
246  * initialization call.
247  */
248 vm_zone_t
249 zinit(char *name, int size, int nentries, int flags, int zalloc)
250 {
251 	vm_zone_t z;
252 
253 	z = (vm_zone_t) kmalloc(sizeof (struct vm_zone), M_ZONE, M_NOWAIT);
254 	if (z == NULL)
255 		return NULL;
256 
257 	z->zflags = 0;
258 	if (zinitna(z, NULL, name, size, nentries,
259 	            flags & ~ZONE_DESTROYABLE, zalloc) == 0) {
260 		kfree(z, M_ZONE);
261 		return NULL;
262 	}
263 
264 	if (flags & ZONE_DESTROYABLE)
265 		z->zflags |= ZONE_DESTROYABLE;
266 
267 	return z;
268 }
269 
270 /*
271  * Initialize a zone before the system is fully up.  This routine should
272  * only be called before full VM startup.
273  */
274 void
275 zbootinit(vm_zone_t z, char *name, int size, void *item, int nitems)
276 {
277 	int i;
278 
279 	z->zname = name;
280 	z->zsize = size;
281 	z->zpagemax = 0;
282 	z->zobj = NULL;
283 	z->zflags = ZONE_BOOT;
284 	z->zfreemin = 0;
285 	z->zallocflag = 0;
286 	z->zpagecount = 0;
287 	z->zalloc = 0;
288 	z->znalloc = 0;
289 	spin_init(&z->zlock);
290 
291 	bzero(item, nitems * z->zsize);
292 	z->zitems = NULL;
293 	for (i = 0; i < nitems; i++) {
294 		((void **) item)[0] = z->zitems;
295 #ifdef INVARIANTS
296 		((void **) item)[1] = (void *) ZENTRY_FREE;
297 #endif
298 		z->zitems = item;
299 		item = (uint8_t *)item + z->zsize;
300 	}
301 	z->zfreecnt = nitems;
302 	z->zmax = nitems;
303 	z->ztotal = nitems;
304 
305 	LIST_INSERT_HEAD(&zlist, z, zlink);
306 }
307 
308 /*
309  * Release all resources owned by zone created with zinit().
310  */
311 void
312 zdestroy(vm_zone_t z)
313 {
314 	int i;
315 
316 	if (z == NULL)
317 		panic("zdestroy: null zone");
318 	if ((z->zflags & ZONE_DESTROYABLE) == 0)
319 		panic("zdestroy: undestroyable zone");
320 
321 	LIST_REMOVE(z, zlink);
322 
323 	/*
324 	 * Release virtual mappings, physical memory and update sysctl stats.
325 	 */
326 	if (z->zflags & ZONE_INTERRUPT) {
327 		/*
328 		 * Pages mapped via pmap_kenter() must be removed from the
329 		 * kernel_pmap() before calling kmem_free() to avoid issues
330 		 * with kernel_pmap.pm_stats.resident_count.
331 		 */
332 		pmap_qremove(z->zkva, z->zpagemax);
333 
334 		/*
335 		 * Free the mapping.
336 		 */
337 		kmem_free(&kernel_map, z->zkva, z->zpagemax*PAGE_SIZE);
338 		atomic_subtract_int(&zone_kmem_kvaspace, z->zpagemax*PAGE_SIZE);
339 
340 		/*
341 		 * Free the backing object and physical pages.
342 		 */
343 		vm_object_deallocate(z->zobj);
344 		atomic_subtract_int(&zone_kmem_pages, z->zpagecount);
345 	} else {
346 		for (i=0; i < z->zkmcur; i++) {
347 			kmem_free(&kernel_map, z->zkmvec[i],
348 			    z->zalloc*PAGE_SIZE);
349 			atomic_subtract_int(&zone_kern_pages, z->zalloc);
350 		}
351 		if (z->zkmvec != NULL)
352 			kfree(z->zkmvec, M_ZONE);
353 	}
354 
355 	spin_uninit(&z->zlock);
356 	kfree(z, M_ZONE);
357 }
358 
359 
360 /*
361  * void *zalloc(vm_zone_t zone) --
362  *	Returns an item from a specified zone.  May not be called from a
363  *	FAST interrupt or IPI function.
364  *
365  * void zfree(vm_zone_t zone, void *item) --
366  *	Frees an item back to a specified zone.  May not be called from a
367  *	FAST interrupt or IPI function.
368  */
369 
370 /*
371  * Internal zone routine.  Not to be called from external (non vm_zone) code.
372  */
373 static void *
374 zget(vm_zone_t z)
375 {
376 	int i;
377 	vm_page_t m;
378 	int nitems, nbytes;
379 	int savezpc;
380 	void *item;
381 
382 	if (z == NULL)
383 		panic("zget: null zone");
384 
385 	if (z->zflags & ZONE_INTERRUPT) {
386 		/*
387 		 * Interrupt zones do not mess with the kernel_map, they
388 		 * simply populate an existing mapping.
389 		 */
390 		get_mplock();
391 		savezpc = z->zpagecount;
392 		nbytes = z->zpagecount * PAGE_SIZE;
393 		nbytes -= nbytes % z->zsize;
394 		item = (char *) z->zkva + nbytes;
395 		for (i = 0; ((i < z->zalloc) && (z->zpagecount < z->zpagemax));
396 		     i++) {
397 			vm_offset_t zkva;
398 
399 			m = vm_page_alloc(z->zobj, z->zpagecount,
400 					  z->zallocflag);
401 			/* note: z might be modified due to blocking */
402 			if (m == NULL)
403 				break;
404 
405 			/*
406 			 * Unbusy page so it can freed in zdestroy().  Make
407 			 * sure it is not on any queue and so can not be
408 			 * recycled under our feet.
409 			 */
410 			KKASSERT(m->queue == PQ_NONE);
411 			vm_page_flag_clear(m, PG_BUSY);
412 
413 			zkva = z->zkva + z->zpagecount * PAGE_SIZE;
414 			pmap_kenter(zkva, VM_PAGE_TO_PHYS(m)); /* YYY */
415 			bzero((void *)zkva, PAGE_SIZE);
416 			KKASSERT(savezpc == z->zpagecount);
417 			++savezpc;
418 			z->zpagecount++;
419 			zone_kmem_pages++;
420 			vmstats.v_wire_count++;
421 		}
422 		nitems = ((z->zpagecount * PAGE_SIZE) - nbytes) / z->zsize;
423 		rel_mplock();
424 	} else if (z->zflags & ZONE_SPECIAL) {
425 		/*
426 		 * The special zone is the one used for vm_map_entry_t's.
427 		 * We have to avoid an infinite recursion in
428 		 * vm_map_entry_reserve() by using vm_map_entry_kreserve()
429 		 * instead.  The map entries are pre-reserved by the kernel
430 		 * by vm_map_entry_reserve_cpu_init().
431 		 */
432 		nbytes = z->zalloc * PAGE_SIZE;
433 
434 		item = (void *)kmem_alloc3(&kernel_map, nbytes, KM_KRESERVE);
435 
436 		/* note: z might be modified due to blocking */
437 		if (item != NULL) {
438 			zone_kern_pages += z->zalloc;	/* not MP-safe XXX */
439 			bzero(item, nbytes);
440 		} else {
441 			nbytes = 0;
442 		}
443 		nitems = nbytes / z->zsize;
444 	} else {
445 		/*
446 		 * Otherwise allocate KVA from the kernel_map.
447 		 */
448 		nbytes = z->zalloc * PAGE_SIZE;
449 
450 		item = (void *)kmem_alloc3(&kernel_map, nbytes, 0);
451 
452 		/* note: z might be modified due to blocking */
453 		if (item != NULL) {
454 			zone_kern_pages += z->zalloc;	/* not MP-safe XXX */
455 			bzero(item, nbytes);
456 
457 			if (z->zflags & ZONE_DESTROYABLE) {
458 				if (z->zkmcur == z->zkmmax) {
459 					z->zkmmax =
460 						z->zkmmax==0 ? 1 : z->zkmmax*2;
461 					z->zkmvec = krealloc(z->zkmvec,
462 					    z->zkmmax * sizeof(z->zkmvec[0]),
463 					    M_ZONE, M_WAITOK);
464 				}
465 				z->zkmvec[z->zkmcur++] = (vm_offset_t)item;
466 			}
467 		} else {
468 			nbytes = 0;
469 		}
470 		nitems = nbytes / z->zsize;
471 	}
472 
473 	spin_lock_wr(&z->zlock);
474 	z->ztotal += nitems;
475 	/*
476 	 * Save one for immediate allocation
477 	 */
478 	if (nitems != 0) {
479 		nitems -= 1;
480 		for (i = 0; i < nitems; i++) {
481 			((void **) item)[0] = z->zitems;
482 #ifdef INVARIANTS
483 			((void **) item)[1] = (void *) ZENTRY_FREE;
484 #endif
485 			z->zitems = item;
486 			item = (uint8_t *)item + z->zsize;
487 		}
488 		z->zfreecnt += nitems;
489 		z->znalloc++;
490 	} else if (z->zfreecnt > 0) {
491 		item = z->zitems;
492 		z->zitems = ((void **) item)[0];
493 #ifdef INVARIANTS
494 		if (((void **) item)[1] != (void *) ZENTRY_FREE)
495 			zerror(ZONE_ERROR_NOTFREE);
496 		((void **) item)[1] = 0;
497 #endif
498 		z->zfreecnt--;
499 		z->znalloc++;
500 	} else {
501 		item = NULL;
502 	}
503 	spin_unlock_wr(&z->zlock);
504 
505 	/*
506 	 * A special zone may have used a kernel-reserved vm_map_entry.  If
507 	 * so we have to be sure to recover our reserve so we don't run out.
508 	 * We will panic if we run out.
509 	 */
510 	if (z->zflags & ZONE_SPECIAL)
511 		vm_map_entry_reserve(0);
512 
513 	return item;
514 }
515 
516 static int
517 sysctl_vm_zone(SYSCTL_HANDLER_ARGS)
518 {
519 	int error=0;
520 	vm_zone_t curzone;
521 	char tmpbuf[128];
522 	char tmpname[14];
523 
524 	ksnprintf(tmpbuf, sizeof(tmpbuf),
525 	    "\nITEM            SIZE     LIMIT    USED    FREE  REQUESTS\n");
526 	error = SYSCTL_OUT(req, tmpbuf, strlen(tmpbuf));
527 	if (error)
528 		return (error);
529 
530 	LIST_FOREACH(curzone, &zlist, zlink) {
531 		int i;
532 		int len;
533 		int offset;
534 
535 		len = strlen(curzone->zname);
536 		if (len >= (sizeof(tmpname) - 1))
537 			len = (sizeof(tmpname) - 1);
538 		for(i = 0; i < sizeof(tmpname) - 1; i++)
539 			tmpname[i] = ' ';
540 		tmpname[i] = 0;
541 		memcpy(tmpname, curzone->zname, len);
542 		tmpname[len] = ':';
543 		offset = 0;
544 		if (curzone == LIST_FIRST(&zlist)) {
545 			offset = 1;
546 			tmpbuf[0] = '\n';
547 		}
548 
549 		ksnprintf(tmpbuf + offset, sizeof(tmpbuf) - offset,
550 			"%s %6.6u, %8.8u, %6.6u, %6.6u, %8.8u\n",
551 			tmpname, curzone->zsize, curzone->zmax,
552 			(curzone->ztotal - curzone->zfreecnt),
553 			curzone->zfreecnt, curzone->znalloc);
554 
555 		len = strlen((char *)tmpbuf);
556 		if (LIST_NEXT(curzone, zlink) == NULL)
557 			tmpbuf[len - 1] = 0;
558 
559 		error = SYSCTL_OUT(req, tmpbuf, len);
560 
561 		if (error)
562 			return (error);
563 	}
564 	return (0);
565 }
566 
567 #if defined(INVARIANTS)
568 void
569 zerror(int error)
570 {
571 	char *msg;
572 
573 	switch (error) {
574 	case ZONE_ERROR_INVALID:
575 		msg = "zone: invalid zone";
576 		break;
577 	case ZONE_ERROR_NOTFREE:
578 		msg = "zone: entry not free";
579 		break;
580 	case ZONE_ERROR_ALREADYFREE:
581 		msg = "zone: freeing free entry";
582 		break;
583 	default:
584 		msg = "zone: invalid error";
585 		break;
586 	}
587 	panic(msg);
588 }
589 #endif
590 
591 SYSCTL_OID(_vm, OID_AUTO, zone, CTLTYPE_STRING|CTLFLAG_RD, \
592 	NULL, 0, sysctl_vm_zone, "A", "Zone Info");
593 
594 SYSCTL_INT(_vm, OID_AUTO, zone_kmem_pages,
595 	CTLFLAG_RD, &zone_kmem_pages, 0, "Number of interrupt safe pages allocated by zone");
596 SYSCTL_INT(_vm, OID_AUTO, zone_kmem_kvaspace,
597 	CTLFLAG_RD, &zone_kmem_kvaspace, 0, "KVA space allocated by zone");
598 SYSCTL_INT(_vm, OID_AUTO, zone_kern_pages,
599 	CTLFLAG_RD, &zone_kern_pages, 0, "Number of non-interrupt safe pages allocated by zone");
600