xref: /openbsd-src/sys/uvm/uvm_amap.c (revision f2da64fbbbf1b03f09f390ab01267c93dfd77c4c)
1 /*	$OpenBSD: uvm_amap.c,v 1.77 2016/09/15 02:00:18 dlg Exp $	*/
2 /*	$NetBSD: uvm_amap.c,v 1.27 2000/11/25 06:27:59 chs Exp $	*/
3 
4 /*
5  * Copyright (c) 1997 Charles D. Cranor and Washington University.
6  * All rights reserved.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
18  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
19  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
20  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
21  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
22  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
23  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
24  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
25  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
26  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27  */
28 
29 /*
30  * uvm_amap.c: amap operations
31  *
32  * this file contains functions that perform operations on amaps.  see
33  * uvm_amap.h for a brief explanation of the role of amaps in uvm.
34  */
35 
36 #include <sys/param.h>
37 #include <sys/systm.h>
38 #include <sys/malloc.h>
39 #include <sys/kernel.h>
40 #include <sys/pool.h>
41 #include <sys/atomic.h>
42 
43 #include <uvm/uvm.h>
44 #include <uvm/uvm_swap.h>
45 
46 /*
47  * pools for allocation of vm_amap structures.  note that in order to
48  * avoid an endless loop, the amap pool's allocator cannot allocate
49  * memory from an amap (it currently goes through the kernel uobj, so
50  * we are ok).
51  */
52 
53 struct pool uvm_amap_pool;
54 struct pool uvm_small_amap_pool[UVM_AMAP_CHUNK];
55 struct pool uvm_amap_chunk_pool;
56 
57 LIST_HEAD(, vm_amap) amap_list;
58 
59 static char amap_small_pool_names[UVM_AMAP_CHUNK][9];
60 
61 /*
62  * local functions
63  */
64 
65 static struct vm_amap *amap_alloc1(int, int, int);
66 static __inline void amap_list_insert(struct vm_amap *);
67 static __inline void amap_list_remove(struct vm_amap *);
68 
69 struct vm_amap_chunk *amap_chunk_get(struct vm_amap *, int, int, int);
70 void amap_chunk_free(struct vm_amap *, struct vm_amap_chunk *);
71 void amap_wiperange_chunk(struct vm_amap *, struct vm_amap_chunk *, int, int);
72 
73 static __inline void
74 amap_list_insert(struct vm_amap *amap)
75 {
76 	LIST_INSERT_HEAD(&amap_list, amap, am_list);
77 }
78 
79 static __inline void
80 amap_list_remove(struct vm_amap *amap)
81 {
82 	LIST_REMOVE(amap, am_list);
83 }
84 
85 /*
86  * amap_chunk_get: lookup a chunk for slot. if create is non-zero,
87  * the chunk is created if it does not yet exist.
88  *
89  * => returns the chunk on success or NULL on error
90  */
91 struct vm_amap_chunk *
92 amap_chunk_get(struct vm_amap *amap, int slot, int create, int waitf)
93 {
94 	int bucket = UVM_AMAP_BUCKET(amap, slot);
95 	int baseslot = AMAP_BASE_SLOT(slot);
96 	int n;
97 	struct vm_amap_chunk *chunk, *newchunk, *pchunk = NULL;
98 
99 	if (UVM_AMAP_SMALL(amap))
100 		return &amap->am_small;
101 
102 	for (chunk = amap->am_buckets[bucket]; chunk != NULL;
103 	    chunk = TAILQ_NEXT(chunk, ac_list)) {
104 		if (UVM_AMAP_BUCKET(amap, chunk->ac_baseslot) != bucket)
105 			break;
106 		if (chunk->ac_baseslot == baseslot)
107 			return chunk;
108 		pchunk = chunk;
109 	}
110 	if (!create)
111 		return NULL;
112 
113 	if (amap->am_nslot - baseslot >= UVM_AMAP_CHUNK)
114 		n = UVM_AMAP_CHUNK;
115 	else
116 		n = amap->am_nslot - baseslot;
117 
118 	newchunk = pool_get(&uvm_amap_chunk_pool, waitf | PR_ZERO);
119 	if (newchunk == NULL)
120 		return NULL;
121 
122 	if (pchunk == NULL) {
123 		TAILQ_INSERT_TAIL(&amap->am_chunks, newchunk, ac_list);
124 		KASSERT(amap->am_buckets[bucket] == NULL);
125 		amap->am_buckets[bucket] = newchunk;
126 	} else
127 		TAILQ_INSERT_AFTER(&amap->am_chunks, pchunk, newchunk,
128 		    ac_list);
129 
130 	amap->am_ncused++;
131 	newchunk->ac_baseslot = baseslot;
132 	newchunk->ac_nslot = n;
133 	return newchunk;
134 }
135 
136 void
137 amap_chunk_free(struct vm_amap *amap, struct vm_amap_chunk *chunk)
138 {
139 	int bucket = UVM_AMAP_BUCKET(amap, chunk->ac_baseslot);
140 	struct vm_amap_chunk *nchunk;
141 
142 	if (UVM_AMAP_SMALL(amap))
143 		return;
144 
145 	nchunk = TAILQ_NEXT(chunk, ac_list);
146 	TAILQ_REMOVE(&amap->am_chunks, chunk, ac_list);
147 	if (amap->am_buckets[bucket] == chunk) {
148 		if (nchunk != NULL &&
149 		    UVM_AMAP_BUCKET(amap, nchunk->ac_baseslot) == bucket)
150 			amap->am_buckets[bucket] = nchunk;
151 		else
152 			amap->am_buckets[bucket] = NULL;
153 
154 	}
155 	pool_put(&uvm_amap_chunk_pool, chunk);
156 	amap->am_ncused--;
157 }
158 
159 #ifdef UVM_AMAP_PPREF
160 /*
161  * what is ppref?   ppref is an _optional_ amap feature which is used
162  * to keep track of reference counts on a per-page basis.  it is enabled
163  * when UVM_AMAP_PPREF is defined.
164  *
165  * when enabled, an array of ints is allocated for the pprefs.  this
166  * array is allocated only when a partial reference is added to the
167  * map (either by unmapping part of the amap, or gaining a reference
168  * to only a part of an amap).  if the malloc of the array fails
169  * (M_NOWAIT), then we set the array pointer to PPREF_NONE to indicate
170  * that we tried to do ppref's but couldn't alloc the array so just
171  * give up (after all, this is an optional feature!).
172  *
173  * the array is divided into page sized "chunks."   for chunks of length 1,
174  * the chunk reference count plus one is stored in that chunk's slot.
175  * for chunks of length > 1 the first slot contains (the reference count
176  * plus one) * -1.    [the negative value indicates that the length is
177  * greater than one.]   the second slot of the chunk contains the length
178  * of the chunk.   here is an example:
179  *
180  * actual REFS:  2  2  2  2  3  1  1  0  0  0  4  4  0  1  1  1
181  *       ppref: -3  4  x  x  4 -2  2 -1  3  x -5  2  1 -2  3  x
182  *              <----------><-><----><-------><----><-><------->
183  * (x = don't care)
184  *
185  * this allows us to allow one int to contain the ref count for the whole
186  * chunk.    note that the "plus one" part is needed because a reference
187  * count of zero is neither positive or negative (need a way to tell
188  * if we've got one zero or a bunch of them).
189  *
190  * here are some in-line functions to help us.
191  */
192 
193 static __inline void pp_getreflen(int *, int, int *, int *);
194 static __inline void pp_setreflen(int *, int, int, int);
195 
196 /*
197  * pp_getreflen: get the reference and length for a specific offset
198  */
199 static __inline void
200 pp_getreflen(int *ppref, int offset, int *refp, int *lenp)
201 {
202 
203 	if (ppref[offset] > 0) {		/* chunk size must be 1 */
204 		*refp = ppref[offset] - 1;	/* don't forget to adjust */
205 		*lenp = 1;
206 	} else {
207 		*refp = (ppref[offset] * -1) - 1;
208 		*lenp = ppref[offset+1];
209 	}
210 }
211 
212 /*
213  * pp_setreflen: set the reference and length for a specific offset
214  */
215 static __inline void
216 pp_setreflen(int *ppref, int offset, int ref, int len)
217 {
218 	if (len == 1) {
219 		ppref[offset] = ref + 1;
220 	} else {
221 		ppref[offset] = (ref + 1) * -1;
222 		ppref[offset+1] = len;
223 	}
224 }
225 #endif
226 
227 /*
228  * amap_init: called at boot time to init global amap data structures
229  */
230 
231 void
232 amap_init(void)
233 {
234 	int i;
235 	size_t size;
236 
237 	/* Initialize the vm_amap pool. */
238 	pool_init(&uvm_amap_pool, sizeof(struct vm_amap),
239 	    0, IPL_NONE, PR_WAITOK, "amappl", NULL);
240 	pool_sethiwat(&uvm_amap_pool, 4096);
241 
242 	/* initialize small amap pools */
243 	for (i = 0; i < nitems(uvm_small_amap_pool); i++) {
244 		snprintf(amap_small_pool_names[i],
245 		    sizeof(amap_small_pool_names[0]), "amappl%d", i + 1);
246 		size = offsetof(struct vm_amap, am_small.ac_anon) +
247 		    (i + 1) * sizeof(struct vm_anon *);
248 		pool_init(&uvm_small_amap_pool[i], size, 0,
249 		    IPL_NONE, 0, amap_small_pool_names[i], NULL);
250 	}
251 
252 	pool_init(&uvm_amap_chunk_pool, sizeof(struct vm_amap_chunk) +
253 	    UVM_AMAP_CHUNK * sizeof(struct vm_anon *),
254 	    0, IPL_NONE, 0, "amapchunkpl", NULL);
255 	pool_sethiwat(&uvm_amap_chunk_pool, 4096);
256 }
257 
258 /*
259  * amap_alloc1: internal function that allocates an amap, but does not
260  *	init the overlay.
261  */
262 static inline struct vm_amap *
263 amap_alloc1(int slots, int waitf, int lazyalloc)
264 {
265 	struct vm_amap *amap;
266 	struct vm_amap_chunk *chunk, *tmp;
267 	int chunks, chunkperbucket = 1, hashshift = 0;
268 	int buckets, i, n;
269 	int pwaitf = (waitf & M_WAITOK) ? PR_WAITOK : PR_NOWAIT;
270 
271 	KASSERT(slots > 0);
272 
273 	/*
274 	 * Cast to unsigned so that rounding up cannot cause integer overflow
275 	 * if slots is large.
276 	 */
277 	chunks = roundup((unsigned int)slots, UVM_AMAP_CHUNK) / UVM_AMAP_CHUNK;
278 
279 	if (lazyalloc) {
280 		/*
281 		 * Basically, the amap is a hash map where the number of
282 		 * buckets is fixed. We select the number of buckets using the
283 		 * following strategy:
284 		 *
285 		 * 1. The maximal number of entries to search in a bucket upon
286 		 * a collision should be less than or equal to
287 		 * log2(slots / UVM_AMAP_CHUNK). This is the worst-case number
288 		 * of lookups we would have if we could chunk the amap. The
289 		 * log2(n) comes from the fact that amaps are chunked by
290 		 * splitting up their vm_map_entries and organizing those
291 		 * in a binary search tree.
292 		 *
293 		 * 2. The maximal number of entries in a bucket must be a
294 		 * power of two.
295 		 *
296 		 * The maximal number of entries per bucket is used to hash
297 		 * a slot to a bucket.
298 		 *
299 		 * In the future, this strategy could be refined to make it
300 		 * even harder/impossible that the total amount of KVA needed
301 		 * for the hash buckets of all amaps to exceed the maximal
302 		 * amount of KVA memory reserved for amaps.
303 		 */
304 		chunkperbucket = 1 << hashshift;
305 		while ((1 << chunkperbucket) * 2 <= chunks) {
306 			hashshift++;
307 			chunkperbucket = 1 << hashshift;
308 		}
309 	}
310 
311 	if (slots > UVM_AMAP_CHUNK)
312 		amap = pool_get(&uvm_amap_pool, pwaitf);
313 	else
314 		amap = pool_get(&uvm_small_amap_pool[slots - 1],
315 		    pwaitf | PR_ZERO);
316 	if (amap == NULL)
317 		return(NULL);
318 
319 	amap->am_ref = 1;
320 	amap->am_flags = 0;
321 #ifdef UVM_AMAP_PPREF
322 	amap->am_ppref = NULL;
323 #endif
324 	amap->am_nslot = slots;
325 	amap->am_nused = 0;
326 
327 	if (UVM_AMAP_SMALL(amap)) {
328 		amap->am_small.ac_nslot = slots;
329 		return (amap);
330 	}
331 
332 	amap->am_ncused = 0;
333 	TAILQ_INIT(&amap->am_chunks);
334 	amap->am_hashshift = hashshift;
335 	amap->am_buckets = NULL;
336 
337 	buckets = howmany(chunks, chunkperbucket);
338 	amap->am_buckets = mallocarray(buckets, sizeof(*amap->am_buckets),
339 	    M_UVMAMAP, waitf | (lazyalloc ? M_ZERO : 0));
340 	if (amap->am_buckets == NULL)
341 		goto fail1;
342 
343 	if (!lazyalloc) {
344 		for (i = 0; i < buckets; i++) {
345 			if (i == buckets - 1) {
346 				n = slots % UVM_AMAP_CHUNK;
347 				if (n == 0)
348 					n = UVM_AMAP_CHUNK;
349 			} else
350 				n = UVM_AMAP_CHUNK;
351 
352 			chunk = pool_get(&uvm_amap_chunk_pool,
353 			    PR_ZERO | pwaitf);
354 			if (chunk == NULL)
355 				goto fail1;
356 
357 			amap->am_buckets[i] = chunk;
358 			amap->am_ncused++;
359 			chunk->ac_baseslot = i * UVM_AMAP_CHUNK;
360 			chunk->ac_nslot = n;
361 			TAILQ_INSERT_TAIL(&amap->am_chunks, chunk, ac_list);
362 		}
363 	}
364 
365 	return(amap);
366 
367 fail1:
368 	free(amap->am_buckets, M_UVMAMAP, 0);
369 	TAILQ_FOREACH_SAFE(chunk, &amap->am_chunks, ac_list, tmp)
370 		pool_put(&uvm_amap_chunk_pool, chunk);
371 	pool_put(&uvm_amap_pool, amap);
372 	return (NULL);
373 }
374 
375 /*
376  * amap_alloc: allocate an amap to manage "sz" bytes of anonymous VM
377  *
378  * => caller should ensure sz is a multiple of PAGE_SIZE
379  * => reference count to new amap is set to one
380  */
381 struct vm_amap *
382 amap_alloc(vaddr_t sz, int waitf, int lazyalloc)
383 {
384 	struct vm_amap *amap;
385 	size_t slots;
386 
387 	AMAP_B2SLOT(slots, sz);		/* load slots */
388 	if (slots > INT_MAX)
389 		return (NULL);
390 
391 	amap = amap_alloc1(slots, waitf, lazyalloc);
392 	if (amap)
393 		amap_list_insert(amap);
394 
395 	return(amap);
396 }
397 
398 
399 /*
400  * amap_free: free an amap
401  *
402  * => the amap should have a zero reference count and be empty
403  */
404 void
405 amap_free(struct vm_amap *amap)
406 {
407 	struct vm_amap_chunk *chunk, *tmp;
408 
409 	KASSERT(amap->am_ref == 0 && amap->am_nused == 0);
410 	KASSERT((amap->am_flags & AMAP_SWAPOFF) == 0);
411 
412 #ifdef UVM_AMAP_PPREF
413 	if (amap->am_ppref && amap->am_ppref != PPREF_NONE)
414 		free(amap->am_ppref, M_UVMAMAP, 0);
415 #endif
416 
417 	if (UVM_AMAP_SMALL(amap))
418 		pool_put(&uvm_small_amap_pool[amap->am_nslot - 1], amap);
419 	else {
420 		TAILQ_FOREACH_SAFE(chunk, &amap->am_chunks, ac_list, tmp)
421 		    pool_put(&uvm_amap_chunk_pool, chunk);
422 		free(amap->am_buckets, M_UVMAMAP, 0);
423 		pool_put(&uvm_amap_pool, amap);
424 	}
425 }
426 
427 /*
428  * amap_wipeout: wipeout all anon's in an amap; then free the amap!
429  *
430  * => called from amap_unref when the final reference to an amap is
431  *	discarded (i.e. when reference count == 1)
432  */
433 
434 void
435 amap_wipeout(struct vm_amap *amap)
436 {
437 	int slot;
438 	struct vm_anon *anon;
439 	struct vm_amap_chunk *chunk;
440 
441 	KASSERT(amap->am_ref == 0);
442 
443 	if (__predict_false((amap->am_flags & AMAP_SWAPOFF) != 0)) {
444 		/* amap_swap_off will call us again. */
445 		return;
446 	}
447 	amap_list_remove(amap);
448 
449 	AMAP_CHUNK_FOREACH(chunk, amap) {
450 		int i, refs, map = chunk->ac_usedmap;
451 
452 		for (i = ffs(map); i != 0; i = ffs(map)) {
453 			slot = i - 1;
454 			map ^= 1 << slot;
455 			anon = chunk->ac_anon[slot];
456 
457 			if (anon == NULL || anon->an_ref == 0)
458 				panic("amap_wipeout: corrupt amap");
459 
460 			refs = --anon->an_ref;
461 			if (refs == 0) {
462 				/*
463 				 * we had the last reference to a vm_anon.
464 				 * free it.
465 				 */
466 				uvm_anfree(anon);
467 			}
468 		}
469 	}
470 
471 	/* now we free the map */
472 	amap->am_ref = 0;	/* ... was one */
473 	amap->am_nused = 0;
474 	amap_free(amap);	/* will free amap */
475 }
476 
477 /*
478  * amap_copy: ensure that a map entry's "needs_copy" flag is false
479  *	by copying the amap if necessary.
480  *
481  * => an entry with a null amap pointer will get a new (blank) one.
482  * => if canchunk is true, then we may clip the entry into a chunk
483  * => "startva" and "endva" are used only if canchunk is true.  they are
484  *     used to limit chunking (e.g. if you have a large space that you
485  *     know you are going to need to allocate amaps for, there is no point
486  *     in allowing that to be chunked)
487  */
488 
489 void
490 amap_copy(struct vm_map *map, struct vm_map_entry *entry, int waitf,
491     boolean_t canchunk, vaddr_t startva, vaddr_t endva)
492 {
493 	struct vm_amap *amap, *srcamap;
494 	int slots, lcv, lazyalloc = 0;
495 	vaddr_t chunksize;
496 	int i, j, k, n, srcslot;
497 	struct vm_amap_chunk *chunk = NULL, *srcchunk = NULL;
498 
499 	/* is there a map to copy?   if not, create one from scratch. */
500 	if (entry->aref.ar_amap == NULL) {
501 		/*
502 		 * check to see if we have a large amap that we can
503 		 * chunk.  we align startva/endva to chunk-sized
504 		 * boundaries and then clip to them.
505 		 *
506 		 * if we cannot chunk the amap, allocate it in a way
507 		 * that makes it grow or shrink dynamically with
508 		 * the number of slots.
509 		 */
510 		if (atop(entry->end - entry->start) >= UVM_AMAP_LARGE) {
511 			if (canchunk) {
512 				/* convert slots to bytes */
513 				chunksize = UVM_AMAP_CHUNK << PAGE_SHIFT;
514 				startva = (startva / chunksize) * chunksize;
515 				endva = roundup(endva, chunksize);
516 				UVM_MAP_CLIP_START(map, entry, startva);
517 				/* watch out for endva wrap-around! */
518 				if (endva >= startva)
519 					UVM_MAP_CLIP_END(map, entry, endva);
520 			} else
521 				lazyalloc = 1;
522 		}
523 
524 		entry->aref.ar_pageoff = 0;
525 		entry->aref.ar_amap = amap_alloc(entry->end - entry->start,
526 		    waitf, lazyalloc);
527 		if (entry->aref.ar_amap != NULL)
528 			entry->etype &= ~UVM_ET_NEEDSCOPY;
529 		return;
530 	}
531 
532 	/*
533 	 * first check and see if we are the only map entry
534 	 * referencing the amap we currently have.  if so, then we can
535 	 * just take it over rather than copying it.  the value can only
536 	 * be one if we have the only reference to the amap
537 	 */
538 	if (entry->aref.ar_amap->am_ref == 1) {
539 		entry->etype &= ~UVM_ET_NEEDSCOPY;
540 		return;
541 	}
542 
543 	/* looks like we need to copy the map. */
544 	AMAP_B2SLOT(slots, entry->end - entry->start);
545 	if (!UVM_AMAP_SMALL(entry->aref.ar_amap) &&
546 	    entry->aref.ar_amap->am_hashshift != 0)
547 		lazyalloc = 1;
548 	amap = amap_alloc1(slots, waitf, lazyalloc);
549 	if (amap == NULL)
550 		return;
551 	srcamap = entry->aref.ar_amap;
552 
553 	/*
554 	 * need to double check reference count now.  the reference count
555 	 * could have changed while we were in malloc.  if the reference count
556 	 * dropped down to one we take over the old map rather than
557 	 * copying the amap.
558 	 */
559 	if (srcamap->am_ref == 1) {		/* take it over? */
560 		entry->etype &= ~UVM_ET_NEEDSCOPY;
561 		amap->am_ref--;		/* drop final reference to map */
562 		amap_free(amap);	/* dispose of new (unused) amap */
563 		return;
564 	}
565 
566 	/* we must copy it now. */
567 	for (lcv = 0; lcv < slots; lcv += n) {
568 		srcslot = entry->aref.ar_pageoff + lcv;
569 		i = UVM_AMAP_SLOTIDX(lcv);
570 		j = UVM_AMAP_SLOTIDX(srcslot);
571 		n = UVM_AMAP_CHUNK;
572 		if (i > j)
573 			n -= i;
574 		else
575 			n -= j;
576 		if (lcv + n > slots)
577 			n = slots - lcv;
578 
579 		srcchunk = amap_chunk_get(srcamap, srcslot, 0, PR_NOWAIT);
580 		if (srcchunk == NULL)
581 			continue;
582 
583 		chunk = amap_chunk_get(amap, lcv, 1, PR_NOWAIT);
584 		if (chunk == NULL) {
585 			amap->am_ref = 0;
586 			amap_wipeout(amap);
587 			return;
588 		}
589 
590 		for (k = 0; k < n; i++, j++, k++) {
591 			chunk->ac_anon[i] = srcchunk->ac_anon[j];
592 			if (chunk->ac_anon[i] == NULL)
593 				continue;
594 
595 			chunk->ac_usedmap |= (1 << i);
596 			chunk->ac_anon[i]->an_ref++;
597 			amap->am_nused++;
598 		}
599 	}
600 
601 	/*
602 	 * drop our reference to the old amap (srcamap).
603 	 * we know that the reference count on srcamap is greater than
604 	 * one (we checked above), so there is no way we could drop
605 	 * the count to zero.  [and no need to worry about freeing it]
606 	 */
607 	srcamap->am_ref--;
608 	if (srcamap->am_ref == 1 && (srcamap->am_flags & AMAP_SHARED) != 0)
609 		srcamap->am_flags &= ~AMAP_SHARED;   /* clear shared flag */
610 #ifdef UVM_AMAP_PPREF
611 	if (srcamap->am_ppref && srcamap->am_ppref != PPREF_NONE) {
612 		amap_pp_adjref(srcamap, entry->aref.ar_pageoff,
613 		    (entry->end - entry->start) >> PAGE_SHIFT, -1);
614 	}
615 #endif
616 
617 	/* install new amap. */
618 	entry->aref.ar_pageoff = 0;
619 	entry->aref.ar_amap = amap;
620 	entry->etype &= ~UVM_ET_NEEDSCOPY;
621 
622 	amap_list_insert(amap);
623 }
624 
625 /*
626  * amap_cow_now: resolve all copy-on-write faults in an amap now for fork(2)
627  *
628  *	called during fork(2) when the parent process has a wired map
629  *	entry.   in that case we want to avoid write-protecting pages
630  *	in the parent's map (e.g. like what you'd do for a COW page)
631  *	so we resolve the COW here.
632  *
633  * => assume parent's entry was wired, thus all pages are resident.
634  * => caller passes child's map/entry in to us
635  * => XXXCDC: out of memory should cause fork to fail, but there is
636  *	currently no easy way to do this (needs fix)
637  */
638 
639 void
640 amap_cow_now(struct vm_map *map, struct vm_map_entry *entry)
641 {
642 	struct vm_amap *amap = entry->aref.ar_amap;
643 	int slot;
644 	struct vm_anon *anon, *nanon;
645 	struct vm_page *pg, *npg;
646 	struct vm_amap_chunk *chunk;
647 
648 	/*
649 	 * note that if we wait, we must ReStart the "lcv" for loop because
650 	 * some other process could reorder the anon's in the
651 	 * am_anon[] array on us.
652 	 */
653 ReStart:
654 	AMAP_CHUNK_FOREACH(chunk, amap) {
655 		int i, map = chunk->ac_usedmap;
656 
657 		for (i = ffs(map); i != 0; i = ffs(map)) {
658 			slot = i - 1;
659 			map ^= 1 << slot;
660 			anon = chunk->ac_anon[slot];
661 			pg = anon->an_page;
662 
663 			/* page must be resident since parent is wired */
664 			if (pg == NULL)
665 				panic("amap_cow_now: non-resident wired page"
666 				    " in anon %p", anon);
667 
668 			/*
669 			 * if the anon ref count is one, we are safe (the child
670 			 * has exclusive access to the page).
671 			 */
672 			if (anon->an_ref <= 1)
673 				continue;
674 
675 			/*
676 			 * if the page is busy then we have to wait for
677 			 * it and then restart.
678 			 */
679 			if (pg->pg_flags & PG_BUSY) {
680 				atomic_setbits_int(&pg->pg_flags, PG_WANTED);
681 				UVM_WAIT(pg, FALSE, "cownow", 0);
682 				goto ReStart;
683 			}
684 
685 			/* ok, time to do a copy-on-write to a new anon */
686 			nanon = uvm_analloc();
687 			if (nanon) {
688 				npg = uvm_pagealloc(NULL, 0, nanon, 0);
689 			} else
690 				npg = NULL;	/* XXX: quiet gcc warning */
691 
692 			if (nanon == NULL || npg == NULL) {
693 				/* out of memory */
694 				/*
695 				 * XXXCDC: we should cause fork to fail, but
696 				 * we can't ...
697 				 */
698 				if (nanon) {
699 					uvm_anfree(nanon);
700 				}
701 				uvm_wait("cownowpage");
702 				goto ReStart;
703 			}
704 
705 			/*
706 			 * got it... now we can copy the data and replace anon
707 			 * with our new one...
708 			 */
709 			uvm_pagecopy(pg, npg);		/* old -> new */
710 			anon->an_ref--;			/* can't drop to zero */
711 			chunk->ac_anon[slot] = nanon;	/* replace */
712 
713 			/*
714 			 * drop PG_BUSY on new page ... since we have had its
715 			 * owner locked the whole time it can't be
716 			 * PG_RELEASED | PG_WANTED.
717 			 */
718 			atomic_clearbits_int(&npg->pg_flags, PG_BUSY|PG_FAKE);
719 			UVM_PAGE_OWN(npg, NULL);
720 			uvm_lock_pageq();
721 			uvm_pageactivate(npg);
722 			uvm_unlock_pageq();
723 		}
724 	}
725 }
726 
727 /*
728  * amap_splitref: split a single reference into two separate references
729  *
730  * => called from uvm_map's clip routines
731  */
732 void
733 amap_splitref(struct vm_aref *origref, struct vm_aref *splitref, vaddr_t offset)
734 {
735 	int leftslots;
736 
737 	AMAP_B2SLOT(leftslots, offset);
738 	if (leftslots == 0)
739 		panic("amap_splitref: split at zero offset");
740 
741 	/* now: we have a valid am_mapped array. */
742 	if (origref->ar_amap->am_nslot - origref->ar_pageoff - leftslots <= 0)
743 		panic("amap_splitref: map size check failed");
744 
745 #ifdef UVM_AMAP_PPREF
746         /* establish ppref before we add a duplicate reference to the amap */
747 	if (origref->ar_amap->am_ppref == NULL)
748 		amap_pp_establish(origref->ar_amap);
749 #endif
750 
751 	splitref->ar_amap = origref->ar_amap;
752 	splitref->ar_amap->am_ref++;		/* not a share reference */
753 	splitref->ar_pageoff = origref->ar_pageoff + leftslots;
754 }
755 
756 #ifdef UVM_AMAP_PPREF
757 
758 /*
759  * amap_pp_establish: add a ppref array to an amap, if possible
760  */
761 void
762 amap_pp_establish(struct vm_amap *amap)
763 {
764 
765 	amap->am_ppref = mallocarray(amap->am_nslot, sizeof(int),
766 	    M_UVMAMAP, M_NOWAIT|M_ZERO);
767 
768 	/* if we fail then we just won't use ppref for this amap */
769 	if (amap->am_ppref == NULL) {
770 		amap->am_ppref = PPREF_NONE;	/* not using it */
771 		return;
772 	}
773 
774 	/* init ppref */
775 	pp_setreflen(amap->am_ppref, 0, amap->am_ref, amap->am_nslot);
776 }
777 
778 /*
779  * amap_pp_adjref: adjust reference count to a part of an amap using the
780  * per-page reference count array.
781  *
782  * => caller must check that ppref != PPREF_NONE before calling
783  */
784 void
785 amap_pp_adjref(struct vm_amap *amap, int curslot, vsize_t slotlen, int adjval)
786 {
787  	int stopslot, *ppref, lcv, prevlcv;
788  	int ref, len, prevref, prevlen;
789 
790 	stopslot = curslot + slotlen;
791 	ppref = amap->am_ppref;
792  	prevlcv = 0;
793 
794 	/*
795  	 * first advance to the correct place in the ppref array,
796  	 * fragment if needed.
797 	 */
798 	for (lcv = 0 ; lcv < curslot ; lcv += len) {
799 		pp_getreflen(ppref, lcv, &ref, &len);
800 		if (lcv + len > curslot) {     /* goes past start? */
801 			pp_setreflen(ppref, lcv, ref, curslot - lcv);
802 			pp_setreflen(ppref, curslot, ref, len - (curslot -lcv));
803 			len = curslot - lcv;   /* new length of entry @ lcv */
804 		}
805 		prevlcv = lcv;
806 	}
807 	if (lcv != 0)
808 		pp_getreflen(ppref, prevlcv, &prevref, &prevlen);
809 	else {
810 		/* Ensure that the "prevref == ref" test below always
811 		 * fails, since we're starting from the beginning of
812 		 * the ppref array; that is, there is no previous
813 		 * chunk.
814 		 */
815 		prevref = -1;
816 		prevlen = 0;
817 	}
818 
819 	/*
820 	 * now adjust reference counts in range.  merge the first
821 	 * changed entry with the last unchanged entry if possible.
822 	 */
823 	if (lcv != curslot)
824 		panic("amap_pp_adjref: overshot target");
825 
826 	for (/* lcv already set */; lcv < stopslot ; lcv += len) {
827 		pp_getreflen(ppref, lcv, &ref, &len);
828 		if (lcv + len > stopslot) {     /* goes past end? */
829 			pp_setreflen(ppref, lcv, ref, stopslot - lcv);
830 			pp_setreflen(ppref, stopslot, ref,
831 			    len - (stopslot - lcv));
832 			len = stopslot - lcv;
833 		}
834 		ref += adjval;
835 		if (ref < 0)
836 			panic("amap_pp_adjref: negative reference count");
837 		if (lcv == prevlcv + prevlen && ref == prevref) {
838 			pp_setreflen(ppref, prevlcv, ref, prevlen + len);
839 		} else {
840 			pp_setreflen(ppref, lcv, ref, len);
841 		}
842 		if (ref == 0)
843 			amap_wiperange(amap, lcv, len);
844 	}
845 
846 }
847 
848 void
849 amap_wiperange_chunk(struct vm_amap *amap, struct vm_amap_chunk *chunk,
850     int slotoff, int slots)
851 {
852 	int curslot, i, map;
853 	int startbase, endbase;
854 	struct vm_anon *anon;
855 
856 	startbase = AMAP_BASE_SLOT(slotoff);
857 	endbase = AMAP_BASE_SLOT(slotoff + slots - 1);
858 
859 	map = chunk->ac_usedmap;
860 	if (startbase == chunk->ac_baseslot)
861 		map &= ~((1 << (slotoff - startbase)) - 1);
862 	if (endbase == chunk->ac_baseslot)
863 		map &= (1 << (slotoff + slots - endbase)) - 1;
864 
865 	for (i = ffs(map); i != 0; i = ffs(map)) {
866 		int refs;
867 
868 		curslot = i - 1;
869 		map ^= 1 << curslot;
870 		chunk->ac_usedmap ^= 1 << curslot;
871 		anon = chunk->ac_anon[curslot];
872 
873 		/* remove it from the amap */
874 		chunk->ac_anon[curslot] = NULL;
875 
876 		amap->am_nused--;
877 
878 		/* drop anon reference count */
879 		refs = --anon->an_ref;
880 		if (refs == 0) {
881 			/*
882 			 * we just eliminated the last reference to an
883 			 * anon.  free it.
884 			 */
885 			uvm_anfree(anon);
886 		}
887 	}
888 }
889 
890 /*
891  * amap_wiperange: wipe out a range of an amap
892  * [different from amap_wipeout because the amap is kept intact]
893  */
894 void
895 amap_wiperange(struct vm_amap *amap, int slotoff, int slots)
896 {
897 	int bucket, startbucket, endbucket;
898 	struct vm_amap_chunk *chunk, *nchunk;
899 
900 	startbucket = UVM_AMAP_BUCKET(amap, slotoff);
901 	endbucket = UVM_AMAP_BUCKET(amap, slotoff + slots - 1);
902 
903 	/*
904 	 * we can either traverse the amap by am_chunks or by am_buckets
905 	 * depending on which is cheaper.    decide now.
906 	 */
907 	if (UVM_AMAP_SMALL(amap))
908 		amap_wiperange_chunk(amap, &amap->am_small, slotoff, slots);
909 	else if (endbucket + 1 - startbucket >= amap->am_ncused) {
910 		TAILQ_FOREACH_SAFE(chunk, &amap->am_chunks, ac_list, nchunk) {
911 			if (chunk->ac_baseslot + chunk->ac_nslot <= slotoff)
912 				continue;
913 			if (chunk->ac_baseslot >= slotoff + slots)
914 				continue;
915 
916 			amap_wiperange_chunk(amap, chunk, slotoff, slots);
917 			if (chunk->ac_usedmap == 0)
918 				amap_chunk_free(amap, chunk);
919 		}
920 	} else {
921 		for (bucket = startbucket; bucket <= endbucket; bucket++) {
922 			for (chunk = amap->am_buckets[bucket]; chunk != NULL;
923 			    chunk = nchunk) {
924 				nchunk = TAILQ_NEXT(chunk, ac_list);
925 
926 				if (UVM_AMAP_BUCKET(amap, chunk->ac_baseslot) !=
927 				    bucket)
928 					break;
929 				if (chunk->ac_baseslot + chunk->ac_nslot <=
930 				    slotoff)
931 					continue;
932 				if (chunk->ac_baseslot >= slotoff + slots)
933 					continue;
934 
935 				amap_wiperange_chunk(amap, chunk, slotoff,
936 				    slots);
937 				if (chunk->ac_usedmap == 0)
938 					amap_chunk_free(amap, chunk);
939 			}
940 		}
941 	}
942 }
943 
944 #endif
945 
946 /*
947  * amap_swap_off: pagein anonymous pages in amaps and drop swap slots.
948  *
949  * => note that we don't always traverse all anons.
950  *    eg. amaps being wiped out, released anons.
951  * => return TRUE if failed.
952  */
953 
954 boolean_t
955 amap_swap_off(int startslot, int endslot)
956 {
957 	struct vm_amap *am;
958 	struct vm_amap *am_next;
959 	boolean_t rv = FALSE;
960 
961 	for (am = LIST_FIRST(&amap_list); am != NULL && !rv; am = am_next) {
962 		int i, map;
963 		struct vm_amap_chunk *chunk;
964 
965 again:
966 		AMAP_CHUNK_FOREACH(chunk, am) {
967 			map = chunk->ac_usedmap;
968 
969 			for (i = ffs(map); i != 0; i = ffs(map)) {
970 				int swslot;
971 				int slot = i - 1;
972 				struct vm_anon *anon;
973 
974 				map ^= 1 << slot;
975 				anon = chunk->ac_anon[slot];
976 
977 				swslot = anon->an_swslot;
978 				if (swslot < startslot || endslot <= swslot) {
979 					continue;
980 				}
981 
982 				am->am_flags |= AMAP_SWAPOFF;
983 
984 				rv = uvm_anon_pagein(anon);
985 
986 				am->am_flags &= ~AMAP_SWAPOFF;
987 				if (rv || amap_refs(am) == 0)
988 					goto nextamap;
989 				goto again;
990 			}
991 		}
992 
993 nextamap:
994 		am_next = LIST_NEXT(am, am_list);
995 		if (amap_refs(am) == 0)
996 			amap_wipeout(am);
997 	}
998 
999 	return rv;
1000 }
1001 
1002 /*
1003  * amap_lookup: look up a page in an amap
1004  */
1005 struct vm_anon *
1006 amap_lookup(struct vm_aref *aref, vaddr_t offset)
1007 {
1008 	int slot;
1009 	struct vm_amap *amap = aref->ar_amap;
1010 	struct vm_amap_chunk *chunk;
1011 
1012 	AMAP_B2SLOT(slot, offset);
1013 	slot += aref->ar_pageoff;
1014 
1015 	if (slot >= amap->am_nslot)
1016 		panic("amap_lookup: offset out of range");
1017 
1018 	chunk = amap_chunk_get(amap, slot, 0, PR_NOWAIT);
1019 	if (chunk == NULL)
1020 		return NULL;
1021 
1022 	return chunk->ac_anon[UVM_AMAP_SLOTIDX(slot)];
1023 }
1024 
1025 /*
1026  * amap_lookups: look up a range of pages in an amap
1027  *
1028  * => XXXCDC: this interface is biased toward array-based amaps.  fix.
1029  */
1030 void
1031 amap_lookups(struct vm_aref *aref, vaddr_t offset,
1032     struct vm_anon **anons, int npages)
1033 {
1034 	int i, lcv, n, slot;
1035 	struct vm_amap *amap = aref->ar_amap;
1036 	struct vm_amap_chunk *chunk = NULL;
1037 
1038 	AMAP_B2SLOT(slot, offset);
1039 	slot += aref->ar_pageoff;
1040 
1041 	if ((slot + (npages - 1)) >= amap->am_nslot)
1042 		panic("amap_lookups: offset out of range");
1043 
1044 	for (i = 0, lcv = slot; lcv < slot + npages; i += n, lcv += n) {
1045 		n = UVM_AMAP_CHUNK - UVM_AMAP_SLOTIDX(lcv);
1046 		if (lcv + n > slot + npages)
1047 			n = slot + npages - lcv;
1048 
1049 		chunk = amap_chunk_get(amap, lcv, 0, PR_NOWAIT);
1050 		if (chunk == NULL)
1051 			memset(&anons[i], 0, n * sizeof(*anons));
1052 		else
1053 			memcpy(&anons[i],
1054 			    &chunk->ac_anon[UVM_AMAP_SLOTIDX(lcv)],
1055 			    n * sizeof(*anons));
1056 	}
1057 }
1058 
1059 /*
1060  * amap_populate: ensure that the amap can store an anon for the page at
1061  * offset. This function can sleep until memory to store the anon is
1062  * available.
1063  */
1064 void
1065 amap_populate(struct vm_aref *aref, vaddr_t offset)
1066 {
1067 	int slot;
1068 	struct vm_amap *amap = aref->ar_amap;
1069 	struct vm_amap_chunk *chunk;
1070 
1071 	AMAP_B2SLOT(slot, offset);
1072 	slot += aref->ar_pageoff;
1073 
1074 	if (slot >= amap->am_nslot)
1075 		panic("amap_populate: offset out of range");
1076 
1077 	chunk = amap_chunk_get(amap, slot, 1, PR_WAITOK);
1078 	KASSERT(chunk != NULL);
1079 }
1080 
1081 /*
1082  * amap_add: add (or replace) a page to an amap
1083  *
1084  * => returns 0 if adding the page was successful or 1 when not.
1085  */
1086 int
1087 amap_add(struct vm_aref *aref, vaddr_t offset, struct vm_anon *anon,
1088     boolean_t replace)
1089 {
1090 	int slot;
1091 	struct vm_amap *amap = aref->ar_amap;
1092 	struct vm_amap_chunk *chunk;
1093 
1094 	AMAP_B2SLOT(slot, offset);
1095 	slot += aref->ar_pageoff;
1096 
1097 	if (slot >= amap->am_nslot)
1098 		panic("amap_add: offset out of range");
1099 	chunk = amap_chunk_get(amap, slot, 1, PR_NOWAIT);
1100 	if (chunk == NULL)
1101 		return 1;
1102 
1103 	slot = UVM_AMAP_SLOTIDX(slot);
1104 	if (replace) {
1105 		if (chunk->ac_anon[slot] == NULL)
1106 			panic("amap_add: replacing null anon");
1107 		if (chunk->ac_anon[slot]->an_page != NULL &&
1108 		    (amap->am_flags & AMAP_SHARED) != 0) {
1109 			pmap_page_protect(chunk->ac_anon[slot]->an_page,
1110 			    PROT_NONE);
1111 			/*
1112 			 * XXX: suppose page is supposed to be wired somewhere?
1113 			 */
1114 		}
1115 	} else {   /* !replace */
1116 		if (chunk->ac_anon[slot] != NULL)
1117 			panic("amap_add: slot in use");
1118 
1119 		chunk->ac_usedmap |= 1 << slot;
1120 		amap->am_nused++;
1121 	}
1122 	chunk->ac_anon[slot] = anon;
1123 
1124 	return 0;
1125 }
1126 
1127 /*
1128  * amap_unadd: remove a page from an amap
1129  */
1130 void
1131 amap_unadd(struct vm_aref *aref, vaddr_t offset)
1132 {
1133 	int slot;
1134 	struct vm_amap *amap = aref->ar_amap;
1135 	struct vm_amap_chunk *chunk;
1136 
1137 	AMAP_B2SLOT(slot, offset);
1138 	slot += aref->ar_pageoff;
1139 
1140 	if (slot >= amap->am_nslot)
1141 		panic("amap_unadd: offset out of range");
1142 	chunk = amap_chunk_get(amap, slot, 0, PR_NOWAIT);
1143 	if (chunk == NULL)
1144 		panic("amap_unadd: chunk for slot %d not present", slot);
1145 
1146 	slot = UVM_AMAP_SLOTIDX(slot);
1147 	if (chunk->ac_anon[slot] == NULL)
1148 		panic("amap_unadd: nothing there");
1149 
1150 	chunk->ac_anon[slot] = NULL;
1151 	chunk->ac_usedmap &= ~(1 << slot);
1152 	amap->am_nused--;
1153 
1154 	if (chunk->ac_usedmap == 0)
1155 		amap_chunk_free(amap, chunk);
1156 }
1157 
1158 /*
1159  * amap_ref: gain a reference to an amap
1160  *
1161  * => "offset" and "len" are in units of pages
1162  * => called at fork time to gain the child's reference
1163  */
1164 void
1165 amap_ref(struct vm_amap *amap, vaddr_t offset, vsize_t len, int flags)
1166 {
1167 
1168 	amap->am_ref++;
1169 	if (flags & AMAP_SHARED)
1170 		amap->am_flags |= AMAP_SHARED;
1171 #ifdef UVM_AMAP_PPREF
1172 	if (amap->am_ppref == NULL && (flags & AMAP_REFALL) == 0 &&
1173 	    len != amap->am_nslot)
1174 		amap_pp_establish(amap);
1175 	if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
1176 		if (flags & AMAP_REFALL)
1177 			amap_pp_adjref(amap, 0, amap->am_nslot, 1);
1178 		else
1179 			amap_pp_adjref(amap, offset, len, 1);
1180 	}
1181 #endif
1182 }
1183 
1184 /*
1185  * amap_unref: remove a reference to an amap
1186  *
1187  * => caller must remove all pmap-level references to this amap before
1188  *	dropping the reference
1189  * => called from uvm_unmap_detach [only]  ... note that entry is no
1190  *	longer part of a map
1191  */
1192 void
1193 amap_unref(struct vm_amap *amap, vaddr_t offset, vsize_t len, boolean_t all)
1194 {
1195 
1196 	/* if we are the last reference, free the amap and return. */
1197 	if (amap->am_ref-- == 1) {
1198 		amap_wipeout(amap);	/* drops final ref and frees */
1199 		return;
1200 	}
1201 
1202 	/* otherwise just drop the reference count(s) */
1203 	if (amap->am_ref == 1 && (amap->am_flags & AMAP_SHARED) != 0)
1204 		amap->am_flags &= ~AMAP_SHARED;	/* clear shared flag */
1205 #ifdef UVM_AMAP_PPREF
1206 	if (amap->am_ppref == NULL && all == 0 && len != amap->am_nslot)
1207 		amap_pp_establish(amap);
1208 	if (amap->am_ppref && amap->am_ppref != PPREF_NONE) {
1209 		if (all)
1210 			amap_pp_adjref(amap, 0, amap->am_nslot, -1);
1211 		else
1212 			amap_pp_adjref(amap, offset, len, -1);
1213 	}
1214 #endif
1215 }
1216