xref: /openbsd-src/sys/kern/subr_hibernate.c (revision 1e8cdc2e593c4e9ec7f4bdac2fbf48a4ad29f5b8)
1 /*	$OpenBSD: subr_hibernate.c,v 1.109 2014/11/16 12:31:00 deraadt Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  * Copyright (c) 2011 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/hibernate.h>
21 #include <sys/malloc.h>
22 #include <sys/param.h>
23 #include <sys/tree.h>
24 #include <sys/systm.h>
25 #include <sys/disklabel.h>
26 #include <sys/disk.h>
27 #include <sys/conf.h>
28 #include <sys/buf.h>
29 #include <sys/fcntl.h>
30 #include <sys/stat.h>
31 #include <uvm/uvm.h>
32 #include <uvm/uvm_swap.h>
33 #include <machine/hibernate.h>
34 
35 /*
36  * Hibernate piglet layout information
37  *
38  * The piglet is a scratch area of memory allocated by the suspending kernel.
39  * Its phys and virt addrs are recorded in the signature block. The piglet is
40  * used to guarantee an unused area of memory that can be used by the resuming
41  * kernel for various things. The piglet is excluded during unpack operations.
42  * The piglet size is presently 4*HIBERNATE_CHUNK_SIZE (typically 4*4MB).
43  *
44  * Offset from piglet_base	Purpose
45  * ----------------------------------------------------------------------------
46  * 0				Private page for suspend I/O write functions
47  * 1*PAGE_SIZE			I/O page used during hibernate suspend
48  * 2*PAGE_SIZE			I/O page used during hibernate suspend
49  * 3*PAGE_SIZE			copy page used during hibernate suspend
50  * 4*PAGE_SIZE			final chunk ordering list (24 pages)
51  * 28*PAGE_SIZE			RLE utility page
52  * 29*PAGE_SIZE			start of hiballoc area
53  * 109*PAGE_SIZE		end of hiballoc area (80 pages)
54  * ...				unused
55  * HIBERNATE_CHUNK_SIZE		start of hibernate chunk table
56  * 2*HIBERNATE_CHUNK_SIZE	bounce area for chunks being unpacked
57  * 4*HIBERNATE_CHUNK_SIZE	end of piglet
58  */
59 
60 /* Temporary vaddr ranges used during hibernate */
61 vaddr_t hibernate_temp_page;
62 vaddr_t hibernate_copy_page;
63 vaddr_t hibernate_rle_page;
64 
65 /* Hibernate info as read from disk during resume */
66 union hibernate_info disk_hib;
67 
68 /*
69  * Global copy of the pig start address. This needs to be a global as we
70  * switch stacks after computing it - it can't be stored on the stack.
71  */
72 paddr_t global_pig_start;
73 
74 /*
75  * Global copies of the piglet start addresses (PA/VA). We store these
76  * as globals to avoid having to carry them around as parameters, as the
77  * piglet is allocated early and freed late - its lifecycle extends beyond
78  * that of the hibernate info union which is calculated on suspend/resume.
79  */
80 vaddr_t global_piglet_va;
81 paddr_t global_piglet_pa;
82 
83 /* #define HIB_DEBUG */
84 #ifdef HIB_DEBUG
85 int	hib_debug = 99;
86 #define DPRINTF(x...)     do { if (hib_debug) printf(x); } while (0)
87 #define DNPRINTF(n,x...)  do { if (hib_debug > (n)) printf(x); } while (0)
88 #else
89 #define DPRINTF(x...)
90 #define DNPRINTF(n,x...)
91 #endif
92 
93 #ifndef NO_PROPOLICE
94 extern long __guard_local;
95 #endif /* ! NO_PROPOLICE */
96 
97 void hibernate_copy_chunk_to_piglet(paddr_t, vaddr_t, size_t);
98 int hibernate_calc_rle(paddr_t, paddr_t);
99 int hibernate_write_rle(union hibernate_info *, paddr_t, paddr_t, daddr_t *,
100 	size_t *);
101 
102 #define MAX_RLE (HIBERNATE_CHUNK_SIZE / PAGE_SIZE)
103 
104 /*
105  * Hib alloc enforced alignment.
106  */
107 #define HIB_ALIGN		8 /* bytes alignment */
108 
109 /*
110  * sizeof builtin operation, but with alignment constraint.
111  */
112 #define HIB_SIZEOF(_type)	roundup(sizeof(_type), HIB_ALIGN)
113 
114 struct hiballoc_entry {
115 	size_t			hibe_use;
116 	size_t			hibe_space;
117 	RB_ENTRY(hiballoc_entry) hibe_entry;
118 };
119 
120 /*
121  * Sort hibernate memory ranges by ascending PA
122  */
123 void
124 hibernate_sort_ranges(union hibernate_info *hib_info)
125 {
126 	int i, j;
127 	struct hibernate_memory_range *ranges;
128 	paddr_t base, end;
129 
130 	ranges = hib_info->ranges;
131 
132 	for (i = 1; i < hib_info->nranges; i++) {
133 		j = i;
134 		while (j > 0 && ranges[j - 1].base > ranges[j].base) {
135 			base = ranges[j].base;
136 			end = ranges[j].end;
137 			ranges[j].base = ranges[j - 1].base;
138 			ranges[j].end = ranges[j - 1].end;
139 			ranges[j - 1].base = base;
140 			ranges[j - 1].end = end;
141 			j--;
142 		}
143 	}
144 }
145 
146 /*
147  * Compare hiballoc entries based on the address they manage.
148  *
149  * Since the address is fixed, relative to struct hiballoc_entry,
150  * we just compare the hiballoc_entry pointers.
151  */
152 static __inline int
153 hibe_cmp(struct hiballoc_entry *l, struct hiballoc_entry *r)
154 {
155 	return l < r ? -1 : (l > r);
156 }
157 
158 RB_PROTOTYPE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
159 
160 /*
161  * Given a hiballoc entry, return the address it manages.
162  */
163 static __inline void *
164 hib_entry_to_addr(struct hiballoc_entry *entry)
165 {
166 	caddr_t addr;
167 
168 	addr = (caddr_t)entry;
169 	addr += HIB_SIZEOF(struct hiballoc_entry);
170 	return addr;
171 }
172 
173 /*
174  * Given an address, find the hiballoc that corresponds.
175  */
176 static __inline struct hiballoc_entry*
177 hib_addr_to_entry(void *addr_param)
178 {
179 	caddr_t addr;
180 
181 	addr = (caddr_t)addr_param;
182 	addr -= HIB_SIZEOF(struct hiballoc_entry);
183 	return (struct hiballoc_entry*)addr;
184 }
185 
186 RB_GENERATE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
187 
188 /*
189  * Allocate memory from the arena.
190  *
191  * Returns NULL if no memory is available.
192  */
193 void *
194 hib_alloc(struct hiballoc_arena *arena, size_t alloc_sz)
195 {
196 	struct hiballoc_entry *entry, *new_entry;
197 	size_t find_sz;
198 
199 	/*
200 	 * Enforce alignment of HIB_ALIGN bytes.
201 	 *
202 	 * Note that, because the entry is put in front of the allocation,
203 	 * 0-byte allocations are guaranteed a unique address.
204 	 */
205 	alloc_sz = roundup(alloc_sz, HIB_ALIGN);
206 
207 	/*
208 	 * Find an entry with hibe_space >= find_sz.
209 	 *
210 	 * If the root node is not large enough, we switch to tree traversal.
211 	 * Because all entries are made at the bottom of the free space,
212 	 * traversal from the end has a slightly better chance of yielding
213 	 * a sufficiently large space.
214 	 */
215 	find_sz = alloc_sz + HIB_SIZEOF(struct hiballoc_entry);
216 	entry = RB_ROOT(&arena->hib_addrs);
217 	if (entry != NULL && entry->hibe_space < find_sz) {
218 		RB_FOREACH_REVERSE(entry, hiballoc_addr, &arena->hib_addrs) {
219 			if (entry->hibe_space >= find_sz)
220 				break;
221 		}
222 	}
223 
224 	/*
225 	 * Insufficient or too fragmented memory.
226 	 */
227 	if (entry == NULL)
228 		return NULL;
229 
230 	/*
231 	 * Create new entry in allocated space.
232 	 */
233 	new_entry = (struct hiballoc_entry*)(
234 	    (caddr_t)hib_entry_to_addr(entry) + entry->hibe_use);
235 	new_entry->hibe_space = entry->hibe_space - find_sz;
236 	new_entry->hibe_use = alloc_sz;
237 
238 	/*
239 	 * Insert entry.
240 	 */
241 	if (RB_INSERT(hiballoc_addr, &arena->hib_addrs, new_entry) != NULL)
242 		panic("hib_alloc: insert failure");
243 	entry->hibe_space = 0;
244 
245 	/* Return address managed by entry. */
246 	return hib_entry_to_addr(new_entry);
247 }
248 
249 /*
250  * Free a pointer previously allocated from this arena.
251  *
252  * If addr is NULL, this will be silently accepted.
253  */
254 void
255 hib_free(struct hiballoc_arena *arena, void *addr)
256 {
257 	struct hiballoc_entry *entry, *prev;
258 
259 	if (addr == NULL)
260 		return;
261 
262 	/*
263 	 * Derive entry from addr and check it is really in this arena.
264 	 */
265 	entry = hib_addr_to_entry(addr);
266 	if (RB_FIND(hiballoc_addr, &arena->hib_addrs, entry) != entry)
267 		panic("hib_free: freed item %p not in hib arena", addr);
268 
269 	/*
270 	 * Give the space in entry to its predecessor.
271 	 *
272 	 * If entry has no predecessor, change its used space into free space
273 	 * instead.
274 	 */
275 	prev = RB_PREV(hiballoc_addr, &arena->hib_addrs, entry);
276 	if (prev != NULL &&
277 	    (void *)((caddr_t)prev + HIB_SIZEOF(struct hiballoc_entry) +
278 	    prev->hibe_use + prev->hibe_space) == entry) {
279 		/* Merge entry. */
280 		RB_REMOVE(hiballoc_addr, &arena->hib_addrs, entry);
281 		prev->hibe_space += HIB_SIZEOF(struct hiballoc_entry) +
282 		    entry->hibe_use + entry->hibe_space;
283 	} else {
284 		/* Flip used memory to free space. */
285 		entry->hibe_space += entry->hibe_use;
286 		entry->hibe_use = 0;
287 	}
288 }
289 
290 /*
291  * Initialize hiballoc.
292  *
293  * The allocator will manage memmory at ptr, which is len bytes.
294  */
295 int
296 hiballoc_init(struct hiballoc_arena *arena, void *p_ptr, size_t p_len)
297 {
298 	struct hiballoc_entry *entry;
299 	caddr_t ptr;
300 	size_t len;
301 
302 	RB_INIT(&arena->hib_addrs);
303 
304 	/*
305 	 * Hib allocator enforces HIB_ALIGN alignment.
306 	 * Fixup ptr and len.
307 	 */
308 	ptr = (caddr_t)roundup((vaddr_t)p_ptr, HIB_ALIGN);
309 	len = p_len - ((size_t)ptr - (size_t)p_ptr);
310 	len &= ~((size_t)HIB_ALIGN - 1);
311 
312 	/*
313 	 * Insufficient memory to be able to allocate and also do bookkeeping.
314 	 */
315 	if (len <= HIB_SIZEOF(struct hiballoc_entry))
316 		return ENOMEM;
317 
318 	/*
319 	 * Create entry describing space.
320 	 */
321 	entry = (struct hiballoc_entry*)ptr;
322 	entry->hibe_use = 0;
323 	entry->hibe_space = len - HIB_SIZEOF(struct hiballoc_entry);
324 	RB_INSERT(hiballoc_addr, &arena->hib_addrs, entry);
325 
326 	return 0;
327 }
328 
329 /*
330  * Zero all free memory.
331  */
332 void
333 uvm_pmr_zero_everything(void)
334 {
335 	struct uvm_pmemrange	*pmr;
336 	struct vm_page		*pg;
337 	int			 i;
338 
339 	uvm_lock_fpageq();
340 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
341 		/* Zero single pages. */
342 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_DIRTY]))
343 		    != NULL) {
344 			uvm_pmr_remove(pmr, pg);
345 			uvm_pagezero(pg);
346 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
347 			uvmexp.zeropages++;
348 			uvm_pmr_insert(pmr, pg, 0);
349 		}
350 
351 		/* Zero multi page ranges. */
352 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_DIRTY]))
353 		    != NULL) {
354 			pg--; /* Size tree always has second page. */
355 			uvm_pmr_remove(pmr, pg);
356 			for (i = 0; i < pg->fpgsz; i++) {
357 				uvm_pagezero(&pg[i]);
358 				atomic_setbits_int(&pg[i].pg_flags, PG_ZERO);
359 				uvmexp.zeropages++;
360 			}
361 			uvm_pmr_insert(pmr, pg, 0);
362 		}
363 	}
364 	uvm_unlock_fpageq();
365 }
366 
367 /*
368  * Mark all memory as dirty.
369  *
370  * Used to inform the system that the clean memory isn't clean for some
371  * reason, for example because we just came back from hibernate.
372  */
373 void
374 uvm_pmr_dirty_everything(void)
375 {
376 	struct uvm_pmemrange	*pmr;
377 	struct vm_page		*pg;
378 	int			 i;
379 
380 	uvm_lock_fpageq();
381 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
382 		/* Dirty single pages. */
383 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_ZERO]))
384 		    != NULL) {
385 			uvm_pmr_remove(pmr, pg);
386 			atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
387 			uvm_pmr_insert(pmr, pg, 0);
388 		}
389 
390 		/* Dirty multi page ranges. */
391 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_ZERO]))
392 		    != NULL) {
393 			pg--; /* Size tree always has second page. */
394 			uvm_pmr_remove(pmr, pg);
395 			for (i = 0; i < pg->fpgsz; i++)
396 				atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
397 			uvm_pmr_insert(pmr, pg, 0);
398 		}
399 	}
400 
401 	uvmexp.zeropages = 0;
402 	uvm_unlock_fpageq();
403 }
404 
405 /*
406  * Allocate an area that can hold sz bytes and doesn't overlap with
407  * the piglet at piglet_pa.
408  */
409 int
410 uvm_pmr_alloc_pig(paddr_t *pa, psize_t sz, paddr_t piglet_pa)
411 {
412 	struct uvm_constraint_range pig_constraint;
413 	struct kmem_pa_mode kp_pig = {
414 		.kp_constraint = &pig_constraint,
415 		.kp_maxseg = 1
416 	};
417 	vaddr_t va;
418 
419 	sz = round_page(sz);
420 
421 	pig_constraint.ucr_low = piglet_pa + 4 * HIBERNATE_CHUNK_SIZE;
422 	pig_constraint.ucr_high = -1;
423 
424 	va = (vaddr_t)km_alloc(sz, &kv_any, &kp_pig, &kd_nowait);
425 	if (va == 0) {
426 		pig_constraint.ucr_low = 0;
427 		pig_constraint.ucr_high = piglet_pa - 1;
428 
429 		va = (vaddr_t)km_alloc(sz, &kv_any, &kp_pig, &kd_nowait);
430 		if (va == 0)
431 			return ENOMEM;
432 	}
433 
434 	pmap_extract(pmap_kernel(), va, pa);
435 	return 0;
436 }
437 
438 /*
439  * Allocate a piglet area.
440  *
441  * This needs to be in DMA-safe memory.
442  * Piglets are aligned.
443  *
444  * sz and align in bytes.
445  *
446  * The call will sleep for the pagedaemon to attempt to free memory.
447  * The pagedaemon may decide its not possible to free enough memory, causing
448  * the allocation to fail.
449  */
450 int
451 uvm_pmr_alloc_piglet(vaddr_t *va, paddr_t *pa, vsize_t sz, paddr_t align)
452 {
453 	struct kmem_pa_mode kp_piglet = {
454 		.kp_constraint = &dma_constraint,
455 		.kp_align = align,
456 		.kp_maxseg = 1
457 	};
458 
459 	/* Ensure align is a power of 2 */
460 	KASSERT((align & (align - 1)) == 0);
461 
462 	/*
463 	 * Fixup arguments: align must be at least PAGE_SIZE,
464 	 * sz will be converted to pagecount, since that is what
465 	 * pmemrange uses internally.
466 	 */
467 	if (align < PAGE_SIZE)
468 		align = PAGE_SIZE;
469 	sz = round_page(sz);
470 
471 	*va = (vaddr_t)km_alloc(sz, &kv_any, &kp_piglet, &kd_nowait);
472 	if (*va == 0)
473 		return ENOMEM;
474 
475 	pmap_extract(pmap_kernel(), *va, pa);
476 	return 0;
477 }
478 
479 /*
480  * Free a piglet area.
481  */
482 void
483 uvm_pmr_free_piglet(vaddr_t va, vsize_t sz)
484 {
485 	/*
486 	 * Fix parameters.
487 	 */
488 	sz = round_page(sz);
489 
490 	/*
491 	 * Free the physical and virtual memory.
492 	 */
493 	km_free((void *)va, sz, &kv_any, &kp_dma_contig);
494 }
495 
496 /*
497  * Physmem RLE compression support.
498  *
499  * Given a physical page address, return the number of pages starting at the
500  * address that are free.  Clamps to the number of pages in
501  * HIBERNATE_CHUNK_SIZE. Returns 0 if the page at addr is not free.
502  */
503 int
504 uvm_page_rle(paddr_t addr)
505 {
506 	struct vm_page		*pg, *pg_end;
507 	struct vm_physseg	*vmp;
508 	int			 pseg_idx, off_idx;
509 
510 	pseg_idx = vm_physseg_find(atop(addr), &off_idx);
511 	if (pseg_idx == -1)
512 		return 0;
513 
514 	vmp = &vm_physmem[pseg_idx];
515 	pg = &vmp->pgs[off_idx];
516 	if (!(pg->pg_flags & PQ_FREE))
517 		return 0;
518 
519 	/*
520 	 * Search for the first non-free page after pg.
521 	 * Note that the page may not be the first page in a free pmemrange,
522 	 * therefore pg->fpgsz cannot be used.
523 	 */
524 	for (pg_end = pg; pg_end <= vmp->lastpg &&
525 	    (pg_end->pg_flags & PQ_FREE) == PQ_FREE; pg_end++)
526 		;
527 	return min((pg_end - pg), HIBERNATE_CHUNK_SIZE/PAGE_SIZE);
528 }
529 
530 /*
531  * Fills out the hibernate_info union pointed to by hib
532  * with information about this machine (swap signature block
533  * offsets, number of memory ranges, kernel in use, etc)
534  */
535 int
536 get_hibernate_info(union hibernate_info *hib, int suspend)
537 {
538 	struct disklabel dl;
539 	char err_string[128], *dl_ret;
540 
541 #ifndef NO_PROPOLICE
542 	/* Save propolice guard */
543 	hib->guard = __guard_local;
544 #endif /* ! NO_PROPOLICE */
545 
546 	/* Determine I/O function to use */
547 	hib->io_func = get_hibernate_io_function(swdevt[0].sw_dev);
548 	if (hib->io_func == NULL)
549 		return (1);
550 
551 	/* Calculate hibernate device */
552 	hib->dev = swdevt[0].sw_dev;
553 
554 	/* Read disklabel (used to calculate signature and image offsets) */
555 	dl_ret = disk_readlabel(&dl, hib->dev, err_string, sizeof(err_string));
556 
557 	if (dl_ret) {
558 		printf("Hibernate error reading disklabel: %s\n", dl_ret);
559 		return (1);
560 	}
561 
562 	/* Make sure we have a swap partition. */
563 	if (dl.d_partitions[1].p_fstype != FS_SWAP ||
564 	    DL_GETPSIZE(&dl.d_partitions[1]) == 0)
565 		return (1);
566 
567 	/* Make sure the signature can fit in one block */
568 	if (sizeof(union hibernate_info) > DEV_BSIZE)
569 		return (1);
570 
571 	/* Magic number */
572 	hib->magic = HIBERNATE_MAGIC;
573 
574 	/* Calculate signature block location */
575 	hib->sig_offset = DL_GETPSIZE(&dl.d_partitions[1]) -
576 	    sizeof(union hibernate_info)/DEV_BSIZE;
577 
578 	/* Stash kernel version information */
579 	memset(&hib->kernel_version, 0, 128);
580 	bcopy(version, &hib->kernel_version,
581 	    min(strlen(version), sizeof(hib->kernel_version)-1));
582 
583 	if (suspend) {
584 		/* Grab the previously-allocated piglet addresses */
585 		hib->piglet_va = global_piglet_va;
586 		hib->piglet_pa = global_piglet_pa;
587 		hib->io_page = (void *)hib->piglet_va;
588 
589 		/*
590 		 * Initialization of the hibernate IO function for drivers
591 		 * that need to do prep work (such as allocating memory or
592 		 * setting up data structures that cannot safely be done
593 		 * during suspend without causing side effects). There is
594 		 * a matching HIB_DONE call performed after the write is
595 		 * completed.
596 		 */
597 		if (hib->io_func(hib->dev, DL_GETPOFFSET(&dl.d_partitions[1]),
598 		    (vaddr_t)NULL, DL_GETPSIZE(&dl.d_partitions[1]),
599 		    HIB_INIT, hib->io_page))
600 			goto fail;
601 
602 	} else {
603 		/*
604 		 * Resuming kernels use a regular private page for the driver
605 		 * No need to free this I/O page as it will vanish as part of
606 		 * the resume.
607 		 */
608 		hib->io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
609 		if (!hib->io_page)
610 			goto fail;
611 	}
612 
613 	if (get_hibernate_info_md(hib))
614 		goto fail;
615 
616 	return (0);
617 
618 fail:
619 	return (1);
620 }
621 
622 /*
623  * Allocate nitems*size bytes from the hiballoc area presently in use
624  */
625 void *
626 hibernate_zlib_alloc(void *unused, int nitems, int size)
627 {
628 	struct hibernate_zlib_state *hibernate_state;
629 
630 	hibernate_state =
631 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
632 
633 	return hib_alloc(&hibernate_state->hiballoc_arena, nitems*size);
634 }
635 
636 /*
637  * Free the memory pointed to by addr in the hiballoc area presently in
638  * use
639  */
640 void
641 hibernate_zlib_free(void *unused, void *addr)
642 {
643 	struct hibernate_zlib_state *hibernate_state;
644 
645 	hibernate_state =
646 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
647 
648 	hib_free(&hibernate_state->hiballoc_arena, addr);
649 }
650 
651 /*
652  * Inflate next page of data from the image stream.
653  * The rle parameter is modified on exit to contain the number of pages to
654  * skip in the output stream (or 0 if this page was inflated into).
655  *
656  * Returns 0 if the stream contains additional data, or 1 if the stream is
657  * finished.
658  */
659 int
660 hibernate_inflate_page(int *rle)
661 {
662 	struct hibernate_zlib_state *hibernate_state;
663 	int i;
664 
665 	hibernate_state =
666 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
667 
668 	/* Set up the stream for RLE code inflate */
669 	hibernate_state->hib_stream.next_out = (unsigned char *)rle;
670 	hibernate_state->hib_stream.avail_out = sizeof(*rle);
671 
672 	/* Inflate RLE code */
673 	i = inflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH);
674 	if (i != Z_OK && i != Z_STREAM_END) {
675 		/*
676 		 * XXX - this will likely reboot/hang most machines
677 		 *       since the console output buffer will be unmapped,
678 		 *       but there's not much else we can do here.
679 		 */
680 		panic("rle inflate stream error");
681 	}
682 
683 	if (hibernate_state->hib_stream.avail_out != 0) {
684 		/*
685 		 * XXX - this will likely reboot/hang most machines
686 		 *       since the console output buffer will be unmapped,
687 		 *       but there's not much else we can do here.
688 		 */
689 		panic("rle short inflate error");
690 	}
691 
692 	if (*rle < 0 || *rle > 1024) {
693 		/*
694 		 * XXX - this will likely reboot/hang most machines
695 		 *       since the console output buffer will be unmapped,
696 		 *       but there's not much else we can do here.
697 		 */
698 		panic("invalid rle count");
699 	}
700 
701 	if (i == Z_STREAM_END)
702 		return (1);
703 
704 	if (*rle != 0)
705 		return (0);
706 
707 	/* Set up the stream for page inflate */
708 	hibernate_state->hib_stream.next_out =
709 		(unsigned char *)HIBERNATE_INFLATE_PAGE;
710 	hibernate_state->hib_stream.avail_out = PAGE_SIZE;
711 
712 	/* Process next block of data */
713 	i = inflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH);
714 	if (i != Z_OK && i != Z_STREAM_END) {
715 		/*
716 		 * XXX - this will likely reboot/hang most machines
717 		 *       since the console output buffer will be unmapped,
718 		 *       but there's not much else we can do here.
719 		 */
720 		panic("inflate error");
721 	}
722 
723 	/* We should always have extracted a full page ... */
724 	if (hibernate_state->hib_stream.avail_out != 0) {
725 		/*
726 		 * XXX - this will likely reboot/hang most machines
727 		 *       since the console output buffer will be unmapped,
728 		 *       but there's not much else we can do here.
729 		 */
730 		panic("incomplete page");
731 	}
732 
733 	return (i == Z_STREAM_END);
734 }
735 
736 /*
737  * Inflate size bytes from src into dest, skipping any pages in
738  * [src..dest] that are special (see hibernate_inflate_skip)
739  *
740  * This function executes while using the resume-time stack
741  * and pmap, and therefore cannot use ddb/printf/etc. Doing so
742  * will likely hang or reset the machine since the console output buffer
743  * will be unmapped.
744  */
745 void
746 hibernate_inflate_region(union hibernate_info *hib, paddr_t dest,
747     paddr_t src, size_t size)
748 {
749 	int end_stream = 0, rle;
750 	struct hibernate_zlib_state *hibernate_state;
751 
752 	hibernate_state =
753 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
754 
755 	hibernate_state->hib_stream.next_in = (unsigned char *)src;
756 	hibernate_state->hib_stream.avail_in = size;
757 
758 	do {
759 		/*
760 		 * Is this a special page? If yes, redirect the
761 		 * inflate output to a scratch page (eg, discard it)
762 		 */
763 		if (hibernate_inflate_skip(hib, dest)) {
764 			hibernate_enter_resume_mapping(
765 			    HIBERNATE_INFLATE_PAGE,
766 			    HIBERNATE_INFLATE_PAGE, 0);
767 		} else {
768 			hibernate_enter_resume_mapping(
769 			    HIBERNATE_INFLATE_PAGE, dest, 0);
770 		}
771 
772 		hibernate_flush();
773 		end_stream = hibernate_inflate_page(&rle);
774 
775 		if (rle == 0)
776 			dest += PAGE_SIZE;
777 		else
778 			dest += (rle * PAGE_SIZE);
779 	} while (!end_stream);
780 }
781 
782 /*
783  * deflate from src into the I/O page, up to 'remaining' bytes
784  *
785  * Returns number of input bytes consumed, and may reset
786  * the 'remaining' parameter if not all the output space was consumed
787  * (this information is needed to know how much to write to disk
788  */
789 size_t
790 hibernate_deflate(union hibernate_info *hib, paddr_t src,
791     size_t *remaining)
792 {
793 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
794 	struct hibernate_zlib_state *hibernate_state;
795 
796 	hibernate_state =
797 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
798 
799 	/* Set up the stream for deflate */
800 	hibernate_state->hib_stream.next_in = (unsigned char *)src;
801 	hibernate_state->hib_stream.avail_in = PAGE_SIZE - (src & PAGE_MASK);
802 	hibernate_state->hib_stream.next_out =
803 		(unsigned char *)hibernate_io_page + (PAGE_SIZE - *remaining);
804 	hibernate_state->hib_stream.avail_out = *remaining;
805 
806 	/* Process next block of data */
807 	if (deflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH) != Z_OK)
808 		panic("hibernate zlib deflate error");
809 
810 	/* Update pointers and return number of bytes consumed */
811 	*remaining = hibernate_state->hib_stream.avail_out;
812 	return (PAGE_SIZE - (src & PAGE_MASK)) -
813 	    hibernate_state->hib_stream.avail_in;
814 }
815 
816 /*
817  * Write the hibernation information specified in hiber_info
818  * to the location in swap previously calculated (last block of
819  * swap), called the "signature block".
820  */
821 int
822 hibernate_write_signature(union hibernate_info *hib)
823 {
824 	/* Write hibernate info to disk */
825 	return (hib->io_func(hib->dev, hib->sig_offset,
826 	    (vaddr_t)hib, DEV_BSIZE, HIB_W,
827 	    hib->io_page));
828 }
829 
830 /*
831  * Write the memory chunk table to the area in swap immediately
832  * preceding the signature block. The chunk table is stored
833  * in the piglet when this function is called.  Returns errno.
834  */
835 int
836 hibernate_write_chunktable(union hibernate_info *hib)
837 {
838 	vaddr_t hibernate_chunk_table_start;
839 	size_t hibernate_chunk_table_size;
840 	int i, err;
841 
842 	hibernate_chunk_table_size = HIBERNATE_CHUNK_TABLE_SIZE;
843 
844 	hibernate_chunk_table_start = hib->piglet_va +
845 	    HIBERNATE_CHUNK_SIZE;
846 
847 	/* Write chunk table */
848 	for (i = 0; i < hibernate_chunk_table_size; i += MAXPHYS) {
849 		if ((err = hib->io_func(hib->dev,
850 		    hib->chunktable_offset + (i/DEV_BSIZE),
851 		    (vaddr_t)(hibernate_chunk_table_start + i),
852 		    MAXPHYS, HIB_W, hib->io_page))) {
853 			DPRINTF("chunktable write error: %d\n", err);
854 			return (err);
855 		}
856 	}
857 
858 	return (0);
859 }
860 
861 /*
862  * Write an empty hiber_info to the swap signature block, which is
863  * guaranteed to not match any valid hib.
864  */
865 int
866 hibernate_clear_signature(void)
867 {
868 	union hibernate_info blank_hiber_info;
869 	union hibernate_info hib;
870 
871 	/* Zero out a blank hiber_info */
872 	memset(&blank_hiber_info, 0, sizeof(union hibernate_info));
873 
874 	/* Get the signature block location */
875 	if (get_hibernate_info(&hib, 0))
876 		return (1);
877 
878 	/* Write (zeroed) hibernate info to disk */
879 	DPRINTF("clearing hibernate signature block location: %lld\n",
880 		hib.sig_offset);
881 	if (hibernate_block_io(&hib,
882 	    hib.sig_offset,
883 	    DEV_BSIZE, (vaddr_t)&blank_hiber_info, 1))
884 		printf("Warning: could not clear hibernate signature\n");
885 
886 	return (0);
887 }
888 
889 /*
890  * Compare two hibernate_infos to determine if they are the same (eg,
891  * we should be performing a hibernate resume on this machine.
892  * Not all fields are checked - just enough to verify that the machine
893  * has the same memory configuration and kernel as the one that
894  * wrote the signature previously.
895  */
896 int
897 hibernate_compare_signature(union hibernate_info *mine,
898     union hibernate_info *disk)
899 {
900 	u_int i;
901 
902 	if (mine->nranges != disk->nranges) {
903 		DPRINTF("hibernate memory range count mismatch\n");
904 		return (1);
905 	}
906 
907 	if (strcmp(mine->kernel_version, disk->kernel_version) != 0) {
908 		DPRINTF("hibernate kernel version mismatch\n");
909 		return (1);
910 	}
911 
912 	for (i = 0; i < mine->nranges; i++) {
913 		if ((mine->ranges[i].base != disk->ranges[i].base) ||
914 		    (mine->ranges[i].end != disk->ranges[i].end) ) {
915 			DPRINTF("hib range %d mismatch [%p-%p != %p-%p]\n",
916 				i,
917 				(void *)mine->ranges[i].base,
918 				(void *)mine->ranges[i].end,
919 				(void *)disk->ranges[i].base,
920 				(void *)disk->ranges[i].end);
921 			return (1);
922 		}
923 	}
924 
925 	return (0);
926 }
927 
928 /*
929  * Transfers xfer_size bytes between the hibernate device specified in
930  * hib_info at offset blkctr and the vaddr specified at dest.
931  *
932  * Separate offsets and pages are used to handle misaligned reads (reads
933  * that span a page boundary).
934  *
935  * blkctr specifies a relative offset (relative to the start of swap),
936  * not an absolute disk offset
937  *
938  */
939 int
940 hibernate_block_io(union hibernate_info *hib, daddr_t blkctr,
941     size_t xfer_size, vaddr_t dest, int iswrite)
942 {
943 	struct buf *bp;
944 	struct bdevsw *bdsw;
945 	int error;
946 
947 	bp = geteblk(xfer_size);
948 	bdsw = &bdevsw[major(hib->dev)];
949 
950 	error = (*bdsw->d_open)(hib->dev, FREAD, S_IFCHR, curproc);
951 	if (error) {
952 		printf("hibernate_block_io open failed\n");
953 		return (1);
954 	}
955 
956 	if (iswrite)
957 		bcopy((caddr_t)dest, bp->b_data, xfer_size);
958 
959 	bp->b_bcount = xfer_size;
960 	bp->b_blkno = blkctr;
961 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
962 	SET(bp->b_flags, B_BUSY | (iswrite ? B_WRITE : B_READ) | B_RAW);
963 	bp->b_dev = hib->dev;
964 	(*bdsw->d_strategy)(bp);
965 
966 	error = biowait(bp);
967 	if (error) {
968 		printf("hib block_io biowait error %d blk %lld size %zu\n",
969 			error, (long long)blkctr, xfer_size);
970 		error = (*bdsw->d_close)(hib->dev, 0, S_IFCHR,
971 		    curproc);
972 		if (error)
973 			printf("hibernate_block_io error close failed\n");
974 		return (1);
975 	}
976 
977 	error = (*bdsw->d_close)(hib->dev, FREAD, S_IFCHR, curproc);
978 	if (error) {
979 		printf("hibernate_block_io close failed\n");
980 		return (1);
981 	}
982 
983 	if (!iswrite)
984 		bcopy(bp->b_data, (caddr_t)dest, xfer_size);
985 
986 	bp->b_flags |= B_INVAL;
987 	brelse(bp);
988 
989 	return (0);
990 }
991 
992 /*
993  * Reads the signature block from swap, checks against the current machine's
994  * information. If the information matches, perform a resume by reading the
995  * saved image into the pig area, and unpacking.
996  */
997 void
998 hibernate_resume(void)
999 {
1000 	union hibernate_info hib;
1001 	int s;
1002 
1003 	/* Get current running machine's hibernate info */
1004 	memset(&hib, 0, sizeof(hib));
1005 	if (get_hibernate_info(&hib, 0)) {
1006 		DPRINTF("couldn't retrieve machine's hibernate info\n");
1007 		return;
1008 	}
1009 
1010 	/* Read hibernate info from disk */
1011 	s = splbio();
1012 
1013 	DPRINTF("reading hibernate signature block location: %lld\n",
1014 		hib.sig_offset);
1015 
1016 	if (hibernate_block_io(&hib,
1017 	    hib.sig_offset,
1018 	    DEV_BSIZE, (vaddr_t)&disk_hib, 0)) {
1019 		DPRINTF("error in hibernate read");
1020 		splx(s);
1021 		return;
1022 	}
1023 
1024 	/* Check magic number */
1025 	if (disk_hib.magic != HIBERNATE_MAGIC) {
1026 		DPRINTF("wrong magic number in hibernate signature: %x\n",
1027 			disk_hib.magic);
1028 		splx(s);
1029 		return;
1030 	}
1031 
1032 	/*
1033 	 * We (possibly) found a hibernate signature. Clear signature first,
1034 	 * to prevent accidental resume or endless resume cycles later.
1035 	 */
1036 	if (hibernate_clear_signature()) {
1037 		DPRINTF("error clearing hibernate signature block\n");
1038 		splx(s);
1039 		return;
1040 	}
1041 
1042 	/*
1043 	 * If on-disk and in-memory hibernate signatures match,
1044 	 * this means we should do a resume from hibernate.
1045 	 */
1046 	if (hibernate_compare_signature(&hib, &disk_hib)) {
1047 		DPRINTF("mismatched hibernate signature block\n");
1048 		splx(s);
1049 		return;
1050 	}
1051 
1052 #ifdef MULTIPROCESSOR
1053 	/* XXX - if we fail later, we may need to rehatch APs on some archs */
1054 	DPRINTF("hibernate: quiescing APs\n");
1055 	hibernate_quiesce_cpus();
1056 #endif /* MULTIPROCESSOR */
1057 
1058 	/* Read the image from disk into the image (pig) area */
1059 	if (hibernate_read_image(&disk_hib))
1060 		goto fail;
1061 
1062 	DPRINTF("hibernate: quiescing devices\n");
1063 	if (config_suspend_all(DVACT_QUIESCE) != 0)
1064 		goto fail;
1065 
1066 	(void) splhigh();
1067 	hibernate_disable_intr_machdep();
1068 	cold = 1;
1069 
1070 	DPRINTF("hibernate: suspending devices\n");
1071 	if (config_suspend_all(DVACT_SUSPEND) != 0) {
1072 		cold = 0;
1073 		hibernate_enable_intr_machdep();
1074 		goto fail;
1075 	}
1076 
1077 	printf("Unpacking image...\n");
1078 
1079 	/* Switch stacks */
1080 	DPRINTF("hibernate: switching stacks\n");
1081 	hibernate_switch_stack_machdep();
1082 
1083 #ifndef NO_PROPOLICE
1084 	/* Start using suspended kernel's propolice guard */
1085 	__guard_local = disk_hib.guard;
1086 #endif /* ! NO_PROPOLICE */
1087 
1088 	/* Unpack and resume */
1089 	hibernate_unpack_image(&disk_hib);
1090 
1091 fail:
1092 	splx(s);
1093 	printf("\nUnable to resume hibernated image\n");
1094 }
1095 
1096 /*
1097  * Unpack image from pig area to original location by looping through the
1098  * list of output chunks in the order they should be restored (fchunks).
1099  *
1100  * Note that due to the stack smash protector and the fact that we have
1101  * switched stacks, it is not permitted to return from this function.
1102  */
1103 void
1104 hibernate_unpack_image(union hibernate_info *hib)
1105 {
1106 	struct hibernate_disk_chunk *chunks;
1107 	union hibernate_info local_hib;
1108 	paddr_t image_cur = global_pig_start;
1109 	short i, *fchunks;
1110 	char *pva;
1111 
1112 	/* Piglet will be identity mapped (VA == PA) */
1113 	pva = (char *)hib->piglet_pa;
1114 
1115 	fchunks = (short *)(pva + (4 * PAGE_SIZE));
1116 
1117 	chunks = (struct hibernate_disk_chunk *)(pva + HIBERNATE_CHUNK_SIZE);
1118 
1119 	/* Can't use hiber_info that's passed in after this point */
1120 	bcopy(hib, &local_hib, sizeof(union hibernate_info));
1121 
1122 	/* VA == PA */
1123 	local_hib.piglet_va = local_hib.piglet_pa;
1124 
1125 	/*
1126 	 * Point of no return. Once we pass this point, only kernel code can
1127 	 * be accessed. No global variables or other kernel data structures
1128 	 * are guaranteed to be coherent after unpack starts.
1129 	 *
1130 	 * The image is now in high memory (pig area), we unpack from the pig
1131 	 * to the correct location in memory. We'll eventually end up copying
1132 	 * on top of ourself, but we are assured the kernel code here is the
1133 	 * same between the hibernated and resuming kernel, and we are running
1134 	 * on our own stack, so the overwrite is ok.
1135 	 */
1136 	DPRINTF("hibernate: activating alt. pagetable and starting unpack\n");
1137 	hibernate_activate_resume_pt_machdep();
1138 
1139 	for (i = 0; i < local_hib.chunk_ctr; i++) {
1140 		/* Reset zlib for inflate */
1141 		if (hibernate_zlib_reset(&local_hib, 0) != Z_OK)
1142 			panic("hibernate failed to reset zlib for inflate");
1143 
1144 		hibernate_process_chunk(&local_hib, &chunks[fchunks[i]],
1145 		    image_cur);
1146 
1147 		image_cur += chunks[fchunks[i]].compressed_size;
1148 
1149 	}
1150 
1151 	/*
1152 	 * Resume the loaded kernel by jumping to the MD resume vector.
1153 	 * We won't be returning from this call.
1154 	 */
1155 	hibernate_resume_machdep();
1156 }
1157 
1158 /*
1159  * Bounce a compressed image chunk to the piglet, entering mappings for the
1160  * copied pages as needed
1161  */
1162 void
1163 hibernate_copy_chunk_to_piglet(paddr_t img_cur, vaddr_t piglet, size_t size)
1164 {
1165 	size_t ct, ofs;
1166 	paddr_t src = img_cur;
1167 	vaddr_t dest = piglet;
1168 
1169 	/* Copy first partial page */
1170 	ct = (PAGE_SIZE) - (src & PAGE_MASK);
1171 	ofs = (src & PAGE_MASK);
1172 
1173 	if (ct < PAGE_SIZE) {
1174 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE,
1175 			(src - ofs), 0);
1176 		hibernate_flush();
1177 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE + ofs), (caddr_t)dest, ct);
1178 		src += ct;
1179 		dest += ct;
1180 	}
1181 
1182 	/* Copy remaining pages */
1183 	while (src < size + img_cur) {
1184 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE, src, 0);
1185 		hibernate_flush();
1186 		ct = PAGE_SIZE;
1187 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE), (caddr_t)dest, ct);
1188 		hibernate_flush();
1189 		src += ct;
1190 		dest += ct;
1191 	}
1192 }
1193 
1194 /*
1195  * Process a chunk by bouncing it to the piglet, followed by unpacking
1196  */
1197 void
1198 hibernate_process_chunk(union hibernate_info *hib,
1199     struct hibernate_disk_chunk *chunk, paddr_t img_cur)
1200 {
1201 	char *pva = (char *)hib->piglet_va;
1202 
1203 	hibernate_copy_chunk_to_piglet(img_cur,
1204 	 (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)), chunk->compressed_size);
1205 	hibernate_inflate_region(hib, chunk->base,
1206 	    (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)),
1207 	    chunk->compressed_size);
1208 }
1209 
1210 /*
1211  * Calculate RLE component for 'inaddr'. Clamps to max RLE pages between
1212  * inaddr and range_end.
1213  */
1214 int
1215 hibernate_calc_rle(paddr_t inaddr, paddr_t range_end)
1216 {
1217 	int rle;
1218 
1219 	rle = uvm_page_rle(inaddr);
1220 	KASSERT(rle >= 0 && rle <= MAX_RLE);
1221 
1222 	/* Clamp RLE to range end */
1223 	if (rle > 0 && inaddr + (rle * PAGE_SIZE) > range_end)
1224 		rle = (range_end - inaddr) / PAGE_SIZE;
1225 
1226 	return (rle);
1227 }
1228 
1229 /*
1230  * Write the RLE byte for page at 'inaddr' to the output stream.
1231  * Returns the number of pages to be skipped at 'inaddr'.
1232  */
1233 int
1234 hibernate_write_rle(union hibernate_info *hib, paddr_t inaddr,
1235 	paddr_t range_end, daddr_t *blkctr,
1236 	size_t *out_remaining)
1237 {
1238 	int rle, err, *rleloc;
1239 	struct hibernate_zlib_state *hibernate_state;
1240 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
1241 
1242 	hibernate_state =
1243 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1244 
1245 	rle = hibernate_calc_rle(inaddr, range_end);
1246 
1247 	rleloc = (int *)hibernate_rle_page + MAX_RLE - 1;
1248 	*rleloc = rle;
1249 
1250 	/* Deflate the RLE byte into the stream */
1251 	hibernate_deflate(hib, (paddr_t)rleloc, out_remaining);
1252 
1253 	/* Did we fill the output page? If so, flush to disk */
1254 	if (*out_remaining == 0) {
1255 		if ((err = hib->io_func(hib->dev, *blkctr + hib->image_offset,
1256 			(vaddr_t)hibernate_io_page, PAGE_SIZE, HIB_W,
1257 			hib->io_page))) {
1258 				DPRINTF("hib write error %d\n", err);
1259 				return (err);
1260 		}
1261 
1262 		*blkctr += PAGE_SIZE / DEV_BSIZE;
1263 		*out_remaining = PAGE_SIZE;
1264 
1265 		/* If we didn't deflate the entire RLE byte, finish it now */
1266 		if (hibernate_state->hib_stream.avail_in != 0)
1267 			hibernate_deflate(hib,
1268 				(vaddr_t)hibernate_state->hib_stream.next_in,
1269 				out_remaining);
1270 	}
1271 
1272 	return (rle);
1273 }
1274 
1275 /*
1276  * Write a compressed version of this machine's memory to disk, at the
1277  * precalculated swap offset:
1278  *
1279  * end of swap - signature block size - chunk table size - memory size
1280  *
1281  * The function begins by looping through each phys mem range, cutting each
1282  * one into MD sized chunks. These chunks are then compressed individually
1283  * and written out to disk, in phys mem order. Some chunks might compress
1284  * more than others, and for this reason, each chunk's size is recorded
1285  * in the chunk table, which is written to disk after the image has
1286  * properly been compressed and written (in hibernate_write_chunktable).
1287  *
1288  * When this function is called, the machine is nearly suspended - most
1289  * devices are quiesced/suspended, interrupts are off, and cold has
1290  * been set. This means that there can be no side effects once the
1291  * write has started, and the write function itself can also have no
1292  * side effects. This also means no printfs are permitted (since printf
1293  * has side effects.)
1294  *
1295  * Return values :
1296  *
1297  * 0      - success
1298  * EIO    - I/O error occurred writing the chunks
1299  * EINVAL - Failed to write a complete range
1300  * ENOMEM - Memory allocation failure during preparation of the zlib arena
1301  */
1302 int
1303 hibernate_write_chunks(union hibernate_info *hib)
1304 {
1305 	paddr_t range_base, range_end, inaddr, temp_inaddr;
1306 	size_t nblocks, out_remaining, used;
1307 	struct hibernate_disk_chunk *chunks;
1308 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
1309 	daddr_t blkctr = 0;
1310 	int i, rle, err;
1311 	struct hibernate_zlib_state *hibernate_state;
1312 
1313 	hibernate_state =
1314 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1315 
1316 	hib->chunk_ctr = 0;
1317 
1318 	/*
1319 	 * Map the utility VAs to the piglet. See the piglet map at the
1320 	 * top of this file for piglet layout information.
1321 	 */
1322 	hibernate_copy_page = hib->piglet_va + 3 * PAGE_SIZE;
1323 	hibernate_rle_page = hib->piglet_va + 28 * PAGE_SIZE;
1324 
1325 	chunks = (struct hibernate_disk_chunk *)(hib->piglet_va +
1326 	    HIBERNATE_CHUNK_SIZE);
1327 
1328 	/* Calculate the chunk regions */
1329 	for (i = 0; i < hib->nranges; i++) {
1330 		range_base = hib->ranges[i].base;
1331 		range_end = hib->ranges[i].end;
1332 
1333 		inaddr = range_base;
1334 
1335 		while (inaddr < range_end) {
1336 			chunks[hib->chunk_ctr].base = inaddr;
1337 			if (inaddr + HIBERNATE_CHUNK_SIZE < range_end)
1338 				chunks[hib->chunk_ctr].end = inaddr +
1339 				    HIBERNATE_CHUNK_SIZE;
1340 			else
1341 				chunks[hib->chunk_ctr].end = range_end;
1342 
1343 			inaddr += HIBERNATE_CHUNK_SIZE;
1344 			hib->chunk_ctr ++;
1345 		}
1346 	}
1347 
1348 	uvm_pmr_dirty_everything();
1349 	uvm_pmr_zero_everything();
1350 
1351 	/* Compress and write the chunks in the chunktable */
1352 	for (i = 0; i < hib->chunk_ctr; i++) {
1353 		range_base = chunks[i].base;
1354 		range_end = chunks[i].end;
1355 
1356 		chunks[i].offset = blkctr + hib->image_offset;
1357 
1358 		/* Reset zlib for deflate */
1359 		if (hibernate_zlib_reset(hib, 1) != Z_OK) {
1360 			DPRINTF("hibernate_zlib_reset failed for deflate\n");
1361 			return (ENOMEM);
1362 		}
1363 
1364 		inaddr = range_base;
1365 
1366 		/*
1367 		 * For each range, loop through its phys mem region
1368 		 * and write out the chunks (the last chunk might be
1369 		 * smaller than the chunk size).
1370 		 */
1371 		while (inaddr < range_end) {
1372 			out_remaining = PAGE_SIZE;
1373 			while (out_remaining > 0 && inaddr < range_end) {
1374 				/*
1375 				 * Adjust for regions that are not evenly
1376 				 * divisible by PAGE_SIZE or overflowed
1377 				 * pages from the previous iteration.
1378 				 */
1379 				temp_inaddr = (inaddr & PAGE_MASK) +
1380 				    hibernate_copy_page;
1381 
1382 				/* Deflate from temp_inaddr to IO page */
1383 				if (inaddr != range_end) {
1384 					if (inaddr % PAGE_SIZE == 0) {
1385 						rle = hibernate_write_rle(hib,
1386 							inaddr,
1387 							range_end,
1388 							&blkctr,
1389 							&out_remaining);
1390 					}
1391 
1392 					if (rle == 0) {
1393 						pmap_kenter_pa(hibernate_temp_page,
1394 							inaddr & PMAP_PA_MASK,
1395 							PROT_READ);
1396 
1397 						pmap_activate(curproc);
1398 
1399 						bcopy((caddr_t)hibernate_temp_page,
1400 							(caddr_t)hibernate_copy_page,
1401 							PAGE_SIZE);
1402 						inaddr += hibernate_deflate(hib,
1403 							temp_inaddr,
1404 							&out_remaining);
1405 					} else {
1406 						inaddr += rle * PAGE_SIZE;
1407 						if (inaddr > range_end)
1408 							inaddr = range_end;
1409 					}
1410 
1411 				}
1412 
1413 				if (out_remaining == 0) {
1414 					/* Filled up the page */
1415 					nblocks = PAGE_SIZE / DEV_BSIZE;
1416 
1417 					if ((err = hib->io_func(hib->dev,
1418 					    blkctr + hib->image_offset,
1419 					    (vaddr_t)hibernate_io_page,
1420 					    PAGE_SIZE, HIB_W, hib->io_page))) {
1421 						DPRINTF("hib write error %d\n",
1422 						    err);
1423 						return (err);
1424 					}
1425 
1426 					blkctr += nblocks;
1427 				}
1428 			}
1429 		}
1430 
1431 		if (inaddr != range_end) {
1432 			DPRINTF("deflate range ended prematurely\n");
1433 			return (EINVAL);
1434 		}
1435 
1436 		/*
1437 		 * End of range. Round up to next secsize bytes
1438 		 * after finishing compress
1439 		 */
1440 		if (out_remaining == 0)
1441 			out_remaining = PAGE_SIZE;
1442 
1443 		/* Finish compress */
1444 		hibernate_state->hib_stream.next_in = (unsigned char *)inaddr;
1445 		hibernate_state->hib_stream.avail_in = 0;
1446 		hibernate_state->hib_stream.next_out =
1447 		    (unsigned char *)hibernate_io_page +
1448 			(PAGE_SIZE - out_remaining);
1449 
1450 		/* We have an extra output page available for finalize */
1451 		hibernate_state->hib_stream.avail_out =
1452 			out_remaining + PAGE_SIZE;
1453 
1454 		if ((err = deflate(&hibernate_state->hib_stream, Z_FINISH)) !=
1455 		    Z_STREAM_END) {
1456 			DPRINTF("deflate error in output stream: %d\n", err);
1457 			return (err);
1458 		}
1459 
1460 		out_remaining = hibernate_state->hib_stream.avail_out;
1461 
1462 		used = 2 * PAGE_SIZE - out_remaining;
1463 		nblocks = used / DEV_BSIZE;
1464 
1465 		/* Round up to next block if needed */
1466 		if (used % DEV_BSIZE != 0)
1467 			nblocks ++;
1468 
1469 		/* Write final block(s) for this chunk */
1470 		if ((err = hib->io_func(hib->dev, blkctr + hib->image_offset,
1471 		    (vaddr_t)hibernate_io_page, nblocks*DEV_BSIZE,
1472 		    HIB_W, hib->io_page))) {
1473 			DPRINTF("hib final write error %d\n", err);
1474 			return (err);
1475 		}
1476 
1477 		blkctr += nblocks;
1478 
1479 		chunks[i].compressed_size = (blkctr + hib->image_offset -
1480 		    chunks[i].offset) * DEV_BSIZE;
1481 	}
1482 
1483 	hib->chunktable_offset = hib->image_offset + blkctr;
1484 	return (0);
1485 }
1486 
1487 /*
1488  * Reset the zlib stream state and allocate a new hiballoc area for either
1489  * inflate or deflate. This function is called once for each hibernate chunk.
1490  * Calling hiballoc_init multiple times is acceptable since the memory it is
1491  * provided is unmanaged memory (stolen). We use the memory provided to us
1492  * by the piglet allocated via the supplied hib.
1493  */
1494 int
1495 hibernate_zlib_reset(union hibernate_info *hib, int deflate)
1496 {
1497 	vaddr_t hibernate_zlib_start;
1498 	size_t hibernate_zlib_size;
1499 	char *pva = (char *)hib->piglet_va;
1500 	struct hibernate_zlib_state *hibernate_state;
1501 
1502 	hibernate_state =
1503 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1504 
1505 	if (!deflate)
1506 		pva = (char *)((paddr_t)pva & (PIGLET_PAGE_MASK));
1507 
1508 	/*
1509 	 * See piglet layout information at the start of this file for
1510 	 * information on the zlib page assignments.
1511 	 */
1512 	hibernate_zlib_start = (vaddr_t)(pva + (29 * PAGE_SIZE));
1513 	hibernate_zlib_size = 80 * PAGE_SIZE;
1514 
1515 	memset((void *)hibernate_zlib_start, 0, hibernate_zlib_size);
1516 	memset(hibernate_state, 0, PAGE_SIZE);
1517 
1518 	/* Set up stream structure */
1519 	hibernate_state->hib_stream.zalloc = (alloc_func)hibernate_zlib_alloc;
1520 	hibernate_state->hib_stream.zfree = (free_func)hibernate_zlib_free;
1521 
1522 	/* Initialize the hiballoc arena for zlib allocs/frees */
1523 	hiballoc_init(&hibernate_state->hiballoc_arena,
1524 	    (caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1525 
1526 	if (deflate) {
1527 		return deflateInit(&hibernate_state->hib_stream,
1528 		    Z_BEST_SPEED);
1529 	} else
1530 		return inflateInit(&hibernate_state->hib_stream);
1531 }
1532 
1533 /*
1534  * Reads the hibernated memory image from disk, whose location and
1535  * size are recorded in hib. Begin by reading the persisted
1536  * chunk table, which records the original chunk placement location
1537  * and compressed size for each. Next, allocate a pig region of
1538  * sufficient size to hold the compressed image. Next, read the
1539  * chunks into the pig area (calling hibernate_read_chunks to do this),
1540  * and finally, if all of the above succeeds, clear the hibernate signature.
1541  * The function will then return to hibernate_resume, which will proceed
1542  * to unpack the pig image to the correct place in memory.
1543  */
1544 int
1545 hibernate_read_image(union hibernate_info *hib)
1546 {
1547 	size_t compressed_size, disk_size, chunktable_size, pig_sz;
1548 	paddr_t image_start, image_end, pig_start, pig_end;
1549 	struct hibernate_disk_chunk *chunks;
1550 	daddr_t blkctr;
1551 	vaddr_t chunktable = (vaddr_t)NULL;
1552 	paddr_t piglet_chunktable = hib->piglet_pa +
1553 	    HIBERNATE_CHUNK_SIZE;
1554 	int i, status;
1555 
1556 	status = 0;
1557 	pmap_activate(curproc);
1558 
1559 	/* Calculate total chunk table size in disk blocks */
1560 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / DEV_BSIZE;
1561 
1562 	blkctr = hib->chunktable_offset;
1563 
1564 	chunktable = (vaddr_t)km_alloc(HIBERNATE_CHUNK_TABLE_SIZE, &kv_any,
1565 	    &kp_none, &kd_nowait);
1566 
1567 	if (!chunktable)
1568 		return (1);
1569 
1570 	/* Map chunktable pages */
1571 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE; i += PAGE_SIZE)
1572 		pmap_kenter_pa(chunktable + i, piglet_chunktable + i,
1573 		    PROT_READ | PROT_WRITE);
1574 	pmap_update(pmap_kernel());
1575 
1576 	/* Read the chunktable from disk into the piglet chunktable */
1577 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE;
1578 	    i += MAXPHYS, blkctr += MAXPHYS/DEV_BSIZE)
1579 		hibernate_block_io(hib, blkctr, MAXPHYS,
1580 		    chunktable + i, 0);
1581 
1582 	blkctr = hib->image_offset;
1583 	compressed_size = 0;
1584 
1585 	chunks = (struct hibernate_disk_chunk *)chunktable;
1586 
1587 	for (i = 0; i < hib->chunk_ctr; i++)
1588 		compressed_size += chunks[i].compressed_size;
1589 
1590 	disk_size = compressed_size;
1591 
1592 	printf("unhibernating @ block %lld length %lu bytes\n",
1593 	    hib->sig_offset - chunktable_size,
1594 	    compressed_size);
1595 
1596 	/* Allocate the pig area */
1597 	pig_sz = compressed_size + HIBERNATE_CHUNK_SIZE;
1598 	if (uvm_pmr_alloc_pig(&pig_start, pig_sz, hib->piglet_pa) == ENOMEM) {
1599 		status = 1;
1600 		goto unmap;
1601 	}
1602 
1603 	pig_end = pig_start + pig_sz;
1604 
1605 	/* Calculate image extents. Pig image must end on a chunk boundary. */
1606 	image_end = pig_end & ~(HIBERNATE_CHUNK_SIZE - 1);
1607 	image_start = image_end - disk_size;
1608 
1609 	hibernate_read_chunks(hib, image_start, image_end, disk_size,
1610 	    chunks);
1611 
1612 	/* Prepare the resume time pmap/page table */
1613 	hibernate_populate_resume_pt(hib, image_start, image_end);
1614 
1615 unmap:
1616 	/* Unmap chunktable pages */
1617 	pmap_kremove(chunktable, HIBERNATE_CHUNK_TABLE_SIZE);
1618 	pmap_update(pmap_kernel());
1619 
1620 	return (status);
1621 }
1622 
1623 /*
1624  * Read the hibernated memory chunks from disk (chunk information at this
1625  * point is stored in the piglet) into the pig area specified by
1626  * [pig_start .. pig_end]. Order the chunks so that the final chunk is the
1627  * only chunk with overlap possibilities.
1628  */
1629 int
1630 hibernate_read_chunks(union hibernate_info *hib, paddr_t pig_start,
1631     paddr_t pig_end, size_t image_compr_size,
1632     struct hibernate_disk_chunk *chunks)
1633 {
1634 	paddr_t img_cur, piglet_base;
1635 	daddr_t blkctr;
1636 	size_t processed, compressed_size, read_size;
1637 	int nchunks, nfchunks, num_io_pages;
1638 	vaddr_t tempva, hibernate_fchunk_area;
1639 	short *fchunks, i, j;
1640 
1641 	tempva = (vaddr_t)NULL;
1642 	hibernate_fchunk_area = (vaddr_t)NULL;
1643 	nfchunks = 0;
1644 	piglet_base = hib->piglet_pa;
1645 	global_pig_start = pig_start;
1646 
1647 	/*
1648 	 * These mappings go into the resuming kernel's page table, and are
1649 	 * used only during image read. They dissappear from existence
1650 	 * when the suspended kernel is unpacked on top of us.
1651 	 */
1652 	tempva = (vaddr_t)km_alloc(MAXPHYS + PAGE_SIZE, &kv_any, &kp_none,
1653 		&kd_nowait);
1654 	if (!tempva)
1655 		return (1);
1656 	hibernate_fchunk_area = (vaddr_t)km_alloc(24 * PAGE_SIZE, &kv_any,
1657 	    &kp_none, &kd_nowait);
1658 	if (!hibernate_fchunk_area)
1659 		return (1);
1660 
1661 	/* Final output chunk ordering VA */
1662 	fchunks = (short *)hibernate_fchunk_area;
1663 
1664 	/* Map the chunk ordering region */
1665 	for(i = 0; i < 24 ; i++)
1666 		pmap_kenter_pa(hibernate_fchunk_area + (i * PAGE_SIZE),
1667 			piglet_base + ((4 + i) * PAGE_SIZE),
1668 			PROT_READ | PROT_WRITE);
1669 	pmap_update(pmap_kernel());
1670 
1671 	nchunks = hib->chunk_ctr;
1672 
1673 	/* Initially start all chunks as unplaced */
1674 	for (i = 0; i < nchunks; i++)
1675 		chunks[i].flags = 0;
1676 
1677 	/*
1678 	 * Search the list for chunks that are outside the pig area. These
1679 	 * can be placed first in the final output list.
1680 	 */
1681 	for (i = 0; i < nchunks; i++) {
1682 		if (chunks[i].end <= pig_start || chunks[i].base >= pig_end) {
1683 			fchunks[nfchunks] = i;
1684 			nfchunks++;
1685 			chunks[i].flags |= HIBERNATE_CHUNK_PLACED;
1686 		}
1687 	}
1688 
1689 	/*
1690 	 * Walk the ordering, place the chunks in ascending memory order.
1691 	 */
1692 	for (i = 0; i < nchunks; i++) {
1693 		if (chunks[i].flags != HIBERNATE_CHUNK_PLACED) {
1694 			fchunks[nfchunks] = i;
1695 			nfchunks++;
1696 			chunks[i].flags = HIBERNATE_CHUNK_PLACED;
1697 		}
1698 	}
1699 
1700 	img_cur = pig_start;
1701 
1702 	for (i = 0; i < nfchunks; i++) {
1703 		blkctr = chunks[fchunks[i]].offset;
1704 		processed = 0;
1705 		compressed_size = chunks[fchunks[i]].compressed_size;
1706 
1707 		while (processed < compressed_size) {
1708 			if (compressed_size - processed >= MAXPHYS)
1709 				read_size = MAXPHYS;
1710 			else
1711 				read_size = compressed_size - processed;
1712 
1713 			/*
1714 			 * We're reading read_size bytes, offset from the
1715 			 * start of a page by img_cur % PAGE_SIZE, so the
1716 			 * end will be read_size + (img_cur % PAGE_SIZE)
1717 			 * from the start of the first page.  Round that
1718 			 * up to the next page size.
1719 			 */
1720 			num_io_pages = (read_size + (img_cur % PAGE_SIZE)
1721 				+ PAGE_SIZE - 1) / PAGE_SIZE;
1722 
1723 			KASSERT(num_io_pages <= MAXPHYS/PAGE_SIZE + 1);
1724 
1725 			/* Map pages for this read */
1726 			for (j = 0; j < num_io_pages; j ++)
1727 				pmap_kenter_pa(tempva + j * PAGE_SIZE,
1728 				    img_cur + j * PAGE_SIZE,
1729 				    PROT_READ | PROT_WRITE);
1730 
1731 			pmap_update(pmap_kernel());
1732 
1733 			hibernate_block_io(hib, blkctr, read_size,
1734 			    tempva + (img_cur & PAGE_MASK), 0);
1735 
1736 			blkctr += (read_size / DEV_BSIZE);
1737 
1738 			pmap_kremove(tempva, num_io_pages * PAGE_SIZE);
1739 			pmap_update(pmap_kernel());
1740 
1741 			processed += read_size;
1742 			img_cur += read_size;
1743 		}
1744 	}
1745 
1746 	pmap_kremove(hibernate_fchunk_area, 24 * PAGE_SIZE);
1747 	pmap_update(pmap_kernel());
1748 
1749 	return (0);
1750 }
1751 
1752 /*
1753  * Hibernating a machine comprises the following operations:
1754  *  1. Calculating this machine's hibernate_info information
1755  *  2. Allocating a piglet and saving the piglet's physaddr
1756  *  3. Calculating the memory chunks
1757  *  4. Writing the compressed chunks to disk
1758  *  5. Writing the chunk table
1759  *  6. Writing the signature block (hibernate_info)
1760  *
1761  * On most architectures, the function calling hibernate_suspend would
1762  * then power off the machine using some MD-specific implementation.
1763  */
1764 int
1765 hibernate_suspend(void)
1766 {
1767 	union hibernate_info hib;
1768 	u_long start, end;
1769 
1770 	/*
1771 	 * Calculate memory ranges, swap offsets, etc.
1772 	 * This also allocates a piglet whose physaddr is stored in
1773 	 * hib->piglet_pa and vaddr stored in hib->piglet_va
1774 	 */
1775 	if (get_hibernate_info(&hib, 1)) {
1776 		DPRINTF("failed to obtain hibernate info\n");
1777 		return (1);
1778 	}
1779 
1780 	/* Find a page-addressed region in swap [start,end] */
1781 	if (uvm_hibswap(hib.dev, &start, &end)) {
1782 		printf("hibernate: cannot find any swap\n");
1783 		return (1);
1784 	}
1785 
1786 	if (end - start < 1000) {
1787 		printf("hibernate: insufficient swap (%lu is too small)\n",
1788 			end - start);
1789 		return (1);
1790 	}
1791 
1792 	/* Calculate block offsets in swap */
1793 	hib.image_offset = ctod(start);
1794 
1795 	DPRINTF("hibernate @ block %lld max-length %lu blocks\n",
1796 	    hib.image_offset, ctod(end) - ctod(start));
1797 
1798 	pmap_kenter_pa(HIBERNATE_HIBALLOC_PAGE, HIBERNATE_HIBALLOC_PAGE,
1799 		PROT_READ | PROT_WRITE);
1800 	pmap_activate(curproc);
1801 
1802 	DPRINTF("hibernate: writing chunks\n");
1803 	if (hibernate_write_chunks(&hib)) {
1804 		DPRINTF("hibernate_write_chunks failed\n");
1805 		goto fail;
1806 	}
1807 
1808 	DPRINTF("hibernate: writing chunktable\n");
1809 	if (hibernate_write_chunktable(&hib)) {
1810 		DPRINTF("hibernate_write_chunktable failed\n");
1811 		goto fail;
1812 	}
1813 
1814 	DPRINTF("hibernate: writing signature\n");
1815 	if (hibernate_write_signature(&hib)) {
1816 		DPRINTF("hibernate_write_signature failed\n");
1817 		goto fail;
1818 	}
1819 
1820 	/* Allow the disk to settle */
1821 	delay(500000);
1822 
1823 	/*
1824 	 * Give the device-specific I/O function a notification that we're
1825 	 * done, and that it can clean up or shutdown as needed.
1826 	 */
1827 	hib.io_func(hib.dev, 0, (vaddr_t)NULL, 0, HIB_DONE, hib.io_page);
1828 
1829 	return (0);
1830 fail:
1831 	pmap_kremove(HIBERNATE_HIBALLOC_PAGE, PAGE_SIZE);
1832 	pmap_update(pmap_kernel());
1833 	return (1);
1834 }
1835 
1836 int
1837 hibernate_alloc(void)
1838 {
1839 	KASSERT(global_piglet_va == 0);
1840 	KASSERT(hibernate_temp_page == 0);
1841 
1842 	/* Allocate a piglet, store its addresses in the supplied globals */
1843 	if (uvm_pmr_alloc_piglet(&global_piglet_va, &global_piglet_pa,
1844 	    HIBERNATE_CHUNK_SIZE * 4, HIBERNATE_CHUNK_SIZE))
1845 		return (ENOMEM);
1846 
1847 	/*
1848 	 * Allocate VA for the temp page.
1849 	 *
1850 	 * This will become part of the suspended kernel and will
1851 	 * be freed in hibernate_free, upon resume (or hibernate
1852 	 * failure)
1853 	 */
1854 	hibernate_temp_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1855 	    &kp_none, &kd_nowait);
1856 	if (!hibernate_temp_page) {
1857 		DPRINTF("out of memory allocating hibernate_temp_page\n");
1858 		return (ENOMEM);
1859 	}
1860 
1861 	return (0);
1862 }
1863 
1864 /*
1865  * Free items allocated by hibernate_alloc()
1866  */
1867 void
1868 hibernate_free(void)
1869 {
1870 	if (global_piglet_va)
1871 		uvm_pmr_free_piglet(global_piglet_va,
1872 		    4 * HIBERNATE_CHUNK_SIZE);
1873 
1874 	if (hibernate_temp_page) {
1875 		pmap_kremove(hibernate_temp_page, PAGE_SIZE);
1876 		km_free((void *)hibernate_temp_page, PAGE_SIZE,
1877 		    &kv_any, &kp_none);
1878 	}
1879 
1880 	global_piglet_va = 0;
1881 	hibernate_temp_page = 0;
1882 	pmap_kremove(HIBERNATE_HIBALLOC_PAGE, PAGE_SIZE);
1883 	pmap_update(pmap_kernel());
1884 }
1885