xref: /openbsd-src/sys/kern/subr_hibernate.c (revision 21a11680b417182fc15040eddb56a35e90b63915)
1 /*	$OpenBSD: subr_hibernate.c,v 1.130 2022/01/04 18:13:31 guenther Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  * Copyright (c) 2011 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/hibernate.h>
21 #include <sys/malloc.h>
22 #include <sys/param.h>
23 #include <sys/tree.h>
24 #include <sys/systm.h>
25 #include <sys/disklabel.h>
26 #include <sys/disk.h>
27 #include <sys/conf.h>
28 #include <sys/buf.h>
29 #include <sys/fcntl.h>
30 #include <sys/stat.h>
31 #include <sys/atomic.h>
32 
33 #include <uvm/uvm.h>
34 #include <uvm/uvm_swap.h>
35 
36 #include <machine/hibernate.h>
37 
38 /*
39  * Hibernate piglet layout information
40  *
41  * The piglet is a scratch area of memory allocated by the suspending kernel.
42  * Its phys and virt addrs are recorded in the signature block. The piglet is
43  * used to guarantee an unused area of memory that can be used by the resuming
44  * kernel for various things. The piglet is excluded during unpack operations.
45  * The piglet size is presently 4*HIBERNATE_CHUNK_SIZE (typically 4*4MB).
46  *
47  * Offset from piglet_base	Purpose
48  * ----------------------------------------------------------------------------
49  * 0				Private page for suspend I/O write functions
50  * 1*PAGE_SIZE			I/O page used during hibernate suspend
51  * 2*PAGE_SIZE			I/O page used during hibernate suspend
52  * 3*PAGE_SIZE			copy page used during hibernate suspend
53  * 4*PAGE_SIZE			final chunk ordering list (24 pages)
54  * 28*PAGE_SIZE			RLE utility page
55  * 29*PAGE_SIZE			start of hiballoc area
56  * 30*PAGE_SIZE			preserved entropy
57  * 110*PAGE_SIZE		end of hiballoc area (80 pages)
58  * 366*PAGE_SIZE		end of retguard preservation region (256 pages)
59  * ...				unused
60  * HIBERNATE_CHUNK_SIZE		start of hibernate chunk table
61  * 2*HIBERNATE_CHUNK_SIZE	bounce area for chunks being unpacked
62  * 4*HIBERNATE_CHUNK_SIZE	end of piglet
63  */
64 
65 /* Temporary vaddr ranges used during hibernate */
66 vaddr_t hibernate_temp_page;
67 vaddr_t hibernate_copy_page;
68 vaddr_t hibernate_rle_page;
69 
70 /* Hibernate info as read from disk during resume */
71 union hibernate_info disk_hib;
72 
73 /*
74  * Global copy of the pig start address. This needs to be a global as we
75  * switch stacks after computing it - it can't be stored on the stack.
76  */
77 paddr_t global_pig_start;
78 
79 /*
80  * Global copies of the piglet start addresses (PA/VA). We store these
81  * as globals to avoid having to carry them around as parameters, as the
82  * piglet is allocated early and freed late - its lifecycle extends beyond
83  * that of the hibernate info union which is calculated on suspend/resume.
84  */
85 vaddr_t global_piglet_va;
86 paddr_t global_piglet_pa;
87 
88 /* #define HIB_DEBUG */
89 #ifdef HIB_DEBUG
90 int	hib_debug = 99;
91 #define DPRINTF(x...)     do { if (hib_debug) printf(x); } while (0)
92 #define DNPRINTF(n,x...)  do { if (hib_debug > (n)) printf(x); } while (0)
93 #else
94 #define DPRINTF(x...)
95 #define DNPRINTF(n,x...)
96 #endif
97 
98 #ifndef NO_PROPOLICE
99 extern long __guard_local;
100 #endif /* ! NO_PROPOLICE */
101 
102 void hibernate_copy_chunk_to_piglet(paddr_t, vaddr_t, size_t);
103 int hibernate_calc_rle(paddr_t, paddr_t);
104 int hibernate_write_rle(union hibernate_info *, paddr_t, paddr_t, daddr_t *,
105 	size_t *);
106 
107 #define MAX_RLE (HIBERNATE_CHUNK_SIZE / PAGE_SIZE)
108 
109 /*
110  * Hib alloc enforced alignment.
111  */
112 #define HIB_ALIGN		8 /* bytes alignment */
113 
114 /*
115  * sizeof builtin operation, but with alignment constraint.
116  */
117 #define HIB_SIZEOF(_type)	roundup(sizeof(_type), HIB_ALIGN)
118 
119 struct hiballoc_entry {
120 	size_t			hibe_use;
121 	size_t			hibe_space;
122 	RBT_ENTRY(hiballoc_entry) hibe_entry;
123 };
124 
125 /*
126  * Sort hibernate memory ranges by ascending PA
127  */
128 void
129 hibernate_sort_ranges(union hibernate_info *hib_info)
130 {
131 	int i, j;
132 	struct hibernate_memory_range *ranges;
133 	paddr_t base, end;
134 
135 	ranges = hib_info->ranges;
136 
137 	for (i = 1; i < hib_info->nranges; i++) {
138 		j = i;
139 		while (j > 0 && ranges[j - 1].base > ranges[j].base) {
140 			base = ranges[j].base;
141 			end = ranges[j].end;
142 			ranges[j].base = ranges[j - 1].base;
143 			ranges[j].end = ranges[j - 1].end;
144 			ranges[j - 1].base = base;
145 			ranges[j - 1].end = end;
146 			j--;
147 		}
148 	}
149 }
150 
151 /*
152  * Compare hiballoc entries based on the address they manage.
153  *
154  * Since the address is fixed, relative to struct hiballoc_entry,
155  * we just compare the hiballoc_entry pointers.
156  */
157 static __inline int
158 hibe_cmp(const struct hiballoc_entry *l, const struct hiballoc_entry *r)
159 {
160 	vaddr_t vl = (vaddr_t)l;
161 	vaddr_t vr = (vaddr_t)r;
162 
163 	return vl < vr ? -1 : (vl > vr);
164 }
165 
166 RBT_PROTOTYPE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
167 
168 /*
169  * Given a hiballoc entry, return the address it manages.
170  */
171 static __inline void *
172 hib_entry_to_addr(struct hiballoc_entry *entry)
173 {
174 	caddr_t addr;
175 
176 	addr = (caddr_t)entry;
177 	addr += HIB_SIZEOF(struct hiballoc_entry);
178 	return addr;
179 }
180 
181 /*
182  * Given an address, find the hiballoc that corresponds.
183  */
184 static __inline struct hiballoc_entry*
185 hib_addr_to_entry(void *addr_param)
186 {
187 	caddr_t addr;
188 
189 	addr = (caddr_t)addr_param;
190 	addr -= HIB_SIZEOF(struct hiballoc_entry);
191 	return (struct hiballoc_entry*)addr;
192 }
193 
194 RBT_GENERATE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp);
195 
196 /*
197  * Allocate memory from the arena.
198  *
199  * Returns NULL if no memory is available.
200  */
201 void *
202 hib_alloc(struct hiballoc_arena *arena, size_t alloc_sz)
203 {
204 	struct hiballoc_entry *entry, *new_entry;
205 	size_t find_sz;
206 
207 	/*
208 	 * Enforce alignment of HIB_ALIGN bytes.
209 	 *
210 	 * Note that, because the entry is put in front of the allocation,
211 	 * 0-byte allocations are guaranteed a unique address.
212 	 */
213 	alloc_sz = roundup(alloc_sz, HIB_ALIGN);
214 
215 	/*
216 	 * Find an entry with hibe_space >= find_sz.
217 	 *
218 	 * If the root node is not large enough, we switch to tree traversal.
219 	 * Because all entries are made at the bottom of the free space,
220 	 * traversal from the end has a slightly better chance of yielding
221 	 * a sufficiently large space.
222 	 */
223 	find_sz = alloc_sz + HIB_SIZEOF(struct hiballoc_entry);
224 	entry = RBT_ROOT(hiballoc_addr, &arena->hib_addrs);
225 	if (entry != NULL && entry->hibe_space < find_sz) {
226 		RBT_FOREACH_REVERSE(entry, hiballoc_addr, &arena->hib_addrs) {
227 			if (entry->hibe_space >= find_sz)
228 				break;
229 		}
230 	}
231 
232 	/*
233 	 * Insufficient or too fragmented memory.
234 	 */
235 	if (entry == NULL)
236 		return NULL;
237 
238 	/*
239 	 * Create new entry in allocated space.
240 	 */
241 	new_entry = (struct hiballoc_entry*)(
242 	    (caddr_t)hib_entry_to_addr(entry) + entry->hibe_use);
243 	new_entry->hibe_space = entry->hibe_space - find_sz;
244 	new_entry->hibe_use = alloc_sz;
245 
246 	/*
247 	 * Insert entry.
248 	 */
249 	if (RBT_INSERT(hiballoc_addr, &arena->hib_addrs, new_entry) != NULL)
250 		panic("hib_alloc: insert failure");
251 	entry->hibe_space = 0;
252 
253 	/* Return address managed by entry. */
254 	return hib_entry_to_addr(new_entry);
255 }
256 
257 void
258 hib_getentropy(char **bufp, size_t *bufplen)
259 {
260 	if (!bufp || !bufplen)
261 		return;
262 
263 	*bufp = (char *)(global_piglet_va + (29 * PAGE_SIZE));
264 	*bufplen = PAGE_SIZE;
265 }
266 
267 /*
268  * Free a pointer previously allocated from this arena.
269  *
270  * If addr is NULL, this will be silently accepted.
271  */
272 void
273 hib_free(struct hiballoc_arena *arena, void *addr)
274 {
275 	struct hiballoc_entry *entry, *prev;
276 
277 	if (addr == NULL)
278 		return;
279 
280 	/*
281 	 * Derive entry from addr and check it is really in this arena.
282 	 */
283 	entry = hib_addr_to_entry(addr);
284 	if (RBT_FIND(hiballoc_addr, &arena->hib_addrs, entry) != entry)
285 		panic("hib_free: freed item %p not in hib arena", addr);
286 
287 	/*
288 	 * Give the space in entry to its predecessor.
289 	 *
290 	 * If entry has no predecessor, change its used space into free space
291 	 * instead.
292 	 */
293 	prev = RBT_PREV(hiballoc_addr, entry);
294 	if (prev != NULL &&
295 	    (void *)((caddr_t)prev + HIB_SIZEOF(struct hiballoc_entry) +
296 	    prev->hibe_use + prev->hibe_space) == entry) {
297 		/* Merge entry. */
298 		RBT_REMOVE(hiballoc_addr, &arena->hib_addrs, entry);
299 		prev->hibe_space += HIB_SIZEOF(struct hiballoc_entry) +
300 		    entry->hibe_use + entry->hibe_space;
301 	} else {
302 		/* Flip used memory to free space. */
303 		entry->hibe_space += entry->hibe_use;
304 		entry->hibe_use = 0;
305 	}
306 }
307 
308 /*
309  * Initialize hiballoc.
310  *
311  * The allocator will manage memory at ptr, which is len bytes.
312  */
313 int
314 hiballoc_init(struct hiballoc_arena *arena, void *p_ptr, size_t p_len)
315 {
316 	struct hiballoc_entry *entry;
317 	caddr_t ptr;
318 	size_t len;
319 
320 	RBT_INIT(hiballoc_addr, &arena->hib_addrs);
321 
322 	/*
323 	 * Hib allocator enforces HIB_ALIGN alignment.
324 	 * Fixup ptr and len.
325 	 */
326 	ptr = (caddr_t)roundup((vaddr_t)p_ptr, HIB_ALIGN);
327 	len = p_len - ((size_t)ptr - (size_t)p_ptr);
328 	len &= ~((size_t)HIB_ALIGN - 1);
329 
330 	/*
331 	 * Insufficient memory to be able to allocate and also do bookkeeping.
332 	 */
333 	if (len <= HIB_SIZEOF(struct hiballoc_entry))
334 		return ENOMEM;
335 
336 	/*
337 	 * Create entry describing space.
338 	 */
339 	entry = (struct hiballoc_entry*)ptr;
340 	entry->hibe_use = 0;
341 	entry->hibe_space = len - HIB_SIZEOF(struct hiballoc_entry);
342 	RBT_INSERT(hiballoc_addr, &arena->hib_addrs, entry);
343 
344 	return 0;
345 }
346 
347 /*
348  * Zero all free memory.
349  */
350 void
351 uvm_pmr_zero_everything(void)
352 {
353 	struct uvm_pmemrange	*pmr;
354 	struct vm_page		*pg;
355 	int			 i;
356 
357 	uvm_lock_fpageq();
358 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
359 		/* Zero single pages. */
360 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_DIRTY]))
361 		    != NULL) {
362 			uvm_pmr_remove(pmr, pg);
363 			uvm_pagezero(pg);
364 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
365 			uvmexp.zeropages++;
366 			uvm_pmr_insert(pmr, pg, 0);
367 		}
368 
369 		/* Zero multi page ranges. */
370 		while ((pg = RBT_ROOT(uvm_pmr_size,
371 		    &pmr->size[UVM_PMR_MEMTYPE_DIRTY])) != NULL) {
372 			pg--; /* Size tree always has second page. */
373 			uvm_pmr_remove(pmr, pg);
374 			for (i = 0; i < pg->fpgsz; i++) {
375 				uvm_pagezero(&pg[i]);
376 				atomic_setbits_int(&pg[i].pg_flags, PG_ZERO);
377 				uvmexp.zeropages++;
378 			}
379 			uvm_pmr_insert(pmr, pg, 0);
380 		}
381 	}
382 	uvm_unlock_fpageq();
383 }
384 
385 /*
386  * Mark all memory as dirty.
387  *
388  * Used to inform the system that the clean memory isn't clean for some
389  * reason, for example because we just came back from hibernate.
390  */
391 void
392 uvm_pmr_dirty_everything(void)
393 {
394 	struct uvm_pmemrange	*pmr;
395 	struct vm_page		*pg;
396 	int			 i;
397 
398 	uvm_lock_fpageq();
399 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
400 		/* Dirty single pages. */
401 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_ZERO]))
402 		    != NULL) {
403 			uvm_pmr_remove(pmr, pg);
404 			atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
405 			uvm_pmr_insert(pmr, pg, 0);
406 		}
407 
408 		/* Dirty multi page ranges. */
409 		while ((pg = RBT_ROOT(uvm_pmr_size,
410 		    &pmr->size[UVM_PMR_MEMTYPE_ZERO])) != NULL) {
411 			pg--; /* Size tree always has second page. */
412 			uvm_pmr_remove(pmr, pg);
413 			for (i = 0; i < pg->fpgsz; i++)
414 				atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
415 			uvm_pmr_insert(pmr, pg, 0);
416 		}
417 	}
418 
419 	uvmexp.zeropages = 0;
420 	uvm_unlock_fpageq();
421 }
422 
423 /*
424  * Allocate an area that can hold sz bytes and doesn't overlap with
425  * the piglet at piglet_pa.
426  */
427 int
428 uvm_pmr_alloc_pig(paddr_t *pa, psize_t sz, paddr_t piglet_pa)
429 {
430 	struct uvm_constraint_range pig_constraint;
431 	struct kmem_pa_mode kp_pig = {
432 		.kp_constraint = &pig_constraint,
433 		.kp_maxseg = 1
434 	};
435 	vaddr_t va;
436 
437 	sz = round_page(sz);
438 
439 	pig_constraint.ucr_low = piglet_pa + 4 * HIBERNATE_CHUNK_SIZE;
440 	pig_constraint.ucr_high = -1;
441 
442 	va = (vaddr_t)km_alloc(sz, &kv_any, &kp_pig, &kd_nowait);
443 	if (va == 0) {
444 		pig_constraint.ucr_low = 0;
445 		pig_constraint.ucr_high = piglet_pa - 1;
446 
447 		va = (vaddr_t)km_alloc(sz, &kv_any, &kp_pig, &kd_nowait);
448 		if (va == 0)
449 			return ENOMEM;
450 	}
451 
452 	pmap_extract(pmap_kernel(), va, pa);
453 	return 0;
454 }
455 
456 /*
457  * Allocate a piglet area.
458  *
459  * This needs to be in DMA-safe memory.
460  * Piglets are aligned.
461  *
462  * sz and align in bytes.
463  *
464  * The call will sleep for the pagedaemon to attempt to free memory.
465  * The pagedaemon may decide its not possible to free enough memory, causing
466  * the allocation to fail.
467  */
468 int
469 uvm_pmr_alloc_piglet(vaddr_t *va, paddr_t *pa, vsize_t sz, paddr_t align)
470 {
471 	struct kmem_pa_mode kp_piglet = {
472 		.kp_constraint = &dma_constraint,
473 		.kp_align = align,
474 		.kp_maxseg = 1
475 	};
476 
477 	/* Ensure align is a power of 2 */
478 	KASSERT((align & (align - 1)) == 0);
479 
480 	/*
481 	 * Fixup arguments: align must be at least PAGE_SIZE,
482 	 * sz will be converted to pagecount, since that is what
483 	 * pmemrange uses internally.
484 	 */
485 	if (align < PAGE_SIZE)
486 		kp_piglet.kp_align = PAGE_SIZE;
487 
488 	sz = round_page(sz);
489 
490 	*va = (vaddr_t)km_alloc(sz, &kv_any, &kp_piglet, &kd_nowait);
491 	if (*va == 0)
492 		return ENOMEM;
493 
494 	pmap_extract(pmap_kernel(), *va, pa);
495 	return 0;
496 }
497 
498 /*
499  * Free a piglet area.
500  */
501 void
502 uvm_pmr_free_piglet(vaddr_t va, vsize_t sz)
503 {
504 	/*
505 	 * Fix parameters.
506 	 */
507 	sz = round_page(sz);
508 
509 	/*
510 	 * Free the physical and virtual memory.
511 	 */
512 	km_free((void *)va, sz, &kv_any, &kp_dma_contig);
513 }
514 
515 /*
516  * Physmem RLE compression support.
517  *
518  * Given a physical page address, return the number of pages starting at the
519  * address that are free.  Clamps to the number of pages in
520  * HIBERNATE_CHUNK_SIZE. Returns 0 if the page at addr is not free.
521  */
522 int
523 uvm_page_rle(paddr_t addr)
524 {
525 	struct vm_page		*pg, *pg_end;
526 	struct vm_physseg	*vmp;
527 	int			 pseg_idx, off_idx;
528 
529 	pseg_idx = vm_physseg_find(atop(addr), &off_idx);
530 	if (pseg_idx == -1)
531 		return 0;
532 
533 	vmp = &vm_physmem[pseg_idx];
534 	pg = &vmp->pgs[off_idx];
535 	if (!(pg->pg_flags & PQ_FREE))
536 		return 0;
537 
538 	/*
539 	 * Search for the first non-free page after pg.
540 	 * Note that the page may not be the first page in a free pmemrange,
541 	 * therefore pg->fpgsz cannot be used.
542 	 */
543 	for (pg_end = pg; pg_end <= vmp->lastpg &&
544 	    (pg_end->pg_flags & PQ_FREE) == PQ_FREE &&
545 	    (pg_end - pg) < HIBERNATE_CHUNK_SIZE/PAGE_SIZE; pg_end++)
546 		;
547 	return pg_end - pg;
548 }
549 
550 /*
551  * Calculate a hopefully unique version # for this kernel, based upon
552  * how it was linked.
553  */
554 u_int32_t
555 hibsum(void)
556 {
557 	return ((long)malloc ^ (long)km_alloc ^ (long)printf ^ (long)strlen);
558 }
559 
560 
561 /*
562  * Fills out the hibernate_info union pointed to by hib
563  * with information about this machine (swap signature block
564  * offsets, number of memory ranges, kernel in use, etc)
565  */
566 int
567 get_hibernate_info(union hibernate_info *hib, int suspend)
568 {
569 	struct disklabel dl;
570 	char err_string[128], *dl_ret;
571 
572 #ifndef NO_PROPOLICE
573 	/* Save propolice guard */
574 	hib->guard = __guard_local;
575 #endif /* ! NO_PROPOLICE */
576 
577 	/* Determine I/O function to use */
578 	hib->io_func = get_hibernate_io_function(swdevt[0].sw_dev);
579 	if (hib->io_func == NULL)
580 		return (1);
581 
582 	/* Calculate hibernate device */
583 	hib->dev = swdevt[0].sw_dev;
584 
585 	/* Read disklabel (used to calculate signature and image offsets) */
586 	dl_ret = disk_readlabel(&dl, hib->dev, err_string, sizeof(err_string));
587 
588 	if (dl_ret) {
589 		printf("Hibernate error reading disklabel: %s\n", dl_ret);
590 		return (1);
591 	}
592 
593 	/* Make sure we have a swap partition. */
594 	if (dl.d_partitions[1].p_fstype != FS_SWAP ||
595 	    DL_GETPSIZE(&dl.d_partitions[1]) == 0)
596 		return (1);
597 
598 	/* Make sure the signature can fit in one block */
599 	if (sizeof(union hibernate_info) > DEV_BSIZE)
600 		return (1);
601 
602 	/* Magic number */
603 	hib->magic = HIBERNATE_MAGIC;
604 
605 	/* Calculate signature block location */
606 	hib->sig_offset = DL_GETPSIZE(&dl.d_partitions[1]) -
607 	    sizeof(union hibernate_info)/DEV_BSIZE;
608 
609 	/* Stash kernel version information */
610 	memset(&hib->kernel_version, 0, 128);
611 	bcopy(version, &hib->kernel_version,
612 	    min(strlen(version), sizeof(hib->kernel_version)-1));
613 	hib->kernel_sum = hibsum();
614 
615 	if (suspend) {
616 		/* Grab the previously-allocated piglet addresses */
617 		hib->piglet_va = global_piglet_va;
618 		hib->piglet_pa = global_piglet_pa;
619 		hib->io_page = (void *)hib->piglet_va;
620 
621 		/*
622 		 * Initialization of the hibernate IO function for drivers
623 		 * that need to do prep work (such as allocating memory or
624 		 * setting up data structures that cannot safely be done
625 		 * during suspend without causing side effects). There is
626 		 * a matching HIB_DONE call performed after the write is
627 		 * completed.
628 		 */
629 		if (hib->io_func(hib->dev, DL_GETPOFFSET(&dl.d_partitions[1]),
630 		    (vaddr_t)NULL, DL_GETPSIZE(&dl.d_partitions[1]),
631 		    HIB_INIT, hib->io_page))
632 			goto fail;
633 
634 	} else {
635 		/*
636 		 * Resuming kernels use a regular private page for the driver
637 		 * No need to free this I/O page as it will vanish as part of
638 		 * the resume.
639 		 */
640 		hib->io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
641 		if (!hib->io_page)
642 			goto fail;
643 	}
644 
645 	if (get_hibernate_info_md(hib))
646 		goto fail;
647 
648 	return (0);
649 
650 fail:
651 	return (1);
652 }
653 
654 /*
655  * Allocate nitems*size bytes from the hiballoc area presently in use
656  */
657 void *
658 hibernate_zlib_alloc(void *unused, int nitems, int size)
659 {
660 	struct hibernate_zlib_state *hibernate_state;
661 
662 	hibernate_state =
663 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
664 
665 	return hib_alloc(&hibernate_state->hiballoc_arena, nitems*size);
666 }
667 
668 /*
669  * Free the memory pointed to by addr in the hiballoc area presently in
670  * use
671  */
672 void
673 hibernate_zlib_free(void *unused, void *addr)
674 {
675 	struct hibernate_zlib_state *hibernate_state;
676 
677 	hibernate_state =
678 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
679 
680 	hib_free(&hibernate_state->hiballoc_arena, addr);
681 }
682 
683 /*
684  * Inflate next page of data from the image stream.
685  * The rle parameter is modified on exit to contain the number of pages to
686  * skip in the output stream (or 0 if this page was inflated into).
687  *
688  * Returns 0 if the stream contains additional data, or 1 if the stream is
689  * finished.
690  */
691 int
692 hibernate_inflate_page(int *rle)
693 {
694 	struct hibernate_zlib_state *hibernate_state;
695 	int i;
696 
697 	hibernate_state =
698 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
699 
700 	/* Set up the stream for RLE code inflate */
701 	hibernate_state->hib_stream.next_out = (unsigned char *)rle;
702 	hibernate_state->hib_stream.avail_out = sizeof(*rle);
703 
704 	/* Inflate RLE code */
705 	i = inflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH);
706 	if (i != Z_OK && i != Z_STREAM_END) {
707 		/*
708 		 * XXX - this will likely reboot/hang most machines
709 		 *       since the console output buffer will be unmapped,
710 		 *       but there's not much else we can do here.
711 		 */
712 		panic("rle inflate stream error");
713 	}
714 
715 	if (hibernate_state->hib_stream.avail_out != 0) {
716 		/*
717 		 * XXX - this will likely reboot/hang most machines
718 		 *       since the console output buffer will be unmapped,
719 		 *       but there's not much else we can do here.
720 		 */
721 		panic("rle short inflate error");
722 	}
723 
724 	if (*rle < 0 || *rle > 1024) {
725 		/*
726 		 * XXX - this will likely reboot/hang most machines
727 		 *       since the console output buffer will be unmapped,
728 		 *       but there's not much else we can do here.
729 		 */
730 		panic("invalid rle count");
731 	}
732 
733 	if (i == Z_STREAM_END)
734 		return (1);
735 
736 	if (*rle != 0)
737 		return (0);
738 
739 	/* Set up the stream for page inflate */
740 	hibernate_state->hib_stream.next_out =
741 		(unsigned char *)HIBERNATE_INFLATE_PAGE;
742 	hibernate_state->hib_stream.avail_out = PAGE_SIZE;
743 
744 	/* Process next block of data */
745 	i = inflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH);
746 	if (i != Z_OK && i != Z_STREAM_END) {
747 		/*
748 		 * XXX - this will likely reboot/hang most machines
749 		 *       since the console output buffer will be unmapped,
750 		 *       but there's not much else we can do here.
751 		 */
752 		panic("inflate error");
753 	}
754 
755 	/* We should always have extracted a full page ... */
756 	if (hibernate_state->hib_stream.avail_out != 0) {
757 		/*
758 		 * XXX - this will likely reboot/hang most machines
759 		 *       since the console output buffer will be unmapped,
760 		 *       but there's not much else we can do here.
761 		 */
762 		panic("incomplete page");
763 	}
764 
765 	return (i == Z_STREAM_END);
766 }
767 
768 /*
769  * Inflate size bytes from src into dest, skipping any pages in
770  * [src..dest] that are special (see hibernate_inflate_skip)
771  *
772  * This function executes while using the resume-time stack
773  * and pmap, and therefore cannot use ddb/printf/etc. Doing so
774  * will likely hang or reset the machine since the console output buffer
775  * will be unmapped.
776  */
777 void
778 hibernate_inflate_region(union hibernate_info *hib, paddr_t dest,
779     paddr_t src, size_t size)
780 {
781 	int end_stream = 0, rle, skip;
782 	struct hibernate_zlib_state *hibernate_state;
783 
784 	hibernate_state =
785 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
786 
787 	hibernate_state->hib_stream.next_in = (unsigned char *)src;
788 	hibernate_state->hib_stream.avail_in = size;
789 
790 	do {
791 		/*
792 		 * Is this a special page? If yes, redirect the
793 		 * inflate output to a scratch page (eg, discard it)
794 		 */
795 		skip = hibernate_inflate_skip(hib, dest);
796 		if (skip == HIB_SKIP) {
797 			hibernate_enter_resume_mapping(
798 			    HIBERNATE_INFLATE_PAGE,
799 			    HIBERNATE_INFLATE_PAGE, 0);
800 		} else if (skip == HIB_MOVE) {
801 			/*
802 			 * Special case : retguard region. This gets moved
803 			 * temporarily into the piglet region and copied into
804 			 * place immediately before resume
805 			 */
806 			hibernate_enter_resume_mapping(
807 			    HIBERNATE_INFLATE_PAGE,
808 			    hib->piglet_pa + (110 * PAGE_SIZE) +
809 			    hib->retguard_ofs, 0);
810 			hib->retguard_ofs += PAGE_SIZE;
811 			if (hib->retguard_ofs > 255 * PAGE_SIZE) {
812 				/*
813 				 * XXX - this will likely reboot/hang most
814 				 *       machines since the console output
815 				 *       buffer will be unmapped, but there's
816 				 *       not much else we can do here.
817 				 */
818 				panic("retguard move error, out of space");
819 			}
820 		} else {
821 			hibernate_enter_resume_mapping(
822 			    HIBERNATE_INFLATE_PAGE, dest, 0);
823 		}
824 
825 		hibernate_flush();
826 		end_stream = hibernate_inflate_page(&rle);
827 
828 		if (rle == 0)
829 			dest += PAGE_SIZE;
830 		else
831 			dest += (rle * PAGE_SIZE);
832 	} while (!end_stream);
833 }
834 
835 /*
836  * deflate from src into the I/O page, up to 'remaining' bytes
837  *
838  * Returns number of input bytes consumed, and may reset
839  * the 'remaining' parameter if not all the output space was consumed
840  * (this information is needed to know how much to write to disk)
841  */
842 size_t
843 hibernate_deflate(union hibernate_info *hib, paddr_t src,
844     size_t *remaining)
845 {
846 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
847 	struct hibernate_zlib_state *hibernate_state;
848 
849 	hibernate_state =
850 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
851 
852 	/* Set up the stream for deflate */
853 	hibernate_state->hib_stream.next_in = (unsigned char *)src;
854 	hibernate_state->hib_stream.avail_in = PAGE_SIZE - (src & PAGE_MASK);
855 	hibernate_state->hib_stream.next_out =
856 		(unsigned char *)hibernate_io_page + (PAGE_SIZE - *remaining);
857 	hibernate_state->hib_stream.avail_out = *remaining;
858 
859 	/* Process next block of data */
860 	if (deflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH) != Z_OK)
861 		panic("hibernate zlib deflate error");
862 
863 	/* Update pointers and return number of bytes consumed */
864 	*remaining = hibernate_state->hib_stream.avail_out;
865 	return (PAGE_SIZE - (src & PAGE_MASK)) -
866 	    hibernate_state->hib_stream.avail_in;
867 }
868 
869 /*
870  * Write the hibernation information specified in hiber_info
871  * to the location in swap previously calculated (last block of
872  * swap), called the "signature block".
873  */
874 int
875 hibernate_write_signature(union hibernate_info *hib)
876 {
877 	/* Write hibernate info to disk */
878 	return (hib->io_func(hib->dev, hib->sig_offset,
879 	    (vaddr_t)hib, DEV_BSIZE, HIB_W,
880 	    hib->io_page));
881 }
882 
883 /*
884  * Write the memory chunk table to the area in swap immediately
885  * preceding the signature block. The chunk table is stored
886  * in the piglet when this function is called.  Returns errno.
887  */
888 int
889 hibernate_write_chunktable(union hibernate_info *hib)
890 {
891 	vaddr_t hibernate_chunk_table_start;
892 	size_t hibernate_chunk_table_size;
893 	int i, err;
894 
895 	hibernate_chunk_table_size = HIBERNATE_CHUNK_TABLE_SIZE;
896 
897 	hibernate_chunk_table_start = hib->piglet_va +
898 	    HIBERNATE_CHUNK_SIZE;
899 
900 	/* Write chunk table */
901 	for (i = 0; i < hibernate_chunk_table_size; i += MAXPHYS) {
902 		if ((err = hib->io_func(hib->dev,
903 		    hib->chunktable_offset + (i/DEV_BSIZE),
904 		    (vaddr_t)(hibernate_chunk_table_start + i),
905 		    MAXPHYS, HIB_W, hib->io_page))) {
906 			DPRINTF("chunktable write error: %d\n", err);
907 			return (err);
908 		}
909 	}
910 
911 	return (0);
912 }
913 
914 /*
915  * Write an empty hiber_info to the swap signature block, which is
916  * guaranteed to not match any valid hib.
917  */
918 int
919 hibernate_clear_signature(void)
920 {
921 	union hibernate_info blank_hiber_info;
922 	union hibernate_info hib;
923 
924 	/* Zero out a blank hiber_info */
925 	memset(&blank_hiber_info, 0, sizeof(union hibernate_info));
926 
927 	/* Get the signature block location */
928 	if (get_hibernate_info(&hib, 0))
929 		return (1);
930 
931 	/* Write (zeroed) hibernate info to disk */
932 	DPRINTF("clearing hibernate signature block location: %lld\n",
933 		hib.sig_offset);
934 	if (hibernate_block_io(&hib,
935 	    hib.sig_offset,
936 	    DEV_BSIZE, (vaddr_t)&blank_hiber_info, 1))
937 		printf("Warning: could not clear hibernate signature\n");
938 
939 	return (0);
940 }
941 
942 /*
943  * Compare two hibernate_infos to determine if they are the same (eg,
944  * we should be performing a hibernate resume on this machine.
945  * Not all fields are checked - just enough to verify that the machine
946  * has the same memory configuration and kernel as the one that
947  * wrote the signature previously.
948  */
949 int
950 hibernate_compare_signature(union hibernate_info *mine,
951     union hibernate_info *disk)
952 {
953 	u_int i;
954 
955 	if (mine->nranges != disk->nranges) {
956 		printf("unhibernate failed: memory layout changed\n");
957 		return (1);
958 	}
959 
960 	if (strcmp(mine->kernel_version, disk->kernel_version) != 0) {
961 		printf("unhibernate failed: original kernel changed\n");
962 		return (1);
963 	}
964 
965 	if (hibsum() != disk->kernel_sum) {
966 		printf("unhibernate failed: original kernel changed\n");
967 		return (1);
968 	}
969 
970 	for (i = 0; i < mine->nranges; i++) {
971 		if ((mine->ranges[i].base != disk->ranges[i].base) ||
972 		    (mine->ranges[i].end != disk->ranges[i].end) ) {
973 			DPRINTF("hib range %d mismatch [%p-%p != %p-%p]\n",
974 				i,
975 				(void *)mine->ranges[i].base,
976 				(void *)mine->ranges[i].end,
977 				(void *)disk->ranges[i].base,
978 				(void *)disk->ranges[i].end);
979 			printf("unhibernate failed: memory size changed\n");
980 			return (1);
981 		}
982 	}
983 
984 	return (0);
985 }
986 
987 /*
988  * Transfers xfer_size bytes between the hibernate device specified in
989  * hib_info at offset blkctr and the vaddr specified at dest.
990  *
991  * Separate offsets and pages are used to handle misaligned reads (reads
992  * that span a page boundary).
993  *
994  * blkctr specifies a relative offset (relative to the start of swap),
995  * not an absolute disk offset
996  *
997  */
998 int
999 hibernate_block_io(union hibernate_info *hib, daddr_t blkctr,
1000     size_t xfer_size, vaddr_t dest, int iswrite)
1001 {
1002 	struct buf *bp;
1003 	struct bdevsw *bdsw;
1004 	int error;
1005 
1006 	bp = geteblk(xfer_size);
1007 	bdsw = &bdevsw[major(hib->dev)];
1008 
1009 	error = (*bdsw->d_open)(hib->dev, FREAD, S_IFCHR, curproc);
1010 	if (error) {
1011 		printf("hibernate_block_io open failed\n");
1012 		return (1);
1013 	}
1014 
1015 	if (iswrite)
1016 		bcopy((caddr_t)dest, bp->b_data, xfer_size);
1017 
1018 	bp->b_bcount = xfer_size;
1019 	bp->b_blkno = blkctr;
1020 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
1021 	SET(bp->b_flags, B_BUSY | (iswrite ? B_WRITE : B_READ) | B_RAW);
1022 	bp->b_dev = hib->dev;
1023 	(*bdsw->d_strategy)(bp);
1024 
1025 	error = biowait(bp);
1026 	if (error) {
1027 		printf("hib block_io biowait error %d blk %lld size %zu\n",
1028 			error, (long long)blkctr, xfer_size);
1029 		error = (*bdsw->d_close)(hib->dev, 0, S_IFCHR,
1030 		    curproc);
1031 		if (error)
1032 			printf("hibernate_block_io error close failed\n");
1033 		return (1);
1034 	}
1035 
1036 	error = (*bdsw->d_close)(hib->dev, FREAD, S_IFCHR, curproc);
1037 	if (error) {
1038 		printf("hibernate_block_io close failed\n");
1039 		return (1);
1040 	}
1041 
1042 	if (!iswrite)
1043 		bcopy(bp->b_data, (caddr_t)dest, xfer_size);
1044 
1045 	bp->b_flags |= B_INVAL;
1046 	brelse(bp);
1047 
1048 	return (0);
1049 }
1050 
1051 /*
1052  * Preserve one page worth of random data, generated from the resuming
1053  * kernel's arc4random. After resume, this preserved entropy can be used
1054  * to further improve the un-hibernated machine's entropy pool. This
1055  * random data is stored in the piglet, which is preserved across the
1056  * unpack operation, and is restored later in the resume process (see
1057  * hib_getentropy)
1058  */
1059 void
1060 hibernate_preserve_entropy(union hibernate_info *hib)
1061 {
1062 	void *entropy;
1063 
1064 	entropy = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
1065 
1066 	if (!entropy)
1067 		return;
1068 
1069 	pmap_activate(curproc);
1070 	pmap_kenter_pa((vaddr_t)entropy,
1071 	    (paddr_t)(hib->piglet_pa + (29 * PAGE_SIZE)),
1072 	    PROT_READ | PROT_WRITE);
1073 
1074 	arc4random_buf((void *)entropy, PAGE_SIZE);
1075 	pmap_kremove((vaddr_t)entropy, PAGE_SIZE);
1076 	km_free(entropy, PAGE_SIZE, &kv_any, &kp_none);
1077 }
1078 
1079 #ifndef NO_PROPOLICE
1080 vaddr_t
1081 hibernate_unprotect_ssp(void)
1082 {
1083 	struct kmem_dyn_mode kd_avoidalias;
1084 	vaddr_t va = trunc_page((vaddr_t)&__guard_local);
1085 	paddr_t pa;
1086 
1087 	pmap_extract(pmap_kernel(), va, &pa);
1088 
1089 	memset(&kd_avoidalias, 0, sizeof kd_avoidalias);
1090 	kd_avoidalias.kd_prefer = pa;
1091 	kd_avoidalias.kd_waitok = 1;
1092 	va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_avoidalias);
1093 	if (!va)
1094 		panic("hibernate_unprotect_ssp");
1095 
1096 	pmap_kenter_pa(va, pa, PROT_READ | PROT_WRITE);
1097 	pmap_update(pmap_kernel());
1098 
1099 	return va;
1100 }
1101 
1102 void
1103 hibernate_reprotect_ssp(vaddr_t va)
1104 {
1105 	pmap_kremove(va, PAGE_SIZE);
1106 	km_free((void *)va, PAGE_SIZE, &kv_any, &kp_none);
1107 }
1108 #endif /* NO_PROPOLICE */
1109 
1110 /*
1111  * Reads the signature block from swap, checks against the current machine's
1112  * information. If the information matches, perform a resume by reading the
1113  * saved image into the pig area, and unpacking.
1114  *
1115  * Must be called with interrupts enabled.
1116  */
1117 void
1118 hibernate_resume(void)
1119 {
1120 	union hibernate_info hib;
1121 	int s;
1122 #ifndef NO_PROPOLICE
1123 	vsize_t off = (vaddr_t)&__guard_local -
1124 	    trunc_page((vaddr_t)&__guard_local);
1125 	vaddr_t guard_va;
1126 #endif
1127 
1128 	/* Get current running machine's hibernate info */
1129 	memset(&hib, 0, sizeof(hib));
1130 	if (get_hibernate_info(&hib, 0)) {
1131 		DPRINTF("couldn't retrieve machine's hibernate info\n");
1132 		return;
1133 	}
1134 
1135 	/* Read hibernate info from disk */
1136 	s = splbio();
1137 
1138 	DPRINTF("reading hibernate signature block location: %lld\n",
1139 		hib.sig_offset);
1140 
1141 	if (hibernate_block_io(&hib,
1142 	    hib.sig_offset,
1143 	    DEV_BSIZE, (vaddr_t)&disk_hib, 0)) {
1144 		DPRINTF("error in hibernate read");
1145 		splx(s);
1146 		return;
1147 	}
1148 
1149 	/* Check magic number */
1150 	if (disk_hib.magic != HIBERNATE_MAGIC) {
1151 		DPRINTF("wrong magic number in hibernate signature: %x\n",
1152 			disk_hib.magic);
1153 		splx(s);
1154 		return;
1155 	}
1156 
1157 	/*
1158 	 * We (possibly) found a hibernate signature. Clear signature first,
1159 	 * to prevent accidental resume or endless resume cycles later.
1160 	 */
1161 	if (hibernate_clear_signature()) {
1162 		DPRINTF("error clearing hibernate signature block\n");
1163 		splx(s);
1164 		return;
1165 	}
1166 
1167 	/*
1168 	 * If on-disk and in-memory hibernate signatures match,
1169 	 * this means we should do a resume from hibernate.
1170 	 */
1171 	if (hibernate_compare_signature(&hib, &disk_hib)) {
1172 		DPRINTF("mismatched hibernate signature block\n");
1173 		splx(s);
1174 		return;
1175 	}
1176 	disk_hib.dev = hib.dev;
1177 
1178 #ifdef MULTIPROCESSOR
1179 	/* XXX - if we fail later, we may need to rehatch APs on some archs */
1180 	DPRINTF("hibernate: quiescing APs\n");
1181 	hibernate_quiesce_cpus();
1182 #endif /* MULTIPROCESSOR */
1183 
1184 	/* Read the image from disk into the image (pig) area */
1185 	if (hibernate_read_image(&disk_hib))
1186 		goto fail;
1187 
1188 	DPRINTF("hibernate: quiescing devices\n");
1189 	if (config_suspend_all(DVACT_QUIESCE) != 0)
1190 		goto fail;
1191 
1192 #ifndef NO_PROPOLICE
1193 	guard_va = hibernate_unprotect_ssp();
1194 #endif /* NO_PROPOLICE */
1195 
1196 	(void) splhigh();
1197 	hibernate_disable_intr_machdep();
1198 	cold = 1;
1199 
1200 	DPRINTF("hibernate: suspending devices\n");
1201 	if (config_suspend_all(DVACT_SUSPEND) != 0) {
1202 		cold = 0;
1203 		hibernate_enable_intr_machdep();
1204 #ifndef NO_PROPOLICE
1205 		hibernate_reprotect_ssp(guard_va);
1206 #endif /* ! NO_PROPOLICE */
1207 		goto fail;
1208 	}
1209 
1210 	hibernate_preserve_entropy(&disk_hib);
1211 
1212 	printf("Unpacking image...\n");
1213 
1214 	/* Switch stacks */
1215 	DPRINTF("hibernate: switching stacks\n");
1216 	hibernate_switch_stack_machdep();
1217 
1218 #ifndef NO_PROPOLICE
1219 	/* Start using suspended kernel's propolice guard */
1220 	*(long *)(guard_va + off) = disk_hib.guard;
1221 	hibernate_reprotect_ssp(guard_va);
1222 #endif /* ! NO_PROPOLICE */
1223 
1224 	/* Unpack and resume */
1225 	hibernate_unpack_image(&disk_hib);
1226 
1227 fail:
1228 	splx(s);
1229 	printf("\nUnable to resume hibernated image\n");
1230 }
1231 
1232 /*
1233  * Unpack image from pig area to original location by looping through the
1234  * list of output chunks in the order they should be restored (fchunks).
1235  *
1236  * Note that due to the stack smash protector and the fact that we have
1237  * switched stacks, it is not permitted to return from this function.
1238  */
1239 void
1240 hibernate_unpack_image(union hibernate_info *hib)
1241 {
1242 	struct hibernate_disk_chunk *chunks;
1243 	union hibernate_info local_hib;
1244 	paddr_t image_cur = global_pig_start;
1245 	short i, *fchunks;
1246 	char *pva;
1247 
1248 	/* Piglet will be identity mapped (VA == PA) */
1249 	pva = (char *)hib->piglet_pa;
1250 
1251 	fchunks = (short *)(pva + (4 * PAGE_SIZE));
1252 
1253 	chunks = (struct hibernate_disk_chunk *)(pva + HIBERNATE_CHUNK_SIZE);
1254 
1255 	/* Can't use hiber_info that's passed in after this point */
1256 	bcopy(hib, &local_hib, sizeof(union hibernate_info));
1257 	local_hib.retguard_ofs = 0;
1258 
1259 	/* VA == PA */
1260 	local_hib.piglet_va = local_hib.piglet_pa;
1261 
1262 	/*
1263 	 * Point of no return. Once we pass this point, only kernel code can
1264 	 * be accessed. No global variables or other kernel data structures
1265 	 * are guaranteed to be coherent after unpack starts.
1266 	 *
1267 	 * The image is now in high memory (pig area), we unpack from the pig
1268 	 * to the correct location in memory. We'll eventually end up copying
1269 	 * on top of ourself, but we are assured the kernel code here is the
1270 	 * same between the hibernated and resuming kernel, and we are running
1271 	 * on our own stack, so the overwrite is ok.
1272 	 */
1273 	DPRINTF("hibernate: activating alt. pagetable and starting unpack\n");
1274 	hibernate_activate_resume_pt_machdep();
1275 
1276 	for (i = 0; i < local_hib.chunk_ctr; i++) {
1277 		/* Reset zlib for inflate */
1278 		if (hibernate_zlib_reset(&local_hib, 0) != Z_OK)
1279 			panic("hibernate failed to reset zlib for inflate");
1280 
1281 		hibernate_process_chunk(&local_hib, &chunks[fchunks[i]],
1282 		    image_cur);
1283 
1284 		image_cur += chunks[fchunks[i]].compressed_size;
1285 
1286 	}
1287 
1288 	/*
1289 	 * Resume the loaded kernel by jumping to the MD resume vector.
1290 	 * We won't be returning from this call. We pass the location of
1291 	 * the retguard save area so the MD code can replace it before
1292 	 * resuming. See the piglet layout at the top of this file for
1293 	 * more information on the layout of the piglet area.
1294 	 *
1295 	 * We use 'global_piglet_va' here since by the time we are at
1296 	 * this point, we have already unpacked the image, and we want
1297 	 * the suspended kernel's view of what the piglet was, before
1298 	 * suspend occurred (since we will need to use that in the retguard
1299 	 * copy code in hibernate_resume_machdep.)
1300 	 */
1301 	hibernate_resume_machdep(global_piglet_va + (110 * PAGE_SIZE));
1302 }
1303 
1304 /*
1305  * Bounce a compressed image chunk to the piglet, entering mappings for the
1306  * copied pages as needed
1307  */
1308 void
1309 hibernate_copy_chunk_to_piglet(paddr_t img_cur, vaddr_t piglet, size_t size)
1310 {
1311 	size_t ct, ofs;
1312 	paddr_t src = img_cur;
1313 	vaddr_t dest = piglet;
1314 
1315 	/* Copy first partial page */
1316 	ct = (PAGE_SIZE) - (src & PAGE_MASK);
1317 	ofs = (src & PAGE_MASK);
1318 
1319 	if (ct < PAGE_SIZE) {
1320 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE,
1321 			(src - ofs), 0);
1322 		hibernate_flush();
1323 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE + ofs), (caddr_t)dest, ct);
1324 		src += ct;
1325 		dest += ct;
1326 	}
1327 
1328 	/* Copy remaining pages */
1329 	while (src < size + img_cur) {
1330 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE, src, 0);
1331 		hibernate_flush();
1332 		ct = PAGE_SIZE;
1333 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE), (caddr_t)dest, ct);
1334 		hibernate_flush();
1335 		src += ct;
1336 		dest += ct;
1337 	}
1338 }
1339 
1340 /*
1341  * Process a chunk by bouncing it to the piglet, followed by unpacking
1342  */
1343 void
1344 hibernate_process_chunk(union hibernate_info *hib,
1345     struct hibernate_disk_chunk *chunk, paddr_t img_cur)
1346 {
1347 	char *pva = (char *)hib->piglet_va;
1348 
1349 	hibernate_copy_chunk_to_piglet(img_cur,
1350 	 (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)), chunk->compressed_size);
1351 	hibernate_inflate_region(hib, chunk->base,
1352 	    (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)),
1353 	    chunk->compressed_size);
1354 }
1355 
1356 /*
1357  * Calculate RLE component for 'inaddr'. Clamps to max RLE pages between
1358  * inaddr and range_end.
1359  */
1360 int
1361 hibernate_calc_rle(paddr_t inaddr, paddr_t range_end)
1362 {
1363 	int rle;
1364 
1365 	rle = uvm_page_rle(inaddr);
1366 	KASSERT(rle >= 0 && rle <= MAX_RLE);
1367 
1368 	/* Clamp RLE to range end */
1369 	if (rle > 0 && inaddr + (rle * PAGE_SIZE) > range_end)
1370 		rle = (range_end - inaddr) / PAGE_SIZE;
1371 
1372 	return (rle);
1373 }
1374 
1375 /*
1376  * Write the RLE byte for page at 'inaddr' to the output stream.
1377  * Returns the number of pages to be skipped at 'inaddr'.
1378  */
1379 int
1380 hibernate_write_rle(union hibernate_info *hib, paddr_t inaddr,
1381 	paddr_t range_end, daddr_t *blkctr,
1382 	size_t *out_remaining)
1383 {
1384 	int rle, err, *rleloc;
1385 	struct hibernate_zlib_state *hibernate_state;
1386 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
1387 
1388 	hibernate_state =
1389 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1390 
1391 	rle = hibernate_calc_rle(inaddr, range_end);
1392 
1393 	rleloc = (int *)hibernate_rle_page + MAX_RLE - 1;
1394 	*rleloc = rle;
1395 
1396 	/* Deflate the RLE byte into the stream */
1397 	hibernate_deflate(hib, (paddr_t)rleloc, out_remaining);
1398 
1399 	/* Did we fill the output page? If so, flush to disk */
1400 	if (*out_remaining == 0) {
1401 		if ((err = hib->io_func(hib->dev, *blkctr + hib->image_offset,
1402 			(vaddr_t)hibernate_io_page, PAGE_SIZE, HIB_W,
1403 			hib->io_page))) {
1404 				DPRINTF("hib write error %d\n", err);
1405 				return (err);
1406 		}
1407 
1408 		*blkctr += PAGE_SIZE / DEV_BSIZE;
1409 		*out_remaining = PAGE_SIZE;
1410 
1411 		/* If we didn't deflate the entire RLE byte, finish it now */
1412 		if (hibernate_state->hib_stream.avail_in != 0)
1413 			hibernate_deflate(hib,
1414 				(vaddr_t)hibernate_state->hib_stream.next_in,
1415 				out_remaining);
1416 	}
1417 
1418 	return (rle);
1419 }
1420 
1421 /*
1422  * Write a compressed version of this machine's memory to disk, at the
1423  * precalculated swap offset:
1424  *
1425  * end of swap - signature block size - chunk table size - memory size
1426  *
1427  * The function begins by looping through each phys mem range, cutting each
1428  * one into MD sized chunks. These chunks are then compressed individually
1429  * and written out to disk, in phys mem order. Some chunks might compress
1430  * more than others, and for this reason, each chunk's size is recorded
1431  * in the chunk table, which is written to disk after the image has
1432  * properly been compressed and written (in hibernate_write_chunktable).
1433  *
1434  * When this function is called, the machine is nearly suspended - most
1435  * devices are quiesced/suspended, interrupts are off, and cold has
1436  * been set. This means that there can be no side effects once the
1437  * write has started, and the write function itself can also have no
1438  * side effects. This also means no printfs are permitted (since printf
1439  * has side effects.)
1440  *
1441  * Return values :
1442  *
1443  * 0      - success
1444  * EIO    - I/O error occurred writing the chunks
1445  * EINVAL - Failed to write a complete range
1446  * ENOMEM - Memory allocation failure during preparation of the zlib arena
1447  */
1448 int
1449 hibernate_write_chunks(union hibernate_info *hib)
1450 {
1451 	paddr_t range_base, range_end, inaddr, temp_inaddr;
1452 	size_t nblocks, out_remaining, used;
1453 	struct hibernate_disk_chunk *chunks;
1454 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
1455 	daddr_t blkctr = 0;
1456 	int i, rle, err;
1457 	struct hibernate_zlib_state *hibernate_state;
1458 
1459 	hibernate_state =
1460 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1461 
1462 	hib->chunk_ctr = 0;
1463 
1464 	/*
1465 	 * Map the utility VAs to the piglet. See the piglet map at the
1466 	 * top of this file for piglet layout information.
1467 	 */
1468 	hibernate_copy_page = hib->piglet_va + 3 * PAGE_SIZE;
1469 	hibernate_rle_page = hib->piglet_va + 28 * PAGE_SIZE;
1470 
1471 	chunks = (struct hibernate_disk_chunk *)(hib->piglet_va +
1472 	    HIBERNATE_CHUNK_SIZE);
1473 
1474 	/* Calculate the chunk regions */
1475 	for (i = 0; i < hib->nranges; i++) {
1476 		range_base = hib->ranges[i].base;
1477 		range_end = hib->ranges[i].end;
1478 
1479 		inaddr = range_base;
1480 
1481 		while (inaddr < range_end) {
1482 			chunks[hib->chunk_ctr].base = inaddr;
1483 			if (inaddr + HIBERNATE_CHUNK_SIZE < range_end)
1484 				chunks[hib->chunk_ctr].end = inaddr +
1485 				    HIBERNATE_CHUNK_SIZE;
1486 			else
1487 				chunks[hib->chunk_ctr].end = range_end;
1488 
1489 			inaddr += HIBERNATE_CHUNK_SIZE;
1490 			hib->chunk_ctr ++;
1491 		}
1492 	}
1493 
1494 	uvm_pmr_dirty_everything();
1495 	uvm_pmr_zero_everything();
1496 
1497 	/* Compress and write the chunks in the chunktable */
1498 	for (i = 0; i < hib->chunk_ctr; i++) {
1499 		range_base = chunks[i].base;
1500 		range_end = chunks[i].end;
1501 
1502 		chunks[i].offset = blkctr + hib->image_offset;
1503 
1504 		/* Reset zlib for deflate */
1505 		if (hibernate_zlib_reset(hib, 1) != Z_OK) {
1506 			DPRINTF("hibernate_zlib_reset failed for deflate\n");
1507 			return (ENOMEM);
1508 		}
1509 
1510 		inaddr = range_base;
1511 
1512 		/*
1513 		 * For each range, loop through its phys mem region
1514 		 * and write out the chunks (the last chunk might be
1515 		 * smaller than the chunk size).
1516 		 */
1517 		while (inaddr < range_end) {
1518 			out_remaining = PAGE_SIZE;
1519 			while (out_remaining > 0 && inaddr < range_end) {
1520 				/*
1521 				 * Adjust for regions that are not evenly
1522 				 * divisible by PAGE_SIZE or overflowed
1523 				 * pages from the previous iteration.
1524 				 */
1525 				temp_inaddr = (inaddr & PAGE_MASK) +
1526 				    hibernate_copy_page;
1527 
1528 				/* Deflate from temp_inaddr to IO page */
1529 				if (inaddr != range_end) {
1530 					if (inaddr % PAGE_SIZE == 0) {
1531 						rle = hibernate_write_rle(hib,
1532 							inaddr,
1533 							range_end,
1534 							&blkctr,
1535 							&out_remaining);
1536 					}
1537 
1538 					if (rle == 0) {
1539 						pmap_kenter_pa(hibernate_temp_page,
1540 							inaddr & PMAP_PA_MASK,
1541 							PROT_READ);
1542 
1543 						bcopy((caddr_t)hibernate_temp_page,
1544 							(caddr_t)hibernate_copy_page,
1545 							PAGE_SIZE);
1546 						inaddr += hibernate_deflate(hib,
1547 							temp_inaddr,
1548 							&out_remaining);
1549 					} else {
1550 						inaddr += rle * PAGE_SIZE;
1551 						if (inaddr > range_end)
1552 							inaddr = range_end;
1553 					}
1554 
1555 				}
1556 
1557 				if (out_remaining == 0) {
1558 					/* Filled up the page */
1559 					nblocks = PAGE_SIZE / DEV_BSIZE;
1560 
1561 					if ((err = hib->io_func(hib->dev,
1562 					    blkctr + hib->image_offset,
1563 					    (vaddr_t)hibernate_io_page,
1564 					    PAGE_SIZE, HIB_W, hib->io_page))) {
1565 						DPRINTF("hib write error %d\n",
1566 						    err);
1567 						return (err);
1568 					}
1569 
1570 					blkctr += nblocks;
1571 				}
1572 			}
1573 		}
1574 
1575 		if (inaddr != range_end) {
1576 			DPRINTF("deflate range ended prematurely\n");
1577 			return (EINVAL);
1578 		}
1579 
1580 		/*
1581 		 * End of range. Round up to next secsize bytes
1582 		 * after finishing compress
1583 		 */
1584 		if (out_remaining == 0)
1585 			out_remaining = PAGE_SIZE;
1586 
1587 		/* Finish compress */
1588 		hibernate_state->hib_stream.next_in = (unsigned char *)inaddr;
1589 		hibernate_state->hib_stream.avail_in = 0;
1590 		hibernate_state->hib_stream.next_out =
1591 		    (unsigned char *)hibernate_io_page +
1592 			(PAGE_SIZE - out_remaining);
1593 
1594 		/* We have an extra output page available for finalize */
1595 		hibernate_state->hib_stream.avail_out =
1596 			out_remaining + PAGE_SIZE;
1597 
1598 		if ((err = deflate(&hibernate_state->hib_stream, Z_FINISH)) !=
1599 		    Z_STREAM_END) {
1600 			DPRINTF("deflate error in output stream: %d\n", err);
1601 			return (err);
1602 		}
1603 
1604 		out_remaining = hibernate_state->hib_stream.avail_out;
1605 
1606 		used = 2 * PAGE_SIZE - out_remaining;
1607 		nblocks = used / DEV_BSIZE;
1608 
1609 		/* Round up to next block if needed */
1610 		if (used % DEV_BSIZE != 0)
1611 			nblocks ++;
1612 
1613 		/* Write final block(s) for this chunk */
1614 		if ((err = hib->io_func(hib->dev, blkctr + hib->image_offset,
1615 		    (vaddr_t)hibernate_io_page, nblocks*DEV_BSIZE,
1616 		    HIB_W, hib->io_page))) {
1617 			DPRINTF("hib final write error %d\n", err);
1618 			return (err);
1619 		}
1620 
1621 		blkctr += nblocks;
1622 
1623 		chunks[i].compressed_size = (blkctr + hib->image_offset -
1624 		    chunks[i].offset) * DEV_BSIZE;
1625 	}
1626 
1627 	hib->chunktable_offset = hib->image_offset + blkctr;
1628 	return (0);
1629 }
1630 
1631 /*
1632  * Reset the zlib stream state and allocate a new hiballoc area for either
1633  * inflate or deflate. This function is called once for each hibernate chunk.
1634  * Calling hiballoc_init multiple times is acceptable since the memory it is
1635  * provided is unmanaged memory (stolen). We use the memory provided to us
1636  * by the piglet allocated via the supplied hib.
1637  */
1638 int
1639 hibernate_zlib_reset(union hibernate_info *hib, int deflate)
1640 {
1641 	vaddr_t hibernate_zlib_start;
1642 	size_t hibernate_zlib_size;
1643 	char *pva = (char *)hib->piglet_va;
1644 	struct hibernate_zlib_state *hibernate_state;
1645 
1646 	hibernate_state =
1647 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1648 
1649 	if (!deflate)
1650 		pva = (char *)((paddr_t)pva & (PIGLET_PAGE_MASK));
1651 
1652 	/*
1653 	 * See piglet layout information at the start of this file for
1654 	 * information on the zlib page assignments.
1655 	 */
1656 	hibernate_zlib_start = (vaddr_t)(pva + (30 * PAGE_SIZE));
1657 	hibernate_zlib_size = 80 * PAGE_SIZE;
1658 
1659 	memset((void *)hibernate_zlib_start, 0, hibernate_zlib_size);
1660 	memset(hibernate_state, 0, PAGE_SIZE);
1661 
1662 	/* Set up stream structure */
1663 	hibernate_state->hib_stream.zalloc = (alloc_func)hibernate_zlib_alloc;
1664 	hibernate_state->hib_stream.zfree = (free_func)hibernate_zlib_free;
1665 
1666 	/* Initialize the hiballoc arena for zlib allocs/frees */
1667 	hiballoc_init(&hibernate_state->hiballoc_arena,
1668 	    (caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1669 
1670 	if (deflate) {
1671 		return deflateInit(&hibernate_state->hib_stream,
1672 		    Z_BEST_SPEED);
1673 	} else
1674 		return inflateInit(&hibernate_state->hib_stream);
1675 }
1676 
1677 /*
1678  * Reads the hibernated memory image from disk, whose location and
1679  * size are recorded in hib. Begin by reading the persisted
1680  * chunk table, which records the original chunk placement location
1681  * and compressed size for each. Next, allocate a pig region of
1682  * sufficient size to hold the compressed image. Next, read the
1683  * chunks into the pig area (calling hibernate_read_chunks to do this),
1684  * and finally, if all of the above succeeds, clear the hibernate signature.
1685  * The function will then return to hibernate_resume, which will proceed
1686  * to unpack the pig image to the correct place in memory.
1687  */
1688 int
1689 hibernate_read_image(union hibernate_info *hib)
1690 {
1691 	size_t compressed_size, disk_size, chunktable_size, pig_sz;
1692 	paddr_t image_start, image_end, pig_start, pig_end;
1693 	struct hibernate_disk_chunk *chunks;
1694 	daddr_t blkctr;
1695 	vaddr_t chunktable = (vaddr_t)NULL;
1696 	paddr_t piglet_chunktable = hib->piglet_pa +
1697 	    HIBERNATE_CHUNK_SIZE;
1698 	int i, status;
1699 
1700 	status = 0;
1701 	pmap_activate(curproc);
1702 
1703 	/* Calculate total chunk table size in disk blocks */
1704 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / DEV_BSIZE;
1705 
1706 	blkctr = hib->chunktable_offset;
1707 
1708 	chunktable = (vaddr_t)km_alloc(HIBERNATE_CHUNK_TABLE_SIZE, &kv_any,
1709 	    &kp_none, &kd_nowait);
1710 
1711 	if (!chunktable)
1712 		return (1);
1713 
1714 	/* Map chunktable pages */
1715 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE; i += PAGE_SIZE)
1716 		pmap_kenter_pa(chunktable + i, piglet_chunktable + i,
1717 		    PROT_READ | PROT_WRITE);
1718 	pmap_update(pmap_kernel());
1719 
1720 	/* Read the chunktable from disk into the piglet chunktable */
1721 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE;
1722 	    i += MAXPHYS, blkctr += MAXPHYS/DEV_BSIZE)
1723 		hibernate_block_io(hib, blkctr, MAXPHYS,
1724 		    chunktable + i, 0);
1725 
1726 	blkctr = hib->image_offset;
1727 	compressed_size = 0;
1728 
1729 	chunks = (struct hibernate_disk_chunk *)chunktable;
1730 
1731 	for (i = 0; i < hib->chunk_ctr; i++)
1732 		compressed_size += chunks[i].compressed_size;
1733 
1734 	disk_size = compressed_size;
1735 
1736 	printf("unhibernating @ block %lld length %luMB\n",
1737 	    hib->sig_offset - chunktable_size,
1738 	    compressed_size / (1024 * 1024));
1739 
1740 	/* Allocate the pig area */
1741 	pig_sz = compressed_size + HIBERNATE_CHUNK_SIZE;
1742 	if (uvm_pmr_alloc_pig(&pig_start, pig_sz, hib->piglet_pa) == ENOMEM) {
1743 		status = 1;
1744 		goto unmap;
1745 	}
1746 
1747 	pig_end = pig_start + pig_sz;
1748 
1749 	/* Calculate image extents. Pig image must end on a chunk boundary. */
1750 	image_end = pig_end & ~(HIBERNATE_CHUNK_SIZE - 1);
1751 	image_start = image_end - disk_size;
1752 
1753 	hibernate_read_chunks(hib, image_start, image_end, disk_size,
1754 	    chunks);
1755 
1756 	/* Prepare the resume time pmap/page table */
1757 	hibernate_populate_resume_pt(hib, image_start, image_end);
1758 
1759 unmap:
1760 	/* Unmap chunktable pages */
1761 	pmap_kremove(chunktable, HIBERNATE_CHUNK_TABLE_SIZE);
1762 	pmap_update(pmap_kernel());
1763 
1764 	return (status);
1765 }
1766 
1767 /*
1768  * Read the hibernated memory chunks from disk (chunk information at this
1769  * point is stored in the piglet) into the pig area specified by
1770  * [pig_start .. pig_end]. Order the chunks so that the final chunk is the
1771  * only chunk with overlap possibilities.
1772  */
1773 int
1774 hibernate_read_chunks(union hibernate_info *hib, paddr_t pig_start,
1775     paddr_t pig_end, size_t image_compr_size,
1776     struct hibernate_disk_chunk *chunks)
1777 {
1778 	paddr_t img_cur, piglet_base;
1779 	daddr_t blkctr;
1780 	size_t processed, compressed_size, read_size;
1781 	int nchunks, nfchunks, num_io_pages;
1782 	vaddr_t tempva, hibernate_fchunk_area;
1783 	short *fchunks, i, j;
1784 
1785 	tempva = (vaddr_t)NULL;
1786 	hibernate_fchunk_area = (vaddr_t)NULL;
1787 	nfchunks = 0;
1788 	piglet_base = hib->piglet_pa;
1789 	global_pig_start = pig_start;
1790 
1791 	/*
1792 	 * These mappings go into the resuming kernel's page table, and are
1793 	 * used only during image read. They disappear from existence
1794 	 * when the suspended kernel is unpacked on top of us.
1795 	 */
1796 	tempva = (vaddr_t)km_alloc(MAXPHYS + PAGE_SIZE, &kv_any, &kp_none,
1797 		&kd_nowait);
1798 	if (!tempva)
1799 		return (1);
1800 	hibernate_fchunk_area = (vaddr_t)km_alloc(24 * PAGE_SIZE, &kv_any,
1801 	    &kp_none, &kd_nowait);
1802 	if (!hibernate_fchunk_area)
1803 		return (1);
1804 
1805 	/* Final output chunk ordering VA */
1806 	fchunks = (short *)hibernate_fchunk_area;
1807 
1808 	/* Map the chunk ordering region */
1809 	for(i = 0; i < 24 ; i++)
1810 		pmap_kenter_pa(hibernate_fchunk_area + (i * PAGE_SIZE),
1811 			piglet_base + ((4 + i) * PAGE_SIZE),
1812 			PROT_READ | PROT_WRITE);
1813 	pmap_update(pmap_kernel());
1814 
1815 	nchunks = hib->chunk_ctr;
1816 
1817 	/* Initially start all chunks as unplaced */
1818 	for (i = 0; i < nchunks; i++)
1819 		chunks[i].flags = 0;
1820 
1821 	/*
1822 	 * Search the list for chunks that are outside the pig area. These
1823 	 * can be placed first in the final output list.
1824 	 */
1825 	for (i = 0; i < nchunks; i++) {
1826 		if (chunks[i].end <= pig_start || chunks[i].base >= pig_end) {
1827 			fchunks[nfchunks] = i;
1828 			nfchunks++;
1829 			chunks[i].flags |= HIBERNATE_CHUNK_PLACED;
1830 		}
1831 	}
1832 
1833 	/*
1834 	 * Walk the ordering, place the chunks in ascending memory order.
1835 	 */
1836 	for (i = 0; i < nchunks; i++) {
1837 		if (chunks[i].flags != HIBERNATE_CHUNK_PLACED) {
1838 			fchunks[nfchunks] = i;
1839 			nfchunks++;
1840 			chunks[i].flags = HIBERNATE_CHUNK_PLACED;
1841 		}
1842 	}
1843 
1844 	img_cur = pig_start;
1845 
1846 	for (i = 0; i < nfchunks; i++) {
1847 		blkctr = chunks[fchunks[i]].offset;
1848 		processed = 0;
1849 		compressed_size = chunks[fchunks[i]].compressed_size;
1850 
1851 		while (processed < compressed_size) {
1852 			if (compressed_size - processed >= MAXPHYS)
1853 				read_size = MAXPHYS;
1854 			else
1855 				read_size = compressed_size - processed;
1856 
1857 			/*
1858 			 * We're reading read_size bytes, offset from the
1859 			 * start of a page by img_cur % PAGE_SIZE, so the
1860 			 * end will be read_size + (img_cur % PAGE_SIZE)
1861 			 * from the start of the first page.  Round that
1862 			 * up to the next page size.
1863 			 */
1864 			num_io_pages = (read_size + (img_cur % PAGE_SIZE)
1865 				+ PAGE_SIZE - 1) / PAGE_SIZE;
1866 
1867 			KASSERT(num_io_pages <= MAXPHYS/PAGE_SIZE + 1);
1868 
1869 			/* Map pages for this read */
1870 			for (j = 0; j < num_io_pages; j ++)
1871 				pmap_kenter_pa(tempva + j * PAGE_SIZE,
1872 				    img_cur + j * PAGE_SIZE,
1873 				    PROT_READ | PROT_WRITE);
1874 
1875 			pmap_update(pmap_kernel());
1876 
1877 			hibernate_block_io(hib, blkctr, read_size,
1878 			    tempva + (img_cur & PAGE_MASK), 0);
1879 
1880 			blkctr += (read_size / DEV_BSIZE);
1881 
1882 			pmap_kremove(tempva, num_io_pages * PAGE_SIZE);
1883 			pmap_update(pmap_kernel());
1884 
1885 			processed += read_size;
1886 			img_cur += read_size;
1887 		}
1888 	}
1889 
1890 	pmap_kremove(hibernate_fchunk_area, 24 * PAGE_SIZE);
1891 	pmap_update(pmap_kernel());
1892 
1893 	return (0);
1894 }
1895 
1896 /*
1897  * Hibernating a machine comprises the following operations:
1898  *  1. Calculating this machine's hibernate_info information
1899  *  2. Allocating a piglet and saving the piglet's physaddr
1900  *  3. Calculating the memory chunks
1901  *  4. Writing the compressed chunks to disk
1902  *  5. Writing the chunk table
1903  *  6. Writing the signature block (hibernate_info)
1904  *
1905  * On most architectures, the function calling hibernate_suspend would
1906  * then power off the machine using some MD-specific implementation.
1907  */
1908 int
1909 hibernate_suspend(void)
1910 {
1911 	union hibernate_info hib;
1912 	u_long start, end;
1913 
1914 	/*
1915 	 * Calculate memory ranges, swap offsets, etc.
1916 	 * This also allocates a piglet whose physaddr is stored in
1917 	 * hib->piglet_pa and vaddr stored in hib->piglet_va
1918 	 */
1919 	if (get_hibernate_info(&hib, 1)) {
1920 		DPRINTF("failed to obtain hibernate info\n");
1921 		return (1);
1922 	}
1923 
1924 	/* Find a page-addressed region in swap [start,end] */
1925 	if (uvm_hibswap(hib.dev, &start, &end)) {
1926 		printf("hibernate: cannot find any swap\n");
1927 		return (1);
1928 	}
1929 
1930 	if (end - start < 1000) {
1931 		printf("hibernate: insufficient swap (%lu is too small)\n",
1932 			end - start);
1933 		return (1);
1934 	}
1935 
1936 	/* Calculate block offsets in swap */
1937 	hib.image_offset = ctod(start);
1938 
1939 	DPRINTF("hibernate @ block %lld max-length %lu blocks\n",
1940 	    hib.image_offset, ctod(end) - ctod(start));
1941 
1942 	pmap_activate(curproc);
1943 	DPRINTF("hibernate: writing chunks\n");
1944 	if (hibernate_write_chunks(&hib)) {
1945 		DPRINTF("hibernate_write_chunks failed\n");
1946 		return (1);
1947 	}
1948 
1949 	DPRINTF("hibernate: writing chunktable\n");
1950 	if (hibernate_write_chunktable(&hib)) {
1951 		DPRINTF("hibernate_write_chunktable failed\n");
1952 		return (1);
1953 	}
1954 
1955 	DPRINTF("hibernate: writing signature\n");
1956 	if (hibernate_write_signature(&hib)) {
1957 		DPRINTF("hibernate_write_signature failed\n");
1958 		return (1);
1959 	}
1960 
1961 	/* Allow the disk to settle */
1962 	delay(500000);
1963 
1964 	/*
1965 	 * Give the device-specific I/O function a notification that we're
1966 	 * done, and that it can clean up or shutdown as needed.
1967 	 */
1968 	hib.io_func(hib.dev, 0, (vaddr_t)NULL, 0, HIB_DONE, hib.io_page);
1969 	return (0);
1970 }
1971 
1972 int
1973 hibernate_alloc(void)
1974 {
1975 	KASSERT(global_piglet_va == 0);
1976 	KASSERT(hibernate_temp_page == 0);
1977 
1978 	pmap_activate(curproc);
1979 	pmap_kenter_pa(HIBERNATE_HIBALLOC_PAGE, HIBERNATE_HIBALLOC_PAGE,
1980 	    PROT_READ | PROT_WRITE);
1981 
1982 	/* Allocate a piglet, store its addresses in the supplied globals */
1983 	if (uvm_pmr_alloc_piglet(&global_piglet_va, &global_piglet_pa,
1984 	    HIBERNATE_CHUNK_SIZE * 4, HIBERNATE_CHUNK_SIZE))
1985 		goto unmap;
1986 
1987 	/*
1988 	 * Allocate VA for the temp page.
1989 	 *
1990 	 * This will become part of the suspended kernel and will
1991 	 * be freed in hibernate_free, upon resume (or hibernate
1992 	 * failure)
1993 	 */
1994 	hibernate_temp_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1995 	    &kp_none, &kd_nowait);
1996 	if (!hibernate_temp_page) {
1997 		uvm_pmr_free_piglet(global_piglet_va,
1998 		    4 * HIBERNATE_CHUNK_SIZE);
1999 		global_piglet_va = 0;
2000 		goto unmap;
2001 	}
2002 	return (0);
2003 unmap:
2004 	pmap_kremove(HIBERNATE_HIBALLOC_PAGE, PAGE_SIZE);
2005 	pmap_update(pmap_kernel());
2006 	return (ENOMEM);
2007 }
2008 
2009 /*
2010  * Free items allocated by hibernate_alloc()
2011  */
2012 void
2013 hibernate_free(void)
2014 {
2015 	pmap_activate(curproc);
2016 
2017 	if (global_piglet_va)
2018 		uvm_pmr_free_piglet(global_piglet_va,
2019 		    4 * HIBERNATE_CHUNK_SIZE);
2020 
2021 	if (hibernate_temp_page) {
2022 		pmap_kremove(hibernate_temp_page, PAGE_SIZE);
2023 		km_free((void *)hibernate_temp_page, PAGE_SIZE,
2024 		    &kv_any, &kp_none);
2025 	}
2026 
2027 	global_piglet_va = 0;
2028 	hibernate_temp_page = 0;
2029 	pmap_kremove(HIBERNATE_HIBALLOC_PAGE, PAGE_SIZE);
2030 	pmap_update(pmap_kernel());
2031 }
2032