xref: /openbsd-src/sys/kern/subr_hibernate.c (revision 96b2f3361dba59e0d09309e91d63a4762a032f71)
1 /*	$OpenBSD: subr_hibernate.c,v 1.147 2024/12/31 17:16:05 krw Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  * Copyright (c) 2011 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/hibernate.h>
21 #include <sys/malloc.h>
22 #include <sys/param.h>
23 #include <sys/tree.h>
24 #include <sys/systm.h>
25 #include <sys/disklabel.h>
26 #include <sys/disk.h>
27 #include <sys/conf.h>
28 #include <sys/buf.h>
29 #include <sys/fcntl.h>
30 #include <sys/stat.h>
31 #include <sys/atomic.h>
32 
33 #include <uvm/uvm.h>
34 #include <uvm/uvm_swap.h>
35 
36 #include <machine/hibernate.h>
37 
38 /* Make sure the signature can fit in one block */
39 CTASSERT((offsetof(union hibernate_info, sec_size) + sizeof(u_int32_t)) <= DEV_BSIZE);
40 
41 /*
42  * Hibernate piglet layout information
43  *
44  * The piglet is a scratch area of memory allocated by the suspending kernel.
45  * Its phys and virt addrs are recorded in the signature block. The piglet is
46  * used to guarantee an unused area of memory that can be used by the resuming
47  * kernel for various things. The piglet is excluded during unpack operations.
48  * The piglet size is presently 4*HIBERNATE_CHUNK_SIZE (typically 4*4MB).
49  *
50  * Offset from piglet_base	Purpose
51  * ----------------------------------------------------------------------------
52  * 0				Private page for suspend I/O write functions
53  * 1*PAGE_SIZE			I/O page used during hibernate suspend
54  * 2*PAGE_SIZE			I/O page used during hibernate suspend
55  * 3*PAGE_SIZE			copy page used during hibernate suspend
56  * 4*PAGE_SIZE			final chunk ordering list (24 pages)
57  * 28*PAGE_SIZE			RLE utility page
58  * 29*PAGE_SIZE			start of hiballoc area
59  * 30*PAGE_SIZE			preserved entropy
60  * 110*PAGE_SIZE		end of hiballoc area (80 pages)
61  * 366*PAGE_SIZE		end of retguard preservation region (256 pages)
62  * ...				unused
63  * HIBERNATE_CHUNK_SIZE		start of hibernate chunk table
64  * 2*HIBERNATE_CHUNK_SIZE	bounce area for chunks being unpacked
65  * 4*HIBERNATE_CHUNK_SIZE	end of piglet
66  */
67 
68 /* Temporary vaddr ranges used during hibernate */
69 vaddr_t hibernate_temp_page;
70 vaddr_t hibernate_copy_page;
71 vaddr_t hibernate_rle_page;
72 
73 /* Hibernate info as read from disk during resume */
74 union hibernate_info disk_hib;
75 struct bdevsw *bdsw;
76 
77 /*
78  * Global copy of the pig start address. This needs to be a global as we
79  * switch stacks after computing it - it can't be stored on the stack.
80  */
81 paddr_t global_pig_start;
82 
83 /*
84  * Global copies of the piglet start addresses (PA/VA). We store these
85  * as globals to avoid having to carry them around as parameters, as the
86  * piglet is allocated early and freed late - its lifecycle extends beyond
87  * that of the hibernate info union which is calculated on suspend/resume.
88  */
89 vaddr_t global_piglet_va;
90 paddr_t global_piglet_pa;
91 
92 /* #define HIB_DEBUG */
93 #ifdef HIB_DEBUG
94 int	hib_debug = 99;
95 #define DPRINTF(x...)     do { if (hib_debug) printf(x); } while (0)
96 #define DNPRINTF(n,x...)  do { if (hib_debug > (n)) printf(x); } while (0)
97 #else
98 #define DPRINTF(x...)
99 #define DNPRINTF(n,x...)
100 #endif
101 
102 #define	ROUNDUP(_x, _y)	((((_x)+(_y)-1)/(_y))*(_y))
103 
104 #ifndef NO_PROPOLICE
105 extern long __guard_local;
106 #endif /* ! NO_PROPOLICE */
107 
108 /* Retguard phys address (need to skip this region during unpack) */
109 paddr_t retguard_start_phys, retguard_end_phys;
110 extern char __retguard_start, __retguard_end;
111 
112 void hibernate_copy_chunk_to_piglet(paddr_t, vaddr_t, size_t);
113 int hibernate_calc_rle(paddr_t, paddr_t);
114 int hibernate_write_rle(union hibernate_info *, paddr_t, paddr_t, daddr_t *,
115 	size_t *);
116 
117 #define MAX_RLE (HIBERNATE_CHUNK_SIZE / PAGE_SIZE)
118 
119 /*
120  * Hib alloc enforced alignment.
121  */
122 #define HIB_ALIGN		8 /* bytes alignment */
123 
124 /*
125  * sizeof builtin operation, but with alignment constraint.
126  */
127 #define HIB_SIZEOF(_type)	roundup(sizeof(_type), HIB_ALIGN)
128 
129 struct hiballoc_entry {
130 	size_t			hibe_use;
131 	size_t			hibe_space;
132 	RBT_ENTRY(hiballoc_entry) hibe_entry;
133 };
134 
135 #define IO_TYPE_IMG 1
136 #define IO_TYPE_CHK 2
137 #define IO_TYPE_SIG 3
138 
139 int
140 hibernate_write(union hibernate_info *hib, daddr_t offset, vaddr_t addr,
141     size_t size, int io_type)
142 {
143 	const uint64_t blks = btodb(size);
144 
145 	if (hib == NULL || offset < 0 || blks == 0) {
146 		printf("%s: hib is NULL, offset < 0 or blks == 0\n", __func__);
147 		return (EINVAL);
148 	}
149 
150 	switch (io_type) {
151 	case IO_TYPE_IMG:
152 		if (offset + blks > hib->image_size) {
153 			printf("%s: image write is out of bounds: "
154 			    "offset-image=%lld, offset-write=%lld, blks=%llu\n",
155 			    __func__, hib->image_offset, offset, blks);
156 			return (EIO);
157 		}
158 		offset += hib->image_offset;
159 		break;
160 	case IO_TYPE_CHK:
161 		if (offset + blks > btodb(HIBERNATE_CHUNK_TABLE_SIZE)) {
162 			printf("%s: chunktable write is out of bounds: "
163 			    "offset-chunk=%lld, offset-write=%lld, blks=%llu\n",
164 			    __func__, hib->chunktable_offset, offset, blks);
165 			return (EIO);
166 		}
167 		offset += hib->chunktable_offset;
168 		break;
169 	case IO_TYPE_SIG:
170 		if (offset != hib->sig_offset || size != hib->sec_size) {
171 			printf("%s: signature write is out of bounds: "
172 			    "offset-sig=%lld, offset-write=%lld, blks=%llu\n",
173 			    __func__, hib->sig_offset, offset, blks);
174 			return (EIO);
175 		}
176 		break;
177 	default:
178 		printf("%s: unsupported io type %d\n", __func__, io_type);
179 		return (EINVAL);
180 	}
181 
182 	return (hib->io_func(hib->dev, offset, addr, size, HIB_W,
183 	    hib->io_page));
184 }
185 
186 /*
187  * Sort hibernate memory ranges by ascending PA
188  */
189 void
190 hibernate_sort_ranges(union hibernate_info *hib_info)
191 {
192 	int i, j;
193 	struct hibernate_memory_range *ranges;
194 	paddr_t base, end;
195 
196 	ranges = hib_info->ranges;
197 
198 	for (i = 1; i < hib_info->nranges; i++) {
199 		j = i;
200 		while (j > 0 && ranges[j - 1].base > ranges[j].base) {
201 			base = ranges[j].base;
202 			end = ranges[j].end;
203 			ranges[j].base = ranges[j - 1].base;
204 			ranges[j].end = ranges[j - 1].end;
205 			ranges[j - 1].base = base;
206 			ranges[j - 1].end = end;
207 			j--;
208 		}
209 	}
210 }
211 
212 /*
213  * Compare hiballoc entries based on the address they manage.
214  *
215  * Since the address is fixed, relative to struct hiballoc_entry,
216  * we just compare the hiballoc_entry pointers.
217  */
218 static __inline int
219 hibe_cmp(const struct hiballoc_entry *l, const struct hiballoc_entry *r)
220 {
221 	vaddr_t vl = (vaddr_t)l;
222 	vaddr_t vr = (vaddr_t)r;
223 
224 	return vl < vr ? -1 : (vl > vr);
225 }
226 
227 RBT_PROTOTYPE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
228 
229 /*
230  * Given a hiballoc entry, return the address it manages.
231  */
232 static __inline void *
233 hib_entry_to_addr(struct hiballoc_entry *entry)
234 {
235 	caddr_t addr;
236 
237 	addr = (caddr_t)entry;
238 	addr += HIB_SIZEOF(struct hiballoc_entry);
239 	return addr;
240 }
241 
242 /*
243  * Given an address, find the hiballoc that corresponds.
244  */
245 static __inline struct hiballoc_entry*
246 hib_addr_to_entry(void *addr_param)
247 {
248 	caddr_t addr;
249 
250 	addr = (caddr_t)addr_param;
251 	addr -= HIB_SIZEOF(struct hiballoc_entry);
252 	return (struct hiballoc_entry*)addr;
253 }
254 
255 RBT_GENERATE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp);
256 
257 /*
258  * Allocate memory from the arena.
259  *
260  * Returns NULL if no memory is available.
261  */
262 void *
263 hib_alloc(struct hiballoc_arena *arena, size_t alloc_sz)
264 {
265 	struct hiballoc_entry *entry, *new_entry;
266 	size_t find_sz;
267 
268 	/*
269 	 * Enforce alignment of HIB_ALIGN bytes.
270 	 *
271 	 * Note that, because the entry is put in front of the allocation,
272 	 * 0-byte allocations are guaranteed a unique address.
273 	 */
274 	alloc_sz = roundup(alloc_sz, HIB_ALIGN);
275 
276 	/*
277 	 * Find an entry with hibe_space >= find_sz.
278 	 *
279 	 * If the root node is not large enough, we switch to tree traversal.
280 	 * Because all entries are made at the bottom of the free space,
281 	 * traversal from the end has a slightly better chance of yielding
282 	 * a sufficiently large space.
283 	 */
284 	find_sz = alloc_sz + HIB_SIZEOF(struct hiballoc_entry);
285 	entry = RBT_ROOT(hiballoc_addr, &arena->hib_addrs);
286 	if (entry != NULL && entry->hibe_space < find_sz) {
287 		RBT_FOREACH_REVERSE(entry, hiballoc_addr, &arena->hib_addrs) {
288 			if (entry->hibe_space >= find_sz)
289 				break;
290 		}
291 	}
292 
293 	/*
294 	 * Insufficient or too fragmented memory.
295 	 */
296 	if (entry == NULL)
297 		return NULL;
298 
299 	/*
300 	 * Create new entry in allocated space.
301 	 */
302 	new_entry = (struct hiballoc_entry*)(
303 	    (caddr_t)hib_entry_to_addr(entry) + entry->hibe_use);
304 	new_entry->hibe_space = entry->hibe_space - find_sz;
305 	new_entry->hibe_use = alloc_sz;
306 
307 	/*
308 	 * Insert entry.
309 	 */
310 	if (RBT_INSERT(hiballoc_addr, &arena->hib_addrs, new_entry) != NULL)
311 		panic("hib_alloc: insert failure");
312 	entry->hibe_space = 0;
313 
314 	/* Return address managed by entry. */
315 	return hib_entry_to_addr(new_entry);
316 }
317 
318 void
319 hib_getentropy(char **bufp, size_t *bufplen)
320 {
321 	if (!bufp || !bufplen)
322 		return;
323 
324 	*bufp = (char *)(global_piglet_va + (29 * PAGE_SIZE));
325 	*bufplen = PAGE_SIZE;
326 }
327 
328 /*
329  * Free a pointer previously allocated from this arena.
330  *
331  * If addr is NULL, this will be silently accepted.
332  */
333 void
334 hib_free(struct hiballoc_arena *arena, void *addr)
335 {
336 	struct hiballoc_entry *entry, *prev;
337 
338 	if (addr == NULL)
339 		return;
340 
341 	/*
342 	 * Derive entry from addr and check it is really in this arena.
343 	 */
344 	entry = hib_addr_to_entry(addr);
345 	if (RBT_FIND(hiballoc_addr, &arena->hib_addrs, entry) != entry)
346 		panic("hib_free: freed item %p not in hib arena", addr);
347 
348 	/*
349 	 * Give the space in entry to its predecessor.
350 	 *
351 	 * If entry has no predecessor, change its used space into free space
352 	 * instead.
353 	 */
354 	prev = RBT_PREV(hiballoc_addr, entry);
355 	if (prev != NULL &&
356 	    (void *)((caddr_t)prev + HIB_SIZEOF(struct hiballoc_entry) +
357 	    prev->hibe_use + prev->hibe_space) == entry) {
358 		/* Merge entry. */
359 		RBT_REMOVE(hiballoc_addr, &arena->hib_addrs, entry);
360 		prev->hibe_space += HIB_SIZEOF(struct hiballoc_entry) +
361 		    entry->hibe_use + entry->hibe_space;
362 	} else {
363 		/* Flip used memory to free space. */
364 		entry->hibe_space += entry->hibe_use;
365 		entry->hibe_use = 0;
366 	}
367 }
368 
369 /*
370  * Initialize hiballoc.
371  *
372  * The allocator will manage memory at ptr, which is len bytes.
373  */
374 int
375 hiballoc_init(struct hiballoc_arena *arena, void *p_ptr, size_t p_len)
376 {
377 	struct hiballoc_entry *entry;
378 	caddr_t ptr;
379 	size_t len;
380 
381 	RBT_INIT(hiballoc_addr, &arena->hib_addrs);
382 
383 	/*
384 	 * Hib allocator enforces HIB_ALIGN alignment.
385 	 * Fixup ptr and len.
386 	 */
387 	ptr = (caddr_t)roundup((vaddr_t)p_ptr, HIB_ALIGN);
388 	len = p_len - ((size_t)ptr - (size_t)p_ptr);
389 	len &= ~((size_t)HIB_ALIGN - 1);
390 
391 	/*
392 	 * Insufficient memory to be able to allocate and also do bookkeeping.
393 	 */
394 	if (len <= HIB_SIZEOF(struct hiballoc_entry))
395 		return ENOMEM;
396 
397 	/*
398 	 * Create entry describing space.
399 	 */
400 	entry = (struct hiballoc_entry*)ptr;
401 	entry->hibe_use = 0;
402 	entry->hibe_space = len - HIB_SIZEOF(struct hiballoc_entry);
403 	RBT_INSERT(hiballoc_addr, &arena->hib_addrs, entry);
404 
405 	return 0;
406 }
407 
408 /*
409  * Zero all free memory.
410  */
411 void
412 uvm_pmr_zero_everything(void)
413 {
414 	struct uvm_pmemrange	*pmr;
415 	struct vm_page		*pg;
416 	int			 i;
417 
418 	uvm_lock_fpageq();
419 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
420 		/* Zero single pages. */
421 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_DIRTY]))
422 		    != NULL) {
423 			uvm_pmr_remove(pmr, pg);
424 			uvm_pagezero(pg);
425 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
426 			uvmexp.zeropages++;
427 			uvm_pmr_insert(pmr, pg, 0);
428 		}
429 
430 		/* Zero multi page ranges. */
431 		while ((pg = RBT_ROOT(uvm_pmr_size,
432 		    &pmr->size[UVM_PMR_MEMTYPE_DIRTY])) != NULL) {
433 			pg--; /* Size tree always has second page. */
434 			uvm_pmr_remove(pmr, pg);
435 			for (i = 0; i < pg->fpgsz; i++) {
436 				uvm_pagezero(&pg[i]);
437 				atomic_setbits_int(&pg[i].pg_flags, PG_ZERO);
438 				uvmexp.zeropages++;
439 			}
440 			uvm_pmr_insert(pmr, pg, 0);
441 		}
442 	}
443 	uvm_unlock_fpageq();
444 }
445 
446 /*
447  * Mark all memory as dirty.
448  *
449  * Used to inform the system that the clean memory isn't clean for some
450  * reason, for example because we just came back from hibernate.
451  */
452 void
453 uvm_pmr_dirty_everything(void)
454 {
455 	struct uvm_pmemrange	*pmr;
456 	struct vm_page		*pg;
457 	int			 i;
458 
459 	uvm_lock_fpageq();
460 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
461 		/* Dirty single pages. */
462 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_ZERO]))
463 		    != NULL) {
464 			uvm_pmr_remove(pmr, pg);
465 			atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
466 			uvm_pmr_insert(pmr, pg, 0);
467 		}
468 
469 		/* Dirty multi page ranges. */
470 		while ((pg = RBT_ROOT(uvm_pmr_size,
471 		    &pmr->size[UVM_PMR_MEMTYPE_ZERO])) != NULL) {
472 			pg--; /* Size tree always has second page. */
473 			uvm_pmr_remove(pmr, pg);
474 			for (i = 0; i < pg->fpgsz; i++)
475 				atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
476 			uvm_pmr_insert(pmr, pg, 0);
477 		}
478 	}
479 
480 	uvmexp.zeropages = 0;
481 	uvm_unlock_fpageq();
482 }
483 
484 /*
485  * Allocate an area that can hold sz bytes and doesn't overlap with
486  * the piglet at piglet_pa.
487  */
488 int
489 uvm_pmr_alloc_pig(paddr_t *pa, psize_t sz, paddr_t piglet_pa)
490 {
491 	struct uvm_constraint_range pig_constraint;
492 	struct kmem_pa_mode kp_pig = {
493 		.kp_constraint = &pig_constraint,
494 		.kp_maxseg = 1
495 	};
496 	vaddr_t va;
497 
498 	sz = round_page(sz);
499 
500 	pig_constraint.ucr_low = piglet_pa + 4 * HIBERNATE_CHUNK_SIZE;
501 	pig_constraint.ucr_high = -1;
502 
503 	va = (vaddr_t)km_alloc(sz, &kv_any, &kp_pig, &kd_nowait);
504 	if (va == 0) {
505 		pig_constraint.ucr_low = 0;
506 		pig_constraint.ucr_high = piglet_pa - 1;
507 
508 		va = (vaddr_t)km_alloc(sz, &kv_any, &kp_pig, &kd_nowait);
509 		if (va == 0)
510 			return ENOMEM;
511 	}
512 
513 	pmap_extract(pmap_kernel(), va, pa);
514 	return 0;
515 }
516 
517 /*
518  * Allocate a piglet area.
519  *
520  * This needs to be in DMA-safe memory.
521  * Piglets are aligned.
522  *
523  * sz and align in bytes.
524  */
525 int
526 uvm_pmr_alloc_piglet(vaddr_t *va, paddr_t *pa, vsize_t sz, paddr_t align)
527 {
528 	struct kmem_pa_mode kp_piglet = {
529 		.kp_constraint = &dma_constraint,
530 		.kp_align = align,
531 		.kp_maxseg = 1
532 	};
533 
534 	/* Ensure align is a power of 2 */
535 	KASSERT((align & (align - 1)) == 0);
536 
537 	/*
538 	 * Fixup arguments: align must be at least PAGE_SIZE,
539 	 * sz will be converted to pagecount, since that is what
540 	 * pmemrange uses internally.
541 	 */
542 	if (align < PAGE_SIZE)
543 		kp_piglet.kp_align = PAGE_SIZE;
544 
545 	sz = round_page(sz);
546 
547 	*va = (vaddr_t)km_alloc(sz, &kv_any, &kp_piglet, &kd_nowait);
548 	if (*va == 0)
549 		return ENOMEM;
550 
551 	pmap_extract(pmap_kernel(), *va, pa);
552 	return 0;
553 }
554 
555 /*
556  * Free a piglet area.
557  */
558 void
559 uvm_pmr_free_piglet(vaddr_t va, vsize_t sz)
560 {
561 	/*
562 	 * Fix parameters.
563 	 */
564 	sz = round_page(sz);
565 
566 	/*
567 	 * Free the physical and virtual memory.
568 	 */
569 	km_free((void *)va, sz, &kv_any, &kp_dma_contig);
570 }
571 
572 /*
573  * Physmem RLE compression support.
574  *
575  * Given a physical page address, return the number of pages starting at the
576  * address that are free.  Clamps to the number of pages in
577  * HIBERNATE_CHUNK_SIZE. Returns 0 if the page at addr is not free.
578  */
579 int
580 uvm_page_rle(paddr_t addr)
581 {
582 	struct vm_page		*pg, *pg_end;
583 	struct vm_physseg	*vmp;
584 	int			 pseg_idx, off_idx;
585 
586 	pseg_idx = vm_physseg_find(atop(addr), &off_idx);
587 	if (pseg_idx == -1)
588 		return 0;
589 
590 	vmp = &vm_physmem[pseg_idx];
591 	pg = &vmp->pgs[off_idx];
592 	if (!(pg->pg_flags & PQ_FREE))
593 		return 0;
594 
595 	/*
596 	 * Search for the first non-free page after pg.
597 	 * Note that the page may not be the first page in a free pmemrange,
598 	 * therefore pg->fpgsz cannot be used.
599 	 */
600 	for (pg_end = pg; pg_end <= vmp->lastpg &&
601 	    (pg_end->pg_flags & PQ_FREE) == PQ_FREE &&
602 	    (pg_end - pg) < HIBERNATE_CHUNK_SIZE/PAGE_SIZE; pg_end++)
603 		;
604 	return pg_end - pg;
605 }
606 
607 /*
608  * Fills out the hibernate_info union pointed to by hib
609  * with information about this machine (swap signature block
610  * offsets, number of memory ranges, kernel in use, etc)
611  */
612 int
613 get_hibernate_info(union hibernate_info *hib, int suspend)
614 {
615 	struct disklabel dl;
616 	char err_string[128], *dl_ret;
617 	int part;
618 	SHA2_CTX ctx;
619 	void *fn;
620 
621 #ifndef NO_PROPOLICE
622 	/* Save propolice guard */
623 	hib->guard = __guard_local;
624 #endif /* ! NO_PROPOLICE */
625 
626 	/* Determine I/O function to use */
627 	hib->io_func = get_hibernate_io_function(swdevt[0]);
628 	if (hib->io_func == NULL)
629 		return (1);
630 
631 	/* Calculate hibernate device */
632 	hib->dev = swdevt[0];
633 
634 	/* Read disklabel (used to calculate signature and image offsets) */
635 	dl_ret = disk_readlabel(&dl, hib->dev, err_string, sizeof(err_string));
636 
637 	if (dl_ret) {
638 		printf("Hibernate error reading disklabel: %s\n", dl_ret);
639 		return (1);
640 	}
641 
642 	/* Make sure we have a swap partition. */
643 	part = DISKPART(hib->dev);
644 	if (dl.d_npartitions <= part ||
645 	    dl.d_secsize > sizeof(union hibernate_info) ||
646 	    dl.d_partitions[part].p_fstype != FS_SWAP ||
647 	    DL_GETPSIZE(&dl.d_partitions[part]) == 0)
648 		return (1);
649 
650 	/* Magic number */
651 	hib->magic = HIBERNATE_MAGIC;
652 
653 	/* Calculate signature block location */
654 	hib->sec_size = dl.d_secsize;
655 	hib->sig_offset = DL_GETPSIZE(&dl.d_partitions[part]) - 1;
656 	hib->sig_offset = DL_SECTOBLK(&dl, hib->sig_offset);
657 
658 	SHA256Init(&ctx);
659 	SHA256Update(&ctx, version, strlen(version));
660 	fn = printf;
661 	SHA256Update(&ctx, &fn, sizeof(fn));
662 	fn = malloc;
663 	SHA256Update(&ctx, &fn, sizeof(fn));
664 	fn = km_alloc;
665 	SHA256Update(&ctx, &fn, sizeof(fn));
666 	fn = strlen;
667 	SHA256Update(&ctx, &fn, sizeof(fn));
668 	SHA256Final((u_int8_t *)&hib->kern_hash, &ctx);
669 
670 	if (suspend) {
671 		/* Grab the previously-allocated piglet addresses */
672 		hib->piglet_va = global_piglet_va;
673 		hib->piglet_pa = global_piglet_pa;
674 		hib->io_page = (void *)hib->piglet_va;
675 
676 		/*
677 		 * Initialization of the hibernate IO function for drivers
678 		 * that need to do prep work (such as allocating memory or
679 		 * setting up data structures that cannot safely be done
680 		 * during suspend without causing side effects). There is
681 		 * a matching HIB_DONE call performed after the write is
682 		 * completed.
683 		 */
684 		if (hib->io_func(hib->dev,
685 		    DL_SECTOBLK(&dl, DL_GETPOFFSET(&dl.d_partitions[part])),
686 		    (vaddr_t)NULL,
687 		    DL_SECTOBLK(&dl, DL_GETPSIZE(&dl.d_partitions[part])),
688 		    HIB_INIT, hib->io_page))
689 			goto fail;
690 
691 	} else {
692 		/*
693 		 * Resuming kernels use a regular private page for the driver
694 		 * No need to free this I/O page as it will vanish as part of
695 		 * the resume.
696 		 */
697 		hib->io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
698 		if (!hib->io_page)
699 			goto fail;
700 	}
701 
702 	if (get_hibernate_info_md(hib))
703 		goto fail;
704 
705 	return (0);
706 
707 fail:
708 	return (1);
709 }
710 
711 /*
712  * Allocate nitems*size bytes from the hiballoc area presently in use
713  */
714 void *
715 hibernate_zlib_alloc(void *unused, int nitems, int size)
716 {
717 	struct hibernate_zlib_state *hibernate_state;
718 
719 	hibernate_state =
720 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
721 
722 	return hib_alloc(&hibernate_state->hiballoc_arena, nitems*size);
723 }
724 
725 /*
726  * Free the memory pointed to by addr in the hiballoc area presently in
727  * use
728  */
729 void
730 hibernate_zlib_free(void *unused, void *addr)
731 {
732 	struct hibernate_zlib_state *hibernate_state;
733 
734 	hibernate_state =
735 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
736 
737 	hib_free(&hibernate_state->hiballoc_arena, addr);
738 }
739 
740 /*
741  * Inflate next page of data from the image stream.
742  * The rle parameter is modified on exit to contain the number of pages to
743  * skip in the output stream (or 0 if this page was inflated into).
744  *
745  * Returns 0 if the stream contains additional data, or 1 if the stream is
746  * finished.
747  */
748 int
749 hibernate_inflate_page(int *rle)
750 {
751 	struct hibernate_zlib_state *hibernate_state;
752 	int i;
753 
754 	hibernate_state =
755 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
756 
757 	/* Set up the stream for RLE code inflate */
758 	hibernate_state->hib_stream.next_out = (unsigned char *)rle;
759 	hibernate_state->hib_stream.avail_out = sizeof(*rle);
760 
761 	/* Inflate RLE code */
762 	i = inflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH);
763 	if (i != Z_OK && i != Z_STREAM_END) {
764 		/*
765 		 * XXX - this will likely reboot/hang most machines
766 		 *       since the console output buffer will be unmapped,
767 		 *       but there's not much else we can do here.
768 		 */
769 		panic("rle inflate stream error");
770 	}
771 
772 	if (hibernate_state->hib_stream.avail_out != 0) {
773 		/*
774 		 * XXX - this will likely reboot/hang most machines
775 		 *       since the console output buffer will be unmapped,
776 		 *       but there's not much else we can do here.
777 		 */
778 		panic("rle short inflate error");
779 	}
780 
781 	if (*rle < 0 || *rle > 1024) {
782 		/*
783 		 * XXX - this will likely reboot/hang most machines
784 		 *       since the console output buffer will be unmapped,
785 		 *       but there's not much else we can do here.
786 		 */
787 		panic("invalid rle count");
788 	}
789 
790 	if (i == Z_STREAM_END)
791 		return (1);
792 
793 	if (*rle != 0)
794 		return (0);
795 
796 	/* Set up the stream for page inflate */
797 	hibernate_state->hib_stream.next_out =
798 		(unsigned char *)HIBERNATE_INFLATE_PAGE;
799 	hibernate_state->hib_stream.avail_out = PAGE_SIZE;
800 
801 	/* Process next block of data */
802 	i = inflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH);
803 	if (i != Z_OK && i != Z_STREAM_END) {
804 		/*
805 		 * XXX - this will likely reboot/hang most machines
806 		 *       since the console output buffer will be unmapped,
807 		 *       but there's not much else we can do here.
808 		 */
809 		panic("inflate error");
810 	}
811 
812 	/* We should always have extracted a full page ... */
813 	if (hibernate_state->hib_stream.avail_out != 0) {
814 		/*
815 		 * XXX - this will likely reboot/hang most machines
816 		 *       since the console output buffer will be unmapped,
817 		 *       but there's not much else we can do here.
818 		 */
819 		panic("incomplete page");
820 	}
821 
822 	return (i == Z_STREAM_END);
823 }
824 
825 /*
826  * Inflate size bytes from src into dest, skipping any pages in
827  * [src..dest] that are special (see hibernate_inflate_skip)
828  *
829  * This function executes while using the resume-time stack
830  * and pmap, and therefore cannot use ddb/printf/etc. Doing so
831  * will likely hang or reset the machine since the console output buffer
832  * will be unmapped.
833  */
834 void
835 hibernate_inflate_region(union hibernate_info *hib, paddr_t dest,
836     paddr_t src, size_t size)
837 {
838 	int end_stream = 0, rle, skip;
839 	struct hibernate_zlib_state *hibernate_state;
840 
841 	hibernate_state =
842 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
843 
844 	hibernate_state->hib_stream.next_in = (unsigned char *)src;
845 	hibernate_state->hib_stream.avail_in = size;
846 
847 	do {
848 		/*
849 		 * Is this a special page? If yes, redirect the
850 		 * inflate output to a scratch page (eg, discard it)
851 		 */
852 		skip = hibernate_inflate_skip(hib, dest);
853 		if (skip == HIB_SKIP) {
854 			hibernate_enter_resume_mapping(
855 			    HIBERNATE_INFLATE_PAGE,
856 			    HIBERNATE_INFLATE_PAGE, 0);
857 		} else if (skip == HIB_MOVE) {
858 			/*
859 			 * Special case : retguard region. This gets moved
860 			 * temporarily into the piglet region and copied into
861 			 * place immediately before resume
862 			 */
863 			hibernate_enter_resume_mapping(
864 			    HIBERNATE_INFLATE_PAGE,
865 			    hib->piglet_pa + (110 * PAGE_SIZE) +
866 			    hib->retguard_ofs, 0);
867 			hib->retguard_ofs += PAGE_SIZE;
868 			if (hib->retguard_ofs > 255 * PAGE_SIZE) {
869 				/*
870 				 * XXX - this will likely reboot/hang most
871 				 *       machines since the console output
872 				 *       buffer will be unmapped, but there's
873 				 *       not much else we can do here.
874 				 */
875 				panic("retguard move error, out of space");
876 			}
877 		} else {
878 			hibernate_enter_resume_mapping(
879 			    HIBERNATE_INFLATE_PAGE, dest, 0);
880 		}
881 
882 		hibernate_flush();
883 		end_stream = hibernate_inflate_page(&rle);
884 
885 		if (rle == 0)
886 			dest += PAGE_SIZE;
887 		else
888 			dest += (rle * PAGE_SIZE);
889 	} while (!end_stream);
890 }
891 
892 /*
893  * deflate from src into the I/O page, up to 'remaining' bytes
894  *
895  * Returns number of input bytes consumed, and may reset
896  * the 'remaining' parameter if not all the output space was consumed
897  * (this information is needed to know how much to write to disk)
898  */
899 size_t
900 hibernate_deflate(union hibernate_info *hib, paddr_t src,
901     size_t *remaining)
902 {
903 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
904 	struct hibernate_zlib_state *hibernate_state;
905 
906 	hibernate_state =
907 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
908 
909 	/* Set up the stream for deflate */
910 	hibernate_state->hib_stream.next_in = (unsigned char *)src;
911 	hibernate_state->hib_stream.avail_in = PAGE_SIZE - (src & PAGE_MASK);
912 	hibernate_state->hib_stream.next_out =
913 		(unsigned char *)hibernate_io_page + (PAGE_SIZE - *remaining);
914 	hibernate_state->hib_stream.avail_out = *remaining;
915 
916 	/* Process next block of data */
917 	if (deflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH) != Z_OK)
918 		panic("hibernate zlib deflate error");
919 
920 	/* Update pointers and return number of bytes consumed */
921 	*remaining = hibernate_state->hib_stream.avail_out;
922 	return (PAGE_SIZE - (src & PAGE_MASK)) -
923 	    hibernate_state->hib_stream.avail_in;
924 }
925 
926 /*
927  * Write the hibernation information specified in hiber_info
928  * to the location in swap previously calculated (last block of
929  * swap), called the "signature block".
930  */
931 int
932 hibernate_write_signature(union hibernate_info *hib)
933 {
934 	memset(&disk_hib, 0, hib->sec_size);
935 	memcpy(&disk_hib, hib, DEV_BSIZE);
936 
937 	/* Write hibernate info to disk */
938 	return (hibernate_write(hib, hib->sig_offset,
939 	    (vaddr_t)&disk_hib, hib->sec_size, IO_TYPE_SIG));
940 }
941 
942 /*
943  * Write the memory chunk table to the area in swap immediately
944  * preceding the signature block. The chunk table is stored
945  * in the piglet when this function is called.  Returns errno.
946  */
947 int
948 hibernate_write_chunktable(union hibernate_info *hib)
949 {
950 	vaddr_t hibernate_chunk_table_start;
951 	size_t hibernate_chunk_table_size;
952 	int i, err;
953 
954 	hibernate_chunk_table_size = HIBERNATE_CHUNK_TABLE_SIZE;
955 
956 	hibernate_chunk_table_start = hib->piglet_va +
957 	    HIBERNATE_CHUNK_SIZE;
958 
959 	/* Write chunk table */
960 	for (i = 0; i < hibernate_chunk_table_size; i += MAXPHYS) {
961 		if ((err = hibernate_write(hib, btodb(i),
962 		    (vaddr_t)(hibernate_chunk_table_start + i),
963 		    MAXPHYS, IO_TYPE_CHK))) {
964 			DPRINTF("chunktable write error: %d\n", err);
965 			return (err);
966 		}
967 	}
968 
969 	return (0);
970 }
971 
972 /*
973  * Write an empty hiber_info to the swap signature block, which is
974  * guaranteed to not match any valid hib.
975  */
976 int
977 hibernate_clear_signature(union hibernate_info *hib)
978 {
979 	uint8_t buf[DEV_BSIZE];
980 
981 	/* Zero out a blank hiber_info */
982 	memcpy(&buf, &disk_hib, sizeof(buf));
983 	memset(&disk_hib, 0, hib->sec_size);
984 
985 	/* Write (zeroed) hibernate info to disk */
986 	DPRINTF("clearing hibernate signature block location: %lld\n",
987 		hib->sig_offset);
988 	if (hibernate_block_io(hib,
989 	    hib->sig_offset,
990 	    hib->sec_size, (vaddr_t)&disk_hib, 1))
991 		printf("Warning: could not clear hibernate signature\n");
992 
993 	memcpy(&disk_hib, buf, sizeof(buf));
994 	return (0);
995 }
996 
997 /*
998  * Compare two hibernate_infos to determine if they are the same (eg,
999  * we should be performing a hibernate resume on this machine.
1000  * Not all fields are checked - just enough to verify that the machine
1001  * has the same memory configuration and kernel as the one that
1002  * wrote the signature previously.
1003  */
1004 int
1005 hibernate_compare_signature(union hibernate_info *mine,
1006     union hibernate_info *disk)
1007 {
1008 	u_int i;
1009 
1010 	if (mine->nranges != disk->nranges) {
1011 		printf("unhibernate failed: memory layout changed\n");
1012 		return (1);
1013 	}
1014 
1015 	if (bcmp(mine->kern_hash, disk->kern_hash, SHA256_DIGEST_LENGTH) != 0) {
1016 		printf("unhibernate failed: original kernel changed\n");
1017 		return (1);
1018 	}
1019 
1020 	for (i = 0; i < mine->nranges; i++) {
1021 		if ((mine->ranges[i].base != disk->ranges[i].base) ||
1022 		    (mine->ranges[i].end != disk->ranges[i].end) ) {
1023 			DPRINTF("hib range %d mismatch [%p-%p != %p-%p]\n",
1024 				i,
1025 				(void *)mine->ranges[i].base,
1026 				(void *)mine->ranges[i].end,
1027 				(void *)disk->ranges[i].base,
1028 				(void *)disk->ranges[i].end);
1029 			printf("unhibernate failed: memory size changed\n");
1030 			return (1);
1031 		}
1032 	}
1033 
1034 	return (0);
1035 }
1036 
1037 /*
1038  * Transfers xfer_size bytes between the hibernate device specified in
1039  * hib_info at offset blkctr and the vaddr specified at dest.
1040  *
1041  * Separate offsets and pages are used to handle misaligned reads (reads
1042  * that span a page boundary).
1043  *
1044  * blkctr specifies a relative offset (relative to the start of swap),
1045  * not an absolute disk offset
1046  *
1047  */
1048 int
1049 hibernate_block_io(union hibernate_info *hib, daddr_t blkctr,
1050     size_t xfer_size, vaddr_t dest, int iswrite)
1051 {
1052 	struct buf *bp;
1053 	int error;
1054 
1055 	bp = geteblk(xfer_size);
1056 	if (iswrite)
1057 		bcopy((caddr_t)dest, bp->b_data, xfer_size);
1058 
1059 	bp->b_bcount = xfer_size;
1060 	bp->b_blkno = blkctr;
1061 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
1062 	SET(bp->b_flags, B_BUSY | (iswrite ? B_WRITE : B_READ) | B_RAW);
1063 	bp->b_dev = hib->dev;
1064 	(*bdsw->d_strategy)(bp);
1065 
1066 	error = biowait(bp);
1067 	if (error) {
1068 		printf("hib block_io biowait error %d blk %lld size %zu\n",
1069 			error, (long long)blkctr, xfer_size);
1070 	} else if (!iswrite)
1071 		bcopy(bp->b_data, (caddr_t)dest, xfer_size);
1072 
1073 	bp->b_flags |= B_INVAL;
1074 	brelse(bp);
1075 
1076 	return (error != 0);
1077 }
1078 
1079 /*
1080  * Preserve one page worth of random data, generated from the resuming
1081  * kernel's arc4random. After resume, this preserved entropy can be used
1082  * to further improve the un-hibernated machine's entropy pool. This
1083  * random data is stored in the piglet, which is preserved across the
1084  * unpack operation, and is restored later in the resume process (see
1085  * hib_getentropy)
1086  */
1087 void
1088 hibernate_preserve_entropy(union hibernate_info *hib)
1089 {
1090 	void *entropy;
1091 
1092 	entropy = km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
1093 
1094 	if (!entropy)
1095 		return;
1096 
1097 	pmap_activate(curproc);
1098 	pmap_kenter_pa((vaddr_t)entropy,
1099 	    (paddr_t)(hib->piglet_pa + (29 * PAGE_SIZE)),
1100 	    PROT_READ | PROT_WRITE);
1101 
1102 	arc4random_buf((void *)entropy, PAGE_SIZE);
1103 	pmap_kremove((vaddr_t)entropy, PAGE_SIZE);
1104 	km_free(entropy, PAGE_SIZE, &kv_any, &kp_none);
1105 }
1106 
1107 #ifndef NO_PROPOLICE
1108 vaddr_t
1109 hibernate_unprotect_ssp(void)
1110 {
1111 	struct kmem_dyn_mode kd_avoidalias;
1112 	vaddr_t va = trunc_page((vaddr_t)&__guard_local);
1113 	paddr_t pa;
1114 
1115 	pmap_extract(pmap_kernel(), va, &pa);
1116 
1117 	memset(&kd_avoidalias, 0, sizeof kd_avoidalias);
1118 	kd_avoidalias.kd_prefer = pa;
1119 	kd_avoidalias.kd_waitok = 1;
1120 	va = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any, &kp_none, &kd_avoidalias);
1121 	if (!va)
1122 		panic("hibernate_unprotect_ssp");
1123 
1124 	pmap_kenter_pa(va, pa, PROT_READ | PROT_WRITE);
1125 	pmap_update(pmap_kernel());
1126 
1127 	return va;
1128 }
1129 
1130 void
1131 hibernate_reprotect_ssp(vaddr_t va)
1132 {
1133 	pmap_kremove(va, PAGE_SIZE);
1134 	km_free((void *)va, PAGE_SIZE, &kv_any, &kp_none);
1135 }
1136 #endif /* NO_PROPOLICE */
1137 
1138 /*
1139  * Reads the signature block from swap, checks against the current machine's
1140  * information. If the information matches, perform a resume by reading the
1141  * saved image into the pig area, and unpacking.
1142  *
1143  * Must be called with interrupts enabled.
1144  */
1145 void
1146 hibernate_resume(void)
1147 {
1148 	uint8_t buf[DEV_BSIZE];
1149 	union hibernate_info *hib = (union hibernate_info *)&buf;
1150 	int s;
1151 #ifndef NO_PROPOLICE
1152 	vsize_t off = (vaddr_t)&__guard_local -
1153 	    trunc_page((vaddr_t)&__guard_local);
1154 	vaddr_t guard_va;
1155 #endif
1156 
1157 	/* Get current running machine's hibernate info */
1158 	memset(buf, 0, sizeof(buf));
1159 	if (get_hibernate_info(hib, 0)) {
1160 		DPRINTF("couldn't retrieve machine's hibernate info\n");
1161 		return;
1162 	}
1163 
1164 	/* Read hibernate info from disk */
1165 	s = splbio();
1166 
1167 	bdsw = &bdevsw[major(hib->dev)];
1168 	if ((*bdsw->d_open)(hib->dev, FREAD, S_IFCHR, curproc)) {
1169 		printf("hibernate_resume device open failed\n");
1170 		splx(s);
1171 		return;
1172 	}
1173 
1174 	DPRINTF("reading hibernate signature block location: %lld\n",
1175 		hib->sig_offset);
1176 
1177 	if (hibernate_block_io(hib,
1178 	    hib->sig_offset,
1179 	    hib->sec_size, (vaddr_t)&disk_hib, 0)) {
1180 		DPRINTF("error in hibernate read\n");
1181 		goto fail;
1182 	}
1183 
1184 	/* Check magic number */
1185 	if (disk_hib.magic != HIBERNATE_MAGIC) {
1186 		DPRINTF("wrong magic number in hibernate signature: %x\n",
1187 			disk_hib.magic);
1188 		goto fail;
1189 	}
1190 
1191 	/*
1192 	 * We (possibly) found a hibernate signature. Clear signature first,
1193 	 * to prevent accidental resume or endless resume cycles later.
1194 	 */
1195 	if (hibernate_clear_signature(hib)) {
1196 		DPRINTF("error clearing hibernate signature block\n");
1197 		goto fail;
1198 	}
1199 
1200 	/*
1201 	 * If on-disk and in-memory hibernate signatures match,
1202 	 * this means we should do a resume from hibernate.
1203 	 */
1204 	if (hibernate_compare_signature(hib, &disk_hib)) {
1205 		DPRINTF("mismatched hibernate signature block\n");
1206 		goto fail;
1207 	}
1208 	disk_hib.dev = hib->dev;
1209 
1210 #ifdef MULTIPROCESSOR
1211 	/* XXX - if we fail later, we may need to rehatch APs on some archs */
1212 	DPRINTF("hibernate: quiescing APs\n");
1213 	hibernate_quiesce_cpus();
1214 #endif /* MULTIPROCESSOR */
1215 
1216 	/* Read the image from disk into the image (pig) area */
1217 	if (hibernate_read_image(&disk_hib))
1218 		goto fail;
1219 	if ((*bdsw->d_close)(hib->dev, 0, S_IFCHR, curproc))
1220 		printf("hibernate_resume device close failed\n");
1221 	bdsw = NULL;
1222 
1223 	DPRINTF("hibernate: quiescing devices\n");
1224 	if (config_suspend_all(DVACT_QUIESCE) != 0)
1225 		goto fail;
1226 
1227 #ifndef NO_PROPOLICE
1228 	guard_va = hibernate_unprotect_ssp();
1229 #endif /* NO_PROPOLICE */
1230 
1231 	(void) splhigh();
1232 	hibernate_disable_intr_machdep();
1233 	cold = 2;
1234 
1235 	DPRINTF("hibernate: suspending devices\n");
1236 	if (config_suspend_all(DVACT_SUSPEND) != 0) {
1237 		cold = 0;
1238 		hibernate_enable_intr_machdep();
1239 #ifndef NO_PROPOLICE
1240 		hibernate_reprotect_ssp(guard_va);
1241 #endif /* ! NO_PROPOLICE */
1242 		goto fail;
1243 	}
1244 
1245 	pmap_extract(pmap_kernel(), (vaddr_t)&__retguard_start,
1246 	    &retguard_start_phys);
1247 	pmap_extract(pmap_kernel(), (vaddr_t)&__retguard_end,
1248 	    &retguard_end_phys);
1249 
1250 	hibernate_preserve_entropy(&disk_hib);
1251 
1252 	printf("Unpacking image...\n");
1253 
1254 	/* Switch stacks */
1255 	DPRINTF("hibernate: switching stacks\n");
1256 	hibernate_switch_stack_machdep();
1257 
1258 #ifndef NO_PROPOLICE
1259 	/* Start using suspended kernel's propolice guard */
1260 	*(long *)(guard_va + off) = disk_hib.guard;
1261 	hibernate_reprotect_ssp(guard_va);
1262 #endif /* ! NO_PROPOLICE */
1263 
1264 	/* Unpack and resume */
1265 	hibernate_unpack_image(&disk_hib);
1266 
1267 fail:
1268 	if (!bdsw)
1269 		printf("\nUnable to resume hibernated image\n");
1270 	else if ((*bdsw->d_close)(hib->dev, 0, S_IFCHR, curproc))
1271 		printf("hibernate_resume device close failed\n");
1272 	splx(s);
1273 }
1274 
1275 /*
1276  * Unpack image from pig area to original location by looping through the
1277  * list of output chunks in the order they should be restored (fchunks).
1278  *
1279  * Note that due to the stack smash protector and the fact that we have
1280  * switched stacks, it is not permitted to return from this function.
1281  */
1282 void
1283 hibernate_unpack_image(union hibernate_info *hib)
1284 {
1285 	uint8_t buf[DEV_BSIZE];
1286 	struct hibernate_disk_chunk *chunks;
1287 	union hibernate_info *local_hib = (union hibernate_info *)&buf;
1288 	paddr_t image_cur = global_pig_start;
1289 	short i, *fchunks;
1290 	char *pva;
1291 
1292 	/* Piglet will be identity mapped (VA == PA) */
1293 	pva = (char *)hib->piglet_pa;
1294 
1295 	fchunks = (short *)(pva + (4 * PAGE_SIZE));
1296 
1297 	chunks = (struct hibernate_disk_chunk *)(pva + HIBERNATE_CHUNK_SIZE);
1298 
1299 	/* Can't use hiber_info that's passed in after this point */
1300 	memcpy(buf, hib, sizeof(buf));
1301 	local_hib->retguard_ofs = 0;
1302 
1303 	/* VA == PA */
1304 	local_hib->piglet_va = local_hib->piglet_pa;
1305 
1306 	/*
1307 	 * Point of no return. Once we pass this point, only kernel code can
1308 	 * be accessed. No global variables or other kernel data structures
1309 	 * are guaranteed to be coherent after unpack starts.
1310 	 *
1311 	 * The image is now in high memory (pig area), we unpack from the pig
1312 	 * to the correct location in memory. We'll eventually end up copying
1313 	 * on top of ourself, but we are assured the kernel code here is the
1314 	 * same between the hibernated and resuming kernel, and we are running
1315 	 * on our own stack, so the overwrite is ok.
1316 	 */
1317 	DPRINTF("hibernate: activating alt. pagetable and starting unpack\n");
1318 	hibernate_activate_resume_pt_machdep();
1319 
1320 	for (i = 0; i < local_hib->chunk_ctr; i++) {
1321 		/* Reset zlib for inflate */
1322 		if (hibernate_zlib_reset(local_hib, 0) != Z_OK)
1323 			panic("hibernate failed to reset zlib for inflate");
1324 
1325 		hibernate_process_chunk(local_hib, &chunks[fchunks[i]],
1326 		    image_cur);
1327 
1328 		image_cur += chunks[fchunks[i]].compressed_size;
1329 	}
1330 
1331 	/*
1332 	 * Resume the loaded kernel by jumping to the MD resume vector.
1333 	 * We won't be returning from this call. We pass the location of
1334 	 * the retguard save area so the MD code can replace it before
1335 	 * resuming. See the piglet layout at the top of this file for
1336 	 * more information on the layout of the piglet area.
1337 	 *
1338 	 * We use 'global_piglet_va' here since by the time we are at
1339 	 * this point, we have already unpacked the image, and we want
1340 	 * the suspended kernel's view of what the piglet was, before
1341 	 * suspend occurred (since we will need to use that in the retguard
1342 	 * copy code in hibernate_resume_machdep.)
1343 	 */
1344 	hibernate_resume_machdep(global_piglet_va + (110 * PAGE_SIZE));
1345 }
1346 
1347 /*
1348  * Bounce a compressed image chunk to the piglet, entering mappings for the
1349  * copied pages as needed
1350  */
1351 void
1352 hibernate_copy_chunk_to_piglet(paddr_t img_cur, vaddr_t piglet, size_t size)
1353 {
1354 	size_t ct, ofs;
1355 	paddr_t src = img_cur;
1356 	vaddr_t dest = piglet;
1357 
1358 	/* Copy first partial page */
1359 	ct = (PAGE_SIZE) - (src & PAGE_MASK);
1360 	ofs = (src & PAGE_MASK);
1361 
1362 	if (ct < PAGE_SIZE) {
1363 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE,
1364 			(src - ofs), 0);
1365 		hibernate_flush();
1366 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE + ofs), (caddr_t)dest, ct);
1367 		src += ct;
1368 		dest += ct;
1369 	}
1370 
1371 	/* Copy remaining pages */
1372 	while (src < size + img_cur) {
1373 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE, src, 0);
1374 		hibernate_flush();
1375 		ct = PAGE_SIZE;
1376 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE), (caddr_t)dest, ct);
1377 		hibernate_flush();
1378 		src += ct;
1379 		dest += ct;
1380 	}
1381 }
1382 
1383 /*
1384  * Process a chunk by bouncing it to the piglet, followed by unpacking
1385  */
1386 void
1387 hibernate_process_chunk(union hibernate_info *hib,
1388     struct hibernate_disk_chunk *chunk, paddr_t img_cur)
1389 {
1390 	char *pva = (char *)hib->piglet_va;
1391 
1392 	hibernate_copy_chunk_to_piglet(img_cur,
1393 	 (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)), chunk->compressed_size);
1394 	hibernate_inflate_region(hib, chunk->base,
1395 	    (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)),
1396 	    chunk->compressed_size);
1397 }
1398 
1399 /*
1400  * Calculate RLE component for 'inaddr'. Clamps to max RLE pages between
1401  * inaddr and range_end.
1402  */
1403 int
1404 hibernate_calc_rle(paddr_t inaddr, paddr_t range_end)
1405 {
1406 	int rle;
1407 
1408 	rle = uvm_page_rle(inaddr);
1409 	KASSERT(rle >= 0 && rle <= MAX_RLE);
1410 
1411 	/* Clamp RLE to range end */
1412 	if (rle > 0 && inaddr + (rle * PAGE_SIZE) > range_end)
1413 		rle = (range_end - inaddr) / PAGE_SIZE;
1414 
1415 	return (rle);
1416 }
1417 
1418 /*
1419  * Write the RLE byte for page at 'inaddr' to the output stream.
1420  * Returns the number of pages to be skipped at 'inaddr'.
1421  */
1422 int
1423 hibernate_write_rle(union hibernate_info *hib, paddr_t inaddr,
1424 	paddr_t range_end, daddr_t *blkctr,
1425 	size_t *out_remaining)
1426 {
1427 	int rle, err, *rleloc;
1428 	struct hibernate_zlib_state *hibernate_state;
1429 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
1430 
1431 	hibernate_state =
1432 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1433 
1434 	rle = hibernate_calc_rle(inaddr, range_end);
1435 
1436 	rleloc = (int *)hibernate_rle_page + MAX_RLE - 1;
1437 	*rleloc = rle;
1438 
1439 	/* Deflate the RLE byte into the stream */
1440 	hibernate_deflate(hib, (paddr_t)rleloc, out_remaining);
1441 
1442 	/* Did we fill the output page? If so, flush to disk */
1443 	if (*out_remaining == 0) {
1444 		if ((err = hibernate_write(hib, *blkctr,
1445 			(vaddr_t)hibernate_io_page, PAGE_SIZE, IO_TYPE_IMG))) {
1446 				DPRINTF("hib write error %d\n", err);
1447 				return -1;
1448 		}
1449 
1450 		*blkctr += btodb(PAGE_SIZE);
1451 		*out_remaining = PAGE_SIZE;
1452 
1453 		/* If we didn't deflate the entire RLE byte, finish it now */
1454 		if (hibernate_state->hib_stream.avail_in != 0)
1455 			hibernate_deflate(hib,
1456 				(vaddr_t)hibernate_state->hib_stream.next_in,
1457 				out_remaining);
1458 	}
1459 
1460 	return (rle);
1461 }
1462 
1463 /*
1464  * Write a compressed version of this machine's memory to disk, at the
1465  * precalculated swap offset:
1466  *
1467  * end of swap - signature block size - chunk table size - memory size
1468  *
1469  * The function begins by looping through each phys mem range, cutting each
1470  * one into MD sized chunks. These chunks are then compressed individually
1471  * and written out to disk, in phys mem order. Some chunks might compress
1472  * more than others, and for this reason, each chunk's size is recorded
1473  * in the chunk table, which is written to disk after the image has
1474  * properly been compressed and written (in hibernate_write_chunktable).
1475  *
1476  * When this function is called, the machine is nearly suspended - most
1477  * devices are quiesced/suspended, interrupts are off, and cold has
1478  * been set. This means that there can be no side effects once the
1479  * write has started, and the write function itself can also have no
1480  * side effects. This also means no printfs are permitted (since printf
1481  * has side effects.)
1482  *
1483  * Return values :
1484  *
1485  * 0      - success
1486  * EIO    - I/O error occurred writing the chunks
1487  * EINVAL - Failed to write a complete range
1488  * ENOMEM - Memory allocation failure during preparation of the zlib arena
1489  */
1490 int
1491 hibernate_write_chunks(union hibernate_info *hib)
1492 {
1493 	paddr_t range_base, range_end, inaddr, temp_inaddr;
1494 	size_t out_remaining, used;
1495 	struct hibernate_disk_chunk *chunks;
1496 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
1497 	daddr_t blkctr = 0;
1498 	int i, rle, err;
1499 	struct hibernate_zlib_state *hibernate_state;
1500 
1501 	hibernate_state =
1502 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1503 
1504 	hib->chunk_ctr = 0;
1505 
1506 	/*
1507 	 * Map the utility VAs to the piglet. See the piglet map at the
1508 	 * top of this file for piglet layout information.
1509 	 */
1510 	hibernate_copy_page = hib->piglet_va + 3 * PAGE_SIZE;
1511 	hibernate_rle_page = hib->piglet_va + 28 * PAGE_SIZE;
1512 
1513 	chunks = (struct hibernate_disk_chunk *)(hib->piglet_va +
1514 	    HIBERNATE_CHUNK_SIZE);
1515 
1516 	/* Calculate the chunk regions */
1517 	for (i = 0; i < hib->nranges; i++) {
1518 		range_base = hib->ranges[i].base;
1519 		range_end = hib->ranges[i].end;
1520 
1521 		inaddr = range_base;
1522 
1523 		while (inaddr < range_end) {
1524 			chunks[hib->chunk_ctr].base = inaddr;
1525 			if (inaddr + HIBERNATE_CHUNK_SIZE < range_end)
1526 				chunks[hib->chunk_ctr].end = inaddr +
1527 				    HIBERNATE_CHUNK_SIZE;
1528 			else
1529 				chunks[hib->chunk_ctr].end = range_end;
1530 
1531 			inaddr += HIBERNATE_CHUNK_SIZE;
1532 			hib->chunk_ctr ++;
1533 		}
1534 	}
1535 
1536 	uvm_pmr_dirty_everything();
1537 	uvm_pmr_zero_everything();
1538 
1539 	/* Compress and write the chunks in the chunktable */
1540 	for (i = 0; i < hib->chunk_ctr; i++) {
1541 		range_base = chunks[i].base;
1542 		range_end = chunks[i].end;
1543 
1544 		chunks[i].offset = blkctr;
1545 
1546 		/* Reset zlib for deflate */
1547 		if (hibernate_zlib_reset(hib, 1) != Z_OK) {
1548 			DPRINTF("hibernate_zlib_reset failed for deflate\n");
1549 			return (ENOMEM);
1550 		}
1551 
1552 		inaddr = range_base;
1553 
1554 		/*
1555 		 * For each range, loop through its phys mem region
1556 		 * and write out the chunks (the last chunk might be
1557 		 * smaller than the chunk size).
1558 		 */
1559 		while (inaddr < range_end) {
1560 			out_remaining = PAGE_SIZE;
1561 			while (out_remaining > 0 && inaddr < range_end) {
1562 				/*
1563 				 * Adjust for regions that are not evenly
1564 				 * divisible by PAGE_SIZE or overflowed
1565 				 * pages from the previous iteration.
1566 				 */
1567 				temp_inaddr = (inaddr & PAGE_MASK) +
1568 				    hibernate_copy_page;
1569 
1570 				/* Deflate from temp_inaddr to IO page */
1571 				if (inaddr != range_end) {
1572 					rle = 0;
1573 					if (inaddr % PAGE_SIZE == 0) {
1574 						rle = hibernate_write_rle(hib,
1575 							inaddr,
1576 							range_end,
1577 							&blkctr,
1578 							&out_remaining);
1579 					}
1580 
1581 					switch (rle) {
1582 					case -1:
1583 						return EIO;
1584 					case 0:
1585 						pmap_kenter_pa(hibernate_temp_page,
1586 							inaddr & PMAP_PA_MASK,
1587 							PROT_READ);
1588 
1589 						bcopy((caddr_t)hibernate_temp_page,
1590 							(caddr_t)hibernate_copy_page,
1591 							PAGE_SIZE);
1592 						inaddr += hibernate_deflate(hib,
1593 							temp_inaddr,
1594 							&out_remaining);
1595 						break;
1596 					default:
1597 						inaddr += rle * PAGE_SIZE;
1598 						if (inaddr > range_end)
1599 							inaddr = range_end;
1600 						break;
1601 					}
1602 
1603 				}
1604 
1605 				if (out_remaining == 0) {
1606 					/* Filled up the page */
1607 					if ((err = hibernate_write(hib, blkctr,
1608 					    (vaddr_t)hibernate_io_page,
1609 					    PAGE_SIZE, IO_TYPE_IMG))) {
1610 						DPRINTF("hib write error %d\n",
1611 						    err);
1612 						return (err);
1613 					}
1614 					blkctr += btodb(PAGE_SIZE);
1615 				}
1616 			}
1617 		}
1618 
1619 		if (inaddr != range_end) {
1620 			DPRINTF("deflate range ended prematurely\n");
1621 			return (EINVAL);
1622 		}
1623 
1624 		/*
1625 		 * End of range. Round up to next secsize bytes
1626 		 * after finishing compress
1627 		 */
1628 		if (out_remaining == 0)
1629 			out_remaining = PAGE_SIZE;
1630 
1631 		/* Finish compress */
1632 		hibernate_state->hib_stream.next_in = (unsigned char *)inaddr;
1633 		hibernate_state->hib_stream.avail_in = 0;
1634 		hibernate_state->hib_stream.next_out =
1635 		    (unsigned char *)hibernate_io_page +
1636 			(PAGE_SIZE - out_remaining);
1637 
1638 		/* We have an extra output page available for finalize */
1639 		hibernate_state->hib_stream.avail_out =
1640 			out_remaining + PAGE_SIZE;
1641 
1642 		if ((err = deflate(&hibernate_state->hib_stream, Z_FINISH)) !=
1643 		    Z_STREAM_END) {
1644 			DPRINTF("deflate error in output stream: %d\n", err);
1645 			return (err);
1646 		}
1647 
1648 		out_remaining = hibernate_state->hib_stream.avail_out;
1649 
1650 		/* Round up to next sector if needed */
1651 		used = ROUNDUP(2 * PAGE_SIZE - out_remaining, hib->sec_size);
1652 
1653 		/* Write final block(s) for this chunk */
1654 		if ((err = hibernate_write(hib, blkctr,
1655 		    (vaddr_t)hibernate_io_page, used, IO_TYPE_IMG))) {
1656 			DPRINTF("hib final write error %d\n", err);
1657 			return (err);
1658 		}
1659 
1660 		blkctr += btodb(used);
1661 
1662 		chunks[i].compressed_size = dbtob(blkctr - chunks[i].offset);
1663 	}
1664 
1665 	return (0);
1666 }
1667 
1668 /*
1669  * Reset the zlib stream state and allocate a new hiballoc area for either
1670  * inflate or deflate. This function is called once for each hibernate chunk.
1671  * Calling hiballoc_init multiple times is acceptable since the memory it is
1672  * provided is unmanaged memory (stolen). We use the memory provided to us
1673  * by the piglet allocated via the supplied hib.
1674  */
1675 int
1676 hibernate_zlib_reset(union hibernate_info *hib, int deflate)
1677 {
1678 	vaddr_t hibernate_zlib_start;
1679 	size_t hibernate_zlib_size;
1680 	char *pva = (char *)hib->piglet_va;
1681 	struct hibernate_zlib_state *hibernate_state;
1682 
1683 	hibernate_state =
1684 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1685 
1686 	if (!deflate)
1687 		pva = (char *)((paddr_t)pva & (PIGLET_PAGE_MASK));
1688 
1689 	/*
1690 	 * See piglet layout information at the start of this file for
1691 	 * information on the zlib page assignments.
1692 	 */
1693 	hibernate_zlib_start = (vaddr_t)(pva + (30 * PAGE_SIZE));
1694 	hibernate_zlib_size = 80 * PAGE_SIZE;
1695 
1696 	memset((void *)hibernate_zlib_start, 0, hibernate_zlib_size);
1697 	memset(hibernate_state, 0, PAGE_SIZE);
1698 
1699 	/* Set up stream structure */
1700 	hibernate_state->hib_stream.zalloc = (alloc_func)hibernate_zlib_alloc;
1701 	hibernate_state->hib_stream.zfree = (free_func)hibernate_zlib_free;
1702 
1703 	/* Initialize the hiballoc arena for zlib allocs/frees */
1704 	if (hiballoc_init(&hibernate_state->hiballoc_arena,
1705 	    (caddr_t)hibernate_zlib_start, hibernate_zlib_size))
1706 		return 1;
1707 
1708 	if (deflate) {
1709 		return deflateInit(&hibernate_state->hib_stream,
1710 		    Z_BEST_SPEED);
1711 	} else
1712 		return inflateInit(&hibernate_state->hib_stream);
1713 }
1714 
1715 /*
1716  * Reads the hibernated memory image from disk, whose location and
1717  * size are recorded in hib. Begin by reading the persisted
1718  * chunk table, which records the original chunk placement location
1719  * and compressed size for each. Next, allocate a pig region of
1720  * sufficient size to hold the compressed image. Next, read the
1721  * chunks into the pig area (calling hibernate_read_chunks to do this),
1722  * and finally, if all of the above succeeds, clear the hibernate signature.
1723  * The function will then return to hibernate_resume, which will proceed
1724  * to unpack the pig image to the correct place in memory.
1725  */
1726 int
1727 hibernate_read_image(union hibernate_info *hib)
1728 {
1729 	size_t compressed_size, disk_size, chunktable_size, pig_sz;
1730 	paddr_t image_start, image_end, pig_start, pig_end;
1731 	struct hibernate_disk_chunk *chunks;
1732 	daddr_t blkctr;
1733 	vaddr_t chunktable = (vaddr_t)NULL;
1734 	paddr_t piglet_chunktable = hib->piglet_pa +
1735 	    HIBERNATE_CHUNK_SIZE;
1736 	int i, status;
1737 
1738 	status = 0;
1739 	pmap_activate(curproc);
1740 
1741 	/* Calculate total chunk table size in disk blocks */
1742 	chunktable_size = btodb(HIBERNATE_CHUNK_TABLE_SIZE);
1743 
1744 	blkctr = hib->chunktable_offset;
1745 
1746 	chunktable = (vaddr_t)km_alloc(HIBERNATE_CHUNK_TABLE_SIZE, &kv_any,
1747 	    &kp_none, &kd_nowait);
1748 
1749 	if (!chunktable)
1750 		return (1);
1751 
1752 	/* Map chunktable pages */
1753 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE; i += PAGE_SIZE)
1754 		pmap_kenter_pa(chunktable + i, piglet_chunktable + i,
1755 		    PROT_READ | PROT_WRITE);
1756 	pmap_update(pmap_kernel());
1757 
1758 	/* Read the chunktable from disk into the piglet chunktable */
1759 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE;
1760 	    i += MAXPHYS, blkctr += btodb(MAXPHYS)) {
1761 		if (hibernate_block_io(hib, blkctr, MAXPHYS,
1762 		    chunktable + i, 0)) {
1763 			status = 1;
1764 			goto unmap;
1765 		}
1766 	}
1767 
1768 	blkctr = hib->image_offset;
1769 	compressed_size = 0;
1770 
1771 	chunks = (struct hibernate_disk_chunk *)chunktable;
1772 
1773 	for (i = 0; i < hib->chunk_ctr; i++)
1774 		compressed_size += chunks[i].compressed_size;
1775 
1776 	disk_size = compressed_size;
1777 
1778 	printf("unhibernating @ block %lld length %luMB\n",
1779 	    hib->image_offset, compressed_size / (1024 * 1024));
1780 
1781 	/* Allocate the pig area */
1782 	pig_sz = compressed_size + HIBERNATE_CHUNK_SIZE;
1783 	if (uvm_pmr_alloc_pig(&pig_start, pig_sz, hib->piglet_pa) == ENOMEM) {
1784 		status = 1;
1785 		goto unmap;
1786 	}
1787 
1788 	pig_end = pig_start + pig_sz;
1789 
1790 	/* Calculate image extents. Pig image must end on a chunk boundary. */
1791 	image_end = pig_end & ~(HIBERNATE_CHUNK_SIZE - 1);
1792 	image_start = image_end - disk_size;
1793 
1794 	if (hibernate_read_chunks(hib, image_start, image_end, disk_size,
1795 	    chunks)) {
1796 		status = 1;
1797 		goto unmap;
1798 	}
1799 
1800 	/* Prepare the resume time pmap/page table */
1801 	hibernate_populate_resume_pt(hib, image_start, image_end);
1802 
1803 unmap:
1804 	/* Unmap chunktable pages */
1805 	pmap_kremove(chunktable, HIBERNATE_CHUNK_TABLE_SIZE);
1806 	pmap_update(pmap_kernel());
1807 
1808 	return (status);
1809 }
1810 
1811 /*
1812  * Read the hibernated memory chunks from disk (chunk information at this
1813  * point is stored in the piglet) into the pig area specified by
1814  * [pig_start .. pig_end]. Order the chunks so that the final chunk is the
1815  * only chunk with overlap possibilities.
1816  */
1817 int
1818 hibernate_read_chunks(union hibernate_info *hib, paddr_t pig_start,
1819     paddr_t pig_end, size_t image_compr_size,
1820     struct hibernate_disk_chunk *chunks)
1821 {
1822 	paddr_t img_cur, piglet_base;
1823 	daddr_t blkctr;
1824 	size_t processed, compressed_size, read_size;
1825 	int err, nchunks, nfchunks, num_io_pages;
1826 	vaddr_t tempva, hibernate_fchunk_area;
1827 	short *fchunks, i, j;
1828 
1829 	tempva = (vaddr_t)NULL;
1830 	hibernate_fchunk_area = (vaddr_t)NULL;
1831 	nfchunks = 0;
1832 	piglet_base = hib->piglet_pa;
1833 	global_pig_start = pig_start;
1834 
1835 	/*
1836 	 * These mappings go into the resuming kernel's page table, and are
1837 	 * used only during image read. They disappear from existence
1838 	 * when the suspended kernel is unpacked on top of us.
1839 	 */
1840 	tempva = (vaddr_t)km_alloc(MAXPHYS + PAGE_SIZE, &kv_any, &kp_none,
1841 		&kd_nowait);
1842 	if (!tempva)
1843 		return (1);
1844 	hibernate_fchunk_area = (vaddr_t)km_alloc(24 * PAGE_SIZE, &kv_any,
1845 	    &kp_none, &kd_nowait);
1846 	if (!hibernate_fchunk_area)
1847 		return (1);
1848 
1849 	/* Final output chunk ordering VA */
1850 	fchunks = (short *)hibernate_fchunk_area;
1851 
1852 	/* Map the chunk ordering region */
1853 	for(i = 0; i < 24 ; i++)
1854 		pmap_kenter_pa(hibernate_fchunk_area + (i * PAGE_SIZE),
1855 			piglet_base + ((4 + i) * PAGE_SIZE),
1856 			PROT_READ | PROT_WRITE);
1857 	pmap_update(pmap_kernel());
1858 
1859 	nchunks = hib->chunk_ctr;
1860 
1861 	/* Initially start all chunks as unplaced */
1862 	for (i = 0; i < nchunks; i++)
1863 		chunks[i].flags = 0;
1864 
1865 	/*
1866 	 * Search the list for chunks that are outside the pig area. These
1867 	 * can be placed first in the final output list.
1868 	 */
1869 	for (i = 0; i < nchunks; i++) {
1870 		if (chunks[i].end <= pig_start || chunks[i].base >= pig_end) {
1871 			fchunks[nfchunks] = i;
1872 			nfchunks++;
1873 			chunks[i].flags |= HIBERNATE_CHUNK_PLACED;
1874 		}
1875 	}
1876 
1877 	/*
1878 	 * Walk the ordering, place the chunks in ascending memory order.
1879 	 */
1880 	for (i = 0; i < nchunks; i++) {
1881 		if (chunks[i].flags != HIBERNATE_CHUNK_PLACED) {
1882 			fchunks[nfchunks] = i;
1883 			nfchunks++;
1884 			chunks[i].flags = HIBERNATE_CHUNK_PLACED;
1885 		}
1886 	}
1887 
1888 	img_cur = pig_start;
1889 
1890 	for (i = 0, err = 0; i < nfchunks && err == 0; i++) {
1891 		blkctr = chunks[fchunks[i]].offset + hib->image_offset;
1892 		processed = 0;
1893 		compressed_size = chunks[fchunks[i]].compressed_size;
1894 
1895 		while (processed < compressed_size && err == 0) {
1896 			if (compressed_size - processed >= MAXPHYS)
1897 				read_size = MAXPHYS;
1898 			else
1899 				read_size = compressed_size - processed;
1900 
1901 			/*
1902 			 * We're reading read_size bytes, offset from the
1903 			 * start of a page by img_cur % PAGE_SIZE, so the
1904 			 * end will be read_size + (img_cur % PAGE_SIZE)
1905 			 * from the start of the first page.  Round that
1906 			 * up to the next page size.
1907 			 */
1908 			num_io_pages = (read_size + (img_cur % PAGE_SIZE)
1909 				+ PAGE_SIZE - 1) / PAGE_SIZE;
1910 
1911 			KASSERT(num_io_pages <= MAXPHYS/PAGE_SIZE + 1);
1912 
1913 			/* Map pages for this read */
1914 			for (j = 0; j < num_io_pages; j ++)
1915 				pmap_kenter_pa(tempva + j * PAGE_SIZE,
1916 				    img_cur + j * PAGE_SIZE,
1917 				    PROT_READ | PROT_WRITE);
1918 
1919 			pmap_update(pmap_kernel());
1920 
1921 			err = hibernate_block_io(hib, blkctr, read_size,
1922 			    tempva + (img_cur & PAGE_MASK), 0);
1923 
1924 			blkctr += btodb(read_size);
1925 
1926 			pmap_kremove(tempva, num_io_pages * PAGE_SIZE);
1927 			pmap_update(pmap_kernel());
1928 
1929 			processed += read_size;
1930 			img_cur += read_size;
1931 		}
1932 	}
1933 
1934 	pmap_kremove(hibernate_fchunk_area, 24 * PAGE_SIZE);
1935 	pmap_update(pmap_kernel());
1936 
1937 	return (i != nfchunks);
1938 }
1939 
1940 /*
1941  * Hibernating a machine comprises the following operations:
1942  *  1. Calculating this machine's hibernate_info information
1943  *  2. Allocating a piglet and saving the piglet's physaddr
1944  *  3. Calculating the memory chunks
1945  *  4. Writing the compressed chunks to disk
1946  *  5. Writing the chunk table
1947  *  6. Writing the signature block (hibernate_info)
1948  *
1949  * On most architectures, the function calling hibernate_suspend would
1950  * then power off the machine using some MD-specific implementation.
1951  */
1952 int
1953 hibernate_suspend(void)
1954 {
1955 	uint8_t buf[DEV_BSIZE];
1956 	union hibernate_info *hib = (union hibernate_info *)&buf;
1957 	u_long start, end;
1958 
1959 	/*
1960 	 * Calculate memory ranges, swap offsets, etc.
1961 	 * This also allocates a piglet whose physaddr is stored in
1962 	 * hib->piglet_pa and vaddr stored in hib->piglet_va
1963 	 */
1964 	if (get_hibernate_info(hib, 1)) {
1965 		DPRINTF("failed to obtain hibernate info\n");
1966 		return (1);
1967 	}
1968 
1969 	/* Find a page-addressed region in swap [start,end] */
1970 	if (uvm_hibswap(hib->dev, &start, &end)) {
1971 		printf("hibernate: cannot find any swap\n");
1972 		return (1);
1973 	}
1974 
1975 	if (end - start + 1 < 1000) {
1976 		printf("hibernate: insufficient swap (%lu is too small)\n",
1977 			end - start + 1);
1978 		return (1);
1979 	}
1980 
1981 	pmap_extract(pmap_kernel(), (vaddr_t)&__retguard_start,
1982 	    &retguard_start_phys);
1983 	pmap_extract(pmap_kernel(), (vaddr_t)&__retguard_end,
1984 	    &retguard_end_phys);
1985 
1986 	/* Calculate block offsets in swap */
1987 	hib->image_offset = ctod(start);
1988 	hib->image_size = ctod(end - start + 1) -
1989 	    btodb(HIBERNATE_CHUNK_TABLE_SIZE);
1990 	hib->chunktable_offset = hib->image_offset + hib->image_size;
1991 
1992 	DPRINTF("hibernate @ block %lld chunks-length %lu blocks, "
1993 	    "chunktable-length %d blocks\n", hib->image_offset, hib->image_size,
1994 	    btodb(HIBERNATE_CHUNK_TABLE_SIZE));
1995 
1996 	pmap_activate(curproc);
1997 	DPRINTF("hibernate: writing chunks\n");
1998 	if (hibernate_write_chunks(hib)) {
1999 		DPRINTF("hibernate_write_chunks failed\n");
2000 		return (1);
2001 	}
2002 
2003 	DPRINTF("hibernate: writing chunktable\n");
2004 	if (hibernate_write_chunktable(hib)) {
2005 		DPRINTF("hibernate_write_chunktable failed\n");
2006 		return (1);
2007 	}
2008 
2009 	DPRINTF("hibernate: writing signature\n");
2010 	if (hibernate_write_signature(hib)) {
2011 		DPRINTF("hibernate_write_signature failed\n");
2012 		return (1);
2013 	}
2014 
2015 	/* Allow the disk to settle */
2016 	delay(500000);
2017 
2018 	/*
2019 	 * Give the device-specific I/O function a notification that we're
2020 	 * done, and that it can clean up or shutdown as needed.
2021 	 */
2022 	if (hib->io_func(hib->dev, 0, (vaddr_t)NULL, 0, HIB_DONE, hib->io_page))
2023 		return (1);
2024 	else
2025 		return (0);
2026 }
2027 
2028 int
2029 hibernate_alloc(void)
2030 {
2031 	KASSERT(global_piglet_va == 0);
2032 	KASSERT(hibernate_temp_page == 0);
2033 
2034 	pmap_activate(curproc);
2035 	pmap_kenter_pa(HIBERNATE_HIBALLOC_PAGE, HIBERNATE_HIBALLOC_PAGE,
2036 	    PROT_READ | PROT_WRITE);
2037 
2038 	/* Allocate a piglet, store its addresses in the supplied globals */
2039 	if (uvm_pmr_alloc_piglet(&global_piglet_va, &global_piglet_pa,
2040 	    HIBERNATE_CHUNK_SIZE * 4, HIBERNATE_CHUNK_SIZE))
2041 		goto unmap;
2042 
2043 	/*
2044 	 * Allocate VA for the temp page.
2045 	 *
2046 	 * This will become part of the suspended kernel and will
2047 	 * be freed in hibernate_free, upon resume (or hibernate
2048 	 * failure)
2049 	 */
2050 	hibernate_temp_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
2051 	    &kp_none, &kd_nowait);
2052 	if (!hibernate_temp_page) {
2053 		uvm_pmr_free_piglet(global_piglet_va, 4 * HIBERNATE_CHUNK_SIZE);
2054 		global_piglet_va = 0;
2055 		goto unmap;
2056 	}
2057 	return (0);
2058 unmap:
2059 	pmap_kremove(HIBERNATE_HIBALLOC_PAGE, PAGE_SIZE);
2060 	pmap_update(pmap_kernel());
2061 	return (ENOMEM);
2062 }
2063 
2064 /*
2065  * Free items allocated by hibernate_alloc()
2066  */
2067 void
2068 hibernate_free(void)
2069 {
2070 	pmap_activate(curproc);
2071 
2072 	if (global_piglet_va)
2073 		uvm_pmr_free_piglet(global_piglet_va,
2074 		    4 * HIBERNATE_CHUNK_SIZE);
2075 
2076 	if (hibernate_temp_page) {
2077 		pmap_kremove(hibernate_temp_page, PAGE_SIZE);
2078 		km_free((void *)hibernate_temp_page, PAGE_SIZE,
2079 		    &kv_any, &kp_none);
2080 	}
2081 
2082 	global_piglet_va = 0;
2083 	hibernate_temp_page = 0;
2084 	pmap_kremove(HIBERNATE_HIBALLOC_PAGE, PAGE_SIZE);
2085 	pmap_update(pmap_kernel());
2086 }
2087