xref: /openbsd-src/sys/kern/subr_hibernate.c (revision 7882bc75851a5e1207ca424d66b9f466cbf36715)
1 /*	$OpenBSD: subr_hibernate.c,v 1.97 2014/07/16 07:42:51 mlarkin Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  * Copyright (c) 2011 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/hibernate.h>
21 #include <sys/malloc.h>
22 #include <sys/param.h>
23 #include <sys/tree.h>
24 #include <sys/systm.h>
25 #include <sys/disklabel.h>
26 #include <sys/disk.h>
27 #include <sys/conf.h>
28 #include <sys/buf.h>
29 #include <sys/fcntl.h>
30 #include <sys/stat.h>
31 #include <uvm/uvm.h>
32 #include <uvm/uvm_swap.h>
33 #include <machine/hibernate.h>
34 
35 /*
36  * Hibernate piglet layout information
37  *
38  * The piglet is a scratch area of memory allocated by the suspending kernel.
39  * Its phys and virt addrs are recorded in the signature block. The piglet is
40  * used to guarantee an unused area of memory that can be used by the resuming
41  * kernel for various things. The piglet is excluded during unpack operations.
42  * The piglet size is presently 4*HIBERNATE_CHUNK_SIZE (typically 4*4MB).
43  *
44  * Offset from piglet_base	Purpose
45  * ----------------------------------------------------------------------------
46  * 0				I/O page used during resume
47  * 1*PAGE_SIZE		 	I/O page used during hibernate suspend
48  * 2*PAGE_SIZE		 	I/O page used during hibernate suspend
49  * 3*PAGE_SIZE			copy page used during hibernate suspend
50  * 4*PAGE_SIZE			final chunk ordering list (8 pages)
51  * 12*PAGE_SIZE			piglet chunk ordering list (8 pages)
52  * 20*PAGE_SIZE			temp chunk ordering list (8 pages)
53  * 28*PAGE_SIZE			RLE utility page
54  * 29*PAGE_SIZE			start of hiballoc area
55  * 109*PAGE_SIZE		end of hiballoc area (80 pages)
56  * ...				unused
57  * HIBERNATE_CHUNK_SIZE		start of hibernate chunk table
58  * 2*HIBERNATE_CHUNK_SIZE	bounce area for chunks being unpacked
59  * 4*HIBERNATE_CHUNK_SIZE	end of piglet
60  */
61 
62 /* Temporary vaddr ranges used during hibernate */
63 vaddr_t hibernate_temp_page;
64 vaddr_t hibernate_copy_page;
65 vaddr_t hibernate_rle_page;
66 
67 /* Hibernate info as read from disk during resume */
68 union hibernate_info disk_hib;
69 paddr_t global_pig_start;
70 vaddr_t global_piglet_va;
71 
72 /* #define HIB_DEBUG */
73 #ifdef HIB_DEBUG
74 int	hib_debug = 99;
75 #define DPRINTF(x...)     do { if (hib_debug) printf(x); } while (0)
76 #define DNPRINTF(n,x...)  do { if (hib_debug > (n)) printf(x); } while (0)
77 #else
78 #define DPRINTF(x...)
79 #define DNPRINTF(n,x...)
80 #endif
81 
82 #ifndef NO_PROPOLICE
83 extern long __guard_local;
84 #endif /* ! NO_PROPOLICE */
85 
86 void hibernate_copy_chunk_to_piglet(paddr_t, vaddr_t, size_t);
87 int hibernate_calc_rle(paddr_t, paddr_t);
88 int hibernate_write_rle(union hibernate_info *, paddr_t, paddr_t, daddr_t *,
89 	size_t *);
90 
91 #define MAX_RLE (HIBERNATE_CHUNK_SIZE / PAGE_SIZE)
92 
93 /*
94  * Hib alloc enforced alignment.
95  */
96 #define HIB_ALIGN		8 /* bytes alignment */
97 
98 /*
99  * sizeof builtin operation, but with alignment constraint.
100  */
101 #define HIB_SIZEOF(_type)	roundup(sizeof(_type), HIB_ALIGN)
102 
103 struct hiballoc_entry {
104 	size_t			hibe_use;
105 	size_t			hibe_space;
106 	RB_ENTRY(hiballoc_entry) hibe_entry;
107 };
108 
109 /*
110  * Sort hibernate memory ranges by ascending PA
111  */
112 void
113 hibernate_sort_ranges(union hibernate_info *hib_info)
114 {
115 	int i, j;
116 	struct hibernate_memory_range *ranges;
117 	paddr_t base, end;
118 
119 	ranges = hib_info->ranges;
120 
121 	for (i = 1; i < hib_info->nranges; i++) {
122 		j = i;
123 		while (j > 0 && ranges[j - 1].base > ranges[j].base) {
124 			base = ranges[j].base;
125 			end = ranges[j].end;
126 			ranges[j].base = ranges[j - 1].base;
127 			ranges[j].end = ranges[j - 1].end;
128 			ranges[j - 1].base = base;
129 			ranges[j - 1].end = end;
130 			j--;
131 		}
132 	}
133 }
134 
135 /*
136  * Compare hiballoc entries based on the address they manage.
137  *
138  * Since the address is fixed, relative to struct hiballoc_entry,
139  * we just compare the hiballoc_entry pointers.
140  */
141 static __inline int
142 hibe_cmp(struct hiballoc_entry *l, struct hiballoc_entry *r)
143 {
144 	return l < r ? -1 : (l > r);
145 }
146 
147 RB_PROTOTYPE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
148 
149 /*
150  * Given a hiballoc entry, return the address it manages.
151  */
152 static __inline void *
153 hib_entry_to_addr(struct hiballoc_entry *entry)
154 {
155 	caddr_t addr;
156 
157 	addr = (caddr_t)entry;
158 	addr += HIB_SIZEOF(struct hiballoc_entry);
159 	return addr;
160 }
161 
162 /*
163  * Given an address, find the hiballoc that corresponds.
164  */
165 static __inline struct hiballoc_entry*
166 hib_addr_to_entry(void *addr_param)
167 {
168 	caddr_t addr;
169 
170 	addr = (caddr_t)addr_param;
171 	addr -= HIB_SIZEOF(struct hiballoc_entry);
172 	return (struct hiballoc_entry*)addr;
173 }
174 
175 RB_GENERATE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
176 
177 /*
178  * Allocate memory from the arena.
179  *
180  * Returns NULL if no memory is available.
181  */
182 void *
183 hib_alloc(struct hiballoc_arena *arena, size_t alloc_sz)
184 {
185 	struct hiballoc_entry *entry, *new_entry;
186 	size_t find_sz;
187 
188 	/*
189 	 * Enforce alignment of HIB_ALIGN bytes.
190 	 *
191 	 * Note that, because the entry is put in front of the allocation,
192 	 * 0-byte allocations are guaranteed a unique address.
193 	 */
194 	alloc_sz = roundup(alloc_sz, HIB_ALIGN);
195 
196 	/*
197 	 * Find an entry with hibe_space >= find_sz.
198 	 *
199 	 * If the root node is not large enough, we switch to tree traversal.
200 	 * Because all entries are made at the bottom of the free space,
201 	 * traversal from the end has a slightly better chance of yielding
202 	 * a sufficiently large space.
203 	 */
204 	find_sz = alloc_sz + HIB_SIZEOF(struct hiballoc_entry);
205 	entry = RB_ROOT(&arena->hib_addrs);
206 	if (entry != NULL && entry->hibe_space < find_sz) {
207 		RB_FOREACH_REVERSE(entry, hiballoc_addr, &arena->hib_addrs) {
208 			if (entry->hibe_space >= find_sz)
209 				break;
210 		}
211 	}
212 
213 	/*
214 	 * Insufficient or too fragmented memory.
215 	 */
216 	if (entry == NULL)
217 		return NULL;
218 
219 	/*
220 	 * Create new entry in allocated space.
221 	 */
222 	new_entry = (struct hiballoc_entry*)(
223 	    (caddr_t)hib_entry_to_addr(entry) + entry->hibe_use);
224 	new_entry->hibe_space = entry->hibe_space - find_sz;
225 	new_entry->hibe_use = alloc_sz;
226 
227 	/*
228 	 * Insert entry.
229 	 */
230 	if (RB_INSERT(hiballoc_addr, &arena->hib_addrs, new_entry) != NULL)
231 		panic("hib_alloc: insert failure");
232 	entry->hibe_space = 0;
233 
234 	/* Return address managed by entry. */
235 	return hib_entry_to_addr(new_entry);
236 }
237 
238 /*
239  * Free a pointer previously allocated from this arena.
240  *
241  * If addr is NULL, this will be silently accepted.
242  */
243 void
244 hib_free(struct hiballoc_arena *arena, void *addr)
245 {
246 	struct hiballoc_entry *entry, *prev;
247 
248 	if (addr == NULL)
249 		return;
250 
251 	/*
252 	 * Derive entry from addr and check it is really in this arena.
253 	 */
254 	entry = hib_addr_to_entry(addr);
255 	if (RB_FIND(hiballoc_addr, &arena->hib_addrs, entry) != entry)
256 		panic("hib_free: freed item %p not in hib arena", addr);
257 
258 	/*
259 	 * Give the space in entry to its predecessor.
260 	 *
261 	 * If entry has no predecessor, change its used space into free space
262 	 * instead.
263 	 */
264 	prev = RB_PREV(hiballoc_addr, &arena->hib_addrs, entry);
265 	if (prev != NULL &&
266 	    (void *)((caddr_t)prev + HIB_SIZEOF(struct hiballoc_entry) +
267 	    prev->hibe_use + prev->hibe_space) == entry) {
268 		/* Merge entry. */
269 		RB_REMOVE(hiballoc_addr, &arena->hib_addrs, entry);
270 		prev->hibe_space += HIB_SIZEOF(struct hiballoc_entry) +
271 		    entry->hibe_use + entry->hibe_space;
272 	} else {
273 		/* Flip used memory to free space. */
274 		entry->hibe_space += entry->hibe_use;
275 		entry->hibe_use = 0;
276 	}
277 }
278 
279 /*
280  * Initialize hiballoc.
281  *
282  * The allocator will manage memmory at ptr, which is len bytes.
283  */
284 int
285 hiballoc_init(struct hiballoc_arena *arena, void *p_ptr, size_t p_len)
286 {
287 	struct hiballoc_entry *entry;
288 	caddr_t ptr;
289 	size_t len;
290 
291 	RB_INIT(&arena->hib_addrs);
292 
293 	/*
294 	 * Hib allocator enforces HIB_ALIGN alignment.
295 	 * Fixup ptr and len.
296 	 */
297 	ptr = (caddr_t)roundup((vaddr_t)p_ptr, HIB_ALIGN);
298 	len = p_len - ((size_t)ptr - (size_t)p_ptr);
299 	len &= ~((size_t)HIB_ALIGN - 1);
300 
301 	/*
302 	 * Insufficient memory to be able to allocate and also do bookkeeping.
303 	 */
304 	if (len <= HIB_SIZEOF(struct hiballoc_entry))
305 		return ENOMEM;
306 
307 	/*
308 	 * Create entry describing space.
309 	 */
310 	entry = (struct hiballoc_entry*)ptr;
311 	entry->hibe_use = 0;
312 	entry->hibe_space = len - HIB_SIZEOF(struct hiballoc_entry);
313 	RB_INSERT(hiballoc_addr, &arena->hib_addrs, entry);
314 
315 	return 0;
316 }
317 
318 /*
319  * Zero all free memory.
320  */
321 void
322 uvm_pmr_zero_everything(void)
323 {
324 	struct uvm_pmemrange	*pmr;
325 	struct vm_page		*pg;
326 	int			 i;
327 
328 	uvm_lock_fpageq();
329 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
330 		/* Zero single pages. */
331 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_DIRTY]))
332 		    != NULL) {
333 			uvm_pmr_remove(pmr, pg);
334 			uvm_pagezero(pg);
335 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
336 			uvmexp.zeropages++;
337 			uvm_pmr_insert(pmr, pg, 0);
338 		}
339 
340 		/* Zero multi page ranges. */
341 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_DIRTY]))
342 		    != NULL) {
343 			pg--; /* Size tree always has second page. */
344 			uvm_pmr_remove(pmr, pg);
345 			for (i = 0; i < pg->fpgsz; i++) {
346 				uvm_pagezero(&pg[i]);
347 				atomic_setbits_int(&pg[i].pg_flags, PG_ZERO);
348 				uvmexp.zeropages++;
349 			}
350 			uvm_pmr_insert(pmr, pg, 0);
351 		}
352 	}
353 	uvm_unlock_fpageq();
354 }
355 
356 /*
357  * Mark all memory as dirty.
358  *
359  * Used to inform the system that the clean memory isn't clean for some
360  * reason, for example because we just came back from hibernate.
361  */
362 void
363 uvm_pmr_dirty_everything(void)
364 {
365 	struct uvm_pmemrange	*pmr;
366 	struct vm_page		*pg;
367 	int			 i;
368 
369 	uvm_lock_fpageq();
370 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
371 		/* Dirty single pages. */
372 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_ZERO]))
373 		    != NULL) {
374 			uvm_pmr_remove(pmr, pg);
375 			atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
376 			uvm_pmr_insert(pmr, pg, 0);
377 		}
378 
379 		/* Dirty multi page ranges. */
380 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_ZERO]))
381 		    != NULL) {
382 			pg--; /* Size tree always has second page. */
383 			uvm_pmr_remove(pmr, pg);
384 			for (i = 0; i < pg->fpgsz; i++)
385 				atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
386 			uvm_pmr_insert(pmr, pg, 0);
387 		}
388 	}
389 
390 	uvmexp.zeropages = 0;
391 	uvm_unlock_fpageq();
392 }
393 
394 /*
395  * Allocate the highest address that can hold sz.
396  *
397  * sz in bytes.
398  */
399 int
400 uvm_pmr_alloc_pig(paddr_t *addr, psize_t sz)
401 {
402 	struct uvm_pmemrange	*pmr;
403 	struct vm_page		*pig_pg, *pg;
404 
405 	/*
406 	 * Convert sz to pages, since that is what pmemrange uses internally.
407 	 */
408 	sz = atop(round_page(sz));
409 
410 	uvm_lock_fpageq();
411 
412 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
413 		RB_FOREACH_REVERSE(pig_pg, uvm_pmr_addr, &pmr->addr) {
414 			if (pig_pg->fpgsz >= sz) {
415 				goto found;
416 			}
417 		}
418 	}
419 
420 	/*
421 	 * Allocation failure.
422 	 */
423 	uvm_unlock_fpageq();
424 	return ENOMEM;
425 
426 found:
427 	/* Remove page from freelist. */
428 	uvm_pmr_remove_size(pmr, pig_pg);
429 	pig_pg->fpgsz -= sz;
430 	pg = pig_pg + pig_pg->fpgsz;
431 	if (pig_pg->fpgsz == 0)
432 		uvm_pmr_remove_addr(pmr, pig_pg);
433 	else
434 		uvm_pmr_insert_size(pmr, pig_pg);
435 
436 	uvmexp.free -= sz;
437 	*addr = VM_PAGE_TO_PHYS(pg);
438 
439 	/*
440 	 * Update pg flags.
441 	 *
442 	 * Note that we trash the sz argument now.
443 	 */
444 	while (sz > 0) {
445 		KASSERT(pg->pg_flags & PQ_FREE);
446 
447 		atomic_clearbits_int(&pg->pg_flags, PG_PMAPMASK);
448 
449 		if (pg->pg_flags & PG_ZERO)
450 			uvmexp.zeropages -= sz;
451 		atomic_clearbits_int(&pg->pg_flags,
452 		    PG_ZERO|PQ_FREE);
453 
454 		pg->uobject = NULL;
455 		pg->uanon = NULL;
456 		pg->pg_version++;
457 
458 		/*
459 		 * Next.
460 		 */
461 		pg++;
462 		sz--;
463 	}
464 
465 	/* Return. */
466 	uvm_unlock_fpageq();
467 	return 0;
468 }
469 
470 /*
471  * Allocate a piglet area.
472  *
473  * This is as low as possible.
474  * Piglets are aligned.
475  *
476  * sz and align in bytes.
477  *
478  * The call will sleep for the pagedaemon to attempt to free memory.
479  * The pagedaemon may decide its not possible to free enough memory, causing
480  * the allocation to fail.
481  */
482 int
483 uvm_pmr_alloc_piglet(vaddr_t *va, paddr_t *pa, vsize_t sz, paddr_t align)
484 {
485 	paddr_t			 pg_addr, piglet_addr;
486 	struct uvm_pmemrange	*pmr;
487 	struct vm_page		*pig_pg, *pg;
488 	struct pglist		 pageq;
489 	int			 pdaemon_woken;
490 	vaddr_t			 piglet_va;
491 
492 	/* Ensure align is a power of 2 */
493 	KASSERT((align & (align - 1)) == 0);
494 
495 	pdaemon_woken = 0; /* Didn't wake the pagedaemon. */
496 
497 	/*
498 	 * Fixup arguments: align must be at least PAGE_SIZE,
499 	 * sz will be converted to pagecount, since that is what
500 	 * pmemrange uses internally.
501 	 */
502 	if (align < PAGE_SIZE)
503 		align = PAGE_SIZE;
504 	sz = round_page(sz);
505 
506 	uvm_lock_fpageq();
507 
508 	TAILQ_FOREACH_REVERSE(pmr, &uvm.pmr_control.use, uvm_pmemrange_use,
509 	    pmr_use) {
510 retry:
511 		/*
512 		 * Search for a range with enough space.
513 		 * Use the address tree, to ensure the range is as low as
514 		 * possible.
515 		 */
516 		RB_FOREACH(pig_pg, uvm_pmr_addr, &pmr->addr) {
517 			pg_addr = VM_PAGE_TO_PHYS(pig_pg);
518 			piglet_addr = (pg_addr + (align - 1)) & ~(align - 1);
519 
520 			if (atop(pg_addr) + pig_pg->fpgsz >=
521 			    atop(piglet_addr) + atop(sz))
522 				goto found;
523 		}
524 	}
525 
526 	/*
527 	 * Try to coerce the pagedaemon into freeing memory
528 	 * for the piglet.
529 	 *
530 	 * pdaemon_woken is set to prevent the code from
531 	 * falling into an endless loop.
532 	 */
533 	if (!pdaemon_woken) {
534 		pdaemon_woken = 1;
535 		if (uvm_wait_pla(ptoa(pmr->low), ptoa(pmr->high) - 1,
536 		    sz, UVM_PLA_FAILOK) == 0)
537 			goto retry;
538 	}
539 
540 	/* Return failure. */
541 	uvm_unlock_fpageq();
542 	return ENOMEM;
543 
544 found:
545 	/*
546 	 * Extract piglet from pigpen.
547 	 */
548 	TAILQ_INIT(&pageq);
549 	uvm_pmr_extract_range(pmr, pig_pg,
550 	    atop(piglet_addr), atop(piglet_addr) + atop(sz), &pageq);
551 
552 	*pa = piglet_addr;
553 	uvmexp.free -= atop(sz);
554 
555 	/*
556 	 * Update pg flags.
557 	 *
558 	 * Note that we trash the sz argument now.
559 	 */
560 	TAILQ_FOREACH(pg, &pageq, pageq) {
561 		KASSERT(pg->pg_flags & PQ_FREE);
562 
563 		atomic_clearbits_int(&pg->pg_flags, PG_PMAPMASK);
564 
565 		if (pg->pg_flags & PG_ZERO)
566 			uvmexp.zeropages--;
567 		atomic_clearbits_int(&pg->pg_flags,
568 		    PG_ZERO|PQ_FREE);
569 
570 		pg->uobject = NULL;
571 		pg->uanon = NULL;
572 		pg->pg_version++;
573 	}
574 
575 	uvm_unlock_fpageq();
576 
577 	/*
578 	 * Now allocate a va.
579 	 * Use direct mappings for the pages.
580 	 */
581 
582 	piglet_va = *va = (vaddr_t)km_alloc(sz, &kv_any, &kp_none, &kd_waitok);
583 	if (!piglet_va) {
584 		uvm_pglistfree(&pageq);
585 		return ENOMEM;
586 	}
587 
588 	/*
589 	 * Map piglet to va.
590 	 */
591 	TAILQ_FOREACH(pg, &pageq, pageq) {
592 		pmap_kenter_pa(piglet_va, VM_PAGE_TO_PHYS(pg), UVM_PROT_RW);
593 		piglet_va += PAGE_SIZE;
594 	}
595 	pmap_update(pmap_kernel());
596 
597 	return 0;
598 }
599 
600 /*
601  * Free a piglet area.
602  */
603 void
604 uvm_pmr_free_piglet(vaddr_t va, vsize_t sz)
605 {
606 	paddr_t			 pa;
607 	struct vm_page		*pg;
608 
609 	/*
610 	 * Fix parameters.
611 	 */
612 	sz = round_page(sz);
613 
614 	/*
615 	 * Find the first page in piglet.
616 	 * Since piglets are contiguous, the first pg is all we need.
617 	 */
618 	if (!pmap_extract(pmap_kernel(), va, &pa))
619 		panic("uvm_pmr_free_piglet: piglet 0x%lx has no pages", va);
620 	pg = PHYS_TO_VM_PAGE(pa);
621 	if (pg == NULL)
622 		panic("uvm_pmr_free_piglet: unmanaged page 0x%lx", pa);
623 
624 	/*
625 	 * Unmap.
626 	 */
627 	pmap_kremove(va, sz);
628 	pmap_update(pmap_kernel());
629 
630 	/*
631 	 * Free the physical and virtual memory.
632 	 */
633 	uvm_pmr_freepages(pg, atop(sz));
634 	km_free((void *)va, sz, &kv_any, &kp_none);
635 }
636 
637 /*
638  * Physmem RLE compression support.
639  *
640  * Given a physical page address, return the number of pages starting at the
641  * address that are free.  Clamps to the number of pages in
642  * HIBERNATE_CHUNK_SIZE. Returns 0 if the page at addr is not free.
643  */
644 int
645 uvm_page_rle(paddr_t addr)
646 {
647 	struct vm_page		*pg, *pg_end;
648 	struct vm_physseg	*vmp;
649 	int			 pseg_idx, off_idx;
650 
651 	pseg_idx = vm_physseg_find(atop(addr), &off_idx);
652 	if (pseg_idx == -1)
653 		return 0;
654 
655 	vmp = &vm_physmem[pseg_idx];
656 	pg = &vmp->pgs[off_idx];
657 	if (!(pg->pg_flags & PQ_FREE))
658 		return 0;
659 
660 	/*
661 	 * Search for the first non-free page after pg.
662 	 * Note that the page may not be the first page in a free pmemrange,
663 	 * therefore pg->fpgsz cannot be used.
664 	 */
665 	for (pg_end = pg; pg_end <= vmp->lastpg &&
666 	    (pg_end->pg_flags & PQ_FREE) == PQ_FREE; pg_end++)
667 		;
668 	return min((pg_end - pg), HIBERNATE_CHUNK_SIZE/PAGE_SIZE);
669 }
670 
671 /*
672  * Fills out the hibernate_info union pointed to by hib
673  * with information about this machine (swap signature block
674  * offsets, number of memory ranges, kernel in use, etc)
675  */
676 int
677 get_hibernate_info(union hibernate_info *hib, int suspend)
678 {
679 	int chunktable_size;
680 	struct disklabel dl;
681 	char err_string[128], *dl_ret;
682 
683 #ifndef NO_PROPOLICE
684 	/* Save propolice guard */
685 	hib->guard = __guard_local;
686 #endif /* ! NO_PROPOLICE */
687 
688 	/* Determine I/O function to use */
689 	hib->io_func = get_hibernate_io_function();
690 	if (hib->io_func == NULL)
691 		return (1);
692 
693 	/* Calculate hibernate device */
694 	hib->dev = swdevt[0].sw_dev;
695 
696 	/* Read disklabel (used to calculate signature and image offsets) */
697 	dl_ret = disk_readlabel(&dl, hib->dev, err_string, 128);
698 
699 	if (dl_ret) {
700 		printf("Hibernate error reading disklabel: %s\n", dl_ret);
701 		return (1);
702 	}
703 
704 	/* Make sure we have a swap partition. */
705 	if (dl.d_partitions[1].p_fstype != FS_SWAP ||
706 	    DL_GETPSIZE(&dl.d_partitions[1]) == 0)
707 		return (1);
708 
709 	/* Make sure the signature can fit in one block */
710 	if (sizeof(union hibernate_info) > DEV_BSIZE)
711 		return (1);
712 
713 	/* Magic number */
714 	hib->magic = HIBERNATE_MAGIC;
715 
716 	/* Calculate signature block location */
717 	hib->sig_offset = DL_GETPSIZE(&dl.d_partitions[1]) -
718 	    sizeof(union hibernate_info)/DEV_BSIZE;
719 
720 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / DEV_BSIZE;
721 
722 	/* Stash kernel version information */
723 	memset(&hib->kernel_version, 0, 128);
724 	bcopy(version, &hib->kernel_version,
725 	    min(strlen(version), sizeof(hib->kernel_version)-1));
726 
727 	if (suspend) {
728 		/* Allocate piglet region */
729 		if (uvm_pmr_alloc_piglet(&hib->piglet_va,
730 		    &hib->piglet_pa, HIBERNATE_CHUNK_SIZE * 4,
731 		    HIBERNATE_CHUNK_SIZE)) {
732 			printf("Hibernate failed to allocate the piglet\n");
733 			return (1);
734 		}
735 		hib->io_page = (void *)hib->piglet_va;
736 
737 		/*
738 		 * Initialization of the hibernate IO function for drivers
739 		 * that need to do prep work (such as allocating memory or
740 		 * setting up data structures that cannot safely be done
741 		 * during suspend without causing side effects). There is
742 		 * a matching HIB_DONE call performed after the write is
743 		 * completed.
744 		 */
745 		if (hib->io_func(hib->dev, DL_GETPOFFSET(&dl.d_partitions[1]),
746 		    (vaddr_t)NULL, DL_GETPSIZE(&dl.d_partitions[1]),
747 		    HIB_INIT, hib->io_page))
748 			goto fail;
749 
750 	} else {
751 		/*
752 		 * Resuming kernels use a regular I/O page since we won't
753 		 * have access to the suspended kernel's piglet VA at this
754 		 * point. No need to free this I/O page as it will vanish
755 		 * as part of the resume.
756 		 */
757 		hib->io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
758 		if (!hib->io_page)
759 			return (1);
760 	}
761 
762 
763 	if (get_hibernate_info_md(hib))
764 		goto fail;
765 
766 
767 	return (0);
768 fail:
769 	if (suspend)
770 		uvm_pmr_free_piglet(hib->piglet_va,
771 		    HIBERNATE_CHUNK_SIZE * 4);
772 
773 	return (1);
774 }
775 
776 /*
777  * Allocate nitems*size bytes from the hiballoc area presently in use
778  */
779 void *
780 hibernate_zlib_alloc(void *unused, int nitems, int size)
781 {
782 	struct hibernate_zlib_state *hibernate_state;
783 
784 	hibernate_state =
785 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
786 
787 	return hib_alloc(&hibernate_state->hiballoc_arena, nitems*size);
788 }
789 
790 /*
791  * Free the memory pointed to by addr in the hiballoc area presently in
792  * use
793  */
794 void
795 hibernate_zlib_free(void *unused, void *addr)
796 {
797 	struct hibernate_zlib_state *hibernate_state;
798 
799 	hibernate_state =
800 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
801 
802 	hib_free(&hibernate_state->hiballoc_arena, addr);
803 }
804 
805 /*
806  * Inflate next page of data from the image stream.
807  * The rle parameter is modified on exit to contain the number of pages to
808  * skip in the output stream (or 0 if this page was inflated into).
809  *
810  * Returns 0 if the stream contains additional data, or 1 if the stream is
811  * finished.
812  */
813 int
814 hibernate_inflate_page(int *rle)
815 {
816 	struct hibernate_zlib_state *hibernate_state;
817 	int i;
818 
819 	hibernate_state =
820 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
821 
822 	/* Set up the stream for RLE code inflate */
823 	hibernate_state->hib_stream.next_out = (char *)rle;
824 	hibernate_state->hib_stream.avail_out = sizeof(*rle);
825 
826 	/* Inflate RLE code */
827 	i = inflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH);
828 	if (i != Z_OK && i != Z_STREAM_END) {
829 		/*
830 		 * XXX - this will likely reboot/hang most machines
831 		 *       since the console output buffer will be unmapped,
832 		 *       but there's not much else we can do here.
833 		 */
834 		panic("rle inflate stream error");
835 	}
836 
837 	if (hibernate_state->hib_stream.avail_out != 0) {
838 		/*
839 		 * XXX - this will likely reboot/hang most machines
840 		 *       since the console output buffer will be unmapped,
841 		 *       but there's not much else we can do here.
842 		 */
843 		panic("rle short inflate error");
844 	}
845 
846 	if (*rle < 0 || *rle > 1024) {
847 		/*
848 		 * XXX - this will likely reboot/hang most machines
849 		 *       since the console output buffer will be unmapped,
850 		 *       but there's not much else we can do here.
851 		 */
852 		panic("invalid rle count");
853 	}
854 
855 	if (i == Z_STREAM_END)
856 		return (1);
857 
858 	if (*rle != 0)
859 		return (0);
860 
861 	/* Set up the stream for page inflate */
862 	hibernate_state->hib_stream.next_out = (char *)HIBERNATE_INFLATE_PAGE;
863 	hibernate_state->hib_stream.avail_out = PAGE_SIZE;
864 
865 	/* Process next block of data */
866 	i = inflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH);
867 	if (i != Z_OK && i != Z_STREAM_END) {
868 		/*
869 		 * XXX - this will likely reboot/hang most machines
870 		 *       since the console output buffer will be unmapped,
871 		 *       but there's not much else we can do here.
872 		 */
873 		panic("inflate error");
874 	}
875 
876 	/* We should always have extracted a full page ... */
877 	if (hibernate_state->hib_stream.avail_out != 0) {
878 		/*
879 		 * XXX - this will likely reboot/hang most machines
880 		 *       since the console output buffer will be unmapped,
881 		 *       but there's not much else we can do here.
882 		 */
883 		panic("incomplete page");
884 	}
885 
886 	return (i == Z_STREAM_END);
887 }
888 
889 /*
890  * Inflate size bytes from src into dest, skipping any pages in
891  * [src..dest] that are special (see hibernate_inflate_skip)
892  *
893  * This function executes while using the resume-time stack
894  * and pmap, and therefore cannot use ddb/printf/etc. Doing so
895  * will likely hang or reset the machine since the console output buffer
896  * will be unmapped.
897  */
898 void
899 hibernate_inflate_region(union hibernate_info *hib, paddr_t dest,
900     paddr_t src, size_t size)
901 {
902 	int end_stream = 0, rle;
903 	struct hibernate_zlib_state *hibernate_state;
904 
905 	hibernate_state =
906 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
907 
908 	hibernate_state->hib_stream.next_in = (char *)src;
909 	hibernate_state->hib_stream.avail_in = size;
910 
911 	do {
912 		/*
913 		 * Is this a special page? If yes, redirect the
914 		 * inflate output to a scratch page (eg, discard it)
915 		 */
916 		if (hibernate_inflate_skip(hib, dest)) {
917 			hibernate_enter_resume_mapping(
918 			    HIBERNATE_INFLATE_PAGE,
919 			    HIBERNATE_INFLATE_PAGE, 0);
920 		} else {
921 			hibernate_enter_resume_mapping(
922 			    HIBERNATE_INFLATE_PAGE, dest, 0);
923 		}
924 
925 		hibernate_flush();
926 		end_stream = hibernate_inflate_page(&rle);
927 
928 		if (rle == 0)
929 			dest += PAGE_SIZE;
930 		else
931 			dest += (rle * PAGE_SIZE);
932 	} while (!end_stream);
933 }
934 
935 /*
936  * deflate from src into the I/O page, up to 'remaining' bytes
937  *
938  * Returns number of input bytes consumed, and may reset
939  * the 'remaining' parameter if not all the output space was consumed
940  * (this information is needed to know how much to write to disk
941  */
942 size_t
943 hibernate_deflate(union hibernate_info *hib, paddr_t src,
944     size_t *remaining)
945 {
946 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
947 	struct hibernate_zlib_state *hibernate_state;
948 
949 	hibernate_state =
950 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
951 
952 	/* Set up the stream for deflate */
953 	hibernate_state->hib_stream.next_in = (caddr_t)src;
954 	hibernate_state->hib_stream.avail_in = PAGE_SIZE - (src & PAGE_MASK);
955 	hibernate_state->hib_stream.next_out = (caddr_t)hibernate_io_page +
956 	    (PAGE_SIZE - *remaining);
957 	hibernate_state->hib_stream.avail_out = *remaining;
958 
959 	/* Process next block of data */
960 	if (deflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH) != Z_OK)
961 		panic("hibernate zlib deflate error");
962 
963 	/* Update pointers and return number of bytes consumed */
964 	*remaining = hibernate_state->hib_stream.avail_out;
965 	return (PAGE_SIZE - (src & PAGE_MASK)) -
966 	    hibernate_state->hib_stream.avail_in;
967 }
968 
969 /*
970  * Write the hibernation information specified in hiber_info
971  * to the location in swap previously calculated (last block of
972  * swap), called the "signature block".
973  */
974 int
975 hibernate_write_signature(union hibernate_info *hib)
976 {
977 	/* Write hibernate info to disk */
978 	return (hib->io_func(hib->dev, hib->sig_offset,
979 	    (vaddr_t)hib, DEV_BSIZE, HIB_W,
980 	    hib->io_page));
981 }
982 
983 /*
984  * Write the memory chunk table to the area in swap immediately
985  * preceding the signature block. The chunk table is stored
986  * in the piglet when this function is called.  Returns errno.
987  */
988 int
989 hibernate_write_chunktable(union hibernate_info *hib)
990 {
991 	struct hibernate_disk_chunk *chunks;
992 	vaddr_t hibernate_chunk_table_start;
993 	size_t hibernate_chunk_table_size;
994 	int i, err;
995 
996 	hibernate_chunk_table_size = HIBERNATE_CHUNK_TABLE_SIZE;
997 
998 	hibernate_chunk_table_start = hib->piglet_va +
999 	    HIBERNATE_CHUNK_SIZE;
1000 
1001 	chunks = (struct hibernate_disk_chunk *)(hib->piglet_va +
1002 	    HIBERNATE_CHUNK_SIZE);
1003 
1004 	/* Write chunk table */
1005 	for (i = 0; i < hibernate_chunk_table_size; i += MAXPHYS) {
1006 		if ((err = hib->io_func(hib->dev,
1007 		    hib->chunktable_offset + (i/DEV_BSIZE),
1008 		    (vaddr_t)(hibernate_chunk_table_start + i),
1009 		    MAXPHYS, HIB_W, hib->io_page))) {
1010 			DPRINTF("chunktable write error: %d\n", err);
1011 			return (err);
1012 		}
1013 	}
1014 
1015 	return (0);
1016 }
1017 
1018 /*
1019  * Write an empty hiber_info to the swap signature block, which is
1020  * guaranteed to not match any valid hib.
1021  */
1022 int
1023 hibernate_clear_signature(void)
1024 {
1025 	union hibernate_info blank_hiber_info;
1026 	union hibernate_info hib;
1027 
1028 	/* Zero out a blank hiber_info */
1029 	memset(&blank_hiber_info, 0, sizeof(union hibernate_info));
1030 
1031 	/* Get the signature block location */
1032 	if (get_hibernate_info(&hib, 0))
1033 		return (1);
1034 
1035 	/* Write (zeroed) hibernate info to disk */
1036 	DPRINTF("clearing hibernate signature block location: %lld\n",
1037 		hib.sig_offset);
1038 	if (hibernate_block_io(&hib,
1039 	    hib.sig_offset,
1040 	    DEV_BSIZE, (vaddr_t)&blank_hiber_info, 1))
1041 		printf("Warning: could not clear hibernate signature\n");
1042 
1043 	return (0);
1044 }
1045 
1046 /*
1047  * Check chunk range overlap when calculating whether or not to copy a
1048  * compressed chunk to the piglet area before decompressing.
1049  *
1050  * returns zero if the ranges do not overlap, non-zero otherwise.
1051  */
1052 int
1053 hibernate_check_overlap(paddr_t r1s, paddr_t r1e, paddr_t r2s, paddr_t r2e)
1054 {
1055 	/* case A : end of r1 overlaps start of r2 */
1056 	if (r1s < r2s && r1e > r2s)
1057 		return (1);
1058 
1059 	/* case B : r1 entirely inside r2 */
1060 	if (r1s >= r2s && r1e <= r2e)
1061 		return (1);
1062 
1063 	/* case C : r2 entirely inside r1 */
1064 	if (r2s >= r1s && r2e <= r1e)
1065 		return (1);
1066 
1067 	/* case D : end of r2 overlaps start of r1 */
1068 	if (r2s < r1s && r2e > r1s)
1069 		return (1);
1070 
1071 	return (0);
1072 }
1073 
1074 /*
1075  * Compare two hibernate_infos to determine if they are the same (eg,
1076  * we should be performing a hibernate resume on this machine.
1077  * Not all fields are checked - just enough to verify that the machine
1078  * has the same memory configuration and kernel as the one that
1079  * wrote the signature previously.
1080  */
1081 int
1082 hibernate_compare_signature(union hibernate_info *mine,
1083     union hibernate_info *disk)
1084 {
1085 	u_int i;
1086 
1087 	if (mine->nranges != disk->nranges) {
1088 		DPRINTF("hibernate memory range count mismatch\n");
1089 		return (1);
1090 	}
1091 
1092 	if (strcmp(mine->kernel_version, disk->kernel_version) != 0) {
1093 		DPRINTF("hibernate kernel version mismatch\n");
1094 		return (1);
1095 	}
1096 
1097 	for (i = 0; i < mine->nranges; i++) {
1098 		if ((mine->ranges[i].base != disk->ranges[i].base) ||
1099 		    (mine->ranges[i].end != disk->ranges[i].end) ) {
1100 			DPRINTF("hib range %d mismatch [%p-%p != %p-%p]\n",
1101 				i,
1102 				(void *)mine->ranges[i].base,
1103 				(void *)mine->ranges[i].end,
1104 				(void *)disk->ranges[i].base,
1105 				(void *)disk->ranges[i].end);
1106 			return (1);
1107 		}
1108 	}
1109 
1110 	return (0);
1111 }
1112 
1113 /*
1114  * Transfers xfer_size bytes between the hibernate device specified in
1115  * hib_info at offset blkctr and the vaddr specified at dest.
1116  *
1117  * Separate offsets and pages are used to handle misaligned reads (reads
1118  * that span a page boundary).
1119  *
1120  * blkctr specifies a relative offset (relative to the start of swap),
1121  * not an absolute disk offset
1122  *
1123  */
1124 int
1125 hibernate_block_io(union hibernate_info *hib, daddr_t blkctr,
1126     size_t xfer_size, vaddr_t dest, int iswrite)
1127 {
1128 	struct buf *bp;
1129 	struct bdevsw *bdsw;
1130 	int error;
1131 
1132 	bp = geteblk(xfer_size);
1133 	bdsw = &bdevsw[major(hib->dev)];
1134 
1135 	error = (*bdsw->d_open)(hib->dev, FREAD, S_IFCHR, curproc);
1136 	if (error) {
1137 		printf("hibernate_block_io open failed\n");
1138 		return (1);
1139 	}
1140 
1141 	if (iswrite)
1142 		bcopy((caddr_t)dest, bp->b_data, xfer_size);
1143 
1144 	bp->b_bcount = xfer_size;
1145 	bp->b_blkno = blkctr;
1146 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
1147 	SET(bp->b_flags, B_BUSY | (iswrite ? B_WRITE : B_READ) | B_RAW);
1148 	bp->b_dev = hib->dev;
1149 	(*bdsw->d_strategy)(bp);
1150 
1151 	error = biowait(bp);
1152 	if (error) {
1153 		printf("hib block_io biowait error %d blk %lld size %zu\n",
1154 			error, (long long)blkctr, xfer_size);
1155 		error = (*bdsw->d_close)(hib->dev, 0, S_IFCHR,
1156 		    curproc);
1157 		if (error)
1158 			printf("hibernate_block_io error close failed\n");
1159 		return (1);
1160 	}
1161 
1162 	error = (*bdsw->d_close)(hib->dev, FREAD, S_IFCHR, curproc);
1163 	if (error) {
1164 		printf("hibernate_block_io close failed\n");
1165 		return (1);
1166 	}
1167 
1168 	if (!iswrite)
1169 		bcopy(bp->b_data, (caddr_t)dest, xfer_size);
1170 
1171 	bp->b_flags |= B_INVAL;
1172 	brelse(bp);
1173 
1174 	return (0);
1175 }
1176 
1177 /*
1178  * Reads the signature block from swap, checks against the current machine's
1179  * information. If the information matches, perform a resume by reading the
1180  * saved image into the pig area, and unpacking.
1181  */
1182 void
1183 hibernate_resume(void)
1184 {
1185 	union hibernate_info hib;
1186 	int s;
1187 
1188 	/* Get current running machine's hibernate info */
1189 	memset(&hib, 0, sizeof(hib));
1190 	if (get_hibernate_info(&hib, 0)) {
1191 		DPRINTF("couldn't retrieve machine's hibernate info\n");
1192 		return;
1193 	}
1194 
1195 	/* Read hibernate info from disk */
1196 	s = splbio();
1197 
1198 	DPRINTF("reading hibernate signature block location: %lld\n",
1199 		hib.sig_offset);
1200 
1201 	if (hibernate_block_io(&hib,
1202 	    hib.sig_offset,
1203 	    DEV_BSIZE, (vaddr_t)&disk_hib, 0)) {
1204 		DPRINTF("error in hibernate read");
1205 		splx(s);
1206 		return;
1207 	}
1208 
1209 	/* Check magic number */
1210 	if (disk_hib.magic != HIBERNATE_MAGIC) {
1211 		DPRINTF("wrong magic number in hibernate signature: %x\n",
1212 			disk_hib.magic);
1213 		splx(s);
1214 		return;
1215 	}
1216 
1217 	/*
1218 	 * We (possibly) found a hibernate signature. Clear signature first,
1219 	 * to prevent accidental resume or endless resume cycles later.
1220 	 */
1221 	if (hibernate_clear_signature()) {
1222 		DPRINTF("error clearing hibernate signature block\n");
1223 		splx(s);
1224 		return;
1225 	}
1226 
1227 	/*
1228 	 * If on-disk and in-memory hibernate signatures match,
1229 	 * this means we should do a resume from hibernate.
1230 	 */
1231 	if (hibernate_compare_signature(&hib, &disk_hib)) {
1232 		DPRINTF("mismatched hibernate signature block\n");
1233 		splx(s);
1234 		return;
1235 	}
1236 
1237 #ifdef MULTIPROCESSOR
1238 	/* XXX - if we fail later, we may need to rehatch APs on some archs */
1239 	DPRINTF("hibernate: quiescing APs\n");
1240 	hibernate_quiesce_cpus();
1241 #endif /* MULTIPROCESSOR */
1242 
1243 	/* Read the image from disk into the image (pig) area */
1244 	if (hibernate_read_image(&disk_hib))
1245 		goto fail;
1246 
1247 	DPRINTF("hibernate: quiescing devices\n");
1248 	if (config_suspend(device_mainbus(), DVACT_QUIESCE) != 0)
1249 		goto fail;
1250 
1251 	(void) splhigh();
1252 	hibernate_disable_intr_machdep();
1253 	cold = 1;
1254 
1255 	DPRINTF("hibernate: suspending devices\n");
1256 	if (config_suspend(device_mainbus(), DVACT_SUSPEND) != 0) {
1257 		cold = 0;
1258 		hibernate_enable_intr_machdep();
1259 		goto fail;
1260 	}
1261 
1262 	pmap_kenter_pa(HIBERNATE_HIBALLOC_PAGE, HIBERNATE_HIBALLOC_PAGE,
1263 	    VM_PROT_ALL);
1264 	pmap_activate(curproc);
1265 
1266 	printf("Unpacking image...\n");
1267 
1268 	/* Switch stacks */
1269 	DPRINTF("hibernate: switching stacks\n");
1270 	hibernate_switch_stack_machdep();
1271 
1272 #ifndef NO_PROPOLICE
1273 	/* Start using suspended kernel's propolice guard */
1274 	__guard_local = disk_hib.guard;
1275 #endif /* ! NO_PROPOLICE */
1276 
1277 	/* Unpack and resume */
1278 	hibernate_unpack_image(&disk_hib);
1279 
1280 fail:
1281 	splx(s);
1282 	printf("\nUnable to resume hibernated image\n");
1283 }
1284 
1285 /*
1286  * Unpack image from pig area to original location by looping through the
1287  * list of output chunks in the order they should be restored (fchunks).
1288  *
1289  * Note that due to the stack smash protector and the fact that we have
1290  * switched stacks, it is not permitted to return from this function.
1291  */
1292 void
1293 hibernate_unpack_image(union hibernate_info *hib)
1294 {
1295 	struct hibernate_disk_chunk *chunks;
1296 	union hibernate_info local_hib;
1297 	paddr_t image_cur = global_pig_start;
1298 	short i, *fchunks;
1299 	char *pva;
1300 	struct hibernate_zlib_state *hibernate_state;
1301 
1302 	hibernate_state =
1303 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1304 
1305 	/* Piglet will be identity mapped (VA == PA) */
1306 	pva = (char *)hib->piglet_pa;
1307 
1308 	fchunks = (short *)(pva + (4 * PAGE_SIZE));
1309 
1310 	chunks = (struct hibernate_disk_chunk *)(pva + HIBERNATE_CHUNK_SIZE);
1311 
1312 	/* Can't use hiber_info that's passed in after this point */
1313 	bcopy(hib, &local_hib, sizeof(union hibernate_info));
1314 
1315 	/* VA == PA */
1316 	local_hib.piglet_va = local_hib.piglet_pa;
1317 
1318 	/*
1319 	 * Point of no return. Once we pass this point, only kernel code can
1320 	 * be accessed. No global variables or other kernel data structures
1321 	 * are guaranteed to be coherent after unpack starts.
1322 	 *
1323 	 * The image is now in high memory (pig area), we unpack from the pig
1324 	 * to the correct location in memory. We'll eventually end up copying
1325 	 * on top of ourself, but we are assured the kernel code here is the
1326 	 * same between the hibernated and resuming kernel, and we are running
1327 	 * on our own stack, so the overwrite is ok.
1328 	 */
1329 	DPRINTF("hibernate: activating alt. pagetable and starting unpack\n");
1330 	hibernate_activate_resume_pt_machdep();
1331 
1332 	for (i = 0; i < local_hib.chunk_ctr; i++) {
1333 		/* Reset zlib for inflate */
1334 		if (hibernate_zlib_reset(&local_hib, 0) != Z_OK)
1335 			panic("hibernate failed to reset zlib for inflate");
1336 
1337 		hibernate_process_chunk(&local_hib, &chunks[fchunks[i]],
1338 		    image_cur);
1339 
1340 		image_cur += chunks[fchunks[i]].compressed_size;
1341 
1342 	}
1343 
1344 	/*
1345 	 * Resume the loaded kernel by jumping to the MD resume vector.
1346 	 * We won't be returning from this call.
1347 	 */
1348 	hibernate_resume_machdep();
1349 }
1350 
1351 /*
1352  * Bounce a compressed image chunk to the piglet, entering mappings for the
1353  * copied pages as needed
1354  */
1355 void
1356 hibernate_copy_chunk_to_piglet(paddr_t img_cur, vaddr_t piglet, size_t size)
1357 {
1358 	size_t ct, ofs;
1359 	paddr_t src = img_cur;
1360 	vaddr_t dest = piglet;
1361 
1362 	/* Copy first partial page */
1363 	ct = (PAGE_SIZE) - (src & PAGE_MASK);
1364 	ofs = (src & PAGE_MASK);
1365 
1366 	if (ct < PAGE_SIZE) {
1367 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE,
1368 			(src - ofs), 0);
1369 		hibernate_flush();
1370 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE + ofs), (caddr_t)dest, ct);
1371 		src += ct;
1372 		dest += ct;
1373 	}
1374 
1375 	/* Copy remaining pages */
1376 	while (src < size + img_cur) {
1377 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE, src, 0);
1378 		hibernate_flush();
1379 		ct = PAGE_SIZE;
1380 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE), (caddr_t)dest, ct);
1381 		hibernate_flush();
1382 		src += ct;
1383 		dest += ct;
1384 	}
1385 }
1386 
1387 /*
1388  * Process a chunk by bouncing it to the piglet, followed by unpacking
1389  */
1390 void
1391 hibernate_process_chunk(union hibernate_info *hib,
1392     struct hibernate_disk_chunk *chunk, paddr_t img_cur)
1393 {
1394 	char *pva = (char *)hib->piglet_va;
1395 
1396 	hibernate_copy_chunk_to_piglet(img_cur,
1397 	 (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)), chunk->compressed_size);
1398 	hibernate_inflate_region(hib, chunk->base,
1399 	    (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)),
1400 	    chunk->compressed_size);
1401 }
1402 
1403 /*
1404  * Calculate RLE component for 'inaddr'. Clamps to max RLE pages between
1405  * inaddr and range_end.
1406  */
1407 int
1408 hibernate_calc_rle(paddr_t inaddr, paddr_t range_end)
1409 {
1410 	int rle;
1411 
1412 	rle = uvm_page_rle(inaddr);
1413 	KASSERT(rle >= 0 && rle <= MAX_RLE);
1414 
1415 	/* Clamp RLE to range end */
1416 	if (rle > 0 && inaddr + (rle * PAGE_SIZE) > range_end)
1417 		rle = (range_end - inaddr) / PAGE_SIZE;
1418 
1419 	return (rle);
1420 }
1421 
1422 /*
1423  * Write the RLE byte for page at 'inaddr' to the output stream.
1424  * Returns the number of pages to be skipped at 'inaddr'.
1425  */
1426 int
1427 hibernate_write_rle(union hibernate_info *hib, paddr_t inaddr,
1428 	paddr_t range_end, daddr_t *blkctr,
1429 	size_t *out_remaining)
1430 {
1431 	int rle, err, *rleloc;
1432 	struct hibernate_zlib_state *hibernate_state;
1433 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
1434 
1435 	hibernate_state =
1436 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1437 
1438 	rle = hibernate_calc_rle(inaddr, range_end);
1439 
1440 	rleloc = (int *)hibernate_rle_page + MAX_RLE - 1;
1441 	*rleloc = rle;
1442 
1443 	/* Deflate the RLE byte into the stream */
1444 	hibernate_deflate(hib, (paddr_t)rleloc, out_remaining);
1445 
1446 	/* Did we fill the output page? If so, flush to disk */
1447 	if (*out_remaining == 0) {
1448 		if ((err = hib->io_func(hib->dev, *blkctr + hib->image_offset,
1449 			(vaddr_t)hibernate_io_page, PAGE_SIZE, HIB_W,
1450 			hib->io_page))) {
1451 				DPRINTF("hib write error %d\n", err);
1452 				return (err);
1453 		}
1454 
1455 		*blkctr += PAGE_SIZE / DEV_BSIZE;
1456 		*out_remaining = PAGE_SIZE;
1457 
1458 		/* If we didn't deflate the entire RLE byte, finish it now */
1459 		if (hibernate_state->hib_stream.avail_in != 0)
1460 			hibernate_deflate(hib,
1461 				(vaddr_t)hibernate_state->hib_stream.next_in,
1462 				out_remaining);
1463 	}
1464 
1465 	return (rle);
1466 }
1467 
1468 /*
1469  * Write a compressed version of this machine's memory to disk, at the
1470  * precalculated swap offset:
1471  *
1472  * end of swap - signature block size - chunk table size - memory size
1473  *
1474  * The function begins by looping through each phys mem range, cutting each
1475  * one into MD sized chunks. These chunks are then compressed individually
1476  * and written out to disk, in phys mem order. Some chunks might compress
1477  * more than others, and for this reason, each chunk's size is recorded
1478  * in the chunk table, which is written to disk after the image has
1479  * properly been compressed and written (in hibernate_write_chunktable).
1480  *
1481  * When this function is called, the machine is nearly suspended - most
1482  * devices are quiesced/suspended, interrupts are off, and cold has
1483  * been set. This means that there can be no side effects once the
1484  * write has started, and the write function itself can also have no
1485  * side effects. This also means no printfs are permitted (since printf
1486  * has side effects.)
1487  *
1488  * Return values :
1489  *
1490  * 0      - success
1491  * EIO    - I/O error occurred writing the chunks
1492  * EINVAL - Failed to write a complete range
1493  * ENOMEM - Memory allocation failure during preparation of the zlib arena
1494  */
1495 int
1496 hibernate_write_chunks(union hibernate_info *hib)
1497 {
1498 	paddr_t range_base, range_end, inaddr, temp_inaddr;
1499 	size_t nblocks, out_remaining, used;
1500 	struct hibernate_disk_chunk *chunks;
1501 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
1502 	daddr_t blkctr = 0;
1503 	int i, rle, err;
1504 	struct hibernate_zlib_state *hibernate_state;
1505 
1506 	hibernate_state =
1507 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1508 
1509 	hib->chunk_ctr = 0;
1510 
1511 	/*
1512 	 * Allocate VA for the temp and copy page.
1513 	 *
1514 	 * These will become part of the suspended kernel and will
1515 	 * be freed in hibernate_free, upon resume.
1516 	 */
1517 	hibernate_temp_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1518 	    &kp_none, &kd_nowait);
1519 	if (!hibernate_temp_page) {
1520 		DPRINTF("out of memory allocating hibernate_temp_page\n");
1521 		return (ENOMEM);
1522 	}
1523 
1524 	hibernate_copy_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1525 	    &kp_none, &kd_nowait);
1526 	if (!hibernate_copy_page) {
1527 		DPRINTF("out of memory allocating hibernate_copy_page\n");
1528 		return (ENOMEM);
1529 	}
1530 
1531 	hibernate_rle_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1532 	    &kp_none, &kd_nowait);
1533 	if (!hibernate_rle_page) {
1534 		DPRINTF("out of memory allocating hibernate_rle_page\n");
1535 		return (ENOMEM);
1536 	}
1537 
1538 	/*
1539 	 * Map the utility VAs to the piglet. See the piglet map at the
1540 	 * top of this file for piglet layout information.
1541 	 */
1542 	pmap_kenter_pa(hibernate_copy_page,
1543 		(hib->piglet_pa + 3 * PAGE_SIZE), VM_PROT_ALL);
1544 	pmap_kenter_pa(hibernate_rle_page,
1545 		(hib->piglet_pa + 28 * PAGE_SIZE), VM_PROT_ALL);
1546 
1547 	pmap_activate(curproc);
1548 
1549 	chunks = (struct hibernate_disk_chunk *)(hib->piglet_va +
1550 	    HIBERNATE_CHUNK_SIZE);
1551 
1552 	/* Calculate the chunk regions */
1553 	for (i = 0; i < hib->nranges; i++) {
1554 		range_base = hib->ranges[i].base;
1555 		range_end = hib->ranges[i].end;
1556 
1557 		inaddr = range_base;
1558 
1559 		while (inaddr < range_end) {
1560 			chunks[hib->chunk_ctr].base = inaddr;
1561 			if (inaddr + HIBERNATE_CHUNK_SIZE < range_end)
1562 				chunks[hib->chunk_ctr].end = inaddr +
1563 				    HIBERNATE_CHUNK_SIZE;
1564 			else
1565 				chunks[hib->chunk_ctr].end = range_end;
1566 
1567 			inaddr += HIBERNATE_CHUNK_SIZE;
1568 			hib->chunk_ctr ++;
1569 		}
1570 	}
1571 
1572 	uvm_pmr_dirty_everything();
1573 	uvm_pmr_zero_everything();
1574 
1575 	/* Compress and write the chunks in the chunktable */
1576 	for (i = 0; i < hib->chunk_ctr; i++) {
1577 		range_base = chunks[i].base;
1578 		range_end = chunks[i].end;
1579 
1580 		chunks[i].offset = blkctr + hib->image_offset;
1581 
1582 		/* Reset zlib for deflate */
1583 		if (hibernate_zlib_reset(hib, 1) != Z_OK) {
1584 			DPRINTF("hibernate_zlib_reset failed for deflate\n");
1585 			return (ENOMEM);
1586 		}
1587 
1588 		inaddr = range_base;
1589 
1590 		/*
1591 		 * For each range, loop through its phys mem region
1592 		 * and write out the chunks (the last chunk might be
1593 		 * smaller than the chunk size).
1594 		 */
1595 		while (inaddr < range_end) {
1596 			out_remaining = PAGE_SIZE;
1597 			while (out_remaining > 0 && inaddr < range_end) {
1598 				/*
1599 				 * Adjust for regions that are not evenly
1600 				 * divisible by PAGE_SIZE or overflowed
1601 				 * pages from the previous iteration.
1602 				 */
1603 				temp_inaddr = (inaddr & PAGE_MASK) +
1604 				    hibernate_copy_page;
1605 
1606 				/* Deflate from temp_inaddr to IO page */
1607 				if (inaddr != range_end) {
1608 					if (inaddr % PAGE_SIZE == 0) {
1609 						rle = hibernate_write_rle(hib,
1610 							inaddr,
1611 							range_end,
1612 							&blkctr,
1613 							&out_remaining);
1614 					}
1615 
1616 					if (rle == 0) {
1617 						pmap_kenter_pa(hibernate_temp_page,
1618 							inaddr & PMAP_PA_MASK,
1619 							VM_PROT_ALL);
1620 
1621 						pmap_activate(curproc);
1622 
1623 						bcopy((caddr_t)hibernate_temp_page,
1624 							(caddr_t)hibernate_copy_page,
1625 							PAGE_SIZE);
1626 						inaddr += hibernate_deflate(hib,
1627 							temp_inaddr,
1628 							&out_remaining);
1629 					} else {
1630 						inaddr += rle * PAGE_SIZE;
1631 						if (inaddr > range_end)
1632 							inaddr = range_end;
1633 					}
1634 
1635 				}
1636 
1637 				if (out_remaining == 0) {
1638 					/* Filled up the page */
1639 					nblocks = PAGE_SIZE / DEV_BSIZE;
1640 
1641 					if ((err = hib->io_func(hib->dev,
1642 					    blkctr + hib->image_offset,
1643 					    (vaddr_t)hibernate_io_page,
1644 					    PAGE_SIZE, HIB_W, hib->io_page))) {
1645 						DPRINTF("hib write error %d\n",
1646 						    err);
1647 						return (err);
1648 					}
1649 
1650 					blkctr += nblocks;
1651 				}
1652 			}
1653 		}
1654 
1655 		if (inaddr != range_end) {
1656 			DPRINTF("deflate range ended prematurely\n");
1657 			return (EINVAL);
1658 		}
1659 
1660 		/*
1661 		 * End of range. Round up to next secsize bytes
1662 		 * after finishing compress
1663 		 */
1664 		if (out_remaining == 0)
1665 			out_remaining = PAGE_SIZE;
1666 
1667 		/* Finish compress */
1668 		hibernate_state->hib_stream.next_in = (caddr_t)inaddr;
1669 		hibernate_state->hib_stream.avail_in = 0;
1670 		hibernate_state->hib_stream.next_out =
1671 		    (caddr_t)hibernate_io_page + (PAGE_SIZE - out_remaining);
1672 
1673 		/* We have an extra output page available for finalize */
1674 		hibernate_state->hib_stream.avail_out =
1675 			out_remaining + PAGE_SIZE;
1676 
1677 		if ((err = deflate(&hibernate_state->hib_stream, Z_FINISH)) !=
1678 		    Z_STREAM_END) {
1679 			DPRINTF("deflate error in output stream: %d\n", err);
1680 			return (err);
1681 		}
1682 
1683 		out_remaining = hibernate_state->hib_stream.avail_out;
1684 
1685 		used = 2*PAGE_SIZE - out_remaining;
1686 		nblocks = used / DEV_BSIZE;
1687 
1688 		/* Round up to next block if needed */
1689 		if (used % DEV_BSIZE != 0)
1690 			nblocks ++;
1691 
1692 		/* Write final block(s) for this chunk */
1693 		if ((err = hib->io_func(hib->dev, blkctr + hib->image_offset,
1694 		    (vaddr_t)hibernate_io_page, nblocks*DEV_BSIZE,
1695 		    HIB_W, hib->io_page))) {
1696 			DPRINTF("hib final write error %d\n", err);
1697 			return (err);
1698 		}
1699 
1700 		blkctr += nblocks;
1701 
1702 		chunks[i].compressed_size = (blkctr + hib->image_offset -
1703 		    chunks[i].offset) * DEV_BSIZE;
1704 	}
1705 
1706 	hib->chunktable_offset = hib->image_offset + blkctr;
1707 	return (0);
1708 }
1709 
1710 /*
1711  * Reset the zlib stream state and allocate a new hiballoc area for either
1712  * inflate or deflate. This function is called once for each hibernate chunk.
1713  * Calling hiballoc_init multiple times is acceptable since the memory it is
1714  * provided is unmanaged memory (stolen). We use the memory provided to us
1715  * by the piglet allocated via the supplied hib.
1716  */
1717 int
1718 hibernate_zlib_reset(union hibernate_info *hib, int deflate)
1719 {
1720 	vaddr_t hibernate_zlib_start;
1721 	size_t hibernate_zlib_size;
1722 	char *pva = (char *)hib->piglet_va;
1723 	struct hibernate_zlib_state *hibernate_state;
1724 
1725 	hibernate_state =
1726 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1727 
1728 	if (!deflate)
1729 		pva = (char *)((paddr_t)pva & (PIGLET_PAGE_MASK));
1730 
1731 	/*
1732 	 * See piglet layout information at the start of this file for
1733 	 * information on the zlib page assignments.
1734 	 */
1735 	hibernate_zlib_start = (vaddr_t)(pva + (29 * PAGE_SIZE));
1736 	hibernate_zlib_size = 80 * PAGE_SIZE;
1737 
1738 	memset((void *)hibernate_zlib_start, 0, hibernate_zlib_size);
1739 	memset(hibernate_state, 0, PAGE_SIZE);
1740 
1741 	/* Set up stream structure */
1742 	hibernate_state->hib_stream.zalloc = (alloc_func)hibernate_zlib_alloc;
1743 	hibernate_state->hib_stream.zfree = (free_func)hibernate_zlib_free;
1744 
1745 	/* Initialize the hiballoc arena for zlib allocs/frees */
1746 	hiballoc_init(&hibernate_state->hiballoc_arena,
1747 	    (caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1748 
1749 	if (deflate) {
1750 		return deflateInit(&hibernate_state->hib_stream,
1751 		    Z_BEST_SPEED);
1752 	} else
1753 		return inflateInit(&hibernate_state->hib_stream);
1754 }
1755 
1756 /*
1757  * Reads the hibernated memory image from disk, whose location and
1758  * size are recorded in hib. Begin by reading the persisted
1759  * chunk table, which records the original chunk placement location
1760  * and compressed size for each. Next, allocate a pig region of
1761  * sufficient size to hold the compressed image. Next, read the
1762  * chunks into the pig area (calling hibernate_read_chunks to do this),
1763  * and finally, if all of the above succeeds, clear the hibernate signature.
1764  * The function will then return to hibernate_resume, which will proceed
1765  * to unpack the pig image to the correct place in memory.
1766  */
1767 int
1768 hibernate_read_image(union hibernate_info *hib)
1769 {
1770 	size_t compressed_size, disk_size, chunktable_size, pig_sz;
1771 	paddr_t image_start, image_end, pig_start, pig_end;
1772 	struct hibernate_disk_chunk *chunks;
1773 	daddr_t blkctr;
1774 	vaddr_t chunktable = (vaddr_t)NULL;
1775 	paddr_t piglet_chunktable = hib->piglet_pa +
1776 	    HIBERNATE_CHUNK_SIZE;
1777 	int i, status;
1778 
1779 	status = 0;
1780 	pmap_activate(curproc);
1781 
1782 	/* Calculate total chunk table size in disk blocks */
1783 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / DEV_BSIZE;
1784 
1785 	blkctr = hib->chunktable_offset;
1786 
1787 	chunktable = (vaddr_t)km_alloc(HIBERNATE_CHUNK_TABLE_SIZE, &kv_any,
1788 	    &kp_none, &kd_nowait);
1789 
1790 	if (!chunktable)
1791 		return (1);
1792 
1793 	/* Map chunktable pages */
1794 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE; i += PAGE_SIZE)
1795 		pmap_kenter_pa(chunktable + i, piglet_chunktable + i,
1796 		    VM_PROT_ALL);
1797 	pmap_update(pmap_kernel());
1798 
1799 	/* Read the chunktable from disk into the piglet chunktable */
1800 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE;
1801 	    i += MAXPHYS, blkctr += MAXPHYS/DEV_BSIZE)
1802 		hibernate_block_io(hib, blkctr, MAXPHYS,
1803 		    chunktable + i, 0);
1804 
1805 	blkctr = hib->image_offset;
1806 	compressed_size = 0;
1807 
1808 	chunks = (struct hibernate_disk_chunk *)chunktable;
1809 
1810 	for (i = 0; i < hib->chunk_ctr; i++)
1811 		compressed_size += chunks[i].compressed_size;
1812 
1813 	disk_size = compressed_size;
1814 
1815 	printf("unhibernating @ block %lld length %lu bytes\n",
1816 	    hib->sig_offset - chunktable_size,
1817 	    compressed_size);
1818 
1819 	/* Allocate the pig area */
1820 	pig_sz = compressed_size + HIBERNATE_CHUNK_SIZE;
1821 	if (uvm_pmr_alloc_pig(&pig_start, pig_sz) == ENOMEM) {
1822 		status = 1;
1823 		goto unmap;
1824 	}
1825 
1826 	pig_end = pig_start + pig_sz;
1827 
1828 	/* Calculate image extents. Pig image must end on a chunk boundary. */
1829 	image_end = pig_end & ~(HIBERNATE_CHUNK_SIZE - 1);
1830 	image_start = image_end - disk_size;
1831 
1832 	hibernate_read_chunks(hib, image_start, image_end, disk_size,
1833 	    chunks);
1834 
1835 	/* Prepare the resume time pmap/page table */
1836 	hibernate_populate_resume_pt(hib, image_start, image_end);
1837 
1838 unmap:
1839 	/* Unmap chunktable pages */
1840 	pmap_kremove(chunktable, HIBERNATE_CHUNK_TABLE_SIZE);
1841 	pmap_update(pmap_kernel());
1842 
1843 	return (status);
1844 }
1845 
1846 /*
1847  * Read the hibernated memory chunks from disk (chunk information at this
1848  * point is stored in the piglet) into the pig area specified by
1849  * [pig_start .. pig_end]. Order the chunks so that the final chunk is the
1850  * only chunk with overlap possibilities.
1851  */
1852 int
1853 hibernate_read_chunks(union hibernate_info *hib, paddr_t pig_start,
1854     paddr_t pig_end, size_t image_compr_size,
1855     struct hibernate_disk_chunk *chunks)
1856 {
1857 	paddr_t img_cur, piglet_base;
1858 	daddr_t blkctr;
1859 	size_t processed, compressed_size, read_size;
1860 	int nchunks, nfchunks, num_io_pages;
1861 	vaddr_t tempva, hibernate_fchunk_area;
1862 	short *fchunks, i, j;
1863 
1864 	tempva = (vaddr_t)NULL;
1865 	hibernate_fchunk_area = (vaddr_t)NULL;
1866 	nfchunks = 0;
1867 	piglet_base = hib->piglet_pa;
1868 	global_pig_start = pig_start;
1869 
1870 	pmap_activate(curproc);
1871 
1872 	/*
1873 	 * These mappings go into the resuming kernel's page table, and are
1874 	 * used only during image read. They dissappear from existence
1875 	 * when the suspended kernel is unpacked on top of us.
1876 	 */
1877 	tempva = (vaddr_t)km_alloc(MAXPHYS + PAGE_SIZE, &kv_any, &kp_none,
1878 		&kd_nowait);
1879 	if (!tempva)
1880 		return (1);
1881 	hibernate_fchunk_area = (vaddr_t)km_alloc(24 * PAGE_SIZE, &kv_any,
1882 	    &kp_none, &kd_nowait);
1883 	if (!hibernate_fchunk_area)
1884 		return (1);
1885 
1886 	/* Final output chunk ordering VA */
1887 	fchunks = (short *)hibernate_fchunk_area;
1888 
1889 	/* Map the chunk ordering region */
1890 	for(i = 0; i < 24 ; i++)
1891 		pmap_kenter_pa(hibernate_fchunk_area + (i * PAGE_SIZE),
1892 			piglet_base + ((4 + i) * PAGE_SIZE), VM_PROT_ALL);
1893 	pmap_update(pmap_kernel());
1894 
1895 	nchunks = hib->chunk_ctr;
1896 
1897 	/* Initially start all chunks as unplaced */
1898 	for (i = 0; i < nchunks; i++)
1899 		chunks[i].flags = 0;
1900 
1901 	/*
1902 	 * Search the list for chunks that are outside the pig area. These
1903 	 * can be placed first in the final output list.
1904 	 */
1905 	for (i = 0; i < nchunks; i++) {
1906 		if (chunks[i].end <= pig_start || chunks[i].base >= pig_end) {
1907 			fchunks[nfchunks] = i;
1908 			nfchunks++;
1909 			chunks[i].flags |= HIBERNATE_CHUNK_PLACED;
1910 		}
1911 	}
1912 
1913 	/*
1914 	 * Walk the ordering, place the chunks in ascending memory order.
1915 	 */
1916 	for (i = 0; i < nchunks; i++) {
1917 		if (chunks[i].flags != HIBERNATE_CHUNK_PLACED) {
1918 			fchunks[nfchunks] = i;
1919 			nfchunks++;
1920 			chunks[i].flags = HIBERNATE_CHUNK_PLACED;
1921 		}
1922 	}
1923 
1924 	img_cur = pig_start;
1925 
1926 	for (i = 0; i < nfchunks; i++) {
1927 		blkctr = chunks[fchunks[i]].offset;
1928 		processed = 0;
1929 		compressed_size = chunks[fchunks[i]].compressed_size;
1930 
1931 		while (processed < compressed_size) {
1932 			if (compressed_size - processed >= MAXPHYS)
1933 				read_size = MAXPHYS;
1934 			else
1935 				read_size = compressed_size - processed;
1936 
1937 			/*
1938 			 * We're reading read_size bytes, offset from the
1939 			 * start of a page by img_cur % PAGE_SIZE, so the
1940 			 * end will be read_size + (img_cur % PAGE_SIZE)
1941 			 * from the start of the first page.  Round that
1942 			 * up to the next page size.
1943 			 */
1944 			num_io_pages = (read_size + (img_cur % PAGE_SIZE)
1945 				+ PAGE_SIZE - 1) / PAGE_SIZE;
1946 
1947 			KASSERT(num_io_pages <= MAXPHYS/PAGE_SIZE + 1);
1948 
1949 			/* Map pages for this read */
1950 			for (j = 0; j < num_io_pages; j ++)
1951 				pmap_kenter_pa(tempva + j * PAGE_SIZE,
1952 					img_cur + j * PAGE_SIZE, VM_PROT_ALL);
1953 
1954 			pmap_update(pmap_kernel());
1955 
1956 			hibernate_block_io(hib, blkctr, read_size,
1957 			    tempva + (img_cur & PAGE_MASK), 0);
1958 
1959 			blkctr += (read_size / DEV_BSIZE);
1960 
1961 			pmap_kremove(tempva, num_io_pages * PAGE_SIZE);
1962 			pmap_update(pmap_kernel());
1963 
1964 			processed += read_size;
1965 			img_cur += read_size;
1966 		}
1967 	}
1968 
1969 	pmap_kremove(hibernate_fchunk_area, 24 * PAGE_SIZE);
1970 	pmap_update(pmap_kernel());
1971 
1972 	return (0);
1973 }
1974 
1975 /*
1976  * Hibernating a machine comprises the following operations:
1977  *  1. Calculating this machine's hibernate_info information
1978  *  2. Allocating a piglet and saving the piglet's physaddr
1979  *  3. Calculating the memory chunks
1980  *  4. Writing the compressed chunks to disk
1981  *  5. Writing the chunk table
1982  *  6. Writing the signature block (hibernate_info)
1983  *
1984  * On most architectures, the function calling hibernate_suspend would
1985  * then power off the machine using some MD-specific implementation.
1986  */
1987 int
1988 hibernate_suspend(void)
1989 {
1990 	union hibernate_info hib;
1991 	u_long start, end;
1992 
1993 	/*
1994 	 * Calculate memory ranges, swap offsets, etc.
1995 	 * This also allocates a piglet whose physaddr is stored in
1996 	 * hib->piglet_pa and vaddr stored in hib->piglet_va
1997 	 */
1998 	if (get_hibernate_info(&hib, 1)) {
1999 		DPRINTF("failed to obtain hibernate info\n");
2000 		return (1);
2001 	}
2002 
2003 	/* Find a page-addressed region in swap [start,end] */
2004 	if (uvm_hibswap(hib.dev, &start, &end)) {
2005 		printf("hibernate: cannot find any swap\n");
2006 		return (1);
2007 	}
2008 
2009 	if (end - start < 1000) {
2010 		printf("hibernate: insufficient swap (%lu is too small)\n",
2011 			end - start);
2012 		return (1);
2013 	}
2014 
2015 	/* Calculate block offsets in swap */
2016 	hib.image_offset = ctod(start);
2017 
2018 	DPRINTF("hibernate @ block %lld max-length %lu blocks\n",
2019 	    hib.image_offset, ctod(end) - ctod(start));
2020 
2021 	pmap_kenter_pa(HIBERNATE_HIBALLOC_PAGE, HIBERNATE_HIBALLOC_PAGE,
2022 		VM_PROT_ALL);
2023 	pmap_activate(curproc);
2024 
2025 	/* Stash the piglet VA so we can free it in the resuming kernel */
2026 	global_piglet_va = hib.piglet_va;
2027 
2028 	DPRINTF("hibernate: writing chunks\n");
2029 	if (hibernate_write_chunks(&hib)) {
2030 		DPRINTF("hibernate_write_chunks failed\n");
2031 		return (1);
2032 	}
2033 
2034 	DPRINTF("hibernate: writing chunktable\n");
2035 	if (hibernate_write_chunktable(&hib)) {
2036 		DPRINTF("hibernate_write_chunktable failed\n");
2037 		return (1);
2038 	}
2039 
2040 	DPRINTF("hibernate: writing signature\n");
2041 	if (hibernate_write_signature(&hib)) {
2042 		DPRINTF("hibernate_write_signature failed\n");
2043 		return (1);
2044 	}
2045 
2046 	/* Allow the disk to settle */
2047 	delay(500000);
2048 
2049 	/*
2050 	 * Give the device-specific I/O function a notification that we're
2051 	 * done, and that it can clean up or shutdown as needed.
2052 	 */
2053 	hib.io_func(hib.dev, 0, (vaddr_t)NULL, 0, HIB_DONE, hib.io_page);
2054 
2055 	return (0);
2056 }
2057 
2058 /*
2059  * Free items allocated by hibernate_suspend()
2060  */
2061 void
2062 hibernate_free(void)
2063 {
2064 	if (global_piglet_va)
2065 		uvm_pmr_free_piglet(global_piglet_va,
2066 		    4 * HIBERNATE_CHUNK_SIZE);
2067 
2068 	if (hibernate_copy_page)
2069 		pmap_kremove(hibernate_copy_page, PAGE_SIZE);
2070 	if (hibernate_temp_page)
2071 		pmap_kremove(hibernate_temp_page, PAGE_SIZE);
2072 	if (hibernate_rle_page)
2073 		pmap_kremove(hibernate_rle_page, PAGE_SIZE);
2074 
2075 	pmap_update(pmap_kernel());
2076 
2077 	if (hibernate_copy_page)
2078 		km_free((void *)hibernate_copy_page, PAGE_SIZE,
2079 		    &kv_any, &kp_none);
2080 	if (hibernate_temp_page)
2081 		km_free((void *)hibernate_temp_page, PAGE_SIZE,
2082 		    &kv_any, &kp_none);
2083 	if (hibernate_rle_page)
2084 		km_free((void *)hibernate_rle_page, PAGE_SIZE,
2085 		    &kv_any, &kp_none);
2086 
2087 	global_piglet_va = 0;
2088 	hibernate_copy_page = 0;
2089 	hibernate_temp_page = 0;
2090 	hibernate_rle_page = 0;
2091 }
2092