xref: /openbsd-src/sys/kern/subr_hibernate.c (revision ac9b4aacc1da35008afea06a5d23c2f2dea9b93e)
1 /*	$OpenBSD: subr_hibernate.c,v 1.46 2012/07/19 18:07:03 deraadt Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  * Copyright (c) 2011 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/hibernate.h>
21 #include <sys/malloc.h>
22 #include <sys/param.h>
23 #include <sys/tree.h>
24 #include <sys/types.h>
25 #include <sys/systm.h>
26 #include <sys/disklabel.h>
27 #include <sys/disk.h>
28 #include <sys/conf.h>
29 #include <sys/buf.h>
30 #include <sys/fcntl.h>
31 #include <sys/stat.h>
32 #include <uvm/uvm.h>
33 #include <uvm/uvm_swap.h>
34 #include <machine/hibernate.h>
35 
36 /* Temporary vaddr ranges used during hibernate */
37 vaddr_t hibernate_temp_page;
38 vaddr_t hibernate_copy_page;
39 
40 /* Hibernate info as read from disk during resume */
41 union hibernate_info disk_hiber_info;
42 paddr_t global_pig_start;
43 vaddr_t global_piglet_va;
44 
45 /*
46  * Hib alloc enforced alignment.
47  */
48 #define HIB_ALIGN		8 /* bytes alignment */
49 
50 /*
51  * sizeof builtin operation, but with alignment constraint.
52  */
53 #define HIB_SIZEOF(_type)	roundup(sizeof(_type), HIB_ALIGN)
54 
55 struct hiballoc_entry {
56 	size_t			hibe_use;
57 	size_t			hibe_space;
58 	RB_ENTRY(hiballoc_entry) hibe_entry;
59 };
60 
61 /*
62  * Compare hiballoc entries based on the address they manage.
63  *
64  * Since the address is fixed, relative to struct hiballoc_entry,
65  * we just compare the hiballoc_entry pointers.
66  */
67 static __inline int
68 hibe_cmp(struct hiballoc_entry *l, struct hiballoc_entry *r)
69 {
70 	return l < r ? -1 : (l > r);
71 }
72 
73 RB_PROTOTYPE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
74 
75 /*
76  * Given a hiballoc entry, return the address it manages.
77  */
78 static __inline void *
79 hib_entry_to_addr(struct hiballoc_entry *entry)
80 {
81 	caddr_t addr;
82 
83 	addr = (caddr_t)entry;
84 	addr += HIB_SIZEOF(struct hiballoc_entry);
85 	return addr;
86 }
87 
88 /*
89  * Given an address, find the hiballoc that corresponds.
90  */
91 static __inline struct hiballoc_entry*
92 hib_addr_to_entry(void *addr_param)
93 {
94 	caddr_t addr;
95 
96 	addr = (caddr_t)addr_param;
97 	addr -= HIB_SIZEOF(struct hiballoc_entry);
98 	return (struct hiballoc_entry*)addr;
99 }
100 
101 RB_GENERATE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
102 
103 /*
104  * Allocate memory from the arena.
105  *
106  * Returns NULL if no memory is available.
107  */
108 void *
109 hib_alloc(struct hiballoc_arena *arena, size_t alloc_sz)
110 {
111 	struct hiballoc_entry *entry, *new_entry;
112 	size_t find_sz;
113 
114 	/*
115 	 * Enforce alignment of HIB_ALIGN bytes.
116 	 *
117 	 * Note that, because the entry is put in front of the allocation,
118 	 * 0-byte allocations are guaranteed a unique address.
119 	 */
120 	alloc_sz = roundup(alloc_sz, HIB_ALIGN);
121 
122 	/*
123 	 * Find an entry with hibe_space >= find_sz.
124 	 *
125 	 * If the root node is not large enough, we switch to tree traversal.
126 	 * Because all entries are made at the bottom of the free space,
127 	 * traversal from the end has a slightly better chance of yielding
128 	 * a sufficiently large space.
129 	 */
130 	find_sz = alloc_sz + HIB_SIZEOF(struct hiballoc_entry);
131 	entry = RB_ROOT(&arena->hib_addrs);
132 	if (entry != NULL && entry->hibe_space < find_sz) {
133 		RB_FOREACH_REVERSE(entry, hiballoc_addr, &arena->hib_addrs) {
134 			if (entry->hibe_space >= find_sz)
135 				break;
136 		}
137 	}
138 
139 	/*
140 	 * Insufficient or too fragmented memory.
141 	 */
142 	if (entry == NULL)
143 		return NULL;
144 
145 	/*
146 	 * Create new entry in allocated space.
147 	 */
148 	new_entry = (struct hiballoc_entry*)(
149 	    (caddr_t)hib_entry_to_addr(entry) + entry->hibe_use);
150 	new_entry->hibe_space = entry->hibe_space - find_sz;
151 	new_entry->hibe_use = alloc_sz;
152 
153 	/*
154 	 * Insert entry.
155 	 */
156 	if (RB_INSERT(hiballoc_addr, &arena->hib_addrs, new_entry) != NULL)
157 		panic("hib_alloc: insert failure");
158 	entry->hibe_space = 0;
159 
160 	/* Return address managed by entry. */
161 	return hib_entry_to_addr(new_entry);
162 }
163 
164 /*
165  * Free a pointer previously allocated from this arena.
166  *
167  * If addr is NULL, this will be silently accepted.
168  */
169 void
170 hib_free(struct hiballoc_arena *arena, void *addr)
171 {
172 	struct hiballoc_entry *entry, *prev;
173 
174 	if (addr == NULL)
175 		return;
176 
177 	/*
178 	 * Derive entry from addr and check it is really in this arena.
179 	 */
180 	entry = hib_addr_to_entry(addr);
181 	if (RB_FIND(hiballoc_addr, &arena->hib_addrs, entry) != entry)
182 		panic("hib_free: freed item %p not in hib arena", addr);
183 
184 	/*
185 	 * Give the space in entry to its predecessor.
186 	 *
187 	 * If entry has no predecessor, change its used space into free space
188 	 * instead.
189 	 */
190 	prev = RB_PREV(hiballoc_addr, &arena->hib_addrs, entry);
191 	if (prev != NULL &&
192 	    (void *)((caddr_t)prev + HIB_SIZEOF(struct hiballoc_entry) +
193 	    prev->hibe_use + prev->hibe_space) == entry) {
194 		/* Merge entry. */
195 		RB_REMOVE(hiballoc_addr, &arena->hib_addrs, entry);
196 		prev->hibe_space += HIB_SIZEOF(struct hiballoc_entry) +
197 		    entry->hibe_use + entry->hibe_space;
198 	} else {
199 		/* Flip used memory to free space. */
200 		entry->hibe_space += entry->hibe_use;
201 		entry->hibe_use = 0;
202 	}
203 }
204 
205 /*
206  * Initialize hiballoc.
207  *
208  * The allocator will manage memmory at ptr, which is len bytes.
209  */
210 int
211 hiballoc_init(struct hiballoc_arena *arena, void *p_ptr, size_t p_len)
212 {
213 	struct hiballoc_entry *entry;
214 	caddr_t ptr;
215 	size_t len;
216 
217 	RB_INIT(&arena->hib_addrs);
218 
219 	/*
220 	 * Hib allocator enforces HIB_ALIGN alignment.
221 	 * Fixup ptr and len.
222 	 */
223 	ptr = (caddr_t)roundup((vaddr_t)p_ptr, HIB_ALIGN);
224 	len = p_len - ((size_t)ptr - (size_t)p_ptr);
225 	len &= ~((size_t)HIB_ALIGN - 1);
226 
227 	/*
228 	 * Insufficient memory to be able to allocate and also do bookkeeping.
229 	 */
230 	if (len <= HIB_SIZEOF(struct hiballoc_entry))
231 		return ENOMEM;
232 
233 	/*
234 	 * Create entry describing space.
235 	 */
236 	entry = (struct hiballoc_entry*)ptr;
237 	entry->hibe_use = 0;
238 	entry->hibe_space = len - HIB_SIZEOF(struct hiballoc_entry);
239 	RB_INSERT(hiballoc_addr, &arena->hib_addrs, entry);
240 
241 	return 0;
242 }
243 
244 /*
245  * Zero all free memory.
246  */
247 void
248 uvm_pmr_zero_everything(void)
249 {
250 	struct uvm_pmemrange	*pmr;
251 	struct vm_page		*pg;
252 	int			 i;
253 
254 	uvm_lock_fpageq();
255 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
256 		/* Zero single pages. */
257 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_DIRTY]))
258 		    != NULL) {
259 			uvm_pmr_remove(pmr, pg);
260 			uvm_pagezero(pg);
261 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
262 			uvmexp.zeropages++;
263 			uvm_pmr_insert(pmr, pg, 0);
264 		}
265 
266 		/* Zero multi page ranges. */
267 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_DIRTY]))
268 		    != NULL) {
269 			pg--; /* Size tree always has second page. */
270 			uvm_pmr_remove(pmr, pg);
271 			for (i = 0; i < pg->fpgsz; i++) {
272 				uvm_pagezero(&pg[i]);
273 				atomic_setbits_int(&pg[i].pg_flags, PG_ZERO);
274 				uvmexp.zeropages++;
275 			}
276 			uvm_pmr_insert(pmr, pg, 0);
277 		}
278 	}
279 	uvm_unlock_fpageq();
280 }
281 
282 /*
283  * Mark all memory as dirty.
284  *
285  * Used to inform the system that the clean memory isn't clean for some
286  * reason, for example because we just came back from hibernate.
287  */
288 void
289 uvm_pmr_dirty_everything(void)
290 {
291 	struct uvm_pmemrange	*pmr;
292 	struct vm_page		*pg;
293 	int			 i;
294 
295 	uvm_lock_fpageq();
296 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
297 		/* Dirty single pages. */
298 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_ZERO]))
299 		    != NULL) {
300 			uvm_pmr_remove(pmr, pg);
301 			atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
302 			uvm_pmr_insert(pmr, pg, 0);
303 		}
304 
305 		/* Dirty multi page ranges. */
306 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_ZERO]))
307 		    != NULL) {
308 			pg--; /* Size tree always has second page. */
309 			uvm_pmr_remove(pmr, pg);
310 			for (i = 0; i < pg->fpgsz; i++)
311 				atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
312 			uvm_pmr_insert(pmr, pg, 0);
313 		}
314 	}
315 
316 	uvmexp.zeropages = 0;
317 	uvm_unlock_fpageq();
318 }
319 
320 /*
321  * Allocate the highest address that can hold sz.
322  *
323  * sz in bytes.
324  */
325 int
326 uvm_pmr_alloc_pig(paddr_t *addr, psize_t sz)
327 {
328 	struct uvm_pmemrange	*pmr;
329 	struct vm_page		*pig_pg, *pg;
330 
331 	/*
332 	 * Convert sz to pages, since that is what pmemrange uses internally.
333 	 */
334 	sz = atop(round_page(sz));
335 
336 	uvm_lock_fpageq();
337 
338 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
339 		RB_FOREACH_REVERSE(pig_pg, uvm_pmr_addr, &pmr->addr) {
340 			if (pig_pg->fpgsz >= sz) {
341 				goto found;
342 			}
343 		}
344 	}
345 
346 	/*
347 	 * Allocation failure.
348 	 */
349 	uvm_unlock_fpageq();
350 	return ENOMEM;
351 
352 found:
353 	/* Remove page from freelist. */
354 	uvm_pmr_remove_size(pmr, pig_pg);
355 	pig_pg->fpgsz -= sz;
356 	pg = pig_pg + pig_pg->fpgsz;
357 	if (pig_pg->fpgsz == 0)
358 		uvm_pmr_remove_addr(pmr, pig_pg);
359 	else
360 		uvm_pmr_insert_size(pmr, pig_pg);
361 
362 	uvmexp.free -= sz;
363 	*addr = VM_PAGE_TO_PHYS(pg);
364 
365 	/*
366 	 * Update pg flags.
367 	 *
368 	 * Note that we trash the sz argument now.
369 	 */
370 	while (sz > 0) {
371 		KASSERT(pg->pg_flags & PQ_FREE);
372 
373 		atomic_clearbits_int(&pg->pg_flags,
374 		    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3);
375 
376 		if (pg->pg_flags & PG_ZERO)
377 			uvmexp.zeropages -= sz;
378 		atomic_clearbits_int(&pg->pg_flags,
379 		    PG_ZERO|PQ_FREE);
380 
381 		pg->uobject = NULL;
382 		pg->uanon = NULL;
383 		pg->pg_version++;
384 
385 		/*
386 		 * Next.
387 		 */
388 		pg++;
389 		sz--;
390 	}
391 
392 	/* Return. */
393 	uvm_unlock_fpageq();
394 	return 0;
395 }
396 
397 /*
398  * Allocate a piglet area.
399  *
400  * This is as low as possible.
401  * Piglets are aligned.
402  *
403  * sz and align in bytes.
404  *
405  * The call will sleep for the pagedaemon to attempt to free memory.
406  * The pagedaemon may decide its not possible to free enough memory, causing
407  * the allocation to fail.
408  */
409 int
410 uvm_pmr_alloc_piglet(vaddr_t *va, paddr_t *pa, vsize_t sz, paddr_t align)
411 {
412 	paddr_t			 pg_addr, piglet_addr;
413 	struct uvm_pmemrange	*pmr;
414 	struct vm_page		*pig_pg, *pg;
415 	struct pglist		 pageq;
416 	int			 pdaemon_woken;
417 	vaddr_t			 piglet_va;
418 
419 	KASSERT((align & (align - 1)) == 0);
420 	pdaemon_woken = 0; /* Didn't wake the pagedaemon. */
421 
422 	/*
423 	 * Fixup arguments: align must be at least PAGE_SIZE,
424 	 * sz will be converted to pagecount, since that is what
425 	 * pmemrange uses internally.
426 	 */
427 	if (align < PAGE_SIZE)
428 		align = PAGE_SIZE;
429 	sz = round_page(sz);
430 
431 	uvm_lock_fpageq();
432 
433 	TAILQ_FOREACH_REVERSE(pmr, &uvm.pmr_control.use, uvm_pmemrange_use,
434 	    pmr_use) {
435 retry:
436 		/*
437 		 * Search for a range with enough space.
438 		 * Use the address tree, to ensure the range is as low as
439 		 * possible.
440 		 */
441 		RB_FOREACH(pig_pg, uvm_pmr_addr, &pmr->addr) {
442 			pg_addr = VM_PAGE_TO_PHYS(pig_pg);
443 			piglet_addr = (pg_addr + (align - 1)) & ~(align - 1);
444 
445 			if (atop(pg_addr) + pig_pg->fpgsz >=
446 			    atop(piglet_addr) + atop(sz))
447 				goto found;
448 		}
449 	}
450 
451 	/*
452 	 * Try to coerse the pagedaemon into freeing memory
453 	 * for the piglet.
454 	 *
455 	 * pdaemon_woken is set to prevent the code from
456 	 * falling into an endless loop.
457 	 */
458 	if (!pdaemon_woken) {
459 		pdaemon_woken = 1;
460 		if (uvm_wait_pla(ptoa(pmr->low), ptoa(pmr->high) - 1,
461 		    sz, UVM_PLA_FAILOK) == 0)
462 			goto retry;
463 	}
464 
465 	/* Return failure. */
466 	uvm_unlock_fpageq();
467 	return ENOMEM;
468 
469 found:
470 	/*
471 	 * Extract piglet from pigpen.
472 	 */
473 	TAILQ_INIT(&pageq);
474 	uvm_pmr_extract_range(pmr, pig_pg,
475 	    atop(piglet_addr), atop(piglet_addr) + atop(sz), &pageq);
476 
477 	*pa = piglet_addr;
478 	uvmexp.free -= atop(sz);
479 
480 	/*
481 	 * Update pg flags.
482 	 *
483 	 * Note that we trash the sz argument now.
484 	 */
485 	TAILQ_FOREACH(pg, &pageq, pageq) {
486 		KASSERT(pg->pg_flags & PQ_FREE);
487 
488 		atomic_clearbits_int(&pg->pg_flags,
489 		    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3);
490 
491 		if (pg->pg_flags & PG_ZERO)
492 			uvmexp.zeropages--;
493 		atomic_clearbits_int(&pg->pg_flags,
494 		    PG_ZERO|PQ_FREE);
495 
496 		pg->uobject = NULL;
497 		pg->uanon = NULL;
498 		pg->pg_version++;
499 	}
500 
501 	uvm_unlock_fpageq();
502 
503 	/*
504 	 * Now allocate a va.
505 	 * Use direct mappings for the pages.
506 	 */
507 
508 	piglet_va = *va = (vaddr_t)km_alloc(sz, &kv_any, &kp_none, &kd_waitok);
509 	if (!piglet_va) {
510 		uvm_pglistfree(&pageq);
511 		return ENOMEM;
512 	}
513 
514 	/*
515 	 * Map piglet to va.
516 	 */
517 	TAILQ_FOREACH(pg, &pageq, pageq) {
518 		pmap_kenter_pa(piglet_va, VM_PAGE_TO_PHYS(pg), UVM_PROT_RW);
519 		piglet_va += PAGE_SIZE;
520 	}
521 	pmap_update(pmap_kernel());
522 
523 	return 0;
524 }
525 
526 /*
527  * Free a piglet area.
528  */
529 void
530 uvm_pmr_free_piglet(vaddr_t va, vsize_t sz)
531 {
532 	paddr_t			 pa;
533 	struct vm_page		*pg;
534 
535 	/*
536 	 * Fix parameters.
537 	 */
538 	sz = round_page(sz);
539 
540 	/*
541 	 * Find the first page in piglet.
542 	 * Since piglets are contiguous, the first pg is all we need.
543 	 */
544 	if (!pmap_extract(pmap_kernel(), va, &pa))
545 		panic("uvm_pmr_free_piglet: piglet 0x%lx has no pages", va);
546 	pg = PHYS_TO_VM_PAGE(pa);
547 	if (pg == NULL)
548 		panic("uvm_pmr_free_piglet: unmanaged page 0x%lx", pa);
549 
550 	/*
551 	 * Unmap.
552 	 */
553 	pmap_kremove(va, sz);
554 	pmap_update(pmap_kernel());
555 
556 	/*
557 	 * Free the physical and virtual memory.
558 	 */
559 	uvm_pmr_freepages(pg, atop(sz));
560 	km_free((void *)va, sz, &kv_any, &kp_none);
561 }
562 
563 /*
564  * Physmem RLE compression support.
565  *
566  * Given a physical page address, it will return the number of pages
567  * starting at the address, that are free.  Clamps to the number of pages in
568  * HIBERNATE_CHUNK_SIZE. Returns 0 if the page at addr is not free.
569  */
570 int
571 uvm_page_rle(paddr_t addr)
572 {
573 	struct vm_page		*pg, *pg_end;
574 	struct vm_physseg	*vmp;
575 	int			 pseg_idx, off_idx;
576 
577 	pseg_idx = vm_physseg_find(atop(addr), &off_idx);
578 	if (pseg_idx == -1)
579 		return 0;
580 
581 	vmp = &vm_physmem[pseg_idx];
582 	pg = &vmp->pgs[off_idx];
583 	if (!(pg->pg_flags & PQ_FREE))
584 		return 0;
585 
586 	/*
587 	 * Search for the first non-free page after pg.
588 	 * Note that the page may not be the first page in a free pmemrange,
589 	 * therefore pg->fpgsz cannot be used.
590 	 */
591 	for (pg_end = pg; pg_end <= vmp->lastpg &&
592 	    (pg_end->pg_flags & PQ_FREE) == PQ_FREE; pg_end++)
593 		;
594 	return min((pg_end - pg), HIBERNATE_CHUNK_SIZE/PAGE_SIZE);
595 }
596 
597 /*
598  * Fills out the hibernate_info union pointed to by hiber_info
599  * with information about this machine (swap signature block
600  * offsets, number of memory ranges, kernel in use, etc)
601  */
602 int
603 get_hibernate_info(union hibernate_info *hiber_info, int suspend)
604 {
605 	int chunktable_size;
606 	struct disklabel dl;
607 	char err_string[128], *dl_ret;
608 
609 	/* Determine I/O function to use */
610 	hiber_info->io_func = get_hibernate_io_function();
611 	if (hiber_info->io_func == NULL)
612 		return (1);
613 
614 	/* Calculate hibernate device */
615 	hiber_info->device = swdevt[0].sw_dev;
616 
617 	/* Read disklabel (used to calculate signature and image offsets) */
618 	dl_ret = disk_readlabel(&dl, hiber_info->device, err_string, 128);
619 
620 	if (dl_ret) {
621 		printf("Hibernate error reading disklabel: %s\n", dl_ret);
622 		return (1);
623 	}
624 
625 	/* Make sure we have a swap partition. */
626 	if (dl.d_partitions[1].p_fstype != FS_SWAP ||
627 	    dl.d_partitions[1].p_size == 0)
628 		return (1);
629 
630 	hiber_info->secsize = dl.d_secsize;
631 
632 	/* Make sure the signature can fit in one block */
633 	KASSERT(sizeof(union hibernate_info)/hiber_info->secsize == 1);
634 
635 	/* Calculate swap offset from start of disk */
636 	hiber_info->swap_offset = dl.d_partitions[1].p_offset;
637 
638 	/* Calculate signature block location */
639 	hiber_info->sig_offset = dl.d_partitions[1].p_offset +
640 	    dl.d_partitions[1].p_size -
641 	    sizeof(union hibernate_info)/hiber_info->secsize;
642 
643 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / hiber_info->secsize;
644 
645 	/* Stash kernel version information */
646 	bzero(&hiber_info->kernel_version, 128);
647 	bcopy(version, &hiber_info->kernel_version,
648 	    min(strlen(version), sizeof(hiber_info->kernel_version)-1));
649 
650 	if (suspend) {
651 		/* Allocate piglet region */
652 		if (uvm_pmr_alloc_piglet(&hiber_info->piglet_va,
653 		    &hiber_info->piglet_pa, HIBERNATE_CHUNK_SIZE*3,
654 		    HIBERNATE_CHUNK_SIZE)) {
655 			printf("Hibernate failed to allocate the piglet\n");
656 			return (1);
657 		}
658 		hiber_info->io_page = (void *)hiber_info->piglet_va;
659 
660 		/*
661 		 * Initialize of the hibernate IO function (for drivers which
662 		 * need that)
663 		 */
664 		if (hiber_info->io_func(hiber_info->device, 0,
665 		    (vaddr_t)NULL, 0, HIB_INIT, hiber_info->io_page))
666 			goto fail;
667 
668 	} else {
669 		/*
670 		 * Resuming kernels use a regular I/O page since we won't
671 		 * have access to the suspended kernel's piglet VA at this
672 		 * point. No need to free this I/O page as it will vanish
673 		 * as part of the resume.
674 		 */
675 		hiber_info->io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
676 		if (!hiber_info->io_page)
677 			return (1);
678 	}
679 
680 
681 	if (get_hibernate_info_md(hiber_info))
682 		goto fail;
683 
684 	/* Calculate memory image location */
685 	hiber_info->image_offset = dl.d_partitions[1].p_offset +
686 	    dl.d_partitions[1].p_size -
687 	    (hiber_info->image_size / hiber_info->secsize) -
688 	    sizeof(union hibernate_info)/hiber_info->secsize -
689 	    chunktable_size;
690 
691 	return (0);
692 fail:
693 	if (suspend)
694 		uvm_pmr_free_piglet(hiber_info->piglet_va, HIBERNATE_CHUNK_SIZE*3);
695 
696 	return (1);
697 }
698 
699 /*
700  * Allocate nitems*size bytes from the hiballoc area presently in use
701  */
702 void
703 *hibernate_zlib_alloc(void *unused, int nitems, int size)
704 {
705 	struct hibernate_zlib_state *hibernate_state;
706 
707 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
708 
709 	return hib_alloc(&hibernate_state->hiballoc_arena, nitems*size);
710 }
711 
712 /*
713  * Free the memory pointed to by addr in the hiballoc area presently in
714  * use
715  */
716 void
717 hibernate_zlib_free(void *unused, void *addr)
718 {
719 	struct hibernate_zlib_state *hibernate_state;
720 
721 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
722 
723 	hib_free(&hibernate_state->hiballoc_arena, addr);
724 }
725 
726 /*
727  * Gets the next RLE value from the image stream
728  */
729 int
730 hibernate_get_next_rle(void)
731 {
732 	int rle, i;
733 	struct hibernate_zlib_state *hibernate_state;
734 
735 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
736 
737 	/* Read RLE code */
738 	hibernate_state->hib_stream.next_out = (char *)&rle;
739 	hibernate_state->hib_stream.avail_out = sizeof(rle);
740 
741 	i = inflate(&hibernate_state->hib_stream, Z_FULL_FLUSH);
742 	if (i != Z_OK && i != Z_STREAM_END) {
743 		/*
744 		 * XXX - this will likely reboot/hang most machines,
745 		 *       but there's not much else we can do here.
746 		 */
747 		panic("inflate rle error");
748 	}
749 
750 	/* Sanity check what RLE value we got */
751 	if (rle > HIBERNATE_CHUNK_SIZE/PAGE_SIZE || rle < 0)
752 		panic("invalid RLE code");
753 
754 	if (i == Z_STREAM_END)
755 		rle = -1;
756 
757 	return rle;
758 }
759 
760 /*
761  * Inflate next page of data from the image stream
762  */
763 int
764 hibernate_inflate_page(void)
765 {
766 	struct hibernate_zlib_state *hibernate_state;
767 	int i;
768 
769 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
770 
771 	/* Set up the stream for inflate */
772 	hibernate_state->hib_stream.next_out = (char *)HIBERNATE_INFLATE_PAGE;
773 	hibernate_state->hib_stream.avail_out = PAGE_SIZE;
774 
775 	/* Process next block of data */
776 	i = inflate(&hibernate_state->hib_stream, Z_PARTIAL_FLUSH);
777 	if (i != Z_OK && i != Z_STREAM_END) {
778 		/*
779 		 * XXX - this will likely reboot/hang most machines,
780 		 *       but there's not much else we can do here.
781 		 */
782 
783 		panic("inflate error");
784 	}
785 
786 	/* We should always have extracted a full page ... */
787 	if (hibernate_state->hib_stream.avail_out != 0)
788 		panic("incomplete page");
789 
790 	return (i == Z_STREAM_END);
791 }
792 
793 /*
794  * Inflate size bytes from src into dest, skipping any pages in
795  * [src..dest] that are special (see hibernate_inflate_skip)
796  *
797  * This function executes while using the resume-time stack
798  * and pmap, and therefore cannot use ddb/printf/etc. Doing so
799  * will likely hang or reset the machine.
800  */
801 void
802 hibernate_inflate_region(union hibernate_info *hiber_info, paddr_t dest,
803     paddr_t src, size_t size)
804 {
805 	int end_stream = 0 ;
806 	struct hibernate_zlib_state *hibernate_state;
807 
808 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
809 
810 	hibernate_state->hib_stream.next_in = (char *)src;
811 	hibernate_state->hib_stream.avail_in = size;
812 
813 	do {
814 		/* Flush cache and TLB */
815 		hibernate_flush();
816 
817 		/*
818 		 * Is this a special page? If yes, redirect the
819 		 * inflate output to a scratch page (eg, discard it)
820 		 */
821 		if (hibernate_inflate_skip(hiber_info, dest)) {
822 			hibernate_enter_resume_mapping(
823 			    HIBERNATE_INFLATE_PAGE,
824 			    HIBERNATE_INFLATE_PAGE, 0);
825 		} else {
826 			hibernate_enter_resume_mapping(
827 			    HIBERNATE_INFLATE_PAGE, dest, 0);
828 		}
829 
830 		hibernate_flush();
831 		end_stream = hibernate_inflate_page();
832 
833 		dest += PAGE_SIZE;
834 	} while (!end_stream);
835 }
836 
837 /*
838  * deflate from src into the I/O page, up to 'remaining' bytes
839  *
840  * Returns number of input bytes consumed, and may reset
841  * the 'remaining' parameter if not all the output space was consumed
842  * (this information is needed to know how much to write to disk
843  */
844 size_t
845 hibernate_deflate(union hibernate_info *hiber_info, paddr_t src,
846     size_t *remaining)
847 {
848 	vaddr_t hibernate_io_page = hiber_info->piglet_va + PAGE_SIZE;
849 	struct hibernate_zlib_state *hibernate_state;
850 
851 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
852 
853 	/* Set up the stream for deflate */
854 	hibernate_state->hib_stream.next_in = (caddr_t)src;
855 	hibernate_state->hib_stream.avail_in = PAGE_SIZE - (src & PAGE_MASK);
856 	hibernate_state->hib_stream.next_out = (caddr_t)hibernate_io_page +
857 	    (PAGE_SIZE - *remaining);
858 	hibernate_state->hib_stream.avail_out = *remaining;
859 
860 	/* Process next block of data */
861 	if (deflate(&hibernate_state->hib_stream, Z_PARTIAL_FLUSH) != Z_OK)
862 		panic("hibernate zlib deflate error");
863 
864 	/* Update pointers and return number of bytes consumed */
865 	*remaining = hibernate_state->hib_stream.avail_out;
866 	return (PAGE_SIZE - (src & PAGE_MASK)) -
867 	    hibernate_state->hib_stream.avail_in;
868 }
869 
870 /*
871  * Write the hibernation information specified in hiber_info
872  * to the location in swap previously calculated (last block of
873  * swap), called the "signature block".
874  *
875  * Write the memory chunk table to the area in swap immediately
876  * preceding the signature block.
877  */
878 int
879 hibernate_write_signature(union hibernate_info *hiber_info)
880 {
881 	/* Write hibernate info to disk */
882 	return (hiber_info->io_func(hiber_info->device, hiber_info->sig_offset,
883 	    (vaddr_t)hiber_info, hiber_info->secsize, HIB_W,
884 	    hiber_info->io_page));
885 }
886 
887 /*
888  * Write the memory chunk table to the area in swap immediately
889  * preceding the signature block. The chunk table is stored
890  * in the piglet when this function is called.
891  */
892 int
893 hibernate_write_chunktable(union hibernate_info *hiber_info)
894 {
895 	struct hibernate_disk_chunk *chunks;
896 	vaddr_t hibernate_chunk_table_start;
897 	size_t hibernate_chunk_table_size;
898 	daddr_t chunkbase;
899 	int i;
900 
901 	hibernate_chunk_table_size = HIBERNATE_CHUNK_TABLE_SIZE;
902 
903 	chunkbase = hiber_info->sig_offset -
904 	    (hibernate_chunk_table_size / hiber_info->secsize);
905 
906 	hibernate_chunk_table_start = hiber_info->piglet_va +
907 	    HIBERNATE_CHUNK_SIZE;
908 
909 	chunks = (struct hibernate_disk_chunk *)(hiber_info->piglet_va +
910 	    HIBERNATE_CHUNK_SIZE);
911 
912 	/* Write chunk table */
913 	for (i = 0; i < hibernate_chunk_table_size; i += MAXPHYS) {
914 		if (hiber_info->io_func(hiber_info->device,
915 		    chunkbase + (i/hiber_info->secsize),
916 		    (vaddr_t)(hibernate_chunk_table_start + i),
917 		    MAXPHYS, HIB_W, hiber_info->io_page))
918 			return (1);
919 	}
920 
921 	return (0);
922 }
923 
924 /*
925  * Write an empty hiber_info to the swap signature block, which is
926  * guaranteed to not match any valid hiber_info.
927  */
928 int
929 hibernate_clear_signature(void)
930 {
931 	union hibernate_info blank_hiber_info;
932 	union hibernate_info hiber_info;
933 
934 	/* Zero out a blank hiber_info */
935 	bzero(&blank_hiber_info, sizeof(hiber_info));
936 
937 	if (get_hibernate_info(&hiber_info, 0))
938 		return (1);
939 
940 	/* Write (zeroed) hibernate info to disk */
941 	if (hibernate_block_io(&hiber_info,
942 	    hiber_info.sig_offset - hiber_info.swap_offset,
943 	    hiber_info.secsize, (vaddr_t)&blank_hiber_info, 1))
944 		panic("error hibernate write 6");
945 
946 	return (0);
947 }
948 
949 /*
950  * Check chunk range overlap when calculating whether or not to copy a
951  * compressed chunk to the piglet area before decompressing.
952  *
953  * returns zero if the ranges do not overlap, non-zero otherwise.
954  */
955 int
956 hibernate_check_overlap(paddr_t r1s, paddr_t r1e, paddr_t r2s, paddr_t r2e)
957 {
958 	/* case A : end of r1 overlaps start of r2 */
959 	if (r1s < r2s && r1e > r2s)
960 		return (1);
961 
962 	/* case B : r1 entirely inside r2 */
963 	if (r1s >= r2s && r1e <= r2e)
964 		return (1);
965 
966 	/* case C : r2 entirely inside r1 */
967 	if (r2s >= r1s && r2e <= r1e)
968 		return (1);
969 
970 	/* case D : end of r2 overlaps start of r1 */
971 	if (r2s < r1s && r2e > r1s)
972 		return (1);
973 
974 	return (0);
975 }
976 
977 /*
978  * Compare two hibernate_infos to determine if they are the same (eg,
979  * we should be performing a hibernate resume on this machine.
980  * Not all fields are checked - just enough to verify that the machine
981  * has the same memory configuration and kernel as the one that
982  * wrote the signature previously.
983  */
984 int
985 hibernate_compare_signature(union hibernate_info *mine,
986     union hibernate_info *disk)
987 {
988 	u_int i;
989 
990 	if (mine->nranges != disk->nranges)
991 		return (1);
992 
993 	if (strcmp(mine->kernel_version, disk->kernel_version) != 0)
994 		return (1);
995 
996 	for (i = 0; i < mine->nranges; i++) {
997 		if ((mine->ranges[i].base != disk->ranges[i].base) ||
998 		    (mine->ranges[i].end != disk->ranges[i].end) )
999 			return (1);
1000 	}
1001 
1002 	return (0);
1003 }
1004 
1005 /*
1006  * Transfers xfer_size bytes between the hibernate device specified in
1007  * hib_info at offset blkctr and the vaddr specified at dest.
1008  *
1009  * Separate offsets and pages are used to handle misaligned reads (reads
1010  * that span a page boundary).
1011  *
1012  * blkctr specifies a relative offset (relative to the start of swap),
1013  * not an absolute disk offset
1014  *
1015  */
1016 int
1017 hibernate_block_io(union hibernate_info *hib_info, daddr_t blkctr,
1018     size_t xfer_size, vaddr_t dest, int iswrite)
1019 {
1020 	struct buf *bp;
1021 	struct bdevsw *bdsw;
1022 	int error;
1023 
1024 	bp = geteblk(xfer_size);
1025 	bdsw = &bdevsw[major(hib_info->device)];
1026 
1027 	error = (*bdsw->d_open)(hib_info->device, FREAD, S_IFCHR, curproc);
1028 	if (error) {
1029 		printf("hibernate_block_io open failed\n");
1030 		return (1);
1031 	}
1032 
1033 	if (iswrite)
1034 		bcopy((caddr_t)dest, bp->b_data, xfer_size);
1035 
1036 	bp->b_bcount = xfer_size;
1037 	bp->b_blkno = blkctr;
1038 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
1039 	SET(bp->b_flags, B_BUSY | (iswrite ? B_WRITE : B_READ) | B_RAW);
1040 	bp->b_dev = hib_info->device;
1041 	bp->b_cylinder = 0;
1042 	(*bdsw->d_strategy)(bp);
1043 
1044 	error = biowait(bp);
1045 	if (error) {
1046 		printf("hibernate_block_io biowait failed %d\n", error);
1047 		error = (*bdsw->d_close)(hib_info->device, 0, S_IFCHR,
1048 		    curproc);
1049 		if (error)
1050 			printf("hibernate_block_io error close failed\n");
1051 		return (1);
1052 	}
1053 
1054 	error = (*bdsw->d_close)(hib_info->device, FREAD, S_IFCHR, curproc);
1055 	if (error) {
1056 		printf("hibernate_block_io close failed\n");
1057 		return (1);
1058 	}
1059 
1060 	if (!iswrite)
1061 		bcopy(bp->b_data, (caddr_t)dest, xfer_size);
1062 
1063 	bp->b_flags |= B_INVAL;
1064 	brelse(bp);
1065 
1066 	return (0);
1067 }
1068 
1069 /*
1070  * Reads the signature block from swap, checks against the current machine's
1071  * information. If the information matches, perform a resume by reading the
1072  * saved image into the pig area, and unpacking.
1073  */
1074 void
1075 hibernate_resume(void)
1076 {
1077 	union hibernate_info hiber_info;
1078 	int s;
1079 
1080 	/* Get current running machine's hibernate info */
1081 	bzero(&hiber_info, sizeof(hiber_info));
1082 	if (get_hibernate_info(&hiber_info, 0))
1083 		return;
1084 
1085 	/* Read hibernate info from disk */
1086 	s = splbio();
1087 
1088 	if (hibernate_block_io(&hiber_info,
1089 	    hiber_info.sig_offset - hiber_info.swap_offset,
1090 	    hiber_info.secsize, (vaddr_t)&disk_hiber_info, 0))
1091 		panic("error in hibernate read");
1092 
1093 	/*
1094 	 * If on-disk and in-memory hibernate signatures match,
1095 	 * this means we should do a resume from hibernate.
1096 	 */
1097 	if (hibernate_compare_signature(&hiber_info, &disk_hiber_info)) {
1098 		splx(s);
1099 		return;
1100 	}
1101 
1102 	printf("Unhibernating...\n");
1103 
1104 	/* Read the image from disk into the image (pig) area */
1105 	if (hibernate_read_image(&disk_hiber_info))
1106 		goto fail;
1107 
1108 	if (config_suspend(TAILQ_FIRST(&alldevs), DVACT_QUIESCE) != 0)
1109 		goto fail;
1110 
1111 	(void) splhigh();
1112 	disable_intr();
1113 	cold = 1;
1114 
1115 	if (config_suspend(TAILQ_FIRST(&alldevs), DVACT_SUSPEND) != 0) {
1116 		cold = 0;
1117 		enable_intr();
1118 		goto fail;
1119 	}
1120 
1121 	/* Point of no return ... */
1122 
1123 	pmap_kenter_pa(HIBERNATE_HIBALLOC_PAGE, HIBERNATE_HIBALLOC_PAGE,
1124 	    VM_PROT_ALL);
1125 	pmap_activate(curproc);
1126 
1127 	/* Switch stacks */
1128 	hibernate_switch_stack_machdep();
1129 
1130 	/*
1131 	 * Image is now in high memory (pig area), copy to correct location
1132 	 * in memory. We'll eventually end up copying on top of ourself, but
1133 	 * we are assured the kernel code here is the same between the
1134 	 * hibernated and resuming kernel, and we are running on our own
1135 	 * stack, so the overwrite is ok.
1136 	 */
1137 	hibernate_unpack_image(&disk_hiber_info);
1138 
1139 	/*
1140 	 * Resume the loaded kernel by jumping to the MD resume vector.
1141 	 * We won't be returning from this call.
1142 	 */
1143 	hibernate_resume_machdep();
1144 
1145 fail:
1146 	splx(s);
1147 	printf("Unable to resume hibernated image\n");
1148 }
1149 
1150 /*
1151  * Unpack image from pig area to original location by looping through the
1152  * list of output chunks in the order they should be restored (fchunks).
1153  * This ordering is used to avoid having inflate overwrite a chunk in the
1154  * middle of processing that chunk. This will, of course, happen during the
1155  * final output chunk, where we copy the chunk to the piglet area first,
1156  * before inflating.
1157  */
1158 void
1159 hibernate_unpack_image(union hibernate_info *hiber_info)
1160 {
1161 	struct hibernate_disk_chunk *chunks;
1162 	union hibernate_info local_hiber_info;
1163 	paddr_t image_cur = global_pig_start;
1164 	int *fchunks, i;
1165 	char *pva = (char *)hiber_info->piglet_va;
1166 	struct hibernate_zlib_state *hibernate_state;
1167 
1168 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1169 
1170 	/* Mask off based on arch-specific piglet page size */
1171 	pva = (char *)((paddr_t)pva & (PIGLET_PAGE_MASK));
1172 	fchunks = (int *)(pva + (6 * PAGE_SIZE));
1173 
1174 	chunks = (struct hibernate_disk_chunk *)(pva +  HIBERNATE_CHUNK_SIZE);
1175 
1176 	/* Can't use hiber_info that's passed in after this point */
1177 	bcopy(hiber_info, &local_hiber_info, sizeof(union hibernate_info));
1178 
1179 	hibernate_activate_resume_pt_machdep();
1180 
1181 	for (i = 0; i < local_hiber_info.chunk_ctr; i++) {
1182 		/* Reset zlib for inflate */
1183 		if (hibernate_zlib_reset(&local_hiber_info, 0) != Z_OK)
1184 			panic("hibernate failed to reset zlib for inflate");
1185 
1186 		hibernate_process_chunk(&local_hiber_info, &chunks[fchunks[i]],
1187 		    image_cur);
1188 
1189 		image_cur += chunks[fchunks[i]].compressed_size;
1190 
1191 	}
1192 }
1193 
1194 /*
1195  * Process a chunk by ensuring its proper placement, followed by unpacking
1196  */
1197 void
1198 hibernate_process_chunk(union hibernate_info *hiber_info,
1199     struct hibernate_disk_chunk *chunk, paddr_t img_cur)
1200 {
1201 	char *pva = (char *)hiber_info->piglet_va;
1202 
1203 	/*
1204 	 * If there is a conflict, copy the chunk to the piglet area
1205 	 * before unpacking it to its original location.
1206 	 */
1207 	if ((chunk->flags & HIBERNATE_CHUNK_CONFLICT) == 0)
1208 		hibernate_inflate_region(hiber_info, chunk->base,
1209 		    img_cur, chunk->compressed_size);
1210 	else {
1211 		bcopy((caddr_t)img_cur,
1212 		    pva + (HIBERNATE_CHUNK_SIZE * 2),
1213 		    chunk->compressed_size);
1214 		hibernate_inflate_region(hiber_info, chunk->base,
1215 		    (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)),
1216 		    chunk->compressed_size);
1217 	}
1218 }
1219 
1220 /*
1221  * Write a compressed version of this machine's memory to disk, at the
1222  * precalculated swap offset:
1223  *
1224  * end of swap - signature block size - chunk table size - memory size
1225  *
1226  * The function begins by looping through each phys mem range, cutting each
1227  * one into MD sized chunks. These chunks are then compressed individually
1228  * and written out to disk, in phys mem order. Some chunks might compress
1229  * more than others, and for this reason, each chunk's size is recorded
1230  * in the chunk table, which is written to disk after the image has
1231  * properly been compressed and written (in hibernate_write_chunktable).
1232  *
1233  * When this function is called, the machine is nearly suspended - most
1234  * devices are quiesced/suspended, interrupts are off, and cold has
1235  * been set. This means that there can be no side effects once the
1236  * write has started, and the write function itself can also have no
1237  * side effects. This also means no printfs are permitted (since it
1238  * has side effects.)
1239  */
1240 int
1241 hibernate_write_chunks(union hibernate_info *hiber_info)
1242 {
1243 	paddr_t range_base, range_end, inaddr, temp_inaddr;
1244 	size_t nblocks, out_remaining, used;
1245 	struct hibernate_disk_chunk *chunks;
1246 	vaddr_t hibernate_io_page = hiber_info->piglet_va + PAGE_SIZE;
1247 	daddr_t blkctr = hiber_info->image_offset, offset = 0;
1248 	int i;
1249 	struct hibernate_zlib_state *hibernate_state;
1250 
1251 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1252 
1253 	hiber_info->chunk_ctr = 0;
1254 
1255 	/*
1256 	 * Allocate VA for the temp and copy page.
1257 	 * These will becomee part of the suspended kernel and will
1258 	 * be freed in hibernate_free, upon resume.
1259 	 */
1260 	hibernate_temp_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1261 	    &kp_none, &kd_nowait);
1262 	if (!hibernate_temp_page)
1263 		return (1);
1264 
1265 	hibernate_copy_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1266 	    &kp_none, &kd_nowait);
1267 	if (!hibernate_copy_page)
1268 		return (1);
1269 
1270 	pmap_kenter_pa(hibernate_copy_page,
1271 	    (hiber_info->piglet_pa + 3*PAGE_SIZE), VM_PROT_ALL);
1272 
1273 	/* XXX - not needed on all archs */
1274 	pmap_activate(curproc);
1275 
1276 	chunks = (struct hibernate_disk_chunk *)(hiber_info->piglet_va +
1277 	    HIBERNATE_CHUNK_SIZE);
1278 
1279 	/* Calculate the chunk regions */
1280 	for (i = 0; i < hiber_info->nranges; i++) {
1281 		range_base = hiber_info->ranges[i].base;
1282 		range_end = hiber_info->ranges[i].end;
1283 
1284 		inaddr = range_base;
1285 
1286 		while (inaddr < range_end) {
1287 			chunks[hiber_info->chunk_ctr].base = inaddr;
1288 			if (inaddr + HIBERNATE_CHUNK_SIZE < range_end)
1289 				chunks[hiber_info->chunk_ctr].end = inaddr +
1290 				    HIBERNATE_CHUNK_SIZE;
1291 			else
1292 				chunks[hiber_info->chunk_ctr].end = range_end;
1293 
1294 			inaddr += HIBERNATE_CHUNK_SIZE;
1295 			hiber_info->chunk_ctr ++;
1296 		}
1297 	}
1298 
1299 	/* Compress and write the chunks in the chunktable */
1300 	for (i = 0; i < hiber_info->chunk_ctr; i++) {
1301 		range_base = chunks[i].base;
1302 		range_end = chunks[i].end;
1303 
1304 		chunks[i].offset = blkctr;
1305 
1306 		/* Reset zlib for deflate */
1307 		if (hibernate_zlib_reset(hiber_info, 1) != Z_OK)
1308 			return (1);
1309 
1310 		inaddr = range_base;
1311 
1312 		/*
1313 		 * For each range, loop through its phys mem region
1314 		 * and write out the chunks (the last chunk might be
1315 		 * smaller than the chunk size).
1316 		 */
1317 		while (inaddr < range_end) {
1318 			out_remaining = PAGE_SIZE;
1319 			while (out_remaining > 0 && inaddr < range_end) {
1320 
1321 				/*
1322 				 * Adjust for regions that are not evenly
1323 				 * divisible by PAGE_SIZE or overflowed
1324 				 * pages from the previous iteration.
1325 				 */
1326 				temp_inaddr = (inaddr & PAGE_MASK) +
1327 				    hibernate_copy_page;
1328 
1329 				/* Deflate from temp_inaddr to IO page */
1330 				if (inaddr != range_end) {
1331 					pmap_kenter_pa(hibernate_temp_page,
1332 					    inaddr & PMAP_PA_MASK, VM_PROT_ALL);
1333 
1334 					/* XXX - not needed on all archs */
1335 					pmap_activate(curproc);
1336 
1337 					bcopy((caddr_t)hibernate_temp_page,
1338 					    (caddr_t)hibernate_copy_page, PAGE_SIZE);
1339 					inaddr += hibernate_deflate(hiber_info,
1340 					    temp_inaddr, &out_remaining);
1341 				}
1342 
1343 				if (out_remaining == 0) {
1344 					/* Filled up the page */
1345 					nblocks = PAGE_SIZE / hiber_info->secsize;
1346 
1347 					if (hiber_info->io_func(hiber_info->device,
1348 					    blkctr, (vaddr_t)hibernate_io_page,
1349 					    PAGE_SIZE, HIB_W, hiber_info->io_page))
1350 						return (1);
1351 
1352 					blkctr += nblocks;
1353 				}
1354 			}
1355 		}
1356 
1357 		if (inaddr != range_end)
1358 			return (1);
1359 
1360 		/*
1361 		 * End of range. Round up to next secsize bytes
1362 		 * after finishing compress
1363 		 */
1364 		if (out_remaining == 0)
1365 			out_remaining = PAGE_SIZE;
1366 
1367 		/* Finish compress */
1368 		hibernate_state->hib_stream.next_in = (caddr_t)inaddr;
1369 		hibernate_state->hib_stream.avail_in = 0;
1370 		hibernate_state->hib_stream.next_out =
1371 		    (caddr_t)hibernate_io_page + (PAGE_SIZE - out_remaining);
1372 		hibernate_state->hib_stream.avail_out = out_remaining;
1373 
1374 		if (deflate(&hibernate_state->hib_stream, Z_FINISH) !=
1375 		    Z_STREAM_END)
1376 			return (1);
1377 
1378 		out_remaining = hibernate_state->hib_stream.avail_out;
1379 
1380 		used = PAGE_SIZE - out_remaining;
1381 		nblocks = used / hiber_info->secsize;
1382 
1383 		/* Round up to next block if needed */
1384 		if (used % hiber_info->secsize != 0)
1385 			nblocks ++;
1386 
1387 		/* Write final block(s) for this chunk */
1388 		if (hiber_info->io_func(hiber_info->device, blkctr,
1389 		    (vaddr_t)hibernate_io_page, nblocks*hiber_info->secsize,
1390 		    HIB_W, hiber_info->io_page))
1391 			return (1);
1392 
1393 		blkctr += nblocks;
1394 
1395 		offset = blkctr;
1396 		chunks[i].compressed_size = (offset - chunks[i].offset) *
1397 		    hiber_info->secsize;
1398 	}
1399 
1400 	return (0);
1401 }
1402 
1403 /*
1404  * Reset the zlib stream state and allocate a new hiballoc area for either
1405  * inflate or deflate. This function is called once for each hibernate chunk.
1406  * Calling hiballoc_init multiple times is acceptable since the memory it is
1407  * provided is unmanaged memory (stolen). We use the memory provided to us
1408  * by the piglet allocated via the supplied hiber_info.
1409  */
1410 int
1411 hibernate_zlib_reset(union hibernate_info *hiber_info, int deflate)
1412 {
1413 	vaddr_t hibernate_zlib_start;
1414 	size_t hibernate_zlib_size;
1415 	char *pva = (char *)hiber_info->piglet_va;
1416 	struct hibernate_zlib_state *hibernate_state;
1417 
1418 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1419 
1420 	if(!deflate)
1421 		pva = (char *)((paddr_t)pva & (PIGLET_PAGE_MASK));
1422 
1423 	hibernate_zlib_start = (vaddr_t)(pva + (8 * PAGE_SIZE));
1424 	hibernate_zlib_size = 80 * PAGE_SIZE;
1425 
1426 	bzero((caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1427 	bzero((caddr_t)hibernate_state, PAGE_SIZE);
1428 
1429 	/* Set up stream structure */
1430 	hibernate_state->hib_stream.zalloc = (alloc_func)hibernate_zlib_alloc;
1431 	hibernate_state->hib_stream.zfree = (free_func)hibernate_zlib_free;
1432 
1433 	/* Initialize the hiballoc arena for zlib allocs/frees */
1434 	hiballoc_init(&hibernate_state->hiballoc_arena,
1435 	    (caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1436 
1437 	if (deflate) {
1438 		return deflateInit(&hibernate_state->hib_stream,
1439 		    Z_BEST_SPEED);
1440 	} else
1441 		return inflateInit(&hibernate_state->hib_stream);
1442 }
1443 
1444 /*
1445  * Reads the hibernated memory image from disk, whose location and
1446  * size are recorded in hiber_info. Begin by reading the persisted
1447  * chunk table, which records the original chunk placement location
1448  * and compressed size for each. Next, allocate a pig region of
1449  * sufficient size to hold the compressed image. Next, read the
1450  * chunks into the pig area (calling hibernate_read_chunks to do this),
1451  * and finally, if all of the above succeeds, clear the hibernate signature.
1452  * The function will then return to hibernate_resume, which will proceed
1453  * to unpack the pig image to the correct place in memory.
1454  */
1455 int
1456 hibernate_read_image(union hibernate_info *hiber_info)
1457 {
1458 	size_t compressed_size, disk_size, chunktable_size, pig_sz;
1459 	paddr_t image_start, image_end, pig_start, pig_end;
1460 	struct hibernate_disk_chunk *chunks;
1461 	daddr_t blkctr;
1462 	vaddr_t chunktable = (vaddr_t)NULL;
1463 	paddr_t piglet_chunktable = hiber_info->piglet_pa +
1464 	    HIBERNATE_CHUNK_SIZE;
1465 	int i;
1466 
1467 	pmap_activate(curproc);
1468 
1469 	/* Calculate total chunk table size in disk blocks */
1470 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / hiber_info->secsize;
1471 
1472 	blkctr = hiber_info->sig_offset - chunktable_size -
1473 			hiber_info->swap_offset;
1474 
1475 	chunktable = (vaddr_t)km_alloc(HIBERNATE_CHUNK_TABLE_SIZE, &kv_any,
1476 	    &kp_none, &kd_nowait);
1477 
1478 	if (!chunktable)
1479 		return (1);
1480 
1481 	/* Read the chunktable from disk into the piglet chunktable */
1482 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE;
1483 	    i += PAGE_SIZE, blkctr += PAGE_SIZE/hiber_info->secsize) {
1484 		pmap_kenter_pa(chunktable + i, piglet_chunktable + i, VM_PROT_ALL);
1485 		pmap_update(pmap_kernel());
1486 		hibernate_block_io(hiber_info, blkctr, PAGE_SIZE,
1487 		    chunktable + i, 0);
1488 	}
1489 
1490 	blkctr = hiber_info->image_offset;
1491 	compressed_size = 0;
1492 
1493 	chunks = (struct hibernate_disk_chunk *)chunktable;
1494 
1495 	for (i = 0; i < hiber_info->chunk_ctr; i++)
1496 		compressed_size += chunks[i].compressed_size;
1497 
1498 	disk_size = compressed_size;
1499 
1500 	/* Allocate the pig area */
1501 	pig_sz = compressed_size + HIBERNATE_CHUNK_SIZE;
1502 	if (uvm_pmr_alloc_pig(&pig_start, pig_sz) == ENOMEM)
1503 		return (1);
1504 
1505 	pig_end = pig_start + pig_sz;
1506 
1507 	/* Calculate image extents. Pig image must end on a chunk boundary. */
1508 	image_end = pig_end & ~(HIBERNATE_CHUNK_SIZE - 1);
1509 	image_start = pig_start;
1510 
1511 	image_start = image_end - disk_size;
1512 
1513 	hibernate_read_chunks(hiber_info, image_start, image_end, disk_size,
1514 	    chunks);
1515 
1516 	pmap_kremove(chunktable, PAGE_SIZE);
1517 	pmap_update(pmap_kernel());
1518 
1519 	/* Prepare the resume time pmap/page table */
1520 	hibernate_populate_resume_pt(hiber_info, image_start, image_end);
1521 
1522 	/* Read complete, clear the signature and return */
1523 	return hibernate_clear_signature();
1524 }
1525 
1526 /*
1527  * Read the hibernated memory chunks from disk (chunk information at this
1528  * point is stored in the piglet) into the pig area specified by
1529  * [pig_start .. pig_end]. Order the chunks so that the final chunk is the
1530  * only chunk with overlap possibilities.
1531  */
1532 int
1533 hibernate_read_chunks(union hibernate_info *hib_info, paddr_t pig_start,
1534     paddr_t pig_end, size_t image_compr_size,
1535     struct hibernate_disk_chunk *chunks)
1536 {
1537 	paddr_t img_index, img_cur, r1s, r1e, r2s, r2e;
1538 	paddr_t copy_start, copy_end, piglet_cur;
1539 	paddr_t piglet_base = hib_info->piglet_pa;
1540 	paddr_t piglet_end = piglet_base + HIBERNATE_CHUNK_SIZE;
1541 	daddr_t blkctr;
1542 	size_t processed, compressed_size, read_size;
1543 	int i, j, overlap, found, nchunks;
1544 	int nochunks = 0, nfchunks = 0, npchunks = 0;
1545 	int *ochunks, *pchunks, *fchunks;
1546 	vaddr_t tempva = (vaddr_t)NULL, hibernate_fchunk_area = (vaddr_t)NULL;
1547 
1548 	global_pig_start = pig_start;
1549 
1550 	/* XXX - dont need this on all archs */
1551 	pmap_activate(curproc);
1552 
1553 	/*
1554 	 * These mappings go into the resuming kernel's page table, and are
1555 	 * used only during image read. They dissappear from existence
1556 	 * when the suspended kernel is unpacked on top of us.
1557 	 */
1558 	tempva = (vaddr_t)km_alloc(2*PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
1559 	if (!tempva)
1560 		return (1);
1561 	hibernate_fchunk_area = (vaddr_t)km_alloc(3*PAGE_SIZE, &kv_any,
1562 	    &kp_none, &kd_nowait);
1563 	if (!hibernate_fchunk_area)
1564 		return (1);
1565 
1566 	/* Temporary output chunk ordering VA */
1567 	ochunks = (int *)hibernate_fchunk_area;
1568 
1569 	/* Piglet chunk ordering VA */
1570 	pchunks = (int *)(hibernate_fchunk_area + PAGE_SIZE);
1571 
1572 	/* Final chunk ordering VA */
1573 	fchunks = (int *)(hibernate_fchunk_area + (2*PAGE_SIZE));
1574 
1575 	/* Map the chunk ordering region */
1576 	pmap_kenter_pa(hibernate_fchunk_area,
1577 	    piglet_base + (4*PAGE_SIZE), VM_PROT_ALL);
1578 	pmap_update(pmap_kernel());
1579 	pmap_kenter_pa((vaddr_t)pchunks, piglet_base + (5*PAGE_SIZE),
1580 	    VM_PROT_ALL);
1581 	pmap_update(pmap_kernel());
1582 	pmap_kenter_pa((vaddr_t)fchunks, piglet_base + (6*PAGE_SIZE),
1583 	    VM_PROT_ALL);
1584 	pmap_update(pmap_kernel());
1585 
1586 	nchunks = hib_info->chunk_ctr;
1587 
1588 	/* Initially start all chunks as unplaced */
1589 	for (i = 0; i < nchunks; i++)
1590 		chunks[i].flags = 0;
1591 
1592 	/*
1593 	 * Search the list for chunks that are outside the pig area. These
1594 	 * can be placed first in the final output list.
1595 	 */
1596 	for (i = 0; i < nchunks; i++) {
1597 		if (chunks[i].end <= pig_start || chunks[i].base >= pig_end) {
1598 			ochunks[nochunks] = i;
1599 			fchunks[nfchunks] = i;
1600 			nochunks++;
1601 			nfchunks++;
1602 			chunks[i].flags |= HIBERNATE_CHUNK_USED;
1603 		}
1604 	}
1605 
1606 	/*
1607 	 * Walk the ordering, place the chunks in ascending memory order.
1608 	 * Conflicts might arise, these are handled next.
1609 	 */
1610 	do {
1611 		img_index = -1;
1612 		found = 0;
1613 		j = -1;
1614 		for (i = 0; i < nchunks; i++)
1615 			if (chunks[i].base < img_index &&
1616 			    chunks[i].flags == 0 ) {
1617 				j = i;
1618 				img_index = chunks[i].base;
1619 			}
1620 
1621 		if (j != -1) {
1622 			found = 1;
1623 			ochunks[nochunks] = (short)j;
1624 			nochunks++;
1625 			chunks[j].flags |= HIBERNATE_CHUNK_PLACED;
1626 		}
1627 	} while (found);
1628 
1629 	img_index = pig_start;
1630 
1631 	/*
1632 	 * Identify chunk output conflicts (chunks whose pig load area
1633 	 * corresponds to their original memory placement location)
1634 	 */
1635 	for (i = 0; i < nochunks ; i++) {
1636 		overlap = 0;
1637 		r1s = img_index;
1638 		r1e = img_index + chunks[ochunks[i]].compressed_size;
1639 		r2s = chunks[ochunks[i]].base;
1640 		r2e = chunks[ochunks[i]].end;
1641 
1642 		overlap = hibernate_check_overlap(r1s, r1e, r2s, r2e);
1643 		if (overlap)
1644 			chunks[ochunks[i]].flags |= HIBERNATE_CHUNK_CONFLICT;
1645 		img_index += chunks[ochunks[i]].compressed_size;
1646 	}
1647 
1648 	/*
1649 	 * Prepare the final output chunk list. Calculate an output
1650 	 * inflate strategy for overlapping chunks if needed.
1651 	 */
1652 	img_index = pig_start;
1653 	for (i = 0; i < nochunks ; i++) {
1654 		/*
1655 		 * If a conflict is detected, consume enough compressed
1656 		 * output chunks to fill the piglet
1657 		 */
1658 		if (chunks[ochunks[i]].flags & HIBERNATE_CHUNK_CONFLICT) {
1659 			copy_start = piglet_base;
1660 			copy_end = piglet_end;
1661 			piglet_cur = piglet_base;
1662 			npchunks = 0;
1663 			j = i;
1664 
1665 			while (copy_start < copy_end && j < nochunks) {
1666 				piglet_cur += chunks[ochunks[j]].compressed_size;
1667 				pchunks[npchunks] = ochunks[j];
1668 				npchunks++;
1669 				copy_start += chunks[ochunks[j]].compressed_size;
1670 				img_index += chunks[ochunks[j]].compressed_size;
1671 				i++;
1672 				j++;
1673 			}
1674 
1675 			piglet_cur = piglet_base;
1676 			for (j = 0; j < npchunks; j++) {
1677 				piglet_cur += chunks[pchunks[j]].compressed_size;
1678 				fchunks[nfchunks] = pchunks[j];
1679 				chunks[pchunks[j]].flags |= HIBERNATE_CHUNK_USED;
1680 				nfchunks++;
1681 			}
1682 		} else {
1683 			/*
1684 			 * No conflict, chunk can be added without copying
1685 			 */
1686 			if ((chunks[ochunks[i]].flags &
1687 			    HIBERNATE_CHUNK_USED) == 0) {
1688 				fchunks[nfchunks] = ochunks[i];
1689 				chunks[ochunks[i]].flags |= HIBERNATE_CHUNK_USED;
1690 				nfchunks++;
1691 			}
1692 			img_index += chunks[ochunks[i]].compressed_size;
1693 		}
1694 	}
1695 
1696 	img_index = pig_start;
1697 	for (i = 0; i < nfchunks; i++) {
1698 		piglet_cur = piglet_base;
1699 		img_index += chunks[fchunks[i]].compressed_size;
1700 	}
1701 
1702 	img_cur = pig_start;
1703 
1704 	for (i = 0; i < nfchunks; i++) {
1705 		blkctr = chunks[fchunks[i]].offset - hib_info->swap_offset;
1706 		processed = 0;
1707 		compressed_size = chunks[fchunks[i]].compressed_size;
1708 
1709 		while (processed < compressed_size) {
1710 			pmap_kenter_pa(tempva, img_cur, VM_PROT_ALL);
1711 			pmap_kenter_pa(tempva + PAGE_SIZE, img_cur+PAGE_SIZE,
1712 			    VM_PROT_ALL);
1713 			pmap_update(pmap_kernel());
1714 
1715 			if (compressed_size - processed >= PAGE_SIZE)
1716 				read_size = PAGE_SIZE;
1717 			else
1718 				read_size = compressed_size - processed;
1719 
1720 			hibernate_block_io(hib_info, blkctr, read_size,
1721 			    tempva + (img_cur & PAGE_MASK), 0);
1722 
1723 			blkctr += (read_size / hib_info->secsize);
1724 
1725 			hibernate_flush();
1726 			pmap_kremove(tempva, PAGE_SIZE);
1727 			pmap_kremove(tempva + PAGE_SIZE, PAGE_SIZE);
1728 			processed += read_size;
1729 			img_cur += read_size;
1730 		}
1731 	}
1732 
1733 	pmap_kremove(hibernate_fchunk_area, PAGE_SIZE);
1734 	pmap_kremove((vaddr_t)pchunks, PAGE_SIZE);
1735 	pmap_kremove((vaddr_t)fchunks, PAGE_SIZE);
1736 	pmap_update(pmap_kernel());
1737 
1738 	return (0);
1739 }
1740 
1741 /*
1742  * Hibernating a machine comprises the following operations:
1743  *  1. Calculating this machine's hibernate_info information
1744  *  2. Allocating a piglet and saving the piglet's physaddr
1745  *  3. Calculating the memory chunks
1746  *  4. Writing the compressed chunks to disk
1747  *  5. Writing the chunk table
1748  *  6. Writing the signature block (hibernate_info)
1749  *
1750  * On most architectures, the function calling hibernate_suspend would
1751  * then power off the machine using some MD-specific implementation.
1752  */
1753 int
1754 hibernate_suspend(void)
1755 {
1756 	union hibernate_info hib_info;
1757 	size_t swap_size;
1758 
1759 	/*
1760 	 * Calculate memory ranges, swap offsets, etc.
1761 	 * This also allocates a piglet whose physaddr is stored in
1762 	 * hib_info->piglet_pa and vaddr stored in hib_info->piglet_va
1763 	 */
1764 	if (get_hibernate_info(&hib_info, 1))
1765 		return (1);
1766 
1767 	swap_size = hib_info.image_size + hib_info.secsize +
1768 		HIBERNATE_CHUNK_TABLE_SIZE;
1769 
1770 	if (uvm_swap_check_range(hib_info.device, swap_size)) {
1771 		printf("insufficient swap space for hibernate\n");
1772 		return (1);
1773 	}
1774 
1775 	pmap_kenter_pa(HIBERNATE_HIBALLOC_PAGE, HIBERNATE_HIBALLOC_PAGE,
1776 		VM_PROT_ALL);
1777 	pmap_activate(curproc);
1778 
1779 	/* Stash the piglet VA so we can free it in the resuming kernel */
1780 	global_piglet_va = hib_info.piglet_va;
1781 
1782 	if (hibernate_write_chunks(&hib_info))
1783 		return (1);
1784 
1785 	if (hibernate_write_chunktable(&hib_info))
1786 		return (1);
1787 
1788 	if (hibernate_write_signature(&hib_info))
1789 		return (1);
1790 
1791 	delay(500000);
1792 	return (0);
1793 }
1794 
1795 /*
1796  * Free items allocated by hibernate_suspend()
1797  */
1798 void
1799 hibernate_free(void)
1800 {
1801 	if (global_piglet_va)
1802 		uvm_pmr_free_piglet(global_piglet_va,
1803 		    3*HIBERNATE_CHUNK_SIZE);
1804 
1805 	if (hibernate_copy_page)
1806 		pmap_kremove(hibernate_copy_page, PAGE_SIZE);
1807 	if (hibernate_temp_page)
1808 		pmap_kremove(hibernate_temp_page, PAGE_SIZE);
1809 
1810 	pmap_update(pmap_kernel());
1811 
1812 	if (hibernate_copy_page)
1813 		km_free((void *)hibernate_copy_page, PAGE_SIZE,
1814 		    &kv_any, &kp_none);
1815 	if (hibernate_temp_page)
1816 		km_free((void *)hibernate_temp_page, PAGE_SIZE,
1817 		    &kv_any, &kp_none);
1818 
1819 	global_piglet_va = 0;
1820 	hibernate_copy_page = 0;
1821 	hibernate_temp_page = 0;
1822 }
1823