xref: /openbsd-src/sys/kern/subr_hibernate.c (revision ca1a7c3ebed9e35986df313163e16caa6fc038df)
1 /*	$OpenBSD: subr_hibernate.c,v 1.40 2012/07/09 09:47:42 deraadt Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  * Copyright (c) 2011 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/hibernate.h>
21 #include <sys/malloc.h>
22 #include <sys/param.h>
23 #include <sys/tree.h>
24 #include <sys/types.h>
25 #include <sys/systm.h>
26 #include <sys/disklabel.h>
27 #include <sys/disk.h>
28 #include <sys/conf.h>
29 #include <sys/buf.h>
30 #include <sys/fcntl.h>
31 #include <sys/stat.h>
32 #include <uvm/uvm.h>
33 #include <machine/hibernate.h>
34 
35 /* Temporary vaddr ranges used during hibernate */
36 vaddr_t hibernate_temp_page;
37 vaddr_t hibernate_copy_page;
38 
39 /* Hibernate info as read from disk during resume */
40 union hibernate_info disk_hiber_info;
41 paddr_t global_pig_start;
42 vaddr_t global_piglet_va;
43 
44 /*
45  * Hib alloc enforced alignment.
46  */
47 #define HIB_ALIGN		8 /* bytes alignment */
48 
49 /*
50  * sizeof builtin operation, but with alignment constraint.
51  */
52 #define HIB_SIZEOF(_type)	roundup(sizeof(_type), HIB_ALIGN)
53 
54 struct hiballoc_entry {
55 	size_t			hibe_use;
56 	size_t			hibe_space;
57 	RB_ENTRY(hiballoc_entry) hibe_entry;
58 };
59 
60 /*
61  * Compare hiballoc entries based on the address they manage.
62  *
63  * Since the address is fixed, relative to struct hiballoc_entry,
64  * we just compare the hiballoc_entry pointers.
65  */
66 static __inline int
67 hibe_cmp(struct hiballoc_entry *l, struct hiballoc_entry *r)
68 {
69 	return l < r ? -1 : (l > r);
70 }
71 
72 RB_PROTOTYPE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
73 
74 /*
75  * Given a hiballoc entry, return the address it manages.
76  */
77 static __inline void *
78 hib_entry_to_addr(struct hiballoc_entry *entry)
79 {
80 	caddr_t addr;
81 
82 	addr = (caddr_t)entry;
83 	addr += HIB_SIZEOF(struct hiballoc_entry);
84 	return addr;
85 }
86 
87 /*
88  * Given an address, find the hiballoc that corresponds.
89  */
90 static __inline struct hiballoc_entry*
91 hib_addr_to_entry(void *addr_param)
92 {
93 	caddr_t addr;
94 
95 	addr = (caddr_t)addr_param;
96 	addr -= HIB_SIZEOF(struct hiballoc_entry);
97 	return (struct hiballoc_entry*)addr;
98 }
99 
100 RB_GENERATE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
101 
102 /*
103  * Allocate memory from the arena.
104  *
105  * Returns NULL if no memory is available.
106  */
107 void *
108 hib_alloc(struct hiballoc_arena *arena, size_t alloc_sz)
109 {
110 	struct hiballoc_entry *entry, *new_entry;
111 	size_t find_sz;
112 
113 	/*
114 	 * Enforce alignment of HIB_ALIGN bytes.
115 	 *
116 	 * Note that, because the entry is put in front of the allocation,
117 	 * 0-byte allocations are guaranteed a unique address.
118 	 */
119 	alloc_sz = roundup(alloc_sz, HIB_ALIGN);
120 
121 	/*
122 	 * Find an entry with hibe_space >= find_sz.
123 	 *
124 	 * If the root node is not large enough, we switch to tree traversal.
125 	 * Because all entries are made at the bottom of the free space,
126 	 * traversal from the end has a slightly better chance of yielding
127 	 * a sufficiently large space.
128 	 */
129 	find_sz = alloc_sz + HIB_SIZEOF(struct hiballoc_entry);
130 	entry = RB_ROOT(&arena->hib_addrs);
131 	if (entry != NULL && entry->hibe_space < find_sz) {
132 		RB_FOREACH_REVERSE(entry, hiballoc_addr, &arena->hib_addrs) {
133 			if (entry->hibe_space >= find_sz)
134 				break;
135 		}
136 	}
137 
138 	/*
139 	 * Insufficient or too fragmented memory.
140 	 */
141 	if (entry == NULL)
142 		return NULL;
143 
144 	/*
145 	 * Create new entry in allocated space.
146 	 */
147 	new_entry = (struct hiballoc_entry*)(
148 	    (caddr_t)hib_entry_to_addr(entry) + entry->hibe_use);
149 	new_entry->hibe_space = entry->hibe_space - find_sz;
150 	new_entry->hibe_use = alloc_sz;
151 
152 	/*
153 	 * Insert entry.
154 	 */
155 	if (RB_INSERT(hiballoc_addr, &arena->hib_addrs, new_entry) != NULL)
156 		panic("hib_alloc: insert failure");
157 	entry->hibe_space = 0;
158 
159 	/* Return address managed by entry. */
160 	return hib_entry_to_addr(new_entry);
161 }
162 
163 /*
164  * Free a pointer previously allocated from this arena.
165  *
166  * If addr is NULL, this will be silently accepted.
167  */
168 void
169 hib_free(struct hiballoc_arena *arena, void *addr)
170 {
171 	struct hiballoc_entry *entry, *prev;
172 
173 	if (addr == NULL)
174 		return;
175 
176 	/*
177 	 * Derive entry from addr and check it is really in this arena.
178 	 */
179 	entry = hib_addr_to_entry(addr);
180 	if (RB_FIND(hiballoc_addr, &arena->hib_addrs, entry) != entry)
181 		panic("hib_free: freed item %p not in hib arena", addr);
182 
183 	/*
184 	 * Give the space in entry to its predecessor.
185 	 *
186 	 * If entry has no predecessor, change its used space into free space
187 	 * instead.
188 	 */
189 	prev = RB_PREV(hiballoc_addr, &arena->hib_addrs, entry);
190 	if (prev != NULL &&
191 	    (void *)((caddr_t)prev + HIB_SIZEOF(struct hiballoc_entry) +
192 	    prev->hibe_use + prev->hibe_space) == entry) {
193 		/* Merge entry. */
194 		RB_REMOVE(hiballoc_addr, &arena->hib_addrs, entry);
195 		prev->hibe_space += HIB_SIZEOF(struct hiballoc_entry) +
196 		    entry->hibe_use + entry->hibe_space;
197 	} else {
198 		/* Flip used memory to free space. */
199 		entry->hibe_space += entry->hibe_use;
200 		entry->hibe_use = 0;
201 	}
202 }
203 
204 /*
205  * Initialize hiballoc.
206  *
207  * The allocator will manage memmory at ptr, which is len bytes.
208  */
209 int
210 hiballoc_init(struct hiballoc_arena *arena, void *p_ptr, size_t p_len)
211 {
212 	struct hiballoc_entry *entry;
213 	caddr_t ptr;
214 	size_t len;
215 
216 	RB_INIT(&arena->hib_addrs);
217 
218 	/*
219 	 * Hib allocator enforces HIB_ALIGN alignment.
220 	 * Fixup ptr and len.
221 	 */
222 	ptr = (caddr_t)roundup((vaddr_t)p_ptr, HIB_ALIGN);
223 	len = p_len - ((size_t)ptr - (size_t)p_ptr);
224 	len &= ~((size_t)HIB_ALIGN - 1);
225 
226 	/*
227 	 * Insufficient memory to be able to allocate and also do bookkeeping.
228 	 */
229 	if (len <= HIB_SIZEOF(struct hiballoc_entry))
230 		return ENOMEM;
231 
232 	/*
233 	 * Create entry describing space.
234 	 */
235 	entry = (struct hiballoc_entry*)ptr;
236 	entry->hibe_use = 0;
237 	entry->hibe_space = len - HIB_SIZEOF(struct hiballoc_entry);
238 	RB_INSERT(hiballoc_addr, &arena->hib_addrs, entry);
239 
240 	return 0;
241 }
242 
243 /*
244  * Zero all free memory.
245  */
246 void
247 uvm_pmr_zero_everything(void)
248 {
249 	struct uvm_pmemrange	*pmr;
250 	struct vm_page		*pg;
251 	int			 i;
252 
253 	uvm_lock_fpageq();
254 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
255 		/* Zero single pages. */
256 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_DIRTY]))
257 		    != NULL) {
258 			uvm_pmr_remove(pmr, pg);
259 			uvm_pagezero(pg);
260 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
261 			uvmexp.zeropages++;
262 			uvm_pmr_insert(pmr, pg, 0);
263 		}
264 
265 		/* Zero multi page ranges. */
266 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_DIRTY]))
267 		    != NULL) {
268 			pg--; /* Size tree always has second page. */
269 			uvm_pmr_remove(pmr, pg);
270 			for (i = 0; i < pg->fpgsz; i++) {
271 				uvm_pagezero(&pg[i]);
272 				atomic_setbits_int(&pg[i].pg_flags, PG_ZERO);
273 				uvmexp.zeropages++;
274 			}
275 			uvm_pmr_insert(pmr, pg, 0);
276 		}
277 	}
278 	uvm_unlock_fpageq();
279 }
280 
281 /*
282  * Mark all memory as dirty.
283  *
284  * Used to inform the system that the clean memory isn't clean for some
285  * reason, for example because we just came back from hibernate.
286  */
287 void
288 uvm_pmr_dirty_everything(void)
289 {
290 	struct uvm_pmemrange	*pmr;
291 	struct vm_page		*pg;
292 	int			 i;
293 
294 	uvm_lock_fpageq();
295 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
296 		/* Dirty single pages. */
297 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_ZERO]))
298 		    != NULL) {
299 			uvm_pmr_remove(pmr, pg);
300 			atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
301 			uvm_pmr_insert(pmr, pg, 0);
302 		}
303 
304 		/* Dirty multi page ranges. */
305 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_ZERO]))
306 		    != NULL) {
307 			pg--; /* Size tree always has second page. */
308 			uvm_pmr_remove(pmr, pg);
309 			for (i = 0; i < pg->fpgsz; i++)
310 				atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
311 			uvm_pmr_insert(pmr, pg, 0);
312 		}
313 	}
314 
315 	uvmexp.zeropages = 0;
316 	uvm_unlock_fpageq();
317 }
318 
319 /*
320  * Allocate the highest address that can hold sz.
321  *
322  * sz in bytes.
323  */
324 int
325 uvm_pmr_alloc_pig(paddr_t *addr, psize_t sz)
326 {
327 	struct uvm_pmemrange	*pmr;
328 	struct vm_page		*pig_pg, *pg;
329 
330 	/*
331 	 * Convert sz to pages, since that is what pmemrange uses internally.
332 	 */
333 	sz = atop(round_page(sz));
334 
335 	uvm_lock_fpageq();
336 
337 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
338 		RB_FOREACH_REVERSE(pig_pg, uvm_pmr_addr, &pmr->addr) {
339 			if (pig_pg->fpgsz >= sz) {
340 				goto found;
341 			}
342 		}
343 	}
344 
345 	/*
346 	 * Allocation failure.
347 	 */
348 	uvm_unlock_fpageq();
349 	return ENOMEM;
350 
351 found:
352 	/* Remove page from freelist. */
353 	uvm_pmr_remove_size(pmr, pig_pg);
354 	pig_pg->fpgsz -= sz;
355 	pg = pig_pg + pig_pg->fpgsz;
356 	if (pig_pg->fpgsz == 0)
357 		uvm_pmr_remove_addr(pmr, pig_pg);
358 	else
359 		uvm_pmr_insert_size(pmr, pig_pg);
360 
361 	uvmexp.free -= sz;
362 	*addr = VM_PAGE_TO_PHYS(pg);
363 
364 	/*
365 	 * Update pg flags.
366 	 *
367 	 * Note that we trash the sz argument now.
368 	 */
369 	while (sz > 0) {
370 		KASSERT(pg->pg_flags & PQ_FREE);
371 
372 		atomic_clearbits_int(&pg->pg_flags,
373 		    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3);
374 
375 		if (pg->pg_flags & PG_ZERO)
376 			uvmexp.zeropages -= sz;
377 		atomic_clearbits_int(&pg->pg_flags,
378 		    PG_ZERO|PQ_FREE);
379 
380 		pg->uobject = NULL;
381 		pg->uanon = NULL;
382 		pg->pg_version++;
383 
384 		/*
385 		 * Next.
386 		 */
387 		pg++;
388 		sz--;
389 	}
390 
391 	/* Return. */
392 	uvm_unlock_fpageq();
393 	return 0;
394 }
395 
396 /*
397  * Allocate a piglet area.
398  *
399  * This is as low as possible.
400  * Piglets are aligned.
401  *
402  * sz and align in bytes.
403  *
404  * The call will sleep for the pagedaemon to attempt to free memory.
405  * The pagedaemon may decide its not possible to free enough memory, causing
406  * the allocation to fail.
407  */
408 int
409 uvm_pmr_alloc_piglet(vaddr_t *va, paddr_t *pa, vsize_t sz, paddr_t align)
410 {
411 	paddr_t			 pg_addr, piglet_addr;
412 	struct uvm_pmemrange	*pmr;
413 	struct vm_page		*pig_pg, *pg;
414 	struct pglist		 pageq;
415 	int			 pdaemon_woken;
416 	vaddr_t			 piglet_va;
417 
418 	KASSERT((align & (align - 1)) == 0);
419 	pdaemon_woken = 0; /* Didn't wake the pagedaemon. */
420 
421 	/*
422 	 * Fixup arguments: align must be at least PAGE_SIZE,
423 	 * sz will be converted to pagecount, since that is what
424 	 * pmemrange uses internally.
425 	 */
426 	if (align < PAGE_SIZE)
427 		align = PAGE_SIZE;
428 	sz = round_page(sz);
429 
430 	uvm_lock_fpageq();
431 
432 	TAILQ_FOREACH_REVERSE(pmr, &uvm.pmr_control.use, uvm_pmemrange_use,
433 	    pmr_use) {
434 retry:
435 		/*
436 		 * Search for a range with enough space.
437 		 * Use the address tree, to ensure the range is as low as
438 		 * possible.
439 		 */
440 		RB_FOREACH(pig_pg, uvm_pmr_addr, &pmr->addr) {
441 			pg_addr = VM_PAGE_TO_PHYS(pig_pg);
442 			piglet_addr = (pg_addr + (align - 1)) & ~(align - 1);
443 
444 			if (atop(pg_addr) + pig_pg->fpgsz >=
445 			    atop(piglet_addr) + atop(sz))
446 				goto found;
447 		}
448 	}
449 
450 	/*
451 	 * Try to coerse the pagedaemon into freeing memory
452 	 * for the piglet.
453 	 *
454 	 * pdaemon_woken is set to prevent the code from
455 	 * falling into an endless loop.
456 	 */
457 	if (!pdaemon_woken) {
458 		pdaemon_woken = 1;
459 		if (uvm_wait_pla(ptoa(pmr->low), ptoa(pmr->high) - 1,
460 		    sz, UVM_PLA_FAILOK) == 0)
461 			goto retry;
462 	}
463 
464 	/* Return failure. */
465 	uvm_unlock_fpageq();
466 	return ENOMEM;
467 
468 found:
469 	/*
470 	 * Extract piglet from pigpen.
471 	 */
472 	TAILQ_INIT(&pageq);
473 	uvm_pmr_extract_range(pmr, pig_pg,
474 	    atop(piglet_addr), atop(piglet_addr) + atop(sz), &pageq);
475 
476 	*pa = piglet_addr;
477 	uvmexp.free -= atop(sz);
478 
479 	/*
480 	 * Update pg flags.
481 	 *
482 	 * Note that we trash the sz argument now.
483 	 */
484 	TAILQ_FOREACH(pg, &pageq, pageq) {
485 		KASSERT(pg->pg_flags & PQ_FREE);
486 
487 		atomic_clearbits_int(&pg->pg_flags,
488 		    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3);
489 
490 		if (pg->pg_flags & PG_ZERO)
491 			uvmexp.zeropages--;
492 		atomic_clearbits_int(&pg->pg_flags,
493 		    PG_ZERO|PQ_FREE);
494 
495 		pg->uobject = NULL;
496 		pg->uanon = NULL;
497 		pg->pg_version++;
498 	}
499 
500 	uvm_unlock_fpageq();
501 
502 	/*
503 	 * Now allocate a va.
504 	 * Use direct mappings for the pages.
505 	 */
506 
507 	piglet_va = *va = (vaddr_t)km_alloc(sz, &kv_any, &kp_none, &kd_waitok);
508 	if (!piglet_va) {
509 		uvm_pglistfree(&pageq);
510 		return ENOMEM;
511 	}
512 
513 	/*
514 	 * Map piglet to va.
515 	 */
516 	TAILQ_FOREACH(pg, &pageq, pageq) {
517 		pmap_kenter_pa(piglet_va, VM_PAGE_TO_PHYS(pg), UVM_PROT_RW);
518 		piglet_va += PAGE_SIZE;
519 	}
520 	pmap_update(pmap_kernel());
521 
522 	return 0;
523 }
524 
525 /*
526  * Free a piglet area.
527  */
528 void
529 uvm_pmr_free_piglet(vaddr_t va, vsize_t sz)
530 {
531 	paddr_t			 pa;
532 	struct vm_page		*pg;
533 
534 	/*
535 	 * Fix parameters.
536 	 */
537 	sz = round_page(sz);
538 
539 	/*
540 	 * Find the first page in piglet.
541 	 * Since piglets are contiguous, the first pg is all we need.
542 	 */
543 	if (!pmap_extract(pmap_kernel(), va, &pa))
544 		panic("uvm_pmr_free_piglet: piglet 0x%lx has no pages", va);
545 	pg = PHYS_TO_VM_PAGE(pa);
546 	if (pg == NULL)
547 		panic("uvm_pmr_free_piglet: unmanaged page 0x%lx", pa);
548 
549 	/*
550 	 * Unmap.
551 	 */
552 	pmap_kremove(va, sz);
553 	pmap_update(pmap_kernel());
554 
555 	/*
556 	 * Free the physical and virtual memory.
557 	 */
558 	uvm_pmr_freepages(pg, atop(sz));
559 	km_free((void *)va, sz, &kv_any, &kp_none);
560 }
561 
562 /*
563  * Physmem RLE compression support.
564  *
565  * Given a physical page address, it will return the number of pages
566  * starting at the address, that are free.  Clamps to the number of pages in
567  * HIBERNATE_CHUNK_SIZE. Returns 0 if the page at addr is not free.
568  */
569 int
570 uvm_page_rle(paddr_t addr)
571 {
572 	struct vm_page		*pg, *pg_end;
573 	struct vm_physseg	*vmp;
574 	int			 pseg_idx, off_idx;
575 
576 	pseg_idx = vm_physseg_find(atop(addr), &off_idx);
577 	if (pseg_idx == -1)
578 		return 0;
579 
580 	vmp = &vm_physmem[pseg_idx];
581 	pg = &vmp->pgs[off_idx];
582 	if (!(pg->pg_flags & PQ_FREE))
583 		return 0;
584 
585 	/*
586 	 * Search for the first non-free page after pg.
587 	 * Note that the page may not be the first page in a free pmemrange,
588 	 * therefore pg->fpgsz cannot be used.
589 	 */
590 	for (pg_end = pg; pg_end <= vmp->lastpg &&
591 	    (pg_end->pg_flags & PQ_FREE) == PQ_FREE; pg_end++)
592 		;
593 	return min((pg_end - pg), HIBERNATE_CHUNK_SIZE/PAGE_SIZE);
594 }
595 
596 /*
597  * Fills out the hibernate_info union pointed to by hiber_info
598  * with information about this machine (swap signature block
599  * offsets, number of memory ranges, kernel in use, etc)
600  */
601 int
602 get_hibernate_info(union hibernate_info *hiber_info, int suspend)
603 {
604 	int chunktable_size;
605 	struct disklabel dl;
606 	char err_string[128], *dl_ret;
607 
608 	/* Determine I/O function to use */
609 	hiber_info->io_func = get_hibernate_io_function();
610 	if (hiber_info->io_func == NULL)
611 		return (1);
612 
613 	/* Calculate hibernate device */
614 	hiber_info->device = swdevt[0].sw_dev;
615 
616 	/* Read disklabel (used to calculate signature and image offsets) */
617 	dl_ret = disk_readlabel(&dl, hiber_info->device, err_string, 128);
618 
619 	if (dl_ret) {
620 		printf("Hibernate error reading disklabel: %s\n", dl_ret);
621 		return (1);
622 	}
623 
624 	hiber_info->secsize = dl.d_secsize;
625 
626 	/* Make sure the signature can fit in one block */
627 	KASSERT(sizeof(union hibernate_info)/hiber_info->secsize == 1);
628 
629 	/* Calculate swap offset from start of disk */
630 	hiber_info->swap_offset = dl.d_partitions[1].p_offset;
631 
632 	/* Calculate signature block location */
633 	hiber_info->sig_offset = dl.d_partitions[1].p_offset +
634 	    dl.d_partitions[1].p_size -
635 	    sizeof(union hibernate_info)/hiber_info->secsize;
636 
637 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / hiber_info->secsize;
638 
639 	/* Stash kernel version information */
640 	bzero(&hiber_info->kernel_version, 128);
641 	bcopy(version, &hiber_info->kernel_version,
642 	    min(strlen(version), sizeof(hiber_info->kernel_version)-1));
643 
644 	if (suspend) {
645 		/* Allocate piglet region */
646 		if (uvm_pmr_alloc_piglet(&hiber_info->piglet_va,
647 		    &hiber_info->piglet_pa, HIBERNATE_CHUNK_SIZE*3,
648 		    HIBERNATE_CHUNK_SIZE)) {
649 			printf("Hibernate failed to allocate the piglet\n");
650 			return (1);
651 		}
652 		hiber_info->io_page = (void *)hiber_info->piglet_va;
653 
654 		/*
655 		 * Initialize of the hibernate IO function (for drivers which
656 		 * need that)
657 		 */
658 		if (hiber_info->io_func(hiber_info->device, 0,
659 		    (vaddr_t)NULL, 0, HIB_INIT, hiber_info->io_page))
660 			goto fail;
661 
662 	} else {
663 		/*
664 		 * Resuming kernels use a regular I/O page since we won't
665 		 * have access to the suspended kernel's piglet VA at this
666 		 * point. No need to free this I/O page as it will vanish
667 		 * as part of the resume.
668 		 */
669 		hiber_info->io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
670 		if (!hiber_info->io_page)
671 			return (1);
672 	}
673 
674 
675 	if (get_hibernate_info_md(hiber_info))
676 		goto fail;
677 
678 	/* Calculate memory image location */
679 	hiber_info->image_offset = dl.d_partitions[1].p_offset +
680 	    dl.d_partitions[1].p_size -
681 	    (hiber_info->image_size / hiber_info->secsize) -
682 	    sizeof(union hibernate_info)/hiber_info->secsize -
683 	    chunktable_size;
684 
685 	return (0);
686 fail:
687 	if (suspend)
688 		uvm_pmr_free_piglet(hiber_info->piglet_va, HIBERNATE_CHUNK_SIZE*3);
689 
690 	return (1);
691 }
692 
693 /*
694  * Allocate nitems*size bytes from the hiballoc area presently in use
695  */
696 void
697 *hibernate_zlib_alloc(void *unused, int nitems, int size)
698 {
699 	struct hibernate_zlib_state *hibernate_state;
700 
701 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
702 
703 	return hib_alloc(&hibernate_state->hiballoc_arena, nitems*size);
704 }
705 
706 /*
707  * Free the memory pointed to by addr in the hiballoc area presently in
708  * use
709  */
710 void
711 hibernate_zlib_free(void *unused, void *addr)
712 {
713 	struct hibernate_zlib_state *hibernate_state;
714 
715 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
716 
717 	hib_free(&hibernate_state->hiballoc_arena, addr);
718 }
719 
720 /*
721  * Gets the next RLE value from the image stream
722  */
723 int
724 hibernate_get_next_rle(void)
725 {
726 	int rle, i;
727 	struct hibernate_zlib_state *hibernate_state;
728 
729 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
730 
731 	/* Read RLE code */
732 	hibernate_state->hib_stream.next_out = (char *)&rle;
733 	hibernate_state->hib_stream.avail_out = sizeof(rle);
734 
735 	i = inflate(&hibernate_state->hib_stream, Z_FULL_FLUSH);
736 	if (i != Z_OK && i != Z_STREAM_END) {
737 		/*
738 		 * XXX - this will likely reboot/hang most machines,
739 		 *       but there's not much else we can do here.
740 		 */
741 		panic("inflate rle error");
742 	}
743 
744 	/* Sanity check what RLE value we got */
745 	if (rle > HIBERNATE_CHUNK_SIZE/PAGE_SIZE || rle < 0)
746 		panic("invalid RLE code");
747 
748 	if (i == Z_STREAM_END)
749 		rle = -1;
750 
751 	return rle;
752 }
753 
754 /*
755  * Inflate next page of data from the image stream
756  */
757 int
758 hibernate_inflate_page(void)
759 {
760 	struct hibernate_zlib_state *hibernate_state;
761 	int i;
762 
763 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
764 
765 	/* Set up the stream for inflate */
766 	hibernate_state->hib_stream.next_out = (char *)HIBERNATE_INFLATE_PAGE;
767 	hibernate_state->hib_stream.avail_out = PAGE_SIZE;
768 
769 	/* Process next block of data */
770 	i = inflate(&hibernate_state->hib_stream, Z_PARTIAL_FLUSH);
771 	if (i != Z_OK && i != Z_STREAM_END) {
772 		/*
773 		 * XXX - this will likely reboot/hang most machines,
774 		 *       but there's not much else we can do here.
775 		 */
776 
777 		panic("inflate error");
778 	}
779 
780 	/* We should always have extracted a full page ... */
781 	if (hibernate_state->hib_stream.avail_out != 0)
782 		panic("incomplete page");
783 
784 	return (i == Z_STREAM_END);
785 }
786 
787 /*
788  * Inflate size bytes from src into dest, skipping any pages in
789  * [src..dest] that are special (see hibernate_inflate_skip)
790  *
791  * This function executes while using the resume-time stack
792  * and pmap, and therefore cannot use ddb/printf/etc. Doing so
793  * will likely hang or reset the machine.
794  */
795 void
796 hibernate_inflate_region(union hibernate_info *hiber_info, paddr_t dest,
797     paddr_t src, size_t size)
798 {
799 	int rle, end_stream = 0 ;
800 	struct hibernate_zlib_state *hibernate_state;
801 
802 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
803 
804 	hibernate_state->hib_stream.next_in = (char *)src;
805 	hibernate_state->hib_stream.avail_in = size;
806 
807 	do {
808 		/* Flush cache and TLB */
809 		hibernate_flush();
810 
811 		/* Consume RLE skipped pages */
812 		do {
813 			rle = hibernate_get_next_rle();
814 			if (rle == -1) {
815 				end_stream = 1;
816 				goto next_page;
817 			}
818 
819 			if (rle != 0)
820 				dest += (rle * PAGE_SIZE);
821 
822 		} while (rle != 0);
823 
824 		/*
825 		 * Is this a special page? If yes, redirect the
826 		 * inflate output to a scratch page (eg, discard it)
827 		 */
828 		if (hibernate_inflate_skip(hiber_info, dest)) {
829 			hibernate_enter_resume_mapping(
830 			    HIBERNATE_INFLATE_PAGE,
831 			    HIBERNATE_INFLATE_PAGE, 0);
832 		} else {
833 			hibernate_enter_resume_mapping(
834 			    HIBERNATE_INFLATE_PAGE, dest, 0);
835 		}
836 
837 		hibernate_flush();
838 		end_stream = hibernate_inflate_page();
839 
840 next_page:
841 		dest += PAGE_SIZE;
842 	} while (!end_stream);
843 }
844 
845 /*
846  * deflate from src into the I/O page, up to 'remaining' bytes
847  *
848  * Returns number of input bytes consumed, and may reset
849  * the 'remaining' parameter if not all the output space was consumed
850  * (this information is needed to know how much to write to disk
851  */
852 size_t
853 hibernate_deflate(union hibernate_info *hiber_info, paddr_t src,
854     size_t *remaining)
855 {
856 	vaddr_t hibernate_io_page = hiber_info->piglet_va + PAGE_SIZE;
857 	struct hibernate_zlib_state *hibernate_state;
858 
859 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
860 
861 	/* Set up the stream for deflate */
862 	hibernate_state->hib_stream.next_in = (caddr_t)src;
863 	hibernate_state->hib_stream.avail_in = PAGE_SIZE - (src & PAGE_MASK);
864 	hibernate_state->hib_stream.next_out = (caddr_t)hibernate_io_page +
865 	    (PAGE_SIZE - *remaining);
866 	hibernate_state->hib_stream.avail_out = *remaining;
867 
868 	/* Process next block of data */
869 	if (deflate(&hibernate_state->hib_stream, Z_PARTIAL_FLUSH) != Z_OK)
870 		panic("hibernate zlib deflate error");
871 
872 	/* Update pointers and return number of bytes consumed */
873 	*remaining = hibernate_state->hib_stream.avail_out;
874 	return (PAGE_SIZE - (src & PAGE_MASK)) -
875 	    hibernate_state->hib_stream.avail_in;
876 }
877 
878 /*
879  * Write the hibernation information specified in hiber_info
880  * to the location in swap previously calculated (last block of
881  * swap), called the "signature block".
882  *
883  * Write the memory chunk table to the area in swap immediately
884  * preceding the signature block.
885  */
886 int
887 hibernate_write_signature(union hibernate_info *hiber_info)
888 {
889 	/* Write hibernate info to disk */
890 	return (hiber_info->io_func(hiber_info->device, hiber_info->sig_offset,
891 	    (vaddr_t)hiber_info, hiber_info->secsize, HIB_W,
892 	    hiber_info->io_page));
893 }
894 
895 /*
896  * Write the memory chunk table to the area in swap immediately
897  * preceding the signature block. The chunk table is stored
898  * in the piglet when this function is called.
899  */
900 int
901 hibernate_write_chunktable(union hibernate_info *hiber_info)
902 {
903 	struct hibernate_disk_chunk *chunks;
904 	vaddr_t hibernate_chunk_table_start;
905 	size_t hibernate_chunk_table_size;
906 	daddr_t chunkbase;
907 	int i;
908 
909 	hibernate_chunk_table_size = HIBERNATE_CHUNK_TABLE_SIZE;
910 
911 	chunkbase = hiber_info->sig_offset -
912 	    (hibernate_chunk_table_size / hiber_info->secsize);
913 
914 	hibernate_chunk_table_start = hiber_info->piglet_va +
915 	    HIBERNATE_CHUNK_SIZE;
916 
917 	chunks = (struct hibernate_disk_chunk *)(hiber_info->piglet_va +
918 	    HIBERNATE_CHUNK_SIZE);
919 
920 	/* Write chunk table */
921 	for (i = 0; i < hibernate_chunk_table_size; i += MAXPHYS) {
922 		if (hiber_info->io_func(hiber_info->device,
923 		    chunkbase + (i/hiber_info->secsize),
924 		    (vaddr_t)(hibernate_chunk_table_start + i),
925 		    MAXPHYS, HIB_W, hiber_info->io_page))
926 			return (1);
927 	}
928 
929 	return (0);
930 }
931 
932 /*
933  * Write an empty hiber_info to the swap signature block, which is
934  * guaranteed to not match any valid hiber_info.
935  */
936 int
937 hibernate_clear_signature(void)
938 {
939 	union hibernate_info blank_hiber_info;
940 	union hibernate_info hiber_info;
941 
942 	/* Zero out a blank hiber_info */
943 	bzero(&blank_hiber_info, sizeof(hiber_info));
944 
945 	if (get_hibernate_info(&hiber_info, 0))
946 		return (1);
947 
948 	/* Write (zeroed) hibernate info to disk */
949 	if (hibernate_block_io(&hiber_info,
950 	    hiber_info.sig_offset - hiber_info.swap_offset,
951 	    hiber_info.secsize, (vaddr_t)&blank_hiber_info, 1))
952 		panic("error hibernate write 6");
953 
954 	return (0);
955 }
956 
957 /*
958  * Check chunk range overlap when calculating whether or not to copy a
959  * compressed chunk to the piglet area before decompressing.
960  *
961  * returns zero if the ranges do not overlap, non-zero otherwise.
962  */
963 int
964 hibernate_check_overlap(paddr_t r1s, paddr_t r1e, paddr_t r2s, paddr_t r2e)
965 {
966 	/* case A : end of r1 overlaps start of r2 */
967 	if (r1s < r2s && r1e > r2s)
968 		return (1);
969 
970 	/* case B : r1 entirely inside r2 */
971 	if (r1s >= r2s && r1e <= r2e)
972 		return (1);
973 
974 	/* case C : r2 entirely inside r1 */
975 	if (r2s >= r1s && r2e <= r1e)
976 		return (1);
977 
978 	/* case D : end of r2 overlaps start of r1 */
979 	if (r2s < r1s && r2e > r1s)
980 		return (1);
981 
982 	return (0);
983 }
984 
985 /*
986  * Compare two hibernate_infos to determine if they are the same (eg,
987  * we should be performing a hibernate resume on this machine.
988  * Not all fields are checked - just enough to verify that the machine
989  * has the same memory configuration and kernel as the one that
990  * wrote the signature previously.
991  */
992 int
993 hibernate_compare_signature(union hibernate_info *mine,
994     union hibernate_info *disk)
995 {
996 	u_int i;
997 
998 	if (mine->nranges != disk->nranges)
999 		return (1);
1000 
1001 	if (strcmp(mine->kernel_version, disk->kernel_version) != 0)
1002 		return (1);
1003 
1004 	for (i = 0; i < mine->nranges; i++) {
1005 		if ((mine->ranges[i].base != disk->ranges[i].base) ||
1006 		    (mine->ranges[i].end != disk->ranges[i].end) )
1007 			return (1);
1008 	}
1009 
1010 	return (0);
1011 }
1012 
1013 /*
1014  * Transfers xfer_size bytes between the hibernate device specified in
1015  * hib_info at offset blkctr and the vaddr specified at dest.
1016  *
1017  * Separate offsets and pages are used to handle misaligned reads (reads
1018  * that span a page boundary).
1019  *
1020  * blkctr specifies a relative offset (relative to the start of swap),
1021  * not an absolute disk offset
1022  *
1023  */
1024 int
1025 hibernate_block_io(union hibernate_info *hib_info, daddr_t blkctr,
1026     size_t xfer_size, vaddr_t dest, int iswrite)
1027 {
1028 	struct buf *bp;
1029 	struct bdevsw *bdsw;
1030 	int error;
1031 
1032 	bp = geteblk(xfer_size);
1033 	bdsw = &bdevsw[major(hib_info->device)];
1034 
1035 	error = (*bdsw->d_open)(hib_info->device, FREAD, S_IFCHR, curproc);
1036 	if (error) {
1037 		printf("hibernate_block_io open failed\n");
1038 		return (1);
1039 	}
1040 
1041 	if (iswrite)
1042 		bcopy((caddr_t)dest, bp->b_data, xfer_size);
1043 
1044 	bp->b_bcount = xfer_size;
1045 	bp->b_blkno = blkctr;
1046 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
1047 	SET(bp->b_flags, B_BUSY | (iswrite ? B_WRITE : B_READ) | B_RAW);
1048 	bp->b_dev = hib_info->device;
1049 	bp->b_cylinder = 0;
1050 	(*bdsw->d_strategy)(bp);
1051 
1052 	error = biowait(bp);
1053 	if (error) {
1054 		printf("hibernate_block_io biowait failed %d\n", error);
1055 		error = (*bdsw->d_close)(hib_info->device, 0, S_IFCHR,
1056 		    curproc);
1057 		if (error)
1058 			printf("hibernate_block_io error close failed\n");
1059 		return (1);
1060 	}
1061 
1062 	error = (*bdsw->d_close)(hib_info->device, FREAD, S_IFCHR, curproc);
1063 	if (error) {
1064 		printf("hibernate_block_io close failed\n");
1065 		return (1);
1066 	}
1067 
1068 	if (!iswrite)
1069 		bcopy(bp->b_data, (caddr_t)dest, xfer_size);
1070 
1071 	bp->b_flags |= B_INVAL;
1072 	brelse(bp);
1073 
1074 	return (0);
1075 }
1076 
1077 /*
1078  * Reads the signature block from swap, checks against the current machine's
1079  * information. If the information matches, perform a resume by reading the
1080  * saved image into the pig area, and unpacking.
1081  */
1082 void
1083 hibernate_resume(void)
1084 {
1085 	union hibernate_info hiber_info;
1086 	int s;
1087 
1088 	/* Get current running machine's hibernate info */
1089 	bzero(&hiber_info, sizeof(hiber_info));
1090 	if (get_hibernate_info(&hiber_info, 0))
1091 		return;
1092 
1093 	/* Read hibernate info from disk */
1094 	s = splbio();
1095 
1096 	if (hibernate_block_io(&hiber_info,
1097 	    hiber_info.sig_offset - hiber_info.swap_offset,
1098 	    hiber_info.secsize, (vaddr_t)&disk_hiber_info, 0))
1099 		panic("error in hibernate read");
1100 
1101 	/*
1102 	 * If on-disk and in-memory hibernate signatures match,
1103 	 * this means we should do a resume from hibernate.
1104 	 */
1105 	if (hibernate_compare_signature(&hiber_info, &disk_hiber_info)) {
1106 		splx(s);
1107 		return;
1108 	}
1109 
1110 	/* Read the image from disk into the image (pig) area */
1111 	if (hibernate_read_image(&disk_hiber_info))
1112 		goto fail;
1113 
1114 	if (config_suspend(TAILQ_FIRST(&alldevs), DVACT_QUIESCE) != 0)
1115 		goto fail;
1116 
1117 	(void) splhigh();
1118 	disable_intr();
1119 	cold = 1;
1120 
1121 	if (config_suspend(TAILQ_FIRST(&alldevs), DVACT_SUSPEND) != 0) {
1122 		cold = 0;
1123 		enable_intr();
1124 		goto fail;
1125 	}
1126 
1127 	/* Point of no return ... */
1128 
1129 	pmap_kenter_pa(HIBERNATE_HIBALLOC_PAGE, HIBERNATE_HIBALLOC_PAGE,
1130 	    VM_PROT_ALL);
1131 	pmap_activate(curproc);
1132 
1133 	/* Switch stacks */
1134 	hibernate_switch_stack_machdep();
1135 
1136 	/*
1137 	 * Image is now in high memory (pig area), copy to correct location
1138 	 * in memory. We'll eventually end up copying on top of ourself, but
1139 	 * we are assured the kernel code here is the same between the
1140 	 * hibernated and resuming kernel, and we are running on our own
1141 	 * stack, so the overwrite is ok.
1142 	 */
1143 	hibernate_unpack_image(&disk_hiber_info);
1144 
1145 	/*
1146 	 * Resume the loaded kernel by jumping to the MD resume vector.
1147 	 * We won't be returning from this call.
1148 	 */
1149 	hibernate_resume_machdep();
1150 
1151 fail:
1152 	splx(s);
1153 	printf("Unable to resume hibernated image\n");
1154 }
1155 
1156 /*
1157  * Unpack image from pig area to original location by looping through the
1158  * list of output chunks in the order they should be restored (fchunks).
1159  * This ordering is used to avoid having inflate overwrite a chunk in the
1160  * middle of processing that chunk. This will, of course, happen during the
1161  * final output chunk, where we copy the chunk to the piglet area first,
1162  * before inflating.
1163  */
1164 void
1165 hibernate_unpack_image(union hibernate_info *hiber_info)
1166 {
1167 	struct hibernate_disk_chunk *chunks;
1168 	union hibernate_info local_hiber_info;
1169 	paddr_t image_cur = global_pig_start;
1170 	int *fchunks, i;
1171 	char *pva = (char *)hiber_info->piglet_va;
1172 	struct hibernate_zlib_state *hibernate_state;
1173 
1174 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1175 
1176 	/* Mask off based on arch-specific piglet page size */
1177 	pva = (char *)((paddr_t)pva & (PIGLET_PAGE_MASK));
1178 	fchunks = (int *)(pva + (6 * PAGE_SIZE));
1179 
1180 	chunks = (struct hibernate_disk_chunk *)(pva +  HIBERNATE_CHUNK_SIZE);
1181 
1182 	/* Can't use hiber_info that's passed in after this point */
1183 	bcopy(hiber_info, &local_hiber_info, sizeof(union hibernate_info));
1184 
1185 	hibernate_activate_resume_pt_machdep();
1186 
1187 	for (i = 0; i < local_hiber_info.chunk_ctr; i++) {
1188 		/* Reset zlib for inflate */
1189 		if (hibernate_zlib_reset(&local_hiber_info, 0) != Z_OK)
1190 			panic("hibernate failed to reset zlib for inflate");
1191 
1192 		hibernate_process_chunk(&local_hiber_info, &chunks[fchunks[i]],
1193 		    image_cur);
1194 
1195 		image_cur += chunks[fchunks[i]].compressed_size;
1196 
1197 	}
1198 }
1199 
1200 /*
1201  * Process a chunk by ensuring its proper placement, followed by unpacking
1202  */
1203 void
1204 hibernate_process_chunk(union hibernate_info *hiber_info,
1205     struct hibernate_disk_chunk *chunk, paddr_t img_cur)
1206 {
1207 	char *pva = (char *)hiber_info->piglet_va;
1208 
1209 	/*
1210 	 * If there is a conflict, copy the chunk to the piglet area
1211 	 * before unpacking it to its original location.
1212 	 */
1213 	if ((chunk->flags & HIBERNATE_CHUNK_CONFLICT) == 0)
1214 		hibernate_inflate_region(hiber_info, chunk->base,
1215 		    img_cur, chunk->compressed_size);
1216 	else {
1217 		bcopy((caddr_t)img_cur,
1218 		    pva + (HIBERNATE_CHUNK_SIZE * 2),
1219 		    chunk->compressed_size);
1220 		hibernate_inflate_region(hiber_info, chunk->base,
1221 		    (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)),
1222 		    chunk->compressed_size);
1223 	}
1224 }
1225 
1226 /*
1227  * Write a compressed version of this machine's memory to disk, at the
1228  * precalculated swap offset:
1229  *
1230  * end of swap - signature block size - chunk table size - memory size
1231  *
1232  * The function begins by looping through each phys mem range, cutting each
1233  * one into MD sized chunks. These chunks are then compressed individually
1234  * and written out to disk, in phys mem order. Some chunks might compress
1235  * more than others, and for this reason, each chunk's size is recorded
1236  * in the chunk table, which is written to disk after the image has
1237  * properly been compressed and written (in hibernate_write_chunktable).
1238  *
1239  * When this function is called, the machine is nearly suspended - most
1240  * devices are quiesced/suspended, interrupts are off, and cold has
1241  * been set. This means that there can be no side effects once the
1242  * write has started, and the write function itself can also have no
1243  * side effects. This also means no printfs are permitted (since it
1244  * has side effects.)
1245  */
1246 int
1247 hibernate_write_chunks(union hibernate_info *hiber_info)
1248 {
1249 	paddr_t range_base, range_end, inaddr, temp_inaddr;
1250 	size_t nblocks, out_remaining, used;
1251 	struct hibernate_disk_chunk *chunks;
1252 	vaddr_t hibernate_io_page = hiber_info->piglet_va + PAGE_SIZE;
1253 	daddr_t blkctr = hiber_info->image_offset, offset = 0;
1254 	int i, rle;
1255 	struct hibernate_zlib_state *hibernate_state;
1256 
1257 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1258 
1259 	hiber_info->chunk_ctr = 0;
1260 
1261 	/*
1262 	 * Allocate VA for the temp and copy page.
1263 	 * These will becomee part of the suspended kernel and will
1264 	 * be freed in hibernate_free, upon resume.
1265 	 */
1266 	hibernate_temp_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1267 	    &kp_none, &kd_nowait);
1268 	if (!hibernate_temp_page)
1269 		return (1);
1270 
1271 	hibernate_copy_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1272 	    &kp_none, &kd_nowait);
1273 	if (!hibernate_copy_page)
1274 		return (1);
1275 
1276 	pmap_kenter_pa(hibernate_copy_page,
1277 	    (hiber_info->piglet_pa + 3*PAGE_SIZE), VM_PROT_ALL);
1278 
1279 	/* XXX - not needed on all archs */
1280 	pmap_activate(curproc);
1281 
1282 	chunks = (struct hibernate_disk_chunk *)(hiber_info->piglet_va +
1283 	    HIBERNATE_CHUNK_SIZE);
1284 
1285 	/* Calculate the chunk regions */
1286 	for (i = 0; i < hiber_info->nranges; i++) {
1287 		range_base = hiber_info->ranges[i].base;
1288 		range_end = hiber_info->ranges[i].end;
1289 
1290 		inaddr = range_base;
1291 
1292 		while (inaddr < range_end) {
1293 			chunks[hiber_info->chunk_ctr].base = inaddr;
1294 			if (inaddr + HIBERNATE_CHUNK_SIZE < range_end)
1295 				chunks[hiber_info->chunk_ctr].end = inaddr +
1296 				    HIBERNATE_CHUNK_SIZE;
1297 			else
1298 				chunks[hiber_info->chunk_ctr].end = range_end;
1299 
1300 			inaddr += HIBERNATE_CHUNK_SIZE;
1301 			hiber_info->chunk_ctr ++;
1302 		}
1303 	}
1304 
1305 	/* Compress and write the chunks in the chunktable */
1306 	for (i = 0; i < hiber_info->chunk_ctr; i++) {
1307 		range_base = chunks[i].base;
1308 		range_end = chunks[i].end;
1309 
1310 		chunks[i].offset = blkctr;
1311 
1312 		/* Reset zlib for deflate */
1313 		if (hibernate_zlib_reset(hiber_info, 1) != Z_OK)
1314 			return (1);
1315 
1316 		inaddr = range_base;
1317 
1318 		/*
1319 		 * For each range, loop through its phys mem region
1320 		 * and write out the chunks (the last chunk might be
1321 		 * smaller than the chunk size).
1322 		 */
1323 		while (inaddr < range_end) {
1324 			out_remaining = PAGE_SIZE;
1325 			while (out_remaining > 0 && inaddr < range_end) {
1326 
1327 				/*
1328 				 * Adjust for regions that are not evenly
1329 				 * divisible by PAGE_SIZE or overflowed
1330 				 * pages from the previous iteration.
1331 				 */
1332 				temp_inaddr = (inaddr & PAGE_MASK) +
1333 				    hibernate_copy_page;
1334 
1335 				if (hibernate_inflate_skip(hiber_info, inaddr))
1336 					rle = 1;
1337 				else
1338 					rle = uvm_page_rle(inaddr);
1339 
1340 				while (rle != 0 && inaddr < range_end) {
1341 					hibernate_state->hib_stream.next_in =
1342 					    (char *)&rle;
1343 					hibernate_state->hib_stream.avail_in =
1344 					    sizeof(rle);
1345 					hibernate_state->hib_stream.next_out =
1346 					    (caddr_t)hibernate_io_page +
1347 					    (PAGE_SIZE - out_remaining);
1348 					hibernate_state->hib_stream.avail_out =
1349 					    out_remaining;
1350 
1351 					if (deflate(&hibernate_state->hib_stream,
1352 					    Z_PARTIAL_FLUSH) != Z_OK)
1353 						return (1);
1354 
1355 					out_remaining =
1356 					    hibernate_state->hib_stream.avail_out;
1357 					inaddr += (rle * PAGE_SIZE);
1358 					if (inaddr > range_end)
1359 						inaddr = range_end;
1360 					else
1361 						rle = uvm_page_rle(inaddr);
1362 				}
1363 
1364 				if (out_remaining == 0) {
1365 					/* Filled up the page */
1366 					nblocks = PAGE_SIZE / hiber_info->secsize;
1367 
1368 					if (hiber_info->io_func(hiber_info->device,
1369 					    blkctr, (vaddr_t)hibernate_io_page,
1370 					    PAGE_SIZE, HIB_W, hiber_info->io_page))
1371 						return (1);
1372 
1373 					blkctr += nblocks;
1374 					out_remaining = PAGE_SIZE;
1375 				}
1376 
1377 				/* Write '0' RLE code */
1378 				if (inaddr < range_end) {
1379 					hibernate_state->hib_stream.next_in =
1380 					    (char *)&rle;
1381 					hibernate_state->hib_stream.avail_in =
1382 					    sizeof(rle);
1383 					hibernate_state->hib_stream.next_out =
1384 				    	    (caddr_t)hibernate_io_page +
1385 					    (PAGE_SIZE - out_remaining);
1386 					hibernate_state->hib_stream.avail_out =
1387 					    out_remaining;
1388 
1389 					if (deflate(&hibernate_state->hib_stream,
1390 					    Z_PARTIAL_FLUSH) != Z_OK)
1391 						return (1);
1392 
1393 					out_remaining =
1394 					    hibernate_state->hib_stream.avail_out;
1395 				}
1396 
1397 				if (out_remaining == 0) {
1398 					/* Filled up the page */
1399 					nblocks = PAGE_SIZE / hiber_info->secsize;
1400 
1401 					if (hiber_info->io_func(hiber_info->device,
1402 					    blkctr, (vaddr_t)hibernate_io_page,
1403 					    PAGE_SIZE, HIB_W, hiber_info->io_page))
1404 						return (1);
1405 
1406 					blkctr += nblocks;
1407 					out_remaining = PAGE_SIZE;
1408 				}
1409 
1410 				/* Deflate from temp_inaddr to IO page */
1411 				if (inaddr != range_end) {
1412 					pmap_kenter_pa(hibernate_temp_page,
1413 					    inaddr & PMAP_PA_MASK, VM_PROT_ALL);
1414 
1415 					/* XXX - not needed on all archs */
1416 					pmap_activate(curproc);
1417 
1418 					bcopy((caddr_t)hibernate_temp_page,
1419 					    (caddr_t)hibernate_copy_page, PAGE_SIZE);
1420 					inaddr += hibernate_deflate(hiber_info,
1421 					    temp_inaddr, &out_remaining);
1422 				}
1423 			}
1424 
1425 			if (out_remaining == 0) {
1426 				/* Filled up the page */
1427 				nblocks = PAGE_SIZE / hiber_info->secsize;
1428 
1429 				if (hiber_info->io_func(hiber_info->device,
1430 				    blkctr, (vaddr_t)hibernate_io_page,
1431 				    PAGE_SIZE, HIB_W, hiber_info->io_page))
1432 					return (1);
1433 
1434 				blkctr += nblocks;
1435 			}
1436 		}
1437 
1438 		if (inaddr != range_end)
1439 			return (1);
1440 
1441 		/*
1442 		 * End of range. Round up to next secsize bytes
1443 		 * after finishing compress
1444 		 */
1445 		if (out_remaining == 0)
1446 			out_remaining = PAGE_SIZE;
1447 
1448 		/* Finish compress */
1449 		hibernate_state->hib_stream.next_in = (caddr_t)inaddr;
1450 		hibernate_state->hib_stream.avail_in = 0;
1451 		hibernate_state->hib_stream.next_out =
1452 		    (caddr_t)hibernate_io_page + (PAGE_SIZE - out_remaining);
1453 		hibernate_state->hib_stream.avail_out = out_remaining;
1454 
1455 		if (deflate(&hibernate_state->hib_stream, Z_FINISH) !=
1456 		    Z_STREAM_END)
1457 			return (1);
1458 
1459 		out_remaining = hibernate_state->hib_stream.avail_out;
1460 
1461 		used = PAGE_SIZE - out_remaining;
1462 		nblocks = used / hiber_info->secsize;
1463 
1464 		/* Round up to next block if needed */
1465 		if (used % hiber_info->secsize != 0)
1466 			nblocks ++;
1467 
1468 		/* Write final block(s) for this chunk */
1469 		if (hiber_info->io_func(hiber_info->device, blkctr,
1470 		    (vaddr_t)hibernate_io_page, nblocks*hiber_info->secsize,
1471 		    HIB_W, hiber_info->io_page))
1472 			return (1);
1473 
1474 		blkctr += nblocks;
1475 
1476 		offset = blkctr;
1477 		chunks[i].compressed_size = (offset - chunks[i].offset) *
1478 		    hiber_info->secsize;
1479 	}
1480 
1481 	return (0);
1482 }
1483 
1484 /*
1485  * Reset the zlib stream state and allocate a new hiballoc area for either
1486  * inflate or deflate. This function is called once for each hibernate chunk.
1487  * Calling hiballoc_init multiple times is acceptable since the memory it is
1488  * provided is unmanaged memory (stolen). We use the memory provided to us
1489  * by the piglet allocated via the supplied hiber_info.
1490  */
1491 int
1492 hibernate_zlib_reset(union hibernate_info *hiber_info, int deflate)
1493 {
1494 	vaddr_t hibernate_zlib_start;
1495 	size_t hibernate_zlib_size;
1496 	char *pva = (char *)hiber_info->piglet_va;
1497 	struct hibernate_zlib_state *hibernate_state;
1498 
1499 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1500 
1501 	if(!deflate)
1502 		pva = (char *)((paddr_t)pva & (PIGLET_PAGE_MASK));
1503 
1504 	hibernate_zlib_start = (vaddr_t)(pva + (8 * PAGE_SIZE));
1505 	hibernate_zlib_size = 80 * PAGE_SIZE;
1506 
1507 	bzero((caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1508 	bzero((caddr_t)hibernate_state, PAGE_SIZE);
1509 
1510 	/* Set up stream structure */
1511 	hibernate_state->hib_stream.zalloc = (alloc_func)hibernate_zlib_alloc;
1512 	hibernate_state->hib_stream.zfree = (free_func)hibernate_zlib_free;
1513 
1514 	/* Initialize the hiballoc arena for zlib allocs/frees */
1515 	hiballoc_init(&hibernate_state->hiballoc_arena,
1516 	    (caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1517 
1518 	if (deflate) {
1519 		return deflateInit(&hibernate_state->hib_stream,
1520 		    Z_BEST_SPEED);
1521 	} else
1522 		return inflateInit(&hibernate_state->hib_stream);
1523 }
1524 
1525 /*
1526  * Reads the hibernated memory image from disk, whose location and
1527  * size are recorded in hiber_info. Begin by reading the persisted
1528  * chunk table, which records the original chunk placement location
1529  * and compressed size for each. Next, allocate a pig region of
1530  * sufficient size to hold the compressed image. Next, read the
1531  * chunks into the pig area (calling hibernate_read_chunks to do this),
1532  * and finally, if all of the above succeeds, clear the hibernate signature.
1533  * The function will then return to hibernate_resume, which will proceed
1534  * to unpack the pig image to the correct place in memory.
1535  */
1536 int
1537 hibernate_read_image(union hibernate_info *hiber_info)
1538 {
1539 	size_t compressed_size, disk_size, chunktable_size, pig_sz;
1540 	paddr_t image_start, image_end, pig_start, pig_end;
1541 	struct hibernate_disk_chunk *chunks;
1542 	daddr_t blkctr;
1543 	vaddr_t chunktable = (vaddr_t)NULL;
1544 	paddr_t piglet_chunktable = hiber_info->piglet_pa +
1545 	    HIBERNATE_CHUNK_SIZE;
1546 	int i;
1547 
1548 	pmap_activate(curproc);
1549 
1550 	/* Calculate total chunk table size in disk blocks */
1551 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / hiber_info->secsize;
1552 
1553 	blkctr = hiber_info->sig_offset - chunktable_size -
1554 			hiber_info->swap_offset;
1555 
1556 	chunktable = (vaddr_t)km_alloc(HIBERNATE_CHUNK_TABLE_SIZE, &kv_any,
1557 	    &kp_none, &kd_nowait);
1558 
1559 	if (!chunktable)
1560 		return (1);
1561 
1562 	/* Read the chunktable from disk into the piglet chunktable */
1563 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE;
1564 	    i += PAGE_SIZE, blkctr += PAGE_SIZE/hiber_info->secsize) {
1565 		pmap_kenter_pa(chunktable + i, piglet_chunktable + i, VM_PROT_ALL);
1566 		pmap_update(pmap_kernel());
1567 		hibernate_block_io(hiber_info, blkctr, PAGE_SIZE,
1568 		    chunktable + i, 0);
1569 	}
1570 
1571 	blkctr = hiber_info->image_offset;
1572 	compressed_size = 0;
1573 
1574 	chunks = (struct hibernate_disk_chunk *)chunktable;
1575 
1576 	for (i = 0; i < hiber_info->chunk_ctr; i++)
1577 		compressed_size += chunks[i].compressed_size;
1578 
1579 	disk_size = compressed_size;
1580 
1581 	/* Allocate the pig area */
1582 	pig_sz = compressed_size + HIBERNATE_CHUNK_SIZE;
1583 	if (uvm_pmr_alloc_pig(&pig_start, pig_sz) == ENOMEM)
1584 		return (1);
1585 
1586 	pig_end = pig_start + pig_sz;
1587 
1588 	/* Calculate image extents. Pig image must end on a chunk boundary. */
1589 	image_end = pig_end & ~(HIBERNATE_CHUNK_SIZE - 1);
1590 	image_start = pig_start;
1591 
1592 	image_start = image_end - disk_size;
1593 
1594 	hibernate_read_chunks(hiber_info, image_start, image_end, disk_size,
1595 	    chunks);
1596 
1597 	pmap_kremove(chunktable, PAGE_SIZE);
1598 	pmap_update(pmap_kernel());
1599 
1600 	/* Prepare the resume time pmap/page table */
1601 	hibernate_populate_resume_pt(hiber_info, image_start, image_end);
1602 
1603 	/* Read complete, clear the signature and return */
1604 	return hibernate_clear_signature();
1605 }
1606 
1607 /*
1608  * Read the hibernated memory chunks from disk (chunk information at this
1609  * point is stored in the piglet) into the pig area specified by
1610  * [pig_start .. pig_end]. Order the chunks so that the final chunk is the
1611  * only chunk with overlap possibilities.
1612  */
1613 int
1614 hibernate_read_chunks(union hibernate_info *hib_info, paddr_t pig_start,
1615     paddr_t pig_end, size_t image_compr_size,
1616     struct hibernate_disk_chunk *chunks)
1617 {
1618 	paddr_t img_index, img_cur, r1s, r1e, r2s, r2e;
1619 	paddr_t copy_start, copy_end, piglet_cur;
1620 	paddr_t piglet_base = hib_info->piglet_pa;
1621 	paddr_t piglet_end = piglet_base + HIBERNATE_CHUNK_SIZE;
1622 	daddr_t blkctr;
1623 	size_t processed, compressed_size, read_size;
1624 	int i, j, overlap, found, nchunks;
1625 	int nochunks = 0, nfchunks = 0, npchunks = 0;
1626 	int *ochunks, *pchunks, *fchunks;
1627 	vaddr_t tempva = (vaddr_t)NULL, hibernate_fchunk_area = (vaddr_t)NULL;
1628 
1629 	global_pig_start = pig_start;
1630 
1631 	/* XXX - dont need this on all archs */
1632 	pmap_activate(curproc);
1633 
1634 	/*
1635 	 * These mappings go into the resuming kernel's page table, and are
1636 	 * used only during image read. They dissappear from existence
1637 	 * when the suspended kernel is unpacked on top of us.
1638 	 */
1639 	tempva = (vaddr_t)km_alloc(2*PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
1640 	if (!tempva)
1641 		return (1);
1642 	hibernate_fchunk_area = (vaddr_t)km_alloc(3*PAGE_SIZE, &kv_any,
1643 	    &kp_none, &kd_nowait);
1644 	if (!hibernate_fchunk_area)
1645 		return (1);
1646 
1647 	/* Temporary output chunk ordering VA */
1648 	ochunks = (int *)hibernate_fchunk_area;
1649 
1650 	/* Piglet chunk ordering VA */
1651 	pchunks = (int *)(hibernate_fchunk_area + PAGE_SIZE);
1652 
1653 	/* Final chunk ordering VA */
1654 	fchunks = (int *)(hibernate_fchunk_area + (2*PAGE_SIZE));
1655 
1656 	/* Map the chunk ordering region */
1657 	pmap_kenter_pa(hibernate_fchunk_area,
1658 	    piglet_base + (4*PAGE_SIZE), VM_PROT_ALL);
1659 	pmap_update(pmap_kernel());
1660 	pmap_kenter_pa((vaddr_t)pchunks, piglet_base + (5*PAGE_SIZE),
1661 	    VM_PROT_ALL);
1662 	pmap_update(pmap_kernel());
1663 	pmap_kenter_pa((vaddr_t)fchunks, piglet_base + (6*PAGE_SIZE),
1664 	    VM_PROT_ALL);
1665 	pmap_update(pmap_kernel());
1666 
1667 	nchunks = hib_info->chunk_ctr;
1668 
1669 	/* Initially start all chunks as unplaced */
1670 	for (i = 0; i < nchunks; i++)
1671 		chunks[i].flags = 0;
1672 
1673 	/*
1674 	 * Search the list for chunks that are outside the pig area. These
1675 	 * can be placed first in the final output list.
1676 	 */
1677 	for (i = 0; i < nchunks; i++) {
1678 		if (chunks[i].end <= pig_start || chunks[i].base >= pig_end) {
1679 			ochunks[nochunks] = i;
1680 			fchunks[nfchunks] = i;
1681 			nochunks++;
1682 			nfchunks++;
1683 			chunks[i].flags |= HIBERNATE_CHUNK_USED;
1684 		}
1685 	}
1686 
1687 	/*
1688 	 * Walk the ordering, place the chunks in ascending memory order.
1689 	 * Conflicts might arise, these are handled next.
1690 	 */
1691 	do {
1692 		img_index = -1;
1693 		found = 0;
1694 		j = -1;
1695 		for (i = 0; i < nchunks; i++)
1696 			if (chunks[i].base < img_index &&
1697 			    chunks[i].flags == 0 ) {
1698 				j = i;
1699 				img_index = chunks[i].base;
1700 			}
1701 
1702 		if (j != -1) {
1703 			found = 1;
1704 			ochunks[nochunks] = (short)j;
1705 			nochunks++;
1706 			chunks[j].flags |= HIBERNATE_CHUNK_PLACED;
1707 		}
1708 	} while (found);
1709 
1710 	img_index = pig_start;
1711 
1712 	/*
1713 	 * Identify chunk output conflicts (chunks whose pig load area
1714 	 * corresponds to their original memory placement location)
1715 	 */
1716 	for (i = 0; i < nochunks ; i++) {
1717 		overlap = 0;
1718 		r1s = img_index;
1719 		r1e = img_index + chunks[ochunks[i]].compressed_size;
1720 		r2s = chunks[ochunks[i]].base;
1721 		r2e = chunks[ochunks[i]].end;
1722 
1723 		overlap = hibernate_check_overlap(r1s, r1e, r2s, r2e);
1724 		if (overlap)
1725 			chunks[ochunks[i]].flags |= HIBERNATE_CHUNK_CONFLICT;
1726 		img_index += chunks[ochunks[i]].compressed_size;
1727 	}
1728 
1729 	/*
1730 	 * Prepare the final output chunk list. Calculate an output
1731 	 * inflate strategy for overlapping chunks if needed.
1732 	 */
1733 	img_index = pig_start;
1734 	for (i = 0; i < nochunks ; i++) {
1735 		/*
1736 		 * If a conflict is detected, consume enough compressed
1737 		 * output chunks to fill the piglet
1738 		 */
1739 		if (chunks[ochunks[i]].flags & HIBERNATE_CHUNK_CONFLICT) {
1740 			copy_start = piglet_base;
1741 			copy_end = piglet_end;
1742 			piglet_cur = piglet_base;
1743 			npchunks = 0;
1744 			j = i;
1745 
1746 			while (copy_start < copy_end && j < nochunks) {
1747 				piglet_cur += chunks[ochunks[j]].compressed_size;
1748 				pchunks[npchunks] = ochunks[j];
1749 				npchunks++;
1750 				copy_start += chunks[ochunks[j]].compressed_size;
1751 				img_index += chunks[ochunks[j]].compressed_size;
1752 				i++;
1753 				j++;
1754 			}
1755 
1756 			piglet_cur = piglet_base;
1757 			for (j = 0; j < npchunks; j++) {
1758 				piglet_cur += chunks[pchunks[j]].compressed_size;
1759 				fchunks[nfchunks] = pchunks[j];
1760 				chunks[pchunks[j]].flags |= HIBERNATE_CHUNK_USED;
1761 				nfchunks++;
1762 			}
1763 		} else {
1764 			/*
1765 			 * No conflict, chunk can be added without copying
1766 			 */
1767 			if ((chunks[ochunks[i]].flags &
1768 			    HIBERNATE_CHUNK_USED) == 0) {
1769 				fchunks[nfchunks] = ochunks[i];
1770 				chunks[ochunks[i]].flags |= HIBERNATE_CHUNK_USED;
1771 				nfchunks++;
1772 			}
1773 			img_index += chunks[ochunks[i]].compressed_size;
1774 		}
1775 	}
1776 
1777 	img_index = pig_start;
1778 	for (i = 0; i < nfchunks; i++) {
1779 		piglet_cur = piglet_base;
1780 		img_index += chunks[fchunks[i]].compressed_size;
1781 	}
1782 
1783 	img_cur = pig_start;
1784 
1785 	for (i = 0; i < nfchunks; i++) {
1786 		blkctr = chunks[fchunks[i]].offset - hib_info->swap_offset;
1787 		processed = 0;
1788 		compressed_size = chunks[fchunks[i]].compressed_size;
1789 
1790 		while (processed < compressed_size) {
1791 			pmap_kenter_pa(tempva, img_cur, VM_PROT_ALL);
1792 			pmap_kenter_pa(tempva + PAGE_SIZE, img_cur+PAGE_SIZE,
1793 			    VM_PROT_ALL);
1794 			pmap_update(pmap_kernel());
1795 
1796 			if (compressed_size - processed >= PAGE_SIZE)
1797 				read_size = PAGE_SIZE;
1798 			else
1799 				read_size = compressed_size - processed;
1800 
1801 			hibernate_block_io(hib_info, blkctr, read_size,
1802 			    tempva + (img_cur & PAGE_MASK), 0);
1803 
1804 			blkctr += (read_size / hib_info->secsize);
1805 
1806 			hibernate_flush();
1807 			pmap_kremove(tempva, PAGE_SIZE);
1808 			pmap_kremove(tempva + PAGE_SIZE, PAGE_SIZE);
1809 			processed += read_size;
1810 			img_cur += read_size;
1811 		}
1812 	}
1813 
1814 	pmap_kremove(hibernate_fchunk_area, PAGE_SIZE);
1815 	pmap_kremove((vaddr_t)pchunks, PAGE_SIZE);
1816 	pmap_kremove((vaddr_t)fchunks, PAGE_SIZE);
1817 	pmap_update(pmap_kernel());
1818 
1819 	return (0);
1820 }
1821 
1822 /*
1823  * Hibernating a machine comprises the following operations:
1824  *  1. Calculating this machine's hibernate_info information
1825  *  2. Allocating a piglet and saving the piglet's physaddr
1826  *  3. Calculating the memory chunks
1827  *  4. Writing the compressed chunks to disk
1828  *  5. Writing the chunk table
1829  *  6. Writing the signature block (hibernate_info)
1830  *
1831  * On most architectures, the function calling hibernate_suspend would
1832  * then power off the machine using some MD-specific implementation.
1833  */
1834 int
1835 hibernate_suspend(void)
1836 {
1837 	union hibernate_info hib_info;
1838 
1839 	/*
1840 	 * Calculate memory ranges, swap offsets, etc.
1841 	 * This also allocates a piglet whose physaddr is stored in
1842 	 * hib_info->piglet_pa and vaddr stored in hib_info->piglet_va
1843 	 */
1844 	if (get_hibernate_info(&hib_info, 1))
1845 		return (1);
1846 
1847 	pmap_kenter_pa(HIBERNATE_HIBALLOC_PAGE, HIBERNATE_HIBALLOC_PAGE, VM_PROT_ALL);
1848 	pmap_activate(curproc);
1849 
1850 	/* Stash the piglet VA so we can free it in the resuming kernel */
1851 	global_piglet_va = hib_info.piglet_va;
1852 
1853 	if (hibernate_write_chunks(&hib_info))
1854 		return (1);
1855 
1856 	if (hibernate_write_chunktable(&hib_info))
1857 		return (1);
1858 
1859 	if (hibernate_write_signature(&hib_info))
1860 		return (1);
1861 
1862 	delay(500000);
1863 	return (0);
1864 }
1865 
1866 /*
1867  * Free items allocated by hibernate_suspend()
1868  */
1869 void
1870 hibernate_free(void)
1871 {
1872 	if (global_piglet_va)
1873 		uvm_pmr_free_piglet(global_piglet_va,
1874 		    3*HIBERNATE_CHUNK_SIZE);
1875 
1876 	if (hibernate_copy_page)
1877 		pmap_kremove(hibernate_copy_page, PAGE_SIZE);
1878 	if (hibernate_temp_page)
1879 		pmap_kremove(hibernate_temp_page, PAGE_SIZE);
1880 
1881 	pmap_update(pmap_kernel());
1882 
1883 	if (hibernate_copy_page)
1884 		km_free((void *)hibernate_copy_page, PAGE_SIZE,
1885 		    &kv_any, &kp_none);
1886 	if (hibernate_temp_page)
1887 		km_free((void *)hibernate_temp_page, PAGE_SIZE,
1888 		    &kv_any, &kp_none);
1889 
1890 	global_piglet_va = 0;
1891 	hibernate_copy_page = 0;
1892 	hibernate_temp_page = 0;
1893 }
1894