xref: /openbsd-src/sys/kern/subr_hibernate.c (revision 81e6fcfbad61bbf08ee8358000788154308fe3f8)
1 /*	$OpenBSD: subr_hibernate.c,v 1.21 2011/11/13 23:13:29 mlarkin Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  * Copyright (c) 2011 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/hibernate.h>
21 #include <sys/malloc.h>
22 #include <sys/param.h>
23 #include <sys/tree.h>
24 #include <sys/types.h>
25 #include <sys/systm.h>
26 #include <sys/disklabel.h>
27 #include <sys/disk.h>
28 #include <sys/conf.h>
29 #include <sys/buf.h>
30 #include <sys/fcntl.h>
31 #include <sys/stat.h>
32 #include <uvm/uvm.h>
33 #include <machine/hibernate.h>
34 
35 struct hibernate_zlib_state *hibernate_state;
36 
37 /* Temporary vaddr ranges used during hibernate */
38 vaddr_t hibernate_temp_page;
39 vaddr_t hibernate_copy_page;
40 vaddr_t hibernate_stack_page;
41 vaddr_t hibernate_fchunk_area;
42 vaddr_t	hibernate_chunktable_area;
43 
44 /* Hibernate info as read from disk during resume */
45 union hibernate_info disk_hiber_info;
46 paddr_t global_pig_start;
47 vaddr_t global_piglet_va;
48 
49 /*
50  * Hib alloc enforced alignment.
51  */
52 #define HIB_ALIGN		8 /* bytes alignment */
53 
54 /*
55  * sizeof builtin operation, but with alignment constraint.
56  */
57 #define HIB_SIZEOF(_type)	roundup(sizeof(_type), HIB_ALIGN)
58 
59 struct hiballoc_entry {
60 	size_t			hibe_use;
61 	size_t			hibe_space;
62 	RB_ENTRY(hiballoc_entry) hibe_entry;
63 };
64 
65 /*
66  * Compare hiballoc entries based on the address they manage.
67  *
68  * Since the address is fixed, relative to struct hiballoc_entry,
69  * we just compare the hiballoc_entry pointers.
70  */
71 static __inline int
72 hibe_cmp(struct hiballoc_entry *l, struct hiballoc_entry *r)
73 {
74 	return l < r ? -1 : (l > r);
75 }
76 
77 RB_PROTOTYPE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
78 
79 /*
80  * Given a hiballoc entry, return the address it manages.
81  */
82 static __inline void *
83 hib_entry_to_addr(struct hiballoc_entry *entry)
84 {
85 	caddr_t addr;
86 
87 	addr = (caddr_t)entry;
88 	addr += HIB_SIZEOF(struct hiballoc_entry);
89 	return addr;
90 }
91 
92 /*
93  * Given an address, find the hiballoc that corresponds.
94  */
95 static __inline struct hiballoc_entry*
96 hib_addr_to_entry(void *addr_param)
97 {
98 	caddr_t addr;
99 
100 	addr = (caddr_t)addr_param;
101 	addr -= HIB_SIZEOF(struct hiballoc_entry);
102 	return (struct hiballoc_entry*)addr;
103 }
104 
105 RB_GENERATE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
106 
107 /*
108  * Allocate memory from the arena.
109  *
110  * Returns NULL if no memory is available.
111  */
112 void *
113 hib_alloc(struct hiballoc_arena *arena, size_t alloc_sz)
114 {
115 	struct hiballoc_entry *entry, *new_entry;
116 	size_t find_sz;
117 
118 	/*
119 	 * Enforce alignment of HIB_ALIGN bytes.
120 	 *
121 	 * Note that, because the entry is put in front of the allocation,
122 	 * 0-byte allocations are guaranteed a unique address.
123 	 */
124 	alloc_sz = roundup(alloc_sz, HIB_ALIGN);
125 
126 	/*
127 	 * Find an entry with hibe_space >= find_sz.
128 	 *
129 	 * If the root node is not large enough, we switch to tree traversal.
130 	 * Because all entries are made at the bottom of the free space,
131 	 * traversal from the end has a slightly better chance of yielding
132 	 * a sufficiently large space.
133 	 */
134 	find_sz = alloc_sz + HIB_SIZEOF(struct hiballoc_entry);
135 	entry = RB_ROOT(&arena->hib_addrs);
136 	if (entry != NULL && entry->hibe_space < find_sz) {
137 		RB_FOREACH_REVERSE(entry, hiballoc_addr, &arena->hib_addrs) {
138 			if (entry->hibe_space >= find_sz)
139 				break;
140 		}
141 	}
142 
143 	/*
144 	 * Insufficient or too fragmented memory.
145 	 */
146 	if (entry == NULL)
147 		return NULL;
148 
149 	/*
150 	 * Create new entry in allocated space.
151 	 */
152 	new_entry = (struct hiballoc_entry*)(
153 	    (caddr_t)hib_entry_to_addr(entry) + entry->hibe_use);
154 	new_entry->hibe_space = entry->hibe_space - find_sz;
155 	new_entry->hibe_use = alloc_sz;
156 
157 	/*
158 	 * Insert entry.
159 	 */
160 	if (RB_INSERT(hiballoc_addr, &arena->hib_addrs, new_entry) != NULL)
161 		panic("hib_alloc: insert failure");
162 	entry->hibe_space = 0;
163 
164 	/* Return address managed by entry. */
165 	return hib_entry_to_addr(new_entry);
166 }
167 
168 /*
169  * Free a pointer previously allocated from this arena.
170  *
171  * If addr is NULL, this will be silently accepted.
172  */
173 void
174 hib_free(struct hiballoc_arena *arena, void *addr)
175 {
176 	struct hiballoc_entry *entry, *prev;
177 
178 	if (addr == NULL)
179 		return;
180 
181 	/*
182 	 * Derive entry from addr and check it is really in this arena.
183 	 */
184 	entry = hib_addr_to_entry(addr);
185 	if (RB_FIND(hiballoc_addr, &arena->hib_addrs, entry) != entry)
186 		panic("hib_free: freed item %p not in hib arena", addr);
187 
188 	/*
189 	 * Give the space in entry to its predecessor.
190 	 *
191 	 * If entry has no predecessor, change its used space into free space
192 	 * instead.
193 	 */
194 	prev = RB_PREV(hiballoc_addr, &arena->hib_addrs, entry);
195 	if (prev != NULL &&
196 	    (void *)((caddr_t)prev + HIB_SIZEOF(struct hiballoc_entry) +
197 	    prev->hibe_use + prev->hibe_space) == entry) {
198 		/* Merge entry. */
199 		RB_REMOVE(hiballoc_addr, &arena->hib_addrs, entry);
200 		prev->hibe_space += HIB_SIZEOF(struct hiballoc_entry) +
201 		    entry->hibe_use + entry->hibe_space;
202 	} else {
203 		/* Flip used memory to free space. */
204 		entry->hibe_space += entry->hibe_use;
205 		entry->hibe_use = 0;
206 	}
207 }
208 
209 /*
210  * Initialize hiballoc.
211  *
212  * The allocator will manage memmory at ptr, which is len bytes.
213  */
214 int
215 hiballoc_init(struct hiballoc_arena *arena, void *p_ptr, size_t p_len)
216 {
217 	struct hiballoc_entry *entry;
218 	caddr_t ptr;
219 	size_t len;
220 
221 	RB_INIT(&arena->hib_addrs);
222 
223 	/*
224 	 * Hib allocator enforces HIB_ALIGN alignment.
225 	 * Fixup ptr and len.
226 	 */
227 	ptr = (caddr_t)roundup((vaddr_t)p_ptr, HIB_ALIGN);
228 	len = p_len - ((size_t)ptr - (size_t)p_ptr);
229 	len &= ~((size_t)HIB_ALIGN - 1);
230 
231 	/*
232 	 * Insufficient memory to be able to allocate and also do bookkeeping.
233 	 */
234 	if (len <= HIB_SIZEOF(struct hiballoc_entry))
235 		return ENOMEM;
236 
237 	/*
238 	 * Create entry describing space.
239 	 */
240 	entry = (struct hiballoc_entry*)ptr;
241 	entry->hibe_use = 0;
242 	entry->hibe_space = len - HIB_SIZEOF(struct hiballoc_entry);
243 	RB_INSERT(hiballoc_addr, &arena->hib_addrs, entry);
244 
245 	return 0;
246 }
247 
248 /*
249  * Zero all free memory.
250  */
251 void
252 uvm_pmr_zero_everything(void)
253 {
254 	struct uvm_pmemrange	*pmr;
255 	struct vm_page		*pg;
256 	int			 i;
257 
258 	uvm_lock_fpageq();
259 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
260 		/* Zero single pages. */
261 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_DIRTY]))
262 		    != NULL) {
263 			uvm_pmr_remove(pmr, pg);
264 			uvm_pagezero(pg);
265 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
266 			uvmexp.zeropages++;
267 			uvm_pmr_insert(pmr, pg, 0);
268 		}
269 
270 		/* Zero multi page ranges. */
271 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_DIRTY]))
272 		    != NULL) {
273 			pg--; /* Size tree always has second page. */
274 			uvm_pmr_remove(pmr, pg);
275 			for (i = 0; i < pg->fpgsz; i++) {
276 				uvm_pagezero(&pg[i]);
277 				atomic_setbits_int(&pg[i].pg_flags, PG_ZERO);
278 				uvmexp.zeropages++;
279 			}
280 			uvm_pmr_insert(pmr, pg, 0);
281 		}
282 	}
283 	uvm_unlock_fpageq();
284 }
285 
286 /*
287  * Mark all memory as dirty.
288  *
289  * Used to inform the system that the clean memory isn't clean for some
290  * reason, for example because we just came back from hibernate.
291  */
292 void
293 uvm_pmr_dirty_everything(void)
294 {
295 	struct uvm_pmemrange	*pmr;
296 	struct vm_page		*pg;
297 	int			 i;
298 
299 	uvm_lock_fpageq();
300 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
301 		/* Dirty single pages. */
302 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_ZERO]))
303 		    != NULL) {
304 			uvm_pmr_remove(pmr, pg);
305 			atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
306 			uvm_pmr_insert(pmr, pg, 0);
307 		}
308 
309 		/* Dirty multi page ranges. */
310 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_ZERO]))
311 		    != NULL) {
312 			pg--; /* Size tree always has second page. */
313 			uvm_pmr_remove(pmr, pg);
314 			for (i = 0; i < pg->fpgsz; i++)
315 				atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
316 			uvm_pmr_insert(pmr, pg, 0);
317 		}
318 	}
319 
320 	uvmexp.zeropages = 0;
321 	uvm_unlock_fpageq();
322 }
323 
324 /*
325  * Allocate the highest address that can hold sz.
326  *
327  * sz in bytes.
328  */
329 int
330 uvm_pmr_alloc_pig(paddr_t *addr, psize_t sz)
331 {
332 	struct uvm_pmemrange	*pmr;
333 	struct vm_page		*pig_pg, *pg;
334 
335 	/*
336 	 * Convert sz to pages, since that is what pmemrange uses internally.
337 	 */
338 	sz = atop(round_page(sz));
339 
340 	uvm_lock_fpageq();
341 
342 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
343 		RB_FOREACH_REVERSE(pig_pg, uvm_pmr_addr, &pmr->addr) {
344 			if (pig_pg->fpgsz >= sz) {
345 				goto found;
346 			}
347 		}
348 	}
349 
350 	/*
351 	 * Allocation failure.
352 	 */
353 	uvm_unlock_pageq();
354 	return ENOMEM;
355 
356 found:
357 	/* Remove page from freelist. */
358 	uvm_pmr_remove_size(pmr, pig_pg);
359 	pig_pg->fpgsz -= sz;
360 	pg = pig_pg + pig_pg->fpgsz;
361 	if (pig_pg->fpgsz == 0)
362 		uvm_pmr_remove_addr(pmr, pig_pg);
363 	else
364 		uvm_pmr_insert_size(pmr, pig_pg);
365 
366 	uvmexp.free -= sz;
367 	*addr = VM_PAGE_TO_PHYS(pg);
368 
369 	/*
370 	 * Update pg flags.
371 	 *
372 	 * Note that we trash the sz argument now.
373 	 */
374 	while (sz > 0) {
375 		KASSERT(pg->pg_flags & PQ_FREE);
376 
377 		atomic_clearbits_int(&pg->pg_flags,
378 		    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3);
379 
380 		if (pg->pg_flags & PG_ZERO)
381 			uvmexp.zeropages -= sz;
382 		atomic_clearbits_int(&pg->pg_flags,
383 		    PG_ZERO|PQ_FREE);
384 
385 		pg->uobject = NULL;
386 		pg->uanon = NULL;
387 		pg->pg_version++;
388 
389 		/*
390 		 * Next.
391 		 */
392 		pg++;
393 		sz--;
394 	}
395 
396 	/* Return. */
397 	uvm_unlock_fpageq();
398 	return 0;
399 }
400 
401 /*
402  * Allocate a piglet area.
403  *
404  * This is as low as possible.
405  * Piglets are aligned.
406  *
407  * sz and align in bytes.
408  *
409  * The call will sleep for the pagedaemon to attempt to free memory.
410  * The pagedaemon may decide its not possible to free enough memory, causing
411  * the allocation to fail.
412  */
413 int
414 uvm_pmr_alloc_piglet(vaddr_t *va, paddr_t *pa, vsize_t sz, paddr_t align)
415 {
416 	paddr_t			 pg_addr, piglet_addr;
417 	struct uvm_pmemrange	*pmr;
418 	struct vm_page		*pig_pg, *pg;
419 	struct pglist		 pageq;
420 	int			 pdaemon_woken;
421 	vaddr_t			 piglet_va;
422 
423 	KASSERT((align & (align - 1)) == 0);
424 	pdaemon_woken = 0; /* Didn't wake the pagedaemon. */
425 
426 	/*
427 	 * Fixup arguments: align must be at least PAGE_SIZE,
428 	 * sz will be converted to pagecount, since that is what
429 	 * pmemrange uses internally.
430 	 */
431 	if (align < PAGE_SIZE)
432 		align = PAGE_SIZE;
433 	sz = round_page(sz);
434 
435 	uvm_lock_fpageq();
436 
437 	TAILQ_FOREACH_REVERSE(pmr, &uvm.pmr_control.use, uvm_pmemrange_use,
438 	    pmr_use) {
439 retry:
440 		/*
441 		 * Search for a range with enough space.
442 		 * Use the address tree, to ensure the range is as low as
443 		 * possible.
444 		 */
445 		RB_FOREACH(pig_pg, uvm_pmr_addr, &pmr->addr) {
446 			pg_addr = VM_PAGE_TO_PHYS(pig_pg);
447 			piglet_addr = (pg_addr + (align - 1)) & ~(align - 1);
448 
449 			if (atop(pg_addr) + pig_pg->fpgsz >=
450 			    atop(piglet_addr) + atop(sz))
451 				goto found;
452 		}
453 	}
454 
455 	/*
456 	 * Try to coerse the pagedaemon into freeing memory
457 	 * for the piglet.
458 	 *
459 	 * pdaemon_woken is set to prevent the code from
460 	 * falling into an endless loop.
461 	 */
462 	if (!pdaemon_woken) {
463 		pdaemon_woken = 1;
464 		if (uvm_wait_pla(ptoa(pmr->low), ptoa(pmr->high) - 1,
465 		    sz, UVM_PLA_FAILOK) == 0)
466 			goto retry;
467 	}
468 
469 	/* Return failure. */
470 	uvm_unlock_fpageq();
471 	return ENOMEM;
472 
473 found:
474 	/*
475 	 * Extract piglet from pigpen.
476 	 */
477 	TAILQ_INIT(&pageq);
478 	uvm_pmr_extract_range(pmr, pig_pg,
479 	    atop(piglet_addr), atop(piglet_addr) + atop(sz), &pageq);
480 
481 	*pa = piglet_addr;
482 	uvmexp.free -= atop(sz);
483 
484 	/*
485 	 * Update pg flags.
486 	 *
487 	 * Note that we trash the sz argument now.
488 	 */
489 	TAILQ_FOREACH(pg, &pageq, pageq) {
490 		KASSERT(pg->pg_flags & PQ_FREE);
491 
492 		atomic_clearbits_int(&pg->pg_flags,
493 		    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3);
494 
495 		if (pg->pg_flags & PG_ZERO)
496 			uvmexp.zeropages--;
497 		atomic_clearbits_int(&pg->pg_flags,
498 		    PG_ZERO|PQ_FREE);
499 
500 		pg->uobject = NULL;
501 		pg->uanon = NULL;
502 		pg->pg_version++;
503 	}
504 
505 	uvm_unlock_fpageq();
506 
507 	/*
508 	 * Now allocate a va.
509 	 * Use direct mappings for the pages.
510 	 */
511 
512 	piglet_va = *va = (vaddr_t)km_alloc(sz, &kv_any, &kp_none, &kd_waitok);
513 	if (!piglet_va) {
514 		uvm_pglistfree(&pageq);
515 		return ENOMEM;
516 	}
517 
518 	/*
519 	 * Map piglet to va.
520 	 */
521 	TAILQ_FOREACH(pg, &pageq, pageq) {
522 		pmap_kenter_pa(piglet_va, VM_PAGE_TO_PHYS(pg), UVM_PROT_RW);
523 		piglet_va += PAGE_SIZE;
524 	}
525 	pmap_update(pmap_kernel());
526 
527 	return 0;
528 }
529 
530 /*
531  * Free a piglet area.
532  */
533 void
534 uvm_pmr_free_piglet(vaddr_t va, vsize_t sz)
535 {
536 	paddr_t			 pa;
537 	struct vm_page		*pg;
538 
539 	/*
540 	 * Fix parameters.
541 	 */
542 	sz = round_page(sz);
543 
544 	/*
545 	 * Find the first page in piglet.
546 	 * Since piglets are contiguous, the first pg is all we need.
547 	 */
548 	if (!pmap_extract(pmap_kernel(), va, &pa))
549 		panic("uvm_pmr_free_piglet: piglet 0x%lx has no pages", va);
550 	pg = PHYS_TO_VM_PAGE(pa);
551 	if (pg == NULL)
552 		panic("uvm_pmr_free_piglet: unmanaged page 0x%lx", pa);
553 
554 	/*
555 	 * Unmap.
556 	 */
557 	pmap_kremove(va, sz);
558 	pmap_update(pmap_kernel());
559 
560 	/*
561 	 * Free the physical and virtual memory.
562 	 */
563 	uvm_pmr_freepages(pg, atop(sz));
564 	km_free((void *)va, sz, &kv_any, &kp_none);
565 }
566 
567 /*
568  * Physmem RLE compression support.
569  *
570  * Given a physical page address, it will return the number of pages
571  * starting at the address, that are free.
572  * Returns 0 if the page at addr is not free.
573  */
574 psize_t
575 uvm_page_rle(paddr_t addr)
576 {
577 	struct vm_page		*pg, *pg_end;
578 	struct vm_physseg	*vmp;
579 	int			 pseg_idx, off_idx;
580 
581 	pseg_idx = vm_physseg_find(atop(addr), &off_idx);
582 	if (pseg_idx == -1)
583 		return 0;
584 
585 	vmp = &vm_physmem[pseg_idx];
586 	pg = &vmp->pgs[off_idx];
587 	if (!(pg->pg_flags & PQ_FREE))
588 		return 0;
589 
590 	/*
591 	 * Search for the first non-free page after pg.
592 	 * Note that the page may not be the first page in a free pmemrange,
593 	 * therefore pg->fpgsz cannot be used.
594 	 */
595 	for (pg_end = pg; pg_end <= vmp->lastpg &&
596 	    (pg_end->pg_flags & PQ_FREE) == PQ_FREE; pg_end++);
597 	return pg_end - pg;
598 }
599 
600 /*
601  * Fills out the hibernate_info union pointed to by hiber_info
602  * with information about this machine (swap signature block
603  * offsets, number of memory ranges, kernel in use, etc)
604  */
605 int
606 get_hibernate_info(union hibernate_info *hiber_info, int suspend)
607 {
608 	int chunktable_size;
609 	struct disklabel dl;
610 	char err_string[128], *dl_ret;
611 
612 	/* Determine I/O function to use */
613 	hiber_info->io_func = get_hibernate_io_function();
614 	if (hiber_info->io_func == NULL)
615 		return (1);
616 
617 	/* Calculate hibernate device */
618 	hiber_info->device = swdevt[0].sw_dev;
619 
620 	/* Read disklabel (used to calculate signature and image offsets) */
621 	dl_ret = disk_readlabel(&dl, hiber_info->device, err_string, 128);
622 
623 	if (dl_ret) {
624 		printf("Hibernate error reading disklabel: %s\n", dl_ret);
625 		return (1);
626 	}
627 
628 	hiber_info->secsize = dl.d_secsize;
629 
630 	/* Make sure the signature can fit in one block */
631 	KASSERT(sizeof(union hibernate_info)/hiber_info->secsize == 1);
632 
633 	/* Calculate swap offset from start of disk */
634 	hiber_info->swap_offset = dl.d_partitions[1].p_offset;
635 
636 	/* Calculate signature block location */
637 	hiber_info->sig_offset = dl.d_partitions[1].p_offset +
638 	    dl.d_partitions[1].p_size -
639 	    sizeof(union hibernate_info)/hiber_info->secsize;
640 
641 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / hiber_info->secsize;
642 
643 	/* Stash kernel version information */
644 	bzero(&hiber_info->kernel_version, 128);
645 	bcopy(version, &hiber_info->kernel_version,
646 	    min(strlen(version), sizeof(hiber_info->kernel_version)-1));
647 
648 	if (suspend) {
649 		/* Allocate piglet region */
650 		if (uvm_pmr_alloc_piglet(&hiber_info->piglet_va,
651 		    &hiber_info->piglet_pa, HIBERNATE_CHUNK_SIZE*3,
652 		    HIBERNATE_CHUNK_SIZE)) {
653 			printf("Hibernate failed to allocate the piglet\n");
654 			return (1);
655 		}
656 	}
657 
658 	if (get_hibernate_info_md(hiber_info))
659 		return (1);
660 
661 	/* Calculate memory image location */
662 	hiber_info->image_offset = dl.d_partitions[1].p_offset +
663 	    dl.d_partitions[1].p_size -
664 	    (hiber_info->image_size / hiber_info->secsize) -
665 	    sizeof(union hibernate_info)/hiber_info->secsize -
666 	    chunktable_size;
667 
668 	return (0);
669 }
670 
671 /*
672  * Allocate nitems*size bytes from the hiballoc area presently in use
673  */
674 void
675 *hibernate_zlib_alloc(void *unused, int nitems, int size)
676 {
677 	return hib_alloc(&hibernate_state->hiballoc_arena, nitems*size);
678 }
679 
680 /*
681  * Free the memory pointed to by addr in the hiballoc area presently in
682  * use
683  */
684 void
685 hibernate_zlib_free(void *unused, void *addr)
686 {
687 	hib_free(&hibernate_state->hiballoc_arena, addr);
688 }
689 
690 /*
691  * Inflate size bytes from src into dest, skipping any pages in
692  * [src..dest] that are special (see hibernate_inflate_skip)
693  *
694  * For each page of output data, we map HIBERNATE_TEMP_PAGE
695  * to the current output page, and tell inflate() to inflate
696  * its data there, resulting in the inflated data being placed
697  * at the proper paddr.
698  *
699  * This function executes while using the resume-time stack
700  * and pmap, and therefore cannot use ddb/printf/etc. Doing so
701  * will likely hang or reset the machine.
702  */
703 void
704 hibernate_inflate(union hibernate_info *hiber_info, paddr_t dest,
705     paddr_t src, size_t size)
706 {
707 	int i;
708 
709 	hibernate_state->hib_stream.avail_in = size;
710 	hibernate_state->hib_stream.next_in = (char *)src;
711 
712 	do {
713 		/* Flush cache and TLB */
714 		hibernate_flush();
715 
716 		/*
717 		 * Is this a special page? If yes, redirect the
718 		 * inflate output to a scratch page (eg, discard it)
719 		 */
720 		if (hibernate_inflate_skip(hiber_info, dest))
721 			hibernate_enter_resume_mapping(
722 			    HIBERNATE_INFLATE_PAGE,
723 			    HIBERNATE_INFLATE_PAGE, 0);
724 		else
725 			hibernate_enter_resume_mapping(
726 			    HIBERNATE_INFLATE_PAGE, dest, 0);
727 
728 		/* Set up the stream for inflate */
729 		hibernate_state->hib_stream.avail_out = PAGE_SIZE;
730 		hibernate_state->hib_stream.next_out =
731 		    (char *)HIBERNATE_INFLATE_PAGE;
732 
733 		/* Process next block of data */
734 		i = inflate(&hibernate_state->hib_stream, Z_PARTIAL_FLUSH);
735 		if (i != Z_OK && i != Z_STREAM_END) {
736 			/*
737 			 * XXX - this will likely reboot/hang most machines,
738 			 *       but there's not much else we can do here.
739 			 */
740 			panic("inflate error");
741 		}
742 
743 		dest += PAGE_SIZE - hibernate_state->hib_stream.avail_out;
744 	} while (i != Z_STREAM_END);
745 }
746 
747 /*
748  * deflate from src into the I/O page, up to 'remaining' bytes
749  *
750  * Returns number of input bytes consumed, and may reset
751  * the 'remaining' parameter if not all the output space was consumed
752  * (this information is needed to know how much to write to disk
753  */
754 size_t
755 hibernate_deflate(union hibernate_info *hiber_info, paddr_t src,
756     size_t *remaining)
757 {
758 	vaddr_t hibernate_io_page = hiber_info->piglet_va + PAGE_SIZE;
759 
760 	/* Set up the stream for deflate */
761 	hibernate_state->hib_stream.avail_in = PAGE_SIZE - (src & PAGE_MASK);
762 	hibernate_state->hib_stream.avail_out = *remaining;
763 	hibernate_state->hib_stream.next_in = (caddr_t)src;
764 	hibernate_state->hib_stream.next_out = (caddr_t)hibernate_io_page +
765 	    (PAGE_SIZE - *remaining);
766 
767 	/* Process next block of data */
768 	if (deflate(&hibernate_state->hib_stream, Z_PARTIAL_FLUSH) != Z_OK)
769 		panic("hibernate zlib deflate error\n");
770 
771 	/* Update pointers and return number of bytes consumed */
772 	*remaining = hibernate_state->hib_stream.avail_out;
773 	return (PAGE_SIZE - (src & PAGE_MASK)) -
774 		hibernate_state->hib_stream.avail_in;
775 }
776 
777 /*
778  * Write the hibernation information specified in hiber_info
779  * to the location in swap previously calculated (last block of
780  * swap), called the "signature block".
781  *
782  * Write the memory chunk table to the area in swap immediately
783  * preceding the signature block.
784  */
785 int
786 hibernate_write_signature(union hibernate_info *hiber_info)
787 {
788 	u_int8_t *io_page;
789 	int result = 0;
790 
791 	io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
792 	if (!io_page)
793 		return (1);
794 
795 	/* Write hibernate info to disk */
796 	if (hiber_info->io_func(hiber_info->device, hiber_info->sig_offset,
797 	    (vaddr_t)hiber_info, hiber_info->secsize, HIB_W, io_page))
798 		result = 1;
799 
800 	free(io_page, M_DEVBUF);
801 	return (result);
802 }
803 
804 /*
805  * Write the memory chunk table to the area in swap immediately
806  * preceding the signature block. The chunk table is stored
807  * in the piglet when this function is called.
808  */
809 int
810 hibernate_write_chunktable(union hibernate_info *hiber_info)
811 {
812 	struct hibernate_disk_chunk *chunks;
813 	vaddr_t hibernate_chunk_table_start;
814 	size_t hibernate_chunk_table_size;
815 	u_int8_t *io_page;
816 	daddr_t chunkbase;
817 	int i;
818 
819 	io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
820 	if (!io_page)
821 		return (1);
822 
823 	hibernate_chunk_table_size = HIBERNATE_CHUNK_TABLE_SIZE;
824 
825 	chunkbase = hiber_info->sig_offset -
826 	    (hibernate_chunk_table_size / hiber_info->secsize);
827 
828 	hibernate_chunk_table_start = hiber_info->piglet_va +
829 	    HIBERNATE_CHUNK_SIZE;
830 
831 	chunks = (struct hibernate_disk_chunk *)(hiber_info->piglet_va +
832 	    HIBERNATE_CHUNK_SIZE);
833 
834 	/* Write chunk table */
835 	for (i = 0; i < hibernate_chunk_table_size; i += MAXPHYS) {
836 		if (hiber_info->io_func(hiber_info->device,
837 		    chunkbase + (i/hiber_info->secsize),
838 		    (vaddr_t)(hibernate_chunk_table_start + i),
839 		    MAXPHYS, HIB_W, io_page)) {
840 			free(io_page, M_DEVBUF);
841 			return (1);
842 		}
843 	}
844 
845 	free(io_page, M_DEVBUF);
846 	return (0);
847 }
848 
849 /*
850  * Write an empty hiber_info to the swap signature block, which is
851  * guaranteed to not match any valid hiber_info.
852  */
853 int
854 hibernate_clear_signature(void)
855 {
856 	union hibernate_info blank_hiber_info;
857 	union hibernate_info hiber_info;
858 	u_int8_t *io_page;
859 
860 	/* Zero out a blank hiber_info */
861 	bzero(&blank_hiber_info, sizeof(hiber_info));
862 
863 	if (get_hibernate_info(&hiber_info, 0))
864 		return (1);
865 
866 	io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
867 	if (!io_page)
868 		return (1);
869 
870 	/* Write (zeroed) hibernate info to disk */
871 	/* XXX - use regular kernel write routine for this */
872 	if (hiber_info.io_func(hiber_info.device, hiber_info.sig_offset,
873 	    (vaddr_t)&blank_hiber_info, hiber_info.secsize, HIB_W, io_page))
874 		panic("error hibernate write 6\n");
875 
876 	free(io_page, M_DEVBUF);
877 
878 	return (0);
879 }
880 
881 /*
882  * Check chunk range overlap when calculating whether or not to copy a
883  * compressed chunk to the piglet area before decompressing.
884  *
885  * returns zero if the ranges do not overlap, non-zero otherwise.
886  */
887 int
888 hibernate_check_overlap(paddr_t r1s, paddr_t r1e, paddr_t r2s, paddr_t r2e)
889 {
890 	/* case A : end of r1 overlaps start of r2 */
891 	if (r1s < r2s && r1e > r2s)
892 		return (1);
893 
894 	/* case B : r1 entirely inside r2 */
895 	if (r1s >= r2s && r1e <= r2e)
896 		return (1);
897 
898 	/* case C : r2 entirely inside r1 */
899 	if (r2s >= r1s && r2e <= r1e)
900 		return (1);
901 
902 	/* case D : end of r2 overlaps start of r1 */
903 	if (r2s < r1s && r2e > r1s)
904 		return (1);
905 
906 	return (0);
907 }
908 
909 /*
910  * Compare two hibernate_infos to determine if they are the same (eg,
911  * we should be performing a hibernate resume on this machine.
912  * Not all fields are checked - just enough to verify that the machine
913  * has the same memory configuration and kernel as the one that
914  * wrote the signature previously.
915  */
916 int
917 hibernate_compare_signature(union hibernate_info *mine,
918     union hibernate_info *disk)
919 {
920 	u_int i;
921 
922 	if (mine->nranges != disk->nranges)
923 		return (1);
924 
925 	if (strcmp(mine->kernel_version, disk->kernel_version) != 0)
926 		return (1);
927 
928 	for (i = 0; i < mine->nranges; i++) {
929 		if ((mine->ranges[i].base != disk->ranges[i].base) ||
930 		    (mine->ranges[i].end != disk->ranges[i].end) )
931 			return (1);
932 	}
933 
934 	return (0);
935 }
936 
937 /*
938  * Reads read_size bytes from the hibernate device specified in
939  * hib_info at offset blkctr. Output is placed into the vaddr specified
940  * at dest.
941  *
942  * Separate offsets and pages are used to handle misaligned reads (reads
943  * that span a page boundary).
944  *
945  * blkctr specifies a relative offset (relative to the start of swap),
946  * not an absolute disk offset
947  *
948  */
949 int
950 hibernate_read_block(union hibernate_info *hib_info, daddr_t blkctr,
951     size_t read_size, vaddr_t dest)
952 {
953 	struct buf *bp;
954 	struct bdevsw *bdsw;
955 	int error;
956 
957 	bp = geteblk(read_size);
958 	bdsw = &bdevsw[major(hib_info->device)];
959 
960 	error = (*bdsw->d_open)(hib_info->device, FREAD, S_IFCHR, curproc);
961 	if (error) {
962 		printf("hibernate_read_block open failed\n");
963 		return (1);
964 	}
965 
966 	bp->b_bcount = read_size;
967 	bp->b_blkno = blkctr;
968 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
969 	SET(bp->b_flags, B_BUSY | B_READ | B_RAW);
970 	bp->b_dev = hib_info->device;
971 	bp->b_cylinder = 0;
972 	(*bdsw->d_strategy)(bp);
973 
974 	error = biowait(bp);
975 	if (error) {
976 		printf("hibernate_read_block biowait failed %d\n", error);
977 		error = (*bdsw->d_close)(hib_info->device, 0, S_IFCHR,
978 		    curproc);
979 		if (error)
980 			printf("hibernate_read_block error close failed\n");
981 		return (1);
982 	}
983 
984 	error = (*bdsw->d_close)(hib_info->device, FREAD, S_IFCHR, curproc);
985 	if (error) {
986 		printf("hibernate_read_block close failed\n");
987 		return (1);
988 	}
989 
990 	bcopy(bp->b_data, (caddr_t)dest, read_size);
991 
992 	bp->b_flags |= B_INVAL;
993 	brelse(bp);
994 
995 	return (0);
996 }
997 
998 /*
999  * Reads the signature block from swap, checks against the current machine's
1000  * information. If the information matches, perform a resume by reading the
1001  * saved image into the pig area, and unpacking.
1002  */
1003 void
1004 hibernate_resume(void)
1005 {
1006 	union hibernate_info hiber_info;
1007 	u_int8_t *io_page;
1008 	int s;
1009 
1010 	/* Scrub temporary vaddr ranges used during resume */
1011 	hibernate_temp_page = (vaddr_t)NULL;
1012 	hibernate_fchunk_area = (vaddr_t)NULL;
1013 	hibernate_chunktable_area = (vaddr_t)NULL;
1014 	hibernate_stack_page = (vaddr_t)NULL;
1015 
1016 	/* Get current running machine's hibernate info */
1017 	bzero(&hiber_info, sizeof(hiber_info));
1018 	if (get_hibernate_info(&hiber_info, 0))
1019 		return;
1020 
1021 	io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
1022 	if (!io_page)
1023 		return;
1024 
1025 	/* Read hibernate info from disk */
1026 	s = splbio();
1027 
1028 	/* XXX use regular kernel read routine here */
1029 	if (hiber_info.io_func(hiber_info.device, hiber_info.sig_offset,
1030 	    (vaddr_t)&disk_hiber_info, hiber_info.secsize, HIB_R, io_page))
1031 		panic("error in hibernate read\n");
1032 
1033 	free(io_page, M_DEVBUF);
1034 
1035 	/*
1036 	 * If on-disk and in-memory hibernate signatures match,
1037 	 * this means we should do a resume from hibernate.
1038 	 */
1039 	if (hibernate_compare_signature(&hiber_info, &disk_hiber_info))
1040 		return;
1041 
1042 	/*
1043 	 * Allocate several regions of vaddrs for use during read.
1044 	 * These mappings go into the resuming kernel's page table, and are
1045 	 * used only during image read.
1046 	 */
1047 	hibernate_temp_page = (vaddr_t)km_alloc(2*PAGE_SIZE, &kv_any,
1048 	    &kp_none, &kd_nowait);
1049 	if (!hibernate_temp_page)
1050 		goto fail;
1051 
1052 	hibernate_fchunk_area = (vaddr_t)km_alloc(3*PAGE_SIZE, &kv_any,
1053 	    &kp_none, &kd_nowait);
1054 	if (!hibernate_fchunk_area)
1055 		goto fail;
1056 
1057 	/* Allocate a temporary chunktable area */
1058 	hibernate_chunktable_area = (vaddr_t)malloc(HIBERNATE_CHUNK_TABLE_SIZE,
1059 					   M_DEVBUF, M_NOWAIT);
1060 	if (!hibernate_chunktable_area)
1061 		goto fail;
1062 
1063 	/* Allocate one temporary page of VAs for the resume time stack */
1064 	hibernate_stack_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1065 	    &kp_none, &kd_nowait);
1066 	if (!hibernate_stack_page)
1067 		goto fail;
1068 
1069 	/* Read the image from disk into the image (pig) area */
1070 	if (hibernate_read_image(&disk_hiber_info))
1071 		goto fail;
1072 
1073 	/* Point of no return ... */
1074 
1075 	disable_intr();
1076 	cold = 1;
1077 
1078 	/* Switch stacks */
1079 	hibernate_switch_stack_machdep();
1080 
1081 	/*
1082 	 * Image is now in high memory (pig area), copy to correct location
1083 	 * in memory. We'll eventually end up copying on top of ourself, but
1084 	 * we are assured the kernel code here is the same between the
1085 	 * hibernated and resuming kernel, and we are running on our own
1086 	 * stack, so the overwrite is ok.
1087 	 */
1088 	hibernate_unpack_image(&disk_hiber_info);
1089 
1090 	/*
1091 	 * Resume the loaded kernel by jumping to the MD resume vector.
1092 	 * We won't be returning from this call.
1093 	 */
1094 	hibernate_resume_machdep();
1095 
1096 fail:
1097 	printf("Unable to resume hibernated image\n");
1098 
1099 	if (hibernate_temp_page)
1100 		km_free((void *)hibernate_temp_page, 2*PAGE_SIZE, &kv_any,
1101 		    &kp_none);
1102 
1103 	if (hibernate_fchunk_area)
1104 		km_free((void *)hibernate_fchunk_area, 3*PAGE_SIZE, &kv_any,
1105 		    &kp_none);
1106 
1107 	if (io_page)
1108 		free((void *)io_page, M_DEVBUF);
1109 
1110 	if (hibernate_chunktable_area)
1111 		free((void *)hibernate_chunktable_area, M_DEVBUF);
1112 }
1113 
1114 /*
1115  * Unpack image from pig area to original location by looping through the
1116  * list of output chunks in the order they should be restored (fchunks).
1117  * This ordering is used to avoid having inflate overwrite a chunk in the
1118  * middle of processing that chunk. This will, of course, happen during the
1119  * final output chunk, where we copy the chunk to the piglet area first,
1120  * before inflating.
1121  */
1122 void
1123 hibernate_unpack_image(union hibernate_info *hiber_info)
1124 {
1125 	struct hibernate_disk_chunk *chunks;
1126 	union hibernate_info local_hiber_info;
1127 	paddr_t image_cur = global_pig_start;
1128 	vaddr_t tempva;
1129 	int *fchunks, i;
1130 	char *pva = (char *)hiber_info->piglet_va;
1131 
1132 	/* Mask off based on arch-specific piglet page size */
1133 	pva = (char *)((paddr_t)pva & (PIGLET_PAGE_MASK));
1134 	fchunks = (int *)(pva + (6 * PAGE_SIZE));
1135 
1136 	/* Copy temporary chunktable to piglet */
1137 	tempva = (vaddr_t)km_alloc(HIBERNATE_CHUNK_TABLE_SIZE, &kv_any,
1138 	    &kp_none, &kd_nowait);
1139 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE; i += PAGE_SIZE)
1140 		pmap_kenter_pa(tempva + i, hiber_info->piglet_pa +
1141 		    HIBERNATE_CHUNK_SIZE + i, VM_PROT_ALL);
1142 
1143 	bcopy((caddr_t)hibernate_chunktable_area, (caddr_t)tempva,
1144 	    HIBERNATE_CHUNK_TABLE_SIZE);
1145 
1146 	chunks = (struct hibernate_disk_chunk *)(pva +  HIBERNATE_CHUNK_SIZE);
1147 
1148 	/* Can't use hiber_info that's passed in after here */
1149 	bcopy(hiber_info, &local_hiber_info, sizeof(union hibernate_info));
1150 
1151 	hibernate_activate_resume_pt_machdep();
1152 
1153 	for (i = 0; i < local_hiber_info.chunk_ctr; i++) {
1154 		/* Reset zlib for inflate */
1155 		if (hibernate_zlib_reset(&local_hiber_info, 0) != Z_OK)
1156 			panic("hibernate failed to reset zlib for inflate\n");
1157 
1158 		/*
1159 		 * If there is a conflict, copy the chunk to the piglet area
1160 		 * before unpacking it to its original location.
1161 		 */
1162 		if ((chunks[fchunks[i]].flags & HIBERNATE_CHUNK_CONFLICT) == 0)
1163 			hibernate_inflate(&local_hiber_info,
1164 			    chunks[fchunks[i]].base, image_cur,
1165 			    chunks[fchunks[i]].compressed_size);
1166 		else {
1167 			bcopy((caddr_t)image_cur,
1168 			    pva + (HIBERNATE_CHUNK_SIZE * 2),
1169 			    chunks[fchunks[i]].compressed_size);
1170 			hibernate_inflate(&local_hiber_info,
1171 			    chunks[fchunks[i]].base,
1172 			    (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)),
1173 			    chunks[fchunks[i]].compressed_size);
1174 		}
1175 		image_cur += chunks[fchunks[i]].compressed_size;
1176 	}
1177 }
1178 
1179 /*
1180  * Write a compressed version of this machine's memory to disk, at the
1181  * precalculated swap offset:
1182  *
1183  * end of swap - signature block size - chunk table size - memory size
1184  *
1185  * The function begins by looping through each phys mem range, cutting each
1186  * one into 4MB chunks. These chunks are then compressed individually
1187  * and written out to disk, in phys mem order. Some chunks might compress
1188  * more than others, and for this reason, each chunk's size is recorded
1189  * in the chunk table, which is written to disk after the image has
1190  * properly been compressed and written (in hibernate_write_chunktable).
1191  *
1192  * When this function is called, the machine is nearly suspended - most
1193  * devices are quiesced/suspended, interrupts are off, and cold has
1194  * been set. This means that there can be no side effects once the
1195  * write has started, and the write function itself can also have no
1196  * side effects.
1197  *
1198  * This function uses the piglet area during this process as follows:
1199  *
1200  * offset from piglet base	use
1201  * -----------------------	--------------------
1202  * 0				i/o allocation area
1203  * PAGE_SIZE			i/o write area
1204  * 2*PAGE_SIZE			temp/scratch page
1205  * 3*PAGE_SIZE			temp/scratch page
1206  * 4*PAGE_SIZE			hiballoc arena
1207  * 5*PAGE_SIZE to 85*PAGE_SIZE	zlib deflate area
1208  * ...
1209  * HIBERNATE_CHUNK_SIZE		chunk table temporary area
1210  *
1211  * Some transient piglet content is saved as part of deflate,
1212  * but it is irrelevant during resume as it will be repurposed
1213  * at that time for other things.
1214  */
1215 int
1216 hibernate_write_chunks(union hibernate_info *hiber_info)
1217 {
1218 	paddr_t range_base, range_end, inaddr, temp_inaddr;
1219 	size_t nblocks, out_remaining, used, offset = 0;
1220 	struct hibernate_disk_chunk *chunks;
1221 	vaddr_t hibernate_alloc_page = hiber_info->piglet_va;
1222 	vaddr_t hibernate_io_page = hiber_info->piglet_va + PAGE_SIZE;
1223 	daddr_t blkctr = hiber_info->image_offset;
1224 	int i;
1225 
1226 	hiber_info->chunk_ctr = 0;
1227 
1228 	/*
1229 	 * Allocate VA for the temp and copy page.
1230 	 */
1231 
1232 	hibernate_temp_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1233 	    &kp_none, &kd_nowait);
1234 	if (!hibernate_temp_page)
1235 		return (1);
1236 
1237 	hibernate_copy_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1238 	    &kp_none, &kd_nowait);
1239 	if (!hibernate_copy_page)
1240 		return (1);
1241 
1242 	pmap_kenter_pa(hibernate_copy_page,
1243 	    (hiber_info->piglet_pa + 3*PAGE_SIZE), VM_PROT_ALL);
1244 
1245 	/* XXX - not needed on all archs */
1246 	pmap_activate(curproc);
1247 
1248 	chunks = (struct hibernate_disk_chunk *)(hiber_info->piglet_va +
1249 	    HIBERNATE_CHUNK_SIZE);
1250 
1251 	/* Calculate the chunk regions */
1252 	for (i = 0; i < hiber_info->nranges; i++) {
1253 		range_base = hiber_info->ranges[i].base;
1254 		range_end = hiber_info->ranges[i].end;
1255 
1256 		inaddr = range_base;
1257 
1258 		while (inaddr < range_end) {
1259 			chunks[hiber_info->chunk_ctr].base = inaddr;
1260 			if (inaddr + HIBERNATE_CHUNK_SIZE < range_end)
1261 				chunks[hiber_info->chunk_ctr].end = inaddr +
1262 				    HIBERNATE_CHUNK_SIZE;
1263 			else
1264 				chunks[hiber_info->chunk_ctr].end = range_end;
1265 
1266 			inaddr += HIBERNATE_CHUNK_SIZE;
1267 			hiber_info->chunk_ctr ++;
1268 		}
1269 	}
1270 
1271 	/* Compress and write the chunks in the chunktable */
1272 	for (i = 0; i < hiber_info->chunk_ctr; i++) {
1273 		range_base = chunks[i].base;
1274 		range_end = chunks[i].end;
1275 
1276 		chunks[i].offset = blkctr;
1277 
1278 		/* Reset zlib for deflate */
1279 		if (hibernate_zlib_reset(hiber_info, 1) != Z_OK)
1280 			return (1);
1281 
1282 		inaddr = range_base;
1283 
1284 		/*
1285 		 * For each range, loop through its phys mem region
1286 		 * and write out the chunks (the last chunk might be
1287 		 * smaller than the chunk size).
1288 		 */
1289 		while (inaddr < range_end) {
1290 			out_remaining = PAGE_SIZE;
1291 			while (out_remaining > 0 && inaddr < range_end) {
1292 				pmap_kenter_pa(hibernate_temp_page,
1293 				    inaddr & PMAP_PA_MASK, VM_PROT_ALL);
1294 
1295 				/* XXX - not needed on all archs */
1296 				pmap_activate(curproc);
1297 
1298 				bcopy((caddr_t)hibernate_temp_page,
1299 				    (caddr_t)hibernate_copy_page, PAGE_SIZE);
1300 
1301 				/*
1302 				 * Adjust for regions that are not evenly
1303 				 * divisible by PAGE_SIZE
1304 				 */
1305 				temp_inaddr = (inaddr & PAGE_MASK) +
1306 				    hibernate_copy_page;
1307 
1308 				/* Deflate from temp_inaddr to IO page */
1309 				inaddr += hibernate_deflate(hiber_info,
1310 				    temp_inaddr, &out_remaining);
1311 			}
1312 
1313 			if (out_remaining == 0) {
1314 				/* Filled up the page */
1315 				nblocks = PAGE_SIZE / hiber_info->secsize;
1316 
1317 				if (hiber_info->io_func(hiber_info->device,
1318 				    blkctr, (vaddr_t)hibernate_io_page,
1319 				    PAGE_SIZE, HIB_W, (void *)hibernate_alloc_page))
1320 					return (1);
1321 
1322 				blkctr += nblocks;
1323 			}
1324 		}
1325 
1326 		if (inaddr != range_end)
1327 			return (1);
1328 
1329 		/*
1330 		 * End of range. Round up to next secsize bytes
1331 		 * after finishing compress
1332 		 */
1333 		if (out_remaining == 0)
1334 			out_remaining = PAGE_SIZE;
1335 
1336 		/* Finish compress */
1337 		hibernate_state->hib_stream.avail_in = 0;
1338 		hibernate_state->hib_stream.avail_out = out_remaining;
1339 		hibernate_state->hib_stream.next_in = (caddr_t)inaddr;
1340 		hibernate_state->hib_stream.next_out =
1341 		    (caddr_t)hibernate_io_page + (PAGE_SIZE - out_remaining);
1342 
1343 		if (deflate(&hibernate_state->hib_stream, Z_FINISH) !=
1344 		    Z_STREAM_END)
1345 			return (1);
1346 
1347 		out_remaining = hibernate_state->hib_stream.avail_out;
1348 
1349 		used = PAGE_SIZE - out_remaining;
1350 		nblocks = used / hiber_info->secsize;
1351 
1352 		/* Round up to next block if needed */
1353 		if (used % hiber_info->secsize != 0)
1354 			nblocks ++;
1355 
1356 		/* Write final block(s) for this chunk */
1357 		if (hiber_info->io_func(hiber_info->device, blkctr,
1358 		    (vaddr_t)hibernate_io_page, nblocks*hiber_info->secsize,
1359 		    HIB_W, (void *)hibernate_alloc_page))
1360 			return (1);
1361 
1362 		blkctr += nblocks;
1363 
1364 		offset = blkctr;
1365 		chunks[i].compressed_size = (offset - chunks[i].offset) *
1366 		    hiber_info->secsize;
1367 	}
1368 
1369 	return (0);
1370 }
1371 
1372 /*
1373  * Reset the zlib stream state and allocate a new hiballoc area for either
1374  * inflate or deflate. This function is called once for each hibernate chunk.
1375  * Calling hiballoc_init multiple times is acceptable since the memory it is
1376  * provided is unmanaged memory (stolen). We use the memory provided to us
1377  * by the piglet allocated via the supplied hiber_info.
1378  */
1379 int
1380 hibernate_zlib_reset(union hibernate_info *hiber_info, int deflate)
1381 {
1382 	vaddr_t hibernate_zlib_start;
1383 	size_t hibernate_zlib_size;
1384 	char *pva = (char *)hiber_info->piglet_va;
1385 
1386 	hibernate_state = (struct hibernate_zlib_state *)
1387 	    (pva + (7 * PAGE_SIZE));
1388 
1389 	hibernate_zlib_start = (vaddr_t)(pva + (8 * PAGE_SIZE));
1390 	hibernate_zlib_size = 80 * PAGE_SIZE;
1391 
1392 	bzero((caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1393 	bzero((caddr_t)hibernate_state, PAGE_SIZE);
1394 
1395 	/* Set up stream structure */
1396 	hibernate_state->hib_stream.zalloc = (alloc_func)hibernate_zlib_alloc;
1397 	hibernate_state->hib_stream.zfree = (free_func)hibernate_zlib_free;
1398 
1399 	/* Initialize the hiballoc arena for zlib allocs/frees */
1400 	hiballoc_init(&hibernate_state->hiballoc_arena,
1401 	    (caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1402 
1403 	if (deflate) {
1404 		return deflateInit(&hibernate_state->hib_stream,
1405 		    Z_DEFAULT_COMPRESSION);
1406 	} else
1407 		return inflateInit(&hibernate_state->hib_stream);
1408 }
1409 
1410 /*
1411  * Reads the hibernated memory image from disk, whose location and
1412  * size are recorded in hiber_info. Begin by reading the persisted
1413  * chunk table, which records the original chunk placement location
1414  * and compressed size for each. Next, allocate a pig region of
1415  * sufficient size to hold the compressed image. Next, read the
1416  * chunks into the pig area (calling hibernate_read_chunks to do this),
1417  * and finally, if all of the above succeeds, clear the hibernate signature.
1418  * The function will then return to hibernate_resume, which will proceed
1419  * to unpack the pig image to the correct place in memory.
1420  */
1421 int
1422 hibernate_read_image(union hibernate_info *hiber_info)
1423 {
1424 	size_t compressed_size, disk_size, chunktable_size, pig_sz;
1425 	paddr_t image_start, image_end, pig_start, pig_end;
1426 	struct hibernate_disk_chunk *chunks;
1427 	daddr_t blkctr;
1428 	int i;
1429 
1430 	/* Calculate total chunk table size in disk blocks */
1431 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / hiber_info->secsize;
1432 
1433 	blkctr = hiber_info->sig_offset - chunktable_size -
1434 			hiber_info->swap_offset;
1435 
1436 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE;
1437 	    i += MAXPHYS, blkctr += MAXPHYS/hiber_info->secsize)
1438 		hibernate_read_block(hiber_info, blkctr, MAXPHYS,
1439 		    hibernate_chunktable_area + i);
1440 
1441 	blkctr = hiber_info->image_offset;
1442 	compressed_size = 0;
1443 	chunks = (struct hibernate_disk_chunk *)hibernate_chunktable_area;
1444 
1445 	for (i = 0; i < hiber_info->chunk_ctr; i++)
1446 		compressed_size += chunks[i].compressed_size;
1447 
1448 	disk_size = compressed_size;
1449 
1450 	/* Allocate the pig area */
1451 	pig_sz = compressed_size + HIBERNATE_CHUNK_SIZE;
1452 	if (uvm_pmr_alloc_pig(&pig_start, pig_sz) == ENOMEM)
1453 		return (1);
1454 
1455 	pig_end = pig_start + pig_sz;
1456 
1457 	/* Calculate image extents. Pig image must end on a chunk boundary. */
1458 	image_end = pig_end & ~(HIBERNATE_CHUNK_SIZE - 1);
1459 	image_start = pig_start;
1460 
1461 	image_start = image_end - disk_size;
1462 
1463 	hibernate_read_chunks(hiber_info, image_start, image_end, disk_size);
1464 
1465 	/* Prepare the resume time pmap/page table */
1466 	hibernate_populate_resume_pt(hiber_info, image_start, image_end);
1467 
1468 	/* Read complete, clear the signature and return */
1469 	return hibernate_clear_signature();
1470 }
1471 
1472 /*
1473  * Read the hibernated memory chunks from disk (chunk information at this
1474  * point is stored in the piglet) into the pig area specified by
1475  * [pig_start .. pig_end]. Order the chunks so that the final chunk is the
1476  * only chunk with overlap possibilities.
1477  *
1478  * This function uses the piglet area during this process as follows:
1479  *
1480  * offset from piglet base	use
1481  * -----------------------	--------------------
1482  * 0				i/o allocation area
1483  * PAGE_SIZE			i/o write area
1484  * 2*PAGE_SIZE			temp/scratch page
1485  * 3*PAGE_SIZE			temp/scratch page
1486  * 4*PAGE_SIZE to 6*PAGE_SIZE	chunk ordering area
1487  * 7*PAGE_SIZE			hiballoc arena
1488  * 8*PAGE_SIZE to 88*PAGE_SIZE	zlib deflate area
1489  * ...
1490  * HIBERNATE_CHUNK_SIZE		chunk table temporary area
1491  */
1492 int
1493 hibernate_read_chunks(union hibernate_info *hib_info, paddr_t pig_start,
1494     paddr_t pig_end, size_t image_compr_size)
1495 {
1496 	paddr_t img_index, img_cur, r1s, r1e, r2s, r2e;
1497 	paddr_t copy_start, copy_end, piglet_cur;
1498 	paddr_t piglet_base = hib_info->piglet_pa;
1499 	paddr_t piglet_end = piglet_base + HIBERNATE_CHUNK_SIZE;
1500 	daddr_t blkctr;
1501 	size_t processed, compressed_size, read_size;
1502 	int i, j, overlap, found, nchunks;
1503 	int nochunks = 0, nfchunks = 0, npchunks = 0;
1504 	struct hibernate_disk_chunk *chunks;
1505 	int *ochunks, *pchunks, *fchunks;
1506 
1507 	global_pig_start = pig_start;
1508 
1509 	/* XXX - dont need this on all archs */
1510 	pmap_activate(curproc);
1511 
1512 	/* Temporary output chunk ordering */
1513 	ochunks = (int *)hibernate_fchunk_area;
1514 
1515 	/* Piglet chunk ordering */
1516 	pchunks = (int *)(hibernate_fchunk_area + PAGE_SIZE);
1517 
1518 	/* Final chunk ordering */
1519 	fchunks = (int *)(hibernate_fchunk_area + (2*PAGE_SIZE));
1520 
1521 	/* Map the chunk ordering region */
1522 	pmap_kenter_pa(hibernate_fchunk_area,
1523 	    piglet_base + (4*PAGE_SIZE), VM_PROT_ALL);
1524 	pmap_kenter_pa((vaddr_t)pchunks, piglet_base + (5*PAGE_SIZE),
1525 	    VM_PROT_ALL);
1526 	pmap_kenter_pa((vaddr_t)fchunks, piglet_base + (6*PAGE_SIZE),
1527 	    VM_PROT_ALL);
1528 
1529 	nchunks = hib_info->chunk_ctr;
1530 	chunks = (struct hibernate_disk_chunk *)hibernate_chunktable_area;
1531 
1532 	/* Initially start all chunks as unplaced */
1533 	for (i = 0; i < nchunks; i++)
1534 		chunks[i].flags = 0;
1535 
1536 	/*
1537 	 * Search the list for chunks that are outside the pig area. These
1538 	 * can be placed first in the final output list.
1539 	 */
1540 	for (i = 0; i < nchunks; i++) {
1541 		if (chunks[i].end <= pig_start || chunks[i].base >= pig_end) {
1542 			ochunks[nochunks] = (u_int8_t)i;
1543 			fchunks[nfchunks] = (u_int8_t)i;
1544 			nochunks++;
1545 			nfchunks++;
1546 			chunks[i].flags |= HIBERNATE_CHUNK_USED;
1547 		}
1548 	}
1549 
1550 	/*
1551 	 * Walk the ordering, place the chunks in ascending memory order.
1552 	 * Conflicts might arise, these are handled next.
1553 	 */
1554 	do {
1555 		img_index = -1;
1556 		found = 0;
1557 		j = -1;
1558 		for (i = 0; i < nchunks; i++)
1559 			if (chunks[i].base < img_index &&
1560 			    chunks[i].flags == 0 ) {
1561 				j = i;
1562 				img_index = chunks[i].base;
1563 			}
1564 
1565 		if (j != -1) {
1566 			found = 1;
1567 			ochunks[nochunks] = (short)j;
1568 			nochunks++;
1569 			chunks[j].flags |= HIBERNATE_CHUNK_PLACED;
1570 		}
1571 	} while (found);
1572 
1573 	img_index = pig_start;
1574 
1575 	/*
1576 	 * Identify chunk output conflicts (chunks whose pig load area
1577 	 * corresponds to their original memory placement location)
1578 	 */
1579 	for (i = 0; i < nochunks ; i++) {
1580 		overlap = 0;
1581 		r1s = img_index;
1582 		r1e = img_index + chunks[ochunks[i]].compressed_size;
1583 		r2s = chunks[ochunks[i]].base;
1584 		r2e = chunks[ochunks[i]].end;
1585 
1586 		overlap = hibernate_check_overlap(r1s, r1e, r2s, r2e);
1587 		if (overlap)
1588 			chunks[ochunks[i]].flags |= HIBERNATE_CHUNK_CONFLICT;
1589 		img_index += chunks[ochunks[i]].compressed_size;
1590 	}
1591 
1592 	/*
1593 	 * Prepare the final output chunk list. Calculate an output
1594 	 * inflate strategy for overlapping chunks if needed.
1595 	 */
1596 	img_index = pig_start;
1597 	for (i = 0; i < nochunks ; i++) {
1598 		/*
1599 		 * If a conflict is detected, consume enough compressed
1600 		 * output chunks to fill the piglet
1601 		 */
1602 		if (chunks[ochunks[i]].flags & HIBERNATE_CHUNK_CONFLICT) {
1603 			copy_start = piglet_base;
1604 			copy_end = piglet_end;
1605 			piglet_cur = piglet_base;
1606 			npchunks = 0;
1607 			j = i;
1608 			while (copy_start < copy_end && j < nochunks) {
1609 				piglet_cur += chunks[ochunks[j]].compressed_size;
1610 				pchunks[npchunks] = ochunks[j];
1611 				npchunks++;
1612 				copy_start += chunks[ochunks[j]].compressed_size;
1613 				img_index += chunks[ochunks[j]].compressed_size;
1614 				i++;
1615 				j++;
1616 			}
1617 
1618 			piglet_cur = piglet_base;
1619 			for (j = 0; j < npchunks; j++) {
1620 				piglet_cur += chunks[pchunks[j]].compressed_size;
1621 				fchunks[nfchunks] = pchunks[j];
1622 				chunks[pchunks[j]].flags |= HIBERNATE_CHUNK_USED;
1623 				nfchunks++;
1624 			}
1625 		} else {
1626 			/*
1627 			 * No conflict, chunk can be added without copying
1628 			 */
1629 			if ((chunks[ochunks[i]].flags &
1630 			    HIBERNATE_CHUNK_USED) == 0) {
1631 				fchunks[nfchunks] = ochunks[i];
1632 				chunks[ochunks[i]].flags |= HIBERNATE_CHUNK_USED;
1633 				nfchunks++;
1634 			}
1635 			img_index += chunks[ochunks[i]].compressed_size;
1636 		}
1637 	}
1638 
1639 	img_index = pig_start;
1640 	for (i = 0; i < nfchunks; i++) {
1641 		piglet_cur = piglet_base;
1642 		img_index += chunks[fchunks[i]].compressed_size;
1643 	}
1644 
1645 	img_cur = pig_start;
1646 
1647 	for (i = 0; i < nfchunks; i++) {
1648 		blkctr = chunks[fchunks[i]].offset - hib_info->swap_offset;
1649 		processed = 0;
1650 		compressed_size = chunks[fchunks[i]].compressed_size;
1651 
1652 		while (processed < compressed_size) {
1653 			pmap_kenter_pa(hibernate_temp_page, img_cur,
1654 			    VM_PROT_ALL);
1655 			pmap_kenter_pa(hibernate_temp_page + PAGE_SIZE,
1656 			    img_cur+PAGE_SIZE, VM_PROT_ALL);
1657 
1658 			/* XXX - not needed on all archs */
1659 			pmap_activate(curproc);
1660 			if (compressed_size - processed >= PAGE_SIZE)
1661 				read_size = PAGE_SIZE;
1662 			else
1663 				read_size = compressed_size - processed;
1664 
1665 			hibernate_read_block(hib_info, blkctr, read_size,
1666 			    hibernate_temp_page + (img_cur & PAGE_MASK));
1667 
1668 			blkctr += (read_size / hib_info->secsize);
1669 
1670 			hibernate_flush();
1671 			pmap_kremove(hibernate_temp_page, PAGE_SIZE);
1672 			pmap_kremove(hibernate_temp_page + PAGE_SIZE,
1673 			    PAGE_SIZE);
1674 			processed += read_size;
1675 			img_cur += read_size;
1676 		}
1677 	}
1678 
1679 	return (0);
1680 }
1681 
1682 /*
1683  * Hibernating a machine comprises the following operations:
1684  *  1. Calculating this machine's hibernate_info information
1685  *  2. Allocating a piglet and saving the piglet's physaddr
1686  *  3. Calculating the memory chunks
1687  *  4. Writing the compressed chunks to disk
1688  *  5. Writing the chunk table
1689  *  6. Writing the signature block (hibernate_info)
1690  *
1691  * On most architectures, the function calling hibernate_suspend would
1692  * then power off the machine using some MD-specific implementation.
1693  */
1694 int
1695 hibernate_suspend(void)
1696 {
1697 	union hibernate_info hib_info;
1698 
1699 	/*
1700 	 * Calculate memory ranges, swap offsets, etc.
1701 	 * This also allocates a piglet whose physaddr is stored in
1702 	 * hib_info->piglet_pa and vaddr stored in hib_info->piglet_va
1703 	 */
1704 	if (get_hibernate_info(&hib_info, 1))
1705 		return (1);
1706 
1707 	global_piglet_va = hib_info.piglet_va;
1708 
1709 	/* XXX - Won't need to zero everything with RLE */
1710 	uvm_pmr_zero_everything();
1711 
1712 	if (hibernate_write_chunks(&hib_info))
1713 		return (1);
1714 
1715 	if (hibernate_write_chunktable(&hib_info))
1716 		return (1);
1717 
1718 	if (hibernate_write_signature(&hib_info))
1719 		return (1);
1720 
1721 	delay(500000);
1722 	return (0);
1723 }
1724 
1725 /*
1726  * Free items allocated during hibernate
1727  */
1728 void
1729 hibernate_free(void)
1730 {
1731 	uvm_pmr_free_piglet(global_piglet_va, 3*HIBERNATE_CHUNK_SIZE);
1732 
1733 	pmap_kremove(hibernate_copy_page, PAGE_SIZE);
1734 	pmap_kremove(hibernate_temp_page, PAGE_SIZE);
1735 	pmap_update(pmap_kernel());
1736 
1737 	km_free((void *)hibernate_fchunk_area, 3*PAGE_SIZE, &kv_any, &kp_none);
1738 	km_free((void *)hibernate_copy_page, PAGE_SIZE, &kv_any, &kp_none);
1739 	km_free((void *)hibernate_temp_page, PAGE_SIZE, &kv_any, &kp_none);
1740 }
1741