xref: /openbsd-src/sys/kern/subr_hibernate.c (revision 2a168c45ca9e4ae6e193098d47dbac484edf0f4a)
1 /*	$OpenBSD: subr_hibernate.c,v 1.17 2011/09/21 06:13:39 mlarkin Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  * Copyright (c) 2011 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/hibernate.h>
21 #include <sys/malloc.h>
22 #include <sys/param.h>
23 #include <sys/tree.h>
24 #include <sys/types.h>
25 #include <sys/systm.h>
26 #include <sys/disklabel.h>
27 #include <sys/conf.h>
28 #include <sys/buf.h>
29 #include <sys/fcntl.h>
30 #include <sys/stat.h>
31 #include <uvm/uvm.h>
32 #include <machine/hibernate.h>
33 
34 extern char *disk_readlabel(struct disklabel *, dev_t, char *, size_t);
35 
36 struct hibernate_zlib_state *hibernate_state;
37 
38 /* Temporary vaddr ranges used during hibernate */
39 vaddr_t hibernate_temp_page;
40 vaddr_t hibernate_copy_page;
41 vaddr_t hibernate_stack_page;
42 vaddr_t hibernate_fchunk_area;
43 vaddr_t	hibernate_chunktable_area;
44 vaddr_t hibernate_inflate_page;
45 
46 /* Hibernate info as read from disk during resume */
47 union hibernate_info disk_hiber_info;
48 
49 /*
50  * Hib alloc enforced alignment.
51  */
52 #define HIB_ALIGN		8 /* bytes alignment */
53 
54 /*
55  * sizeof builtin operation, but with alignment constraint.
56  */
57 #define HIB_SIZEOF(_type)	roundup(sizeof(_type), HIB_ALIGN)
58 
59 struct hiballoc_entry
60 {
61 	size_t			hibe_use;
62 	size_t			hibe_space;
63 	RB_ENTRY(hiballoc_entry) hibe_entry;
64 };
65 
66 /*
67  * Compare hiballoc entries based on the address they manage.
68  *
69  * Since the address is fixed, relative to struct hiballoc_entry,
70  * we just compare the hiballoc_entry pointers.
71  */
72 static __inline int
73 hibe_cmp(struct hiballoc_entry *l, struct hiballoc_entry *r)
74 {
75 	return l < r ? -1 : (l > r);
76 }
77 
78 RB_PROTOTYPE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
79 
80 /*
81  * Given a hiballoc entry, return the address it manages.
82  */
83 static __inline void*
84 hib_entry_to_addr(struct hiballoc_entry *entry)
85 {
86 	caddr_t addr;
87 
88 	addr = (caddr_t)entry;
89 	addr += HIB_SIZEOF(struct hiballoc_entry);
90 	return addr;
91 }
92 
93 /*
94  * Given an address, find the hiballoc that corresponds.
95  */
96 static __inline struct hiballoc_entry*
97 hib_addr_to_entry(void* addr_param)
98 {
99 	caddr_t addr;
100 
101 	addr = (caddr_t)addr_param;
102 	addr -= HIB_SIZEOF(struct hiballoc_entry);
103 	return (struct hiballoc_entry*)addr;
104 }
105 
106 RB_GENERATE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
107 
108 /*
109  * Allocate memory from the arena.
110  *
111  * Returns NULL if no memory is available.
112  */
113 void*
114 hib_alloc(struct hiballoc_arena *arena, size_t alloc_sz)
115 {
116 	struct hiballoc_entry *entry, *new_entry;
117 	size_t find_sz;
118 
119 	/*
120 	 * Enforce alignment of HIB_ALIGN bytes.
121 	 *
122 	 * Note that, because the entry is put in front of the allocation,
123 	 * 0-byte allocations are guaranteed a unique address.
124 	 */
125 	alloc_sz = roundup(alloc_sz, HIB_ALIGN);
126 
127 	/*
128 	 * Find an entry with hibe_space >= find_sz.
129 	 *
130 	 * If the root node is not large enough, we switch to tree traversal.
131 	 * Because all entries are made at the bottom of the free space,
132 	 * traversal from the end has a slightly better chance of yielding
133 	 * a sufficiently large space.
134 	 */
135 	find_sz = alloc_sz + HIB_SIZEOF(struct hiballoc_entry);
136 	entry = RB_ROOT(&arena->hib_addrs);
137 	if (entry != NULL && entry->hibe_space < find_sz) {
138 		RB_FOREACH_REVERSE(entry, hiballoc_addr, &arena->hib_addrs) {
139 			if (entry->hibe_space >= find_sz)
140 				break;
141 		}
142 	}
143 
144 	/*
145 	 * Insufficient or too fragmented memory.
146 	 */
147 	if (entry == NULL)
148 		return NULL;
149 
150 	/*
151 	 * Create new entry in allocated space.
152 	 */
153 	new_entry = (struct hiballoc_entry*)(
154 	    (caddr_t)hib_entry_to_addr(entry) + entry->hibe_use);
155 	new_entry->hibe_space = entry->hibe_space - find_sz;
156 	new_entry->hibe_use = alloc_sz;
157 
158 	/*
159 	 * Insert entry.
160 	 */
161 	if (RB_INSERT(hiballoc_addr, &arena->hib_addrs, new_entry) != NULL)
162 		panic("hib_alloc: insert failure");
163 	entry->hibe_space = 0;
164 
165 	/* Return address managed by entry. */
166 	return hib_entry_to_addr(new_entry);
167 }
168 
169 /*
170  * Free a pointer previously allocated from this arena.
171  *
172  * If addr is NULL, this will be silently accepted.
173  */
174 void
175 hib_free(struct hiballoc_arena *arena, void *addr)
176 {
177 	struct hiballoc_entry *entry, *prev;
178 
179 	if (addr == NULL)
180 		return;
181 
182 	/*
183 	 * Derive entry from addr and check it is really in this arena.
184 	 */
185 	entry = hib_addr_to_entry(addr);
186 	if (RB_FIND(hiballoc_addr, &arena->hib_addrs, entry) != entry)
187 		panic("hib_free: freed item %p not in hib arena", addr);
188 
189 	/*
190 	 * Give the space in entry to its predecessor.
191 	 *
192 	 * If entry has no predecessor, change its used space into free space
193 	 * instead.
194 	 */
195 	prev = RB_PREV(hiballoc_addr, &arena->hib_addrs, entry);
196 	if (prev != NULL &&
197 	    (void*)((caddr_t)prev + HIB_SIZEOF(struct hiballoc_entry) +
198 	    prev->hibe_use + prev->hibe_space) == entry) {
199 		/* Merge entry. */
200 		RB_REMOVE(hiballoc_addr, &arena->hib_addrs, entry);
201 		prev->hibe_space += HIB_SIZEOF(struct hiballoc_entry) +
202 		    entry->hibe_use + entry->hibe_space;
203 	} else {
204 	  	/* Flip used memory to free space. */
205 		entry->hibe_space += entry->hibe_use;
206 		entry->hibe_use = 0;
207 	}
208 }
209 
210 /*
211  * Initialize hiballoc.
212  *
213  * The allocator will manage memmory at ptr, which is len bytes.
214  */
215 int
216 hiballoc_init(struct hiballoc_arena *arena, void *p_ptr, size_t p_len)
217 {
218 	struct hiballoc_entry *entry;
219 	caddr_t ptr;
220 	size_t len;
221 
222 	RB_INIT(&arena->hib_addrs);
223 
224 	/*
225 	 * Hib allocator enforces HIB_ALIGN alignment.
226 	 * Fixup ptr and len.
227 	 */
228 	ptr = (caddr_t)roundup((vaddr_t)p_ptr, HIB_ALIGN);
229 	len = p_len - ((size_t)ptr - (size_t)p_ptr);
230 	len &= ~((size_t)HIB_ALIGN - 1);
231 
232 	/*
233 	 * Insufficient memory to be able to allocate and also do bookkeeping.
234 	 */
235 	if (len <= HIB_SIZEOF(struct hiballoc_entry))
236 		return ENOMEM;
237 
238 	/*
239 	 * Create entry describing space.
240 	 */
241 	entry = (struct hiballoc_entry*)ptr;
242 	entry->hibe_use = 0;
243 	entry->hibe_space = len - HIB_SIZEOF(struct hiballoc_entry);
244 	RB_INSERT(hiballoc_addr, &arena->hib_addrs, entry);
245 
246 	return 0;
247 }
248 
249 
250 /*
251  * Zero all free memory.
252  */
253 void
254 uvm_pmr_zero_everything(void)
255 {
256 	struct uvm_pmemrange	*pmr;
257 	struct vm_page		*pg;
258 	int			 i;
259 
260 	uvm_lock_fpageq();
261 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
262 		/* Zero single pages. */
263 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_DIRTY]))
264 		    != NULL) {
265 			uvm_pmr_remove(pmr, pg);
266 			uvm_pagezero(pg);
267 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
268 			uvmexp.zeropages++;
269 			uvm_pmr_insert(pmr, pg, 0);
270 		}
271 
272 		/* Zero multi page ranges. */
273 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_DIRTY]))
274 		    != NULL) {
275 			pg--; /* Size tree always has second page. */
276 			uvm_pmr_remove(pmr, pg);
277 			for (i = 0; i < pg->fpgsz; i++) {
278 				uvm_pagezero(&pg[i]);
279 				atomic_setbits_int(&pg[i].pg_flags, PG_ZERO);
280 				uvmexp.zeropages++;
281 			}
282 			uvm_pmr_insert(pmr, pg, 0);
283 		}
284 	}
285 	uvm_unlock_fpageq();
286 }
287 
288 /*
289  * Mark all memory as dirty.
290  *
291  * Used to inform the system that the clean memory isn't clean for some
292  * reason, for example because we just came back from hibernate.
293  */
294 void
295 uvm_pmr_dirty_everything(void)
296 {
297 	struct uvm_pmemrange	*pmr;
298 	struct vm_page		*pg;
299 	int			 i;
300 
301 	uvm_lock_fpageq();
302 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
303 		/* Dirty single pages. */
304 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_ZERO]))
305 		    != NULL) {
306 			uvm_pmr_remove(pmr, pg);
307 			atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
308 			uvm_pmr_insert(pmr, pg, 0);
309 		}
310 
311 		/* Dirty multi page ranges. */
312 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_ZERO]))
313 		    != NULL) {
314 			pg--; /* Size tree always has second page. */
315 			uvm_pmr_remove(pmr, pg);
316 			for (i = 0; i < pg->fpgsz; i++)
317 				atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
318 			uvm_pmr_insert(pmr, pg, 0);
319 		}
320 	}
321 
322 	uvmexp.zeropages = 0;
323 	uvm_unlock_fpageq();
324 }
325 
326 /*
327  * Allocate the highest address that can hold sz.
328  *
329  * sz in bytes.
330  */
331 int
332 uvm_pmr_alloc_pig(paddr_t *addr, psize_t sz)
333 {
334 	struct uvm_pmemrange	*pmr;
335 	struct vm_page		*pig_pg, *pg;
336 
337 	/*
338 	 * Convert sz to pages, since that is what pmemrange uses internally.
339 	 */
340 	sz = atop(round_page(sz));
341 
342 	uvm_lock_fpageq();
343 
344 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
345 		RB_FOREACH_REVERSE(pig_pg, uvm_pmr_addr, &pmr->addr) {
346 			if (pig_pg->fpgsz >= sz) {
347 				goto found;
348 			}
349 		}
350 	}
351 
352 	/*
353 	 * Allocation failure.
354 	 */
355 	uvm_unlock_pageq();
356 	return ENOMEM;
357 
358 found:
359 	/* Remove page from freelist. */
360 	uvm_pmr_remove_size(pmr, pig_pg);
361 	pig_pg->fpgsz -= sz;
362 	pg = pig_pg + pig_pg->fpgsz;
363 	if (pig_pg->fpgsz == 0)
364 		uvm_pmr_remove_addr(pmr, pig_pg);
365 	else
366 		uvm_pmr_insert_size(pmr, pig_pg);
367 
368 	uvmexp.free -= sz;
369 	*addr = VM_PAGE_TO_PHYS(pg);
370 
371 	/*
372 	 * Update pg flags.
373 	 *
374 	 * Note that we trash the sz argument now.
375 	 */
376 	while (sz > 0) {
377 		KASSERT(pg->pg_flags & PQ_FREE);
378 
379 		atomic_clearbits_int(&pg->pg_flags,
380 		    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3);
381 
382 		if (pg->pg_flags & PG_ZERO)
383 			uvmexp.zeropages -= sz;
384 		atomic_clearbits_int(&pg->pg_flags,
385 		    PG_ZERO|PQ_FREE);
386 
387 		pg->uobject = NULL;
388 		pg->uanon = NULL;
389 		pg->pg_version++;
390 
391 		/*
392 		 * Next.
393 		 */
394 		pg++;
395 		sz--;
396 	}
397 
398 	/* Return. */
399 	uvm_unlock_fpageq();
400 	return 0;
401 }
402 
403 /*
404  * Allocate a piglet area.
405  *
406  * This is as low as possible.
407  * Piglets are aligned.
408  *
409  * sz and align in bytes.
410  *
411  * The call will sleep for the pagedaemon to attempt to free memory.
412  * The pagedaemon may decide its not possible to free enough memory, causing
413  * the allocation to fail.
414  */
415 int
416 uvm_pmr_alloc_piglet(vaddr_t *va, paddr_t *pa, vsize_t sz, paddr_t align)
417 {
418 	paddr_t			 pg_addr, piglet_addr;
419 	struct uvm_pmemrange	*pmr;
420 	struct vm_page		*pig_pg, *pg;
421 	struct pglist		 pageq;
422 	int			 pdaemon_woken;
423 	vaddr_t			 piglet_va;
424 
425 	KASSERT((align & (align - 1)) == 0);
426 	pdaemon_woken = 0; /* Didn't wake the pagedaemon. */
427 
428 	/*
429 	 * Fixup arguments: align must be at least PAGE_SIZE,
430 	 * sz will be converted to pagecount, since that is what
431 	 * pmemrange uses internally.
432 	 */
433 	if (align < PAGE_SIZE)
434 		align = PAGE_SIZE;
435 	sz = round_page(sz);
436 
437 	uvm_lock_fpageq();
438 
439 	TAILQ_FOREACH_REVERSE(pmr, &uvm.pmr_control.use, uvm_pmemrange_use,
440 	    pmr_use) {
441 retry:
442 		/*
443 		 * Search for a range with enough space.
444 		 * Use the address tree, to ensure the range is as low as
445 		 * possible.
446 		 */
447 		RB_FOREACH(pig_pg, uvm_pmr_addr, &pmr->addr) {
448 			pg_addr = VM_PAGE_TO_PHYS(pig_pg);
449 			piglet_addr = (pg_addr + (align - 1)) & ~(align - 1);
450 
451 			if (atop(pg_addr) + pig_pg->fpgsz >=
452 			    atop(piglet_addr) + atop(sz)) {
453 				goto found;
454 			}
455 		}
456 	}
457 
458 	/*
459 	 * Try to coerse the pagedaemon into freeing memory
460 	 * for the piglet.
461 	 *
462 	 * pdaemon_woken is set to prevent the code from
463 	 * falling into an endless loop.
464 	 */
465 	if (!pdaemon_woken) {
466 		pdaemon_woken = 1;
467 		if (uvm_wait_pla(ptoa(pmr->low), ptoa(pmr->high) - 1,
468 		    sz, UVM_PLA_FAILOK) == 0)
469 			goto retry;
470 	}
471 
472 	/* Return failure. */
473 	uvm_unlock_fpageq();
474 	return ENOMEM;
475 
476 found:
477 	/*
478 	 * Extract piglet from pigpen.
479 	 */
480 	TAILQ_INIT(&pageq);
481 	uvm_pmr_extract_range(pmr, pig_pg,
482 	    atop(piglet_addr), atop(piglet_addr) + atop(sz), &pageq);
483 
484 	*pa = piglet_addr;
485 	uvmexp.free -= atop(sz);
486 
487 	/*
488 	 * Update pg flags.
489 	 *
490 	 * Note that we trash the sz argument now.
491 	 */
492 	TAILQ_FOREACH(pg, &pageq, pageq) {
493 		KASSERT(pg->pg_flags & PQ_FREE);
494 
495 		atomic_clearbits_int(&pg->pg_flags,
496 		    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3);
497 
498 		if (pg->pg_flags & PG_ZERO)
499 			uvmexp.zeropages--;
500 		atomic_clearbits_int(&pg->pg_flags,
501 		    PG_ZERO|PQ_FREE);
502 
503 		pg->uobject = NULL;
504 		pg->uanon = NULL;
505 		pg->pg_version++;
506 	}
507 
508 	uvm_unlock_fpageq();
509 
510 
511 	/*
512 	 * Now allocate a va.
513 	 * Use direct mappings for the pages.
514 	 */
515 
516 	piglet_va = *va = (vaddr_t)km_alloc(sz, &kv_any, &kp_none, &kd_waitok);
517 	if (!piglet_va) {
518 		uvm_pglistfree(&pageq);
519 		return ENOMEM;
520 	}
521 
522 	/*
523 	 * Map piglet to va.
524 	 */
525 	TAILQ_FOREACH(pg, &pageq, pageq) {
526 		pmap_kenter_pa(piglet_va, VM_PAGE_TO_PHYS(pg), UVM_PROT_RW);
527 		piglet_va += PAGE_SIZE;
528 	}
529 	pmap_update(pmap_kernel());
530 
531 	return 0;
532 }
533 
534 /*
535  * Free a piglet area.
536  */
537 void
538 uvm_pmr_free_piglet(vaddr_t va, vsize_t sz)
539 {
540 	paddr_t			 pa;
541 	struct vm_page		*pg;
542 
543 	/*
544 	 * Fix parameters.
545 	 */
546 	sz = round_page(sz);
547 
548 	/*
549 	 * Find the first page in piglet.
550 	 * Since piglets are contiguous, the first pg is all we need.
551 	 */
552 	if (!pmap_extract(pmap_kernel(), va, &pa))
553 		panic("uvm_pmr_free_piglet: piglet 0x%lx has no pages", va);
554 	pg = PHYS_TO_VM_PAGE(pa);
555 	if (pg == NULL)
556 		panic("uvm_pmr_free_piglet: unmanaged page 0x%lx", pa);
557 
558 	/*
559 	 * Unmap.
560 	 */
561 	pmap_kremove(va, sz);
562 	pmap_update(pmap_kernel());
563 
564 	/*
565 	 * Free the physical and virtual memory.
566 	 */
567 	uvm_pmr_freepages(pg, atop(sz));
568 	km_free((void*)va, sz, &kv_any, &kp_none);
569 }
570 
571 /*
572  * Physmem RLE compression support.
573  *
574  * Given a physical page address, it will return the number of pages
575  * starting at the address, that are free.
576  * Returns 0 if the page at addr is not free.
577  */
578 psize_t
579 uvm_page_rle(paddr_t addr)
580 {
581 	struct vm_page		*pg, *pg_end;
582 	struct vm_physseg	*vmp;
583 	int			 pseg_idx, off_idx;
584 
585 	pseg_idx = vm_physseg_find(atop(addr), &off_idx);
586 	if (pseg_idx == -1)
587 		return 0;
588 
589 	vmp = &vm_physmem[pseg_idx];
590 	pg = &vmp->pgs[off_idx];
591 	if (!(pg->pg_flags & PQ_FREE))
592 		return 0;
593 
594 	/*
595 	 * Search for the first non-free page after pg.
596 	 * Note that the page may not be the first page in a free pmemrange,
597 	 * therefore pg->fpgsz cannot be used.
598 	 */
599 	for (pg_end = pg; pg_end <= vmp->lastpg &&
600 	    (pg_end->pg_flags & PQ_FREE) == PQ_FREE; pg_end++);
601 	return pg_end - pg;
602 }
603 
604 /*
605  * Fills out the hibernate_info union pointed to by hiber_info
606  * with information about this machine (swap signature block
607  * offsets, number of memory ranges, kernel in use, etc)
608  *
609  */
610 int
611 get_hibernate_info(union hibernate_info *hiber_info, int suspend)
612 {
613 	int chunktable_size;
614 	struct disklabel dl;
615 	char err_string[128], *dl_ret;
616 
617 	/* Determine I/O function to use */
618 	hiber_info->io_func = get_hibernate_io_function();
619 	if (hiber_info->io_func == NULL)
620 		return (1);
621 
622 	/* Calculate hibernate device */
623 	hiber_info->device = swdevt[0].sw_dev;
624 
625 	/* Read disklabel (used to calculate signature and image offsets) */
626 	dl_ret = disk_readlabel(&dl, hiber_info->device, err_string, 128);
627 
628 	if (dl_ret) {
629 		printf("Hibernate error reading disklabel: %s\n", dl_ret);
630 		return (1);
631 	}
632 
633 	hiber_info->secsize = dl.d_secsize;
634 
635 	/* Make sure the signature can fit in one block */
636 	KASSERT(sizeof(union hibernate_info)/hiber_info->secsize == 1);
637 
638 	/* Calculate swap offset from start of disk */
639 	hiber_info->swap_offset = dl.d_partitions[1].p_offset;
640 
641 	/* Calculate signature block location */
642 	hiber_info->sig_offset = dl.d_partitions[1].p_offset +
643 		dl.d_partitions[1].p_size -
644 		sizeof(union hibernate_info)/hiber_info->secsize;
645 
646 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / hiber_info->secsize;
647 
648 	/* Stash kernel version information */
649 	bzero(&hiber_info->kernel_version, 128);
650 	bcopy(version, &hiber_info->kernel_version,
651 		min(strlen(version), sizeof(hiber_info->kernel_version)-1));
652 
653 	if (suspend) {
654 		/* Allocate piglet region */
655 		if (uvm_pmr_alloc_piglet(&hiber_info->piglet_va,
656 					&hiber_info->piglet_pa,
657 					HIBERNATE_CHUNK_SIZE*3,
658 					HIBERNATE_CHUNK_SIZE)) {
659 			printf("Hibernate failed to allocate the piglet\n");
660 			return (1);
661 		}
662 	}
663 
664 	if (get_hibernate_info_md(hiber_info))
665 		return (1);
666 
667 	/* Calculate memory image location */
668 	hiber_info->image_offset = dl.d_partitions[1].p_offset +
669 		dl.d_partitions[1].p_size -
670 		(hiber_info->image_size / hiber_info->secsize) -
671 		sizeof(union hibernate_info)/hiber_info->secsize -
672 		chunktable_size;
673 
674 	return (0);
675 }
676 
677 /*
678  * Allocate nitems*size bytes from the hiballoc area presently in use
679  */
680 void
681 *hibernate_zlib_alloc(void *unused, int nitems, int size)
682 {
683 	return hib_alloc(&hibernate_state->hiballoc_arena, nitems*size);
684 }
685 
686 /*
687  * Free the memory pointed to by addr in the hiballoc area presently in
688  * use
689  */
690 void
691 hibernate_zlib_free(void *unused, void *addr)
692 {
693 	hib_free(&hibernate_state->hiballoc_arena, addr);
694 }
695 
696 /*
697  * Inflate size bytes from src into dest, skipping any pages in
698  * [src..dest] that are special (see hibernate_inflate_skip)
699  *
700  * For each page of output data, we map HIBERNATE_TEMP_PAGE
701  * to the current output page, and tell inflate() to inflate
702  * its data there, resulting in the inflated data being placed
703  * at the proper paddr.
704  *
705  * This function executes while using the resume-time stack
706  * and pmap, and therefore cannot use ddb/printf/etc. Doing so
707  * will likely hang or reset the machine.
708  */
709 void
710 hibernate_inflate(union hibernate_info *hiber_info,
711 	paddr_t dest, paddr_t src, size_t size)
712 {
713 	int i;
714 
715 	hibernate_state->hib_stream.avail_in = size;
716 	hibernate_state->hib_stream.next_in = (char *)src;
717 
718 	hibernate_inflate_page = hiber_info->piglet_va + 2 * PAGE_SIZE;
719 
720 	do {
721 		/* Flush cache and TLB */
722 		hibernate_flush();
723 
724 		/*
725 		 * Is this a special page? If yes, redirect the
726 		 * inflate output to a scratch page (eg, discard it)
727 		 */
728 		if (hibernate_inflate_skip(hiber_info, dest))
729 			hibernate_enter_resume_mapping(
730 				hibernate_inflate_page,
731 				hiber_info->piglet_pa + 2 * PAGE_SIZE,
732 				0);
733 		else
734 			hibernate_enter_resume_mapping(
735 				hibernate_inflate_page,
736 				dest, 0);
737 
738 		/* Set up the stream for inflate */
739 		hibernate_state->hib_stream.avail_out = PAGE_SIZE;
740 		hibernate_state->hib_stream.next_out =
741 			(char *)hiber_info->piglet_va + 2 * PAGE_SIZE;
742 
743 		/* Process next block of data */
744 		i = inflate(&hibernate_state->hib_stream, Z_PARTIAL_FLUSH);
745 		if (i != Z_OK && i != Z_STREAM_END) {
746 			/*
747 			 * XXX - this will likely reboot/hang most machines,
748 			 *       but there's not much else we can do here.
749 			 */
750 			panic("inflate error");
751 		}
752 
753 		dest += PAGE_SIZE - hibernate_state->hib_stream.avail_out;
754 	} while (i != Z_STREAM_END);
755 }
756 
757 /*
758  * deflate from src into the I/O page, up to 'remaining' bytes
759  *
760  * Returns number of input bytes consumed, and may reset
761  * the 'remaining' parameter if not all the output space was consumed
762  * (this information is needed to know how much to write to disk
763  */
764 size_t
765 hibernate_deflate(union hibernate_info *hiber_info, paddr_t src,
766 	size_t *remaining)
767 {
768 	vaddr_t hibernate_io_page = hiber_info->piglet_va + PAGE_SIZE;
769 
770 	/* Set up the stream for deflate */
771 	hibernate_state->hib_stream.avail_in = PAGE_SIZE -
772 		(src & PAGE_MASK);
773 	hibernate_state->hib_stream.avail_out = *remaining;
774 	hibernate_state->hib_stream.next_in = (caddr_t)src;
775 	hibernate_state->hib_stream.next_out = (caddr_t)hibernate_io_page +
776 		(PAGE_SIZE - *remaining);
777 
778 	/* Process next block of data */
779 	if (deflate(&hibernate_state->hib_stream, Z_PARTIAL_FLUSH) != Z_OK)
780 		panic("hibernate zlib deflate error\n");
781 
782 	/* Update pointers and return number of bytes consumed */
783 	*remaining = hibernate_state->hib_stream.avail_out;
784 	return (PAGE_SIZE - (src & PAGE_MASK)) -
785 		hibernate_state->hib_stream.avail_in;
786 }
787 
788 /*
789  * Write the hibernation information specified in hiber_info
790  * to the location in swap previously calculated (last block of
791  * swap), called the "signature block".
792  *
793  * Write the memory chunk table to the area in swap immediately
794  * preceding the signature block.
795  */
796 int
797 hibernate_write_signature(union hibernate_info *hiber_info)
798 {
799 	u_int8_t *io_page;
800 	int result = 0;
801 
802 	io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
803 	if (!io_page)
804 		return (1);
805 
806 	/* Write hibernate info to disk */
807 	if (hiber_info->io_func(hiber_info->device, hiber_info->sig_offset,
808 		(vaddr_t)hiber_info, hiber_info->secsize, 1, io_page)) {
809 			result = 1;
810 	}
811 
812 	free(io_page, M_DEVBUF);
813 	return (result);
814 }
815 
816 /*
817  * Write the memory chunk table to the area in swap immediately
818  * preceding the signature block. The chunk table is stored
819  * in the piglet when this function is called.
820  */
821 int
822 hibernate_write_chunktable(union hibernate_info *hiber_info)
823 {
824 	u_int8_t *io_page;
825 	int i;
826 	daddr_t chunkbase;
827 	vaddr_t hibernate_chunk_table_start;
828 	size_t hibernate_chunk_table_size;
829 	struct hibernate_disk_chunk *chunks;
830 
831 	io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
832 	if (!io_page)
833 		return (1);
834 
835 	hibernate_chunk_table_size = HIBERNATE_CHUNK_TABLE_SIZE;
836 
837 	chunkbase = hiber_info->sig_offset -
838 		    (hibernate_chunk_table_size / hiber_info->secsize);
839 
840 	hibernate_chunk_table_start = hiber_info->piglet_va +
841 					HIBERNATE_CHUNK_SIZE;
842 
843 	chunks = (struct hibernate_disk_chunk *)(hiber_info->piglet_va +
844 		HIBERNATE_CHUNK_SIZE);
845 
846 	/* Write chunk table */
847 	for(i=0; i < hibernate_chunk_table_size; i += MAXPHYS) {
848 		if(hiber_info->io_func(hiber_info->device,
849 			chunkbase + (i/hiber_info->secsize),
850 			(vaddr_t)(hibernate_chunk_table_start + i),
851 			MAXPHYS,
852 			1,
853 			io_page)) {
854 				free(io_page, M_DEVBUF);
855 				return (1);
856 		}
857 	}
858 
859 	free(io_page, M_DEVBUF);
860 
861 	return (0);
862 }
863 
864 /*
865  * Write an empty hiber_info to the swap signature block, which is
866  * guaranteed to not match any valid hiber_info.
867  */
868 int
869 hibernate_clear_signature()
870 {
871 	union hibernate_info blank_hiber_info;
872 	union hibernate_info hiber_info;
873 	u_int8_t *io_page;
874 
875 	/* Zero out a blank hiber_info */
876 	bzero(&blank_hiber_info, sizeof(hiber_info));
877 
878 	if (get_hibernate_info(&hiber_info, 0))
879 		return (1);
880 
881 	io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
882 	if (!io_page)
883 		return (1);
884 
885 	/* Write (zeroed) hibernate info to disk */
886 	/* XXX - use regular kernel write routine for this */
887 	if(hiber_info.io_func(hiber_info.device, hiber_info.sig_offset,
888 		(vaddr_t)&blank_hiber_info, hiber_info.secsize, 1, io_page))
889 			panic("error hibernate write 6\n");
890 
891 	free(io_page, M_DEVBUF);
892 
893 	return (0);
894 }
895 
896 /*
897  * Check chunk range overlap when calculating whether or not to copy a
898  * compressed chunk to the piglet area before decompressing.
899  *
900  * returns zero if the ranges do not overlap, non-zero otherwise.
901  */
902 int
903 hibernate_check_overlap(paddr_t r1s, paddr_t r1e, paddr_t r2s, paddr_t r2e)
904 {
905 	/* case A : end of r1 overlaps start of r2 */
906 	if (r1s < r2s && r1e > r2s)
907 		return (1);
908 
909 	/* case B : r1 entirely inside r2 */
910 	if (r1s >= r2s && r1e <= r2e)
911 		return (1);
912 
913 	/* case C : r2 entirely inside r1 */
914 	if (r2s >= r1s && r2e <= r1e)
915 		return (1);
916 
917 	/* case D : end of r2 overlaps start of r1 */
918 	if (r2s < r1s && r2e > r1s)
919 		return (1);
920 
921 	return (0);
922 }
923 
924 /*
925  * Compare two hibernate_infos to determine if they are the same (eg,
926  * we should be performing a hibernate resume on this machine.
927  * Not all fields are checked - just enough to verify that the machine
928  * has the same memory configuration and kernel as the one that
929  * wrote the signature previously.
930  */
931 int
932 hibernate_compare_signature(union hibernate_info *mine,
933 	union hibernate_info *disk)
934 {
935 	u_int i;
936 
937 	if (mine->nranges != disk->nranges)
938 		return (1);
939 
940 	if (strcmp(mine->kernel_version, disk->kernel_version) != 0)
941 		return (1);
942 
943 	for (i=0; i< mine->nranges; i++) {
944 		if ((mine->ranges[i].base != disk->ranges[i].base) ||
945 			(mine->ranges[i].end != disk->ranges[i].end) )
946 		return (1);
947 	}
948 
949 	return (0);
950 }
951 
952 /*
953  * Reads read_size bytes from the hibernate device specified in
954  * hib_info at offset blkctr. Output is placed into the vaddr specified
955  * at dest.
956  *
957  * Separate offsets and pages are used to handle misaligned reads (reads
958  * that span a page boundary).
959  *
960  * blkctr specifies a relative offset (relative to the start of swap),
961  * not an absolute disk offset
962  *
963  */
964 int
965 hibernate_read_block(union hibernate_info *hib_info, daddr_t blkctr,
966 	size_t read_size, vaddr_t dest)
967 {
968 	struct buf *bp;
969 	struct bdevsw *bdsw;
970 	int error;
971 
972 	bp = geteblk(read_size);
973 	bdsw = &bdevsw[major(hib_info->device)];
974 
975 	error = (*bdsw->d_open)(hib_info->device, FREAD, S_IFCHR, curproc);
976 	if (error) {
977 		printf("hibernate_read_block open failed\n");
978 		return (1);
979 	}
980 
981 	bp->b_bcount = read_size;
982 	bp->b_blkno = blkctr;
983 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
984 	SET(bp->b_flags, B_BUSY | B_READ | B_RAW);
985 	bp->b_dev = hib_info->device;
986 	bp->b_cylinder = 0;
987 	(*bdsw->d_strategy)(bp);
988 
989 	error = biowait(bp);
990 	if (error) {
991 		printf("hibernate_read_block biowait failed %d\n", error);
992 		error = (*bdsw->d_close)(hib_info->device, 0, S_IFCHR,
993 				curproc);
994 		if (error)
995 			printf("hibernate_read_block error close failed\n");
996 		return (1);
997 	}
998 
999 	error = (*bdsw->d_close)(hib_info->device, FREAD, S_IFCHR, curproc);
1000 	if (error) {
1001 		printf("hibernate_read_block close failed\n");
1002 		return (1);
1003 	}
1004 
1005 	bcopy(bp->b_data, (caddr_t)dest, read_size);
1006 
1007 	bp->b_flags |= B_INVAL;
1008 	brelse(bp);
1009 
1010 	return (0);
1011 }
1012 
1013 /*
1014  * Reads the signature block from swap, checks against the current machine's
1015  * information. If the information matches, perform a resume by reading the
1016  * saved image into the pig area, and unpacking.
1017  */
1018 void
1019 hibernate_resume()
1020 {
1021 	union hibernate_info hiber_info;
1022 	u_int8_t *io_page;
1023 	int s;
1024 
1025 	/* Scrub temporary vaddr ranges used during resume */
1026 	hibernate_temp_page = (vaddr_t)NULL;
1027 	hibernate_fchunk_area = (vaddr_t)NULL;
1028 	hibernate_chunktable_area = (vaddr_t)NULL;
1029 	hibernate_stack_page = (vaddr_t)NULL;
1030 
1031 	/* Get current running machine's hibernate info */
1032 	bzero(&hiber_info, sizeof(hiber_info));
1033 	if (get_hibernate_info(&hiber_info, 0))
1034 		return;
1035 
1036 	io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
1037 	if (!io_page)
1038 		return;
1039 
1040 	/* Read hibernate info from disk */
1041 	s = splbio();
1042 
1043 	/* XXX use regular kernel read routine here */
1044 	if(hiber_info.io_func(hiber_info.device, hiber_info.sig_offset,
1045 				(vaddr_t)&disk_hiber_info,
1046 				hiber_info.secsize, 0, io_page))
1047 			panic("error in hibernate read\n");
1048 
1049 	free(io_page, M_DEVBUF);
1050 
1051 	/*
1052 	 * If on-disk and in-memory hibernate signatures match,
1053 	 * this means we should do a resume from hibernate.
1054 	 */
1055 	if (hibernate_compare_signature(&hiber_info,
1056 		&disk_hiber_info))
1057 		return;
1058 
1059 	/*
1060 	 * Allocate several regions of vaddrs for use during read.
1061 	 * These mappings go into the resuming kernel's page table, and are
1062 	 * used only during image read.
1063 	 */
1064 	hibernate_temp_page = (vaddr_t)km_alloc(2*PAGE_SIZE, &kv_any,
1065 						&kp_none, &kd_nowait);
1066 	if (!hibernate_temp_page)
1067 		goto fail;
1068 
1069 	hibernate_fchunk_area = (vaddr_t)km_alloc(3*PAGE_SIZE, &kv_any,
1070 						&kp_none, &kd_nowait);
1071 	if (!hibernate_fchunk_area)
1072 		goto fail;
1073 
1074 	/* Allocate a temporary chunktable area */
1075 	hibernate_chunktable_area = (vaddr_t)malloc(HIBERNATE_CHUNK_TABLE_SIZE,
1076 					   M_DEVBUF, M_NOWAIT);
1077 	if (!hibernate_chunktable_area)
1078 		goto fail;
1079 
1080 	/* Allocate one temporary page of VAs for the resume time stack */
1081 	hibernate_stack_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1082 						&kp_none, &kd_nowait);
1083 	if (!hibernate_stack_page)
1084 		goto fail;
1085 
1086 	/* Read the image from disk into the image (pig) area */
1087 	if (hibernate_read_image(&disk_hiber_info))
1088 		goto fail;
1089 
1090 	/* Point of no return ... */
1091 
1092 	disable_intr();
1093 	cold = 1;
1094 
1095 	/* Switch stacks */
1096 	hibernate_switch_stack_machdep();
1097 
1098 	/*
1099 	 * Image is now in high memory (pig area), copy to correct location
1100 	 * in memory. We'll eventually end up copying on top of ourself, but
1101 	 * we are assured the kernel code here is the same between the
1102 	 * hibernated and resuming kernel, and we are running on our own
1103 	 * stack, so the overwrite is ok.
1104 	 */
1105 	hibernate_unpack_image(&disk_hiber_info);
1106 
1107 	/*
1108 	 * Resume the loaded kernel by jumping to the MD resume vector.
1109 	 * We won't be returning from this call.
1110 	 */
1111 	hibernate_resume_machdep();
1112 
1113 fail:
1114 	printf("Unable to resume hibernated image\n");
1115 
1116 	if (hibernate_temp_page)
1117 		km_free((void *)hibernate_temp_page, 2*PAGE_SIZE, &kv_any,
1118 			&kp_none);
1119 
1120 	if (hibernate_fchunk_area)
1121 		km_free((void *)hibernate_fchunk_area, 3*PAGE_SIZE, &kv_any,
1122 			&kp_none);
1123 
1124 	if (io_page)
1125 		free((void *)io_page, M_DEVBUF);
1126 
1127 	if (hibernate_chunktable_area)
1128 		free((void *)hibernate_chunktable_area, M_DEVBUF);
1129 }
1130 
1131 /*
1132  * Unpack image from pig area to original location by looping through the
1133  * list of output chunks in the order they should be restored (fchunks).
1134  * This ordering is used to avoid having inflate overwrite a chunk in the
1135  * middle of processing that chunk. This will, of course, happen during the
1136  * final output chunk, where we copy the chunk to the piglet area first,
1137  * before inflating.
1138  */
1139 void
1140 hibernate_unpack_image(union hibernate_info *hiber_info)
1141 {
1142 	int i;
1143 	paddr_t image_cur;
1144 	vaddr_t tempva;
1145 	struct hibernate_disk_chunk *chunks;
1146 	char *pva;
1147 	int *fchunks;
1148 
1149 	pva = (char *)hiber_info->piglet_va;
1150 
1151 	fchunks = (int *)(pva + (4 * PAGE_SIZE));
1152 
1153 	/* Copy temporary chunktable to piglet */
1154 	tempva = (vaddr_t)km_alloc(HIBERNATE_CHUNK_TABLE_SIZE, &kv_any,
1155 			&kp_none, &kd_nowait);
1156 	for (i=0; i<HIBERNATE_CHUNK_TABLE_SIZE; i += PAGE_SIZE)
1157 		pmap_kenter_pa(tempva + i, hiber_info->piglet_pa +
1158 			HIBERNATE_CHUNK_SIZE + i, VM_PROT_ALL);
1159 
1160 	bcopy((caddr_t)hibernate_chunktable_area, (caddr_t)tempva,
1161 		HIBERNATE_CHUNK_TABLE_SIZE);
1162 
1163 	chunks = (struct hibernate_disk_chunk *)(pva +  HIBERNATE_CHUNK_SIZE);
1164 
1165 	hibernate_activate_resume_pt_machdep();
1166 
1167 	for (i=0; i<hiber_info->chunk_ctr; i++) {
1168 		/* Reset zlib for inflate */
1169 		if (hibernate_zlib_reset(hiber_info, 0) != Z_OK)
1170 			panic("hibernate failed to reset zlib for inflate\n");
1171 
1172 		/*
1173 		 * If there is a conflict, copy the chunk to the piglet area
1174 		 * before unpacking it to its original location.
1175 		 */
1176 		if((chunks[fchunks[i]].flags & HIBERNATE_CHUNK_CONFLICT) == 0)
1177 			hibernate_inflate(hiber_info,
1178 				chunks[fchunks[i]].base, image_cur,
1179 				chunks[fchunks[i]].compressed_size);
1180 		else {
1181 			bcopy((caddr_t)image_cur,
1182 				(caddr_t)hiber_info->piglet_va +
1183 				HIBERNATE_CHUNK_SIZE * 2,
1184 				chunks[fchunks[i]].compressed_size);
1185 			hibernate_inflate(hiber_info,
1186 				chunks[fchunks[i]].base,
1187 				hiber_info->piglet_va +
1188 				HIBERNATE_CHUNK_SIZE * 2,
1189 				chunks[fchunks[i]].compressed_size);
1190 		}
1191 		image_cur += chunks[fchunks[i]].compressed_size;
1192 	}
1193 }
1194 
1195 /*
1196  * Write a compressed version of this machine's memory to disk, at the
1197  * precalculated swap offset:
1198  *
1199  * end of swap - signature block size - chunk table size - memory size
1200  *
1201  * The function begins by looping through each phys mem range, cutting each
1202  * one into 4MB chunks. These chunks are then compressed individually
1203  * and written out to disk, in phys mem order. Some chunks might compress
1204  * more than others, and for this reason, each chunk's size is recorded
1205  * in the chunk table, which is written to disk after the image has
1206  * properly been compressed and written (in hibernate_write_chunktable).
1207  *
1208  * When this function is called, the machine is nearly suspended - most
1209  * devices are quiesced/suspended, interrupts are off, and cold has
1210  * been set. This means that there can be no side effects once the
1211  * write has started, and the write function itself can also have no
1212  * side effects.
1213  *
1214  * This function uses the piglet area during this process as follows:
1215  *
1216  * offset from piglet base	use
1217  * -----------------------	--------------------
1218  * 0				i/o allocation area
1219  * PAGE_SIZE			i/o write area
1220  * 2*PAGE_SIZE			temp/scratch page
1221  * 3*PAGE_SIZE			temp/scratch page
1222  * 4*PAGE_SIZE			hiballoc arena
1223  * 5*PAGE_SIZE to 85*PAGE_SIZE	zlib deflate area
1224  * ...
1225  * HIBERNATE_CHUNK_SIZE		chunk table temporary area
1226  *
1227  * Some transient piglet content is saved as part of deflate,
1228  * but it is irrelevant during resume as it will be repurposed
1229  * at that time for other things.
1230  */
1231 int
1232 hibernate_write_chunks(union hibernate_info *hiber_info)
1233 {
1234 	paddr_t range_base, range_end, inaddr, temp_inaddr;
1235 	daddr_t blkctr;
1236 	int i;
1237 	size_t nblocks, out_remaining, used, offset;
1238 	struct hibernate_disk_chunk *chunks;
1239 	vaddr_t hibernate_alloc_page = hiber_info->piglet_va;
1240 	vaddr_t hibernate_io_page = hiber_info->piglet_va + PAGE_SIZE;
1241 
1242 	blkctr = hiber_info->image_offset;
1243 	hiber_info->chunk_ctr = 0;
1244 	offset = 0;
1245 
1246 	/*
1247 	 * Allocate VA for the temp and copy page.
1248 	 */
1249 
1250 	hibernate_temp_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1251 						&kp_none, &kd_nowait);
1252 	if (!hibernate_temp_page)
1253 		return (1);
1254 
1255 	hibernate_copy_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1256 						&kp_none, &kd_nowait);
1257 	if (!hibernate_copy_page)
1258 		return (1);
1259 
1260 	pmap_kenter_pa(hibernate_copy_page,
1261 			(hiber_info->piglet_pa + 3*PAGE_SIZE),
1262 			VM_PROT_ALL);
1263 
1264 	/* XXX - needed on i386. check other archs */
1265 	pmap_activate(curproc);
1266 
1267 	chunks = (struct hibernate_disk_chunk *)(hiber_info->piglet_va +
1268 			HIBERNATE_CHUNK_SIZE);
1269 
1270 	/* Calculate the chunk regions */
1271 	for (i=0; i < hiber_info->nranges; i++) {
1272                 range_base = hiber_info->ranges[i].base;
1273                 range_end = hiber_info->ranges[i].end;
1274 
1275 		inaddr = range_base;
1276 
1277 		while (inaddr < range_end) {
1278 			chunks[hiber_info->chunk_ctr].base = inaddr;
1279 			if (inaddr + HIBERNATE_CHUNK_SIZE < range_end)
1280 				chunks[hiber_info->chunk_ctr].end = inaddr +
1281 					HIBERNATE_CHUNK_SIZE;
1282 			else
1283 				chunks[hiber_info->chunk_ctr].end = range_end;
1284 
1285 			inaddr += HIBERNATE_CHUNK_SIZE;
1286 			hiber_info->chunk_ctr ++;
1287 		}
1288 	}
1289 
1290 	/* Compress and write the chunks in the chunktable */
1291 	for (i=0; i < hiber_info->chunk_ctr; i++) {
1292 		range_base = chunks[i].base;
1293 		range_end = chunks[i].end;
1294 
1295 		chunks[i].offset = blkctr;
1296 
1297 		/* Reset zlib for deflate */
1298 		if (hibernate_zlib_reset(hiber_info, 1) != Z_OK)
1299 			return (1);
1300 
1301 		inaddr = range_base;
1302 
1303 		/*
1304 		 * For each range, loop through its phys mem region
1305 		 * and write out the chunks (the last chunk might be
1306 		 * smaller than the chunk size).
1307 		 */
1308 		while (inaddr < range_end) {
1309 			out_remaining = PAGE_SIZE;
1310 			while (out_remaining > 0 && inaddr < range_end) {
1311 				pmap_kenter_pa(hibernate_temp_page,
1312 					inaddr & PMAP_PA_MASK, VM_PROT_ALL);
1313 				pmap_activate(curproc);
1314 
1315 				bcopy((caddr_t)hibernate_temp_page,
1316 					(caddr_t)hibernate_copy_page, PAGE_SIZE);
1317 
1318 				/* Adjust for non page-sized regions */
1319 				temp_inaddr = (inaddr & PAGE_MASK) +
1320 					hibernate_copy_page;
1321 
1322 				/* Deflate from temp_inaddr to IO page */
1323 				inaddr += hibernate_deflate(hiber_info,
1324 						temp_inaddr,
1325 						&out_remaining);
1326 			}
1327 
1328 			if (out_remaining == 0) {
1329 				/* Filled up the page */
1330 				nblocks = PAGE_SIZE / hiber_info->secsize;
1331 
1332 				if(hiber_info->io_func(hiber_info->device, blkctr,
1333 					(vaddr_t)hibernate_io_page, PAGE_SIZE,
1334 					1, (void *)hibernate_alloc_page))
1335 						return (1);
1336 
1337 				blkctr += nblocks;
1338 			}
1339 
1340 		}
1341 
1342 		if (inaddr != range_end)
1343 			return (1);
1344 
1345 		/*
1346 		 * End of range. Round up to next secsize bytes
1347 		 * after finishing compress
1348 		 */
1349 		if (out_remaining == 0)
1350 			out_remaining = PAGE_SIZE;
1351 
1352 		/* Finish compress */
1353 		hibernate_state->hib_stream.avail_in = 0;
1354 		hibernate_state->hib_stream.avail_out = out_remaining;
1355 		hibernate_state->hib_stream.next_in = (caddr_t)inaddr;
1356 		hibernate_state->hib_stream.next_out =
1357 			(caddr_t)hibernate_io_page + (PAGE_SIZE - out_remaining);
1358 
1359 		if (deflate(&hibernate_state->hib_stream, Z_FINISH) !=
1360 			Z_STREAM_END)
1361 				return (1);
1362 
1363 		out_remaining = hibernate_state->hib_stream.avail_out;
1364 
1365 		used = PAGE_SIZE - out_remaining;
1366 		nblocks = used / hiber_info->secsize;
1367 
1368 		/* Round up to next block if needed */
1369 		if (used % hiber_info->secsize != 0)
1370 			nblocks ++;
1371 
1372 		/* Write final block(s) for this chunk */
1373 		if( hiber_info->io_func(hiber_info->device, blkctr,
1374 			(vaddr_t)hibernate_io_page, nblocks*hiber_info->secsize,
1375 			1, (void *)hibernate_alloc_page))
1376 				return (1);
1377 
1378 		blkctr += nblocks;
1379 
1380 		offset = blkctr;
1381 		chunks[i].compressed_size=
1382 			(offset-chunks[i].offset)*hiber_info->secsize;
1383 
1384 	}
1385 
1386 	return (0);
1387 }
1388 
1389 /*
1390  * Reset the zlib stream state and allocate a new hiballoc area for either
1391  * inflate or deflate. This function is called once for each hibernate chunk.
1392  * Calling hiballoc_init multiple times is acceptable since the memory it is
1393  * provided is unmanaged memory (stolen). We use the memory provided to us
1394  * by the piglet allocated via the supplied hiber_info.
1395  */
1396 int
1397 hibernate_zlib_reset(union hibernate_info *hiber_info, int deflate)
1398 {
1399 	vaddr_t hibernate_zlib_start;
1400 	size_t hibernate_zlib_size;
1401 
1402 	hibernate_state = (struct hibernate_zlib_state *)hiber_info->piglet_va +
1403 				(4 * PAGE_SIZE);
1404 
1405 	hibernate_zlib_start = hiber_info->piglet_va + (5 * PAGE_SIZE);
1406 	hibernate_zlib_size = 80 * PAGE_SIZE;
1407 
1408 	bzero((caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1409 	bzero((caddr_t)hibernate_state, PAGE_SIZE);
1410 
1411 	/* Set up stream structure */
1412 	hibernate_state->hib_stream.zalloc = (alloc_func)hibernate_zlib_alloc;
1413 	hibernate_state->hib_stream.zfree = (free_func)hibernate_zlib_free;
1414 
1415 	/* Initialize the hiballoc arena for zlib allocs/frees */
1416 	hiballoc_init(&hibernate_state->hiballoc_arena,
1417 		(caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1418 
1419 	if (deflate) {
1420 		return deflateInit(&hibernate_state->hib_stream,
1421 			Z_DEFAULT_COMPRESSION);
1422 	}
1423 	else
1424 		return inflateInit(&hibernate_state->hib_stream);
1425 }
1426 
1427 /*
1428  * Reads the hibernated memory image from disk, whose location and
1429  * size are recorded in hiber_info. Begin by reading the persisted
1430  * chunk table, which records the original chunk placement location
1431  * and compressed size for each. Next, allocate a pig region of
1432  * sufficient size to hold the compressed image. Next, read the
1433  * chunks into the pig area (calling hibernate_read_chunks to do this),
1434  * and finally, if all of the above succeeds, clear the hibernate signature.
1435  * The function will then return to hibernate_resume, which will proceed
1436  * to unpack the pig image to the correct place in memory.
1437  */
1438 int
1439 hibernate_read_image(union hibernate_info *hiber_info)
1440 {
1441 	int i;
1442 	paddr_t image_start, image_end, pig_start, pig_end;
1443 	daddr_t blkctr;
1444 	struct hibernate_disk_chunk *chunks;
1445 	size_t compressed_size, disk_size, chunktable_size, pig_sz;
1446 
1447 	/* Calculate total chunk table size in disk blocks */
1448 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / hiber_info->secsize;
1449 
1450 	blkctr = hiber_info->sig_offset - chunktable_size -
1451 			hiber_info->swap_offset;
1452 
1453 	for(i=0; i < HIBERNATE_CHUNK_TABLE_SIZE;
1454 	    i += MAXPHYS, blkctr += MAXPHYS/hiber_info->secsize)
1455 		hibernate_read_block(hiber_info, blkctr, MAXPHYS,
1456 			hibernate_chunktable_area + i);
1457 
1458 	blkctr = hiber_info->image_offset;
1459 	compressed_size = 0;
1460 	chunks = (struct hibernate_disk_chunk *)hibernate_chunktable_area;
1461 
1462 	for (i=0; i<hiber_info->chunk_ctr; i++)
1463 		compressed_size += chunks[i].compressed_size;
1464 
1465 	disk_size = compressed_size;
1466 
1467 	/* Allocate the pig area */
1468 	pig_sz =  compressed_size + HIBERNATE_CHUNK_SIZE;
1469 	if (uvm_pmr_alloc_pig(&pig_start, pig_sz) == ENOMEM)
1470 		return (1);
1471 
1472 	pig_end = pig_start + pig_sz;
1473 
1474 	/* Calculate image extents. Pig image must end on a chunk boundary. */
1475 	image_end = pig_end & ~(HIBERNATE_CHUNK_SIZE - 1);
1476 	image_start = pig_start;
1477 
1478 	image_start = image_end - disk_size;
1479 
1480 	hibernate_read_chunks(hiber_info, image_start, image_end, disk_size);
1481 
1482 	/* Prepare the resume time pmap/page table */
1483 	hibernate_populate_resume_pt(hiber_info, image_start, image_end);
1484 
1485 	/* Read complete, clear the signature and return */
1486 	return hibernate_clear_signature();
1487 }
1488 
1489 /*
1490  * Read the hibernated memory chunks from disk (chunk information at this
1491  * point is stored in the piglet) into the pig area specified by
1492  * [pig_start .. pig_end]. Order the chunks so that the final chunk is the
1493  * only chunk with overlap possibilities.
1494  *
1495  * This function uses the piglet area during this process as follows:
1496  *
1497  * offset from piglet base	use
1498  * -----------------------	--------------------
1499  * 0				i/o allocation area
1500  * PAGE_SIZE			i/o write area
1501  * 2*PAGE_SIZE			temp/scratch page
1502  * 3*PAGE_SIZE			temp/scratch page
1503  * 4*PAGE_SIZE to 6*PAGE_SIZE	chunk ordering area
1504  * 7*PAGE_SIZE			hiballoc arena
1505  * 8*PAGE_SIZE to 88*PAGE_SIZE	zlib deflate area
1506  * ...
1507  * HIBERNATE_CHUNK_SIZE		chunk table temporary area
1508  */
1509 int
1510 hibernate_read_chunks(union hibernate_info *hib_info, paddr_t pig_start,
1511 			paddr_t pig_end, size_t image_compr_size)
1512 {
1513 	paddr_t img_index, img_cur, r1s, r1e, r2s, r2e;
1514 	paddr_t copy_start, copy_end, piglet_cur;
1515 	paddr_t piglet_base = hib_info->piglet_pa;
1516 	paddr_t piglet_end = piglet_base + HIBERNATE_CHUNK_SIZE;
1517 	daddr_t blkctr;
1518 	size_t processed, compressed_size, read_size;
1519 	int i, j, overlap, found, nchunks, nochunks=0, nfchunks=0, npchunks=0;
1520 	struct hibernate_disk_chunk *chunks;
1521 	u_int8_t *ochunks, *pchunks, *fchunks;
1522 
1523 	/* Map the chunk ordering region */
1524 	pmap_kenter_pa(hibernate_fchunk_area,
1525 		piglet_base + (4*PAGE_SIZE), VM_PROT_ALL);
1526 	pmap_kenter_pa(hibernate_fchunk_area + PAGE_SIZE,
1527 		piglet_base + (5*PAGE_SIZE), VM_PROT_ALL);
1528 	pmap_kenter_pa(hibernate_fchunk_area + 2*PAGE_SIZE,
1529 		piglet_base + (6*PAGE_SIZE),
1530 	 	VM_PROT_ALL);
1531 
1532 	/* Temporary output chunk ordering */
1533 	ochunks = (u_int8_t *)hibernate_fchunk_area;
1534 
1535 	/* Piglet chunk ordering */
1536 	pchunks = (u_int8_t *)hibernate_fchunk_area + PAGE_SIZE;
1537 
1538 	/* Final chunk ordering */
1539 	fchunks = (u_int8_t *)hibernate_fchunk_area + 2*PAGE_SIZE;
1540 
1541 	nchunks = hib_info->chunk_ctr;
1542 	chunks = (struct hibernate_disk_chunk *)hibernate_chunktable_area;
1543 
1544 	/* Initially start all chunks as unplaced */
1545 	for (i=0; i < nchunks; i++)
1546 		chunks[i].flags=0;
1547 
1548 	/*
1549 	 * Search the list for chunks that are outside the pig area. These
1550 	 * can be placed first in the final output list.
1551 	 */
1552 	for (i=0; i < nchunks; i++) {
1553 		if(chunks[i].end <= pig_start || chunks[i].base >= pig_end) {
1554 			ochunks[nochunks] = (u_int8_t)i;
1555 			fchunks[nfchunks] = (u_int8_t)i;
1556 			nochunks++;
1557 			nfchunks++;
1558 			chunks[i].flags |= HIBERNATE_CHUNK_USED;
1559 		}
1560 	}
1561 
1562 	/*
1563 	 * Walk the ordering, place the chunks in ascending memory order.
1564 	 * Conflicts might arise, these are handled next.
1565 	 */
1566 	do {
1567 		img_index = -1;
1568 		found=0;
1569 		j=-1;
1570 		for (i=0; i < nchunks; i++)
1571 			if (chunks[i].base < img_index &&
1572 			    chunks[i].flags == 0 ) {
1573 				j = i;
1574 				img_index = chunks[i].base;
1575 			}
1576 
1577 		if (j != -1) {
1578 			found = 1;
1579 			ochunks[nochunks] = (short)j;
1580 			nochunks++;
1581 			chunks[j].flags |= HIBERNATE_CHUNK_PLACED;
1582 		}
1583 	} while (found);
1584 
1585 	img_index=pig_start;
1586 
1587 	/*
1588 	 * Identify chunk output conflicts (chunks whose pig load area
1589 	 * corresponds to their original memory placement location)
1590 	 */
1591 	for(i=0; i< nochunks ; i++) {
1592 		overlap=0;
1593 		r1s = img_index;
1594 		r1e = img_index + chunks[ochunks[i]].compressed_size;
1595 		r2s = chunks[ochunks[i]].base;
1596 		r2e = chunks[ochunks[i]].end;
1597 
1598 		overlap = hibernate_check_overlap(r1s, r1e, r2s, r2e);
1599 		if (overlap)
1600  			chunks[ochunks[i]].flags |= HIBERNATE_CHUNK_CONFLICT;
1601 
1602 		img_index += chunks[ochunks[i]].compressed_size;
1603 	}
1604 
1605 	/*
1606 	 * Prepare the final output chunk list. Calculate an output
1607 	 * inflate strategy for overlapping chunks if needed.
1608 	 */
1609 	img_index=pig_start;
1610 	for (i=0; i < nochunks ; i++) {
1611 		/*
1612 		 * If a conflict is detected, consume enough compressed
1613 		 * output chunks to fill the piglet
1614 		 */
1615 		if (chunks[ochunks[i]].flags & HIBERNATE_CHUNK_CONFLICT) {
1616 			copy_start = piglet_base;
1617 			copy_end = piglet_end;
1618 			piglet_cur = piglet_base;
1619 			npchunks = 0;
1620 			j=i;
1621 			while (copy_start < copy_end && j < nochunks) {
1622 				piglet_cur += chunks[ochunks[j]].compressed_size;
1623 				pchunks[npchunks] = ochunks[j];
1624 				npchunks++;
1625 				copy_start += chunks[ochunks[j]].compressed_size;
1626 				img_index += chunks[ochunks[j]].compressed_size;
1627 				i++;
1628 				j++;
1629 			}
1630 
1631 			piglet_cur = piglet_base;
1632 			for (j=0; j < npchunks; j++) {
1633 				piglet_cur += chunks[pchunks[j]].compressed_size;
1634 				fchunks[nfchunks] = pchunks[j];
1635 				chunks[pchunks[j]].flags |= HIBERNATE_CHUNK_USED;
1636 				nfchunks++;
1637 			}
1638 		} else {
1639 			/*
1640 			 * No conflict, chunk can be added without copying
1641 			 */
1642 			if ((chunks[ochunks[i]].flags &
1643 			    HIBERNATE_CHUNK_USED) == 0) {
1644 				fchunks[nfchunks] = ochunks[i];
1645 				chunks[ochunks[i]].flags |= HIBERNATE_CHUNK_USED;
1646 				nfchunks++;
1647 			}
1648 
1649 			img_index += chunks[ochunks[i]].compressed_size;
1650 		}
1651 	}
1652 
1653 	img_index = pig_start;
1654 	for(i=0 ; i< nfchunks; i++) {
1655 		piglet_cur = piglet_base;
1656 		img_index += chunks[fchunks[i]].compressed_size;
1657 	}
1658 
1659 	img_cur = pig_start;
1660 
1661 	for(i=0; i<nfchunks; i++) {
1662 		blkctr = chunks[fchunks[i]].offset - hib_info->swap_offset;
1663 		processed = 0;
1664 		compressed_size = chunks[fchunks[i]].compressed_size;
1665 
1666 		while (processed < compressed_size) {
1667 			pmap_kenter_pa(hibernate_temp_page, img_cur,
1668 				VM_PROT_ALL);
1669 			pmap_kenter_pa(hibernate_temp_page + PAGE_SIZE,
1670 				img_cur+PAGE_SIZE, VM_PROT_ALL);
1671 
1672 			/* XXX - needed on i386. check other archs */
1673 			pmap_activate(curproc);
1674 			if (compressed_size - processed >= PAGE_SIZE)
1675 				read_size = PAGE_SIZE;
1676 			else
1677 				read_size = compressed_size - processed;
1678 
1679 			hibernate_read_block(hib_info, blkctr, read_size,
1680 				hibernate_temp_page + (img_cur & PAGE_MASK));
1681 
1682 			blkctr += (read_size / hib_info->secsize);
1683 
1684 			hibernate_flush();
1685 			pmap_kremove(hibernate_temp_page, PAGE_SIZE);
1686 			pmap_kremove(hibernate_temp_page + PAGE_SIZE,
1687 				PAGE_SIZE);
1688 			processed += read_size;
1689 			img_cur += read_size;
1690 		}
1691 	}
1692 
1693 	return (0);
1694 }
1695 
1696 /*
1697  * Hibernating a machine comprises the following operations:
1698  *  1. Calculating this machine's hibernate_info information
1699  *  2. Allocating a piglet and saving the piglet's physaddr
1700  *  3. Calculating the memory chunks
1701  *  4. Writing the compressed chunks to disk
1702  *  5. Writing the chunk table
1703  *  6. Writing the signature block (hibernate_info)
1704  *
1705  * On most architectures, the function calling hibernate_suspend would
1706  * then power off the machine using some MD-specific implementation.
1707  */
1708 int
1709 hibernate_suspend()
1710 {
1711 	union hibernate_info hib_info;
1712 
1713 	/*
1714 	 * Calculate memory ranges, swap offsets, etc.
1715 	 * This also allocates a piglet whose physaddr is stored in
1716 	 * hib_info->piglet_pa and vaddr stored in hib_info->piglet_va
1717 	 */
1718 	if (get_hibernate_info(&hib_info, 1))
1719 		return (1);
1720 
1721 	/* XXX - Won't need to zero everything with RLE */
1722 	uvm_pmr_zero_everything();
1723 
1724 	if (hibernate_write_chunks(&hib_info))
1725 		return (1);
1726 
1727 	if (hibernate_write_chunktable(&hib_info))
1728 		return (1);
1729 
1730 	return hibernate_write_signature(&hib_info);
1731 }
1732