xref: /openbsd-src/sys/kern/subr_hibernate.c (revision cffc25bbcbb09a36c18dec82cf433c1fb8b97b46)
1 /*	$OpenBSD: subr_hibernate.c,v 1.48 2013/01/17 01:28:01 mlarkin Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  * Copyright (c) 2011 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/hibernate.h>
21 #include <sys/malloc.h>
22 #include <sys/param.h>
23 #include <sys/tree.h>
24 #include <sys/types.h>
25 #include <sys/systm.h>
26 #include <sys/disklabel.h>
27 #include <sys/disk.h>
28 #include <sys/conf.h>
29 #include <sys/buf.h>
30 #include <sys/fcntl.h>
31 #include <sys/stat.h>
32 #include <uvm/uvm.h>
33 #include <uvm/uvm_swap.h>
34 #include <machine/hibernate.h>
35 
36 /* Temporary vaddr ranges used during hibernate */
37 vaddr_t hibernate_temp_page;
38 vaddr_t hibernate_copy_page;
39 
40 /* Hibernate info as read from disk during resume */
41 union hibernate_info disk_hiber_info;
42 paddr_t global_pig_start;
43 vaddr_t global_piglet_va;
44 
45 void hibernate_copy_chunk_to_piglet(paddr_t, vaddr_t, size_t);
46 
47 /*
48  * Hib alloc enforced alignment.
49  */
50 #define HIB_ALIGN		8 /* bytes alignment */
51 
52 /*
53  * sizeof builtin operation, but with alignment constraint.
54  */
55 #define HIB_SIZEOF(_type)	roundup(sizeof(_type), HIB_ALIGN)
56 
57 struct hiballoc_entry {
58 	size_t			hibe_use;
59 	size_t			hibe_space;
60 	RB_ENTRY(hiballoc_entry) hibe_entry;
61 };
62 
63 /*
64  * Compare hiballoc entries based on the address they manage.
65  *
66  * Since the address is fixed, relative to struct hiballoc_entry,
67  * we just compare the hiballoc_entry pointers.
68  */
69 static __inline int
70 hibe_cmp(struct hiballoc_entry *l, struct hiballoc_entry *r)
71 {
72 	return l < r ? -1 : (l > r);
73 }
74 
75 RB_PROTOTYPE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
76 
77 /*
78  * Given a hiballoc entry, return the address it manages.
79  */
80 static __inline void *
81 hib_entry_to_addr(struct hiballoc_entry *entry)
82 {
83 	caddr_t addr;
84 
85 	addr = (caddr_t)entry;
86 	addr += HIB_SIZEOF(struct hiballoc_entry);
87 	return addr;
88 }
89 
90 /*
91  * Given an address, find the hiballoc that corresponds.
92  */
93 static __inline struct hiballoc_entry*
94 hib_addr_to_entry(void *addr_param)
95 {
96 	caddr_t addr;
97 
98 	addr = (caddr_t)addr_param;
99 	addr -= HIB_SIZEOF(struct hiballoc_entry);
100 	return (struct hiballoc_entry*)addr;
101 }
102 
103 RB_GENERATE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
104 
105 /*
106  * Allocate memory from the arena.
107  *
108  * Returns NULL if no memory is available.
109  */
110 void *
111 hib_alloc(struct hiballoc_arena *arena, size_t alloc_sz)
112 {
113 	struct hiballoc_entry *entry, *new_entry;
114 	size_t find_sz;
115 
116 	/*
117 	 * Enforce alignment of HIB_ALIGN bytes.
118 	 *
119 	 * Note that, because the entry is put in front of the allocation,
120 	 * 0-byte allocations are guaranteed a unique address.
121 	 */
122 	alloc_sz = roundup(alloc_sz, HIB_ALIGN);
123 
124 	/*
125 	 * Find an entry with hibe_space >= find_sz.
126 	 *
127 	 * If the root node is not large enough, we switch to tree traversal.
128 	 * Because all entries are made at the bottom of the free space,
129 	 * traversal from the end has a slightly better chance of yielding
130 	 * a sufficiently large space.
131 	 */
132 	find_sz = alloc_sz + HIB_SIZEOF(struct hiballoc_entry);
133 	entry = RB_ROOT(&arena->hib_addrs);
134 	if (entry != NULL && entry->hibe_space < find_sz) {
135 		RB_FOREACH_REVERSE(entry, hiballoc_addr, &arena->hib_addrs) {
136 			if (entry->hibe_space >= find_sz)
137 				break;
138 		}
139 	}
140 
141 	/*
142 	 * Insufficient or too fragmented memory.
143 	 */
144 	if (entry == NULL)
145 		return NULL;
146 
147 	/*
148 	 * Create new entry in allocated space.
149 	 */
150 	new_entry = (struct hiballoc_entry*)(
151 	    (caddr_t)hib_entry_to_addr(entry) + entry->hibe_use);
152 	new_entry->hibe_space = entry->hibe_space - find_sz;
153 	new_entry->hibe_use = alloc_sz;
154 
155 	/*
156 	 * Insert entry.
157 	 */
158 	if (RB_INSERT(hiballoc_addr, &arena->hib_addrs, new_entry) != NULL)
159 		panic("hib_alloc: insert failure");
160 	entry->hibe_space = 0;
161 
162 	/* Return address managed by entry. */
163 	return hib_entry_to_addr(new_entry);
164 }
165 
166 /*
167  * Free a pointer previously allocated from this arena.
168  *
169  * If addr is NULL, this will be silently accepted.
170  */
171 void
172 hib_free(struct hiballoc_arena *arena, void *addr)
173 {
174 	struct hiballoc_entry *entry, *prev;
175 
176 	if (addr == NULL)
177 		return;
178 
179 	/*
180 	 * Derive entry from addr and check it is really in this arena.
181 	 */
182 	entry = hib_addr_to_entry(addr);
183 	if (RB_FIND(hiballoc_addr, &arena->hib_addrs, entry) != entry)
184 		panic("hib_free: freed item %p not in hib arena", addr);
185 
186 	/*
187 	 * Give the space in entry to its predecessor.
188 	 *
189 	 * If entry has no predecessor, change its used space into free space
190 	 * instead.
191 	 */
192 	prev = RB_PREV(hiballoc_addr, &arena->hib_addrs, entry);
193 	if (prev != NULL &&
194 	    (void *)((caddr_t)prev + HIB_SIZEOF(struct hiballoc_entry) +
195 	    prev->hibe_use + prev->hibe_space) == entry) {
196 		/* Merge entry. */
197 		RB_REMOVE(hiballoc_addr, &arena->hib_addrs, entry);
198 		prev->hibe_space += HIB_SIZEOF(struct hiballoc_entry) +
199 		    entry->hibe_use + entry->hibe_space;
200 	} else {
201 		/* Flip used memory to free space. */
202 		entry->hibe_space += entry->hibe_use;
203 		entry->hibe_use = 0;
204 	}
205 }
206 
207 /*
208  * Initialize hiballoc.
209  *
210  * The allocator will manage memmory at ptr, which is len bytes.
211  */
212 int
213 hiballoc_init(struct hiballoc_arena *arena, void *p_ptr, size_t p_len)
214 {
215 	struct hiballoc_entry *entry;
216 	caddr_t ptr;
217 	size_t len;
218 
219 	RB_INIT(&arena->hib_addrs);
220 
221 	/*
222 	 * Hib allocator enforces HIB_ALIGN alignment.
223 	 * Fixup ptr and len.
224 	 */
225 	ptr = (caddr_t)roundup((vaddr_t)p_ptr, HIB_ALIGN);
226 	len = p_len - ((size_t)ptr - (size_t)p_ptr);
227 	len &= ~((size_t)HIB_ALIGN - 1);
228 
229 	/*
230 	 * Insufficient memory to be able to allocate and also do bookkeeping.
231 	 */
232 	if (len <= HIB_SIZEOF(struct hiballoc_entry))
233 		return ENOMEM;
234 
235 	/*
236 	 * Create entry describing space.
237 	 */
238 	entry = (struct hiballoc_entry*)ptr;
239 	entry->hibe_use = 0;
240 	entry->hibe_space = len - HIB_SIZEOF(struct hiballoc_entry);
241 	RB_INSERT(hiballoc_addr, &arena->hib_addrs, entry);
242 
243 	return 0;
244 }
245 
246 /*
247  * Zero all free memory.
248  */
249 void
250 uvm_pmr_zero_everything(void)
251 {
252 	struct uvm_pmemrange	*pmr;
253 	struct vm_page		*pg;
254 	int			 i;
255 
256 	uvm_lock_fpageq();
257 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
258 		/* Zero single pages. */
259 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_DIRTY]))
260 		    != NULL) {
261 			uvm_pmr_remove(pmr, pg);
262 			uvm_pagezero(pg);
263 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
264 			uvmexp.zeropages++;
265 			uvm_pmr_insert(pmr, pg, 0);
266 		}
267 
268 		/* Zero multi page ranges. */
269 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_DIRTY]))
270 		    != NULL) {
271 			pg--; /* Size tree always has second page. */
272 			uvm_pmr_remove(pmr, pg);
273 			for (i = 0; i < pg->fpgsz; i++) {
274 				uvm_pagezero(&pg[i]);
275 				atomic_setbits_int(&pg[i].pg_flags, PG_ZERO);
276 				uvmexp.zeropages++;
277 			}
278 			uvm_pmr_insert(pmr, pg, 0);
279 		}
280 	}
281 	uvm_unlock_fpageq();
282 }
283 
284 /*
285  * Mark all memory as dirty.
286  *
287  * Used to inform the system that the clean memory isn't clean for some
288  * reason, for example because we just came back from hibernate.
289  */
290 void
291 uvm_pmr_dirty_everything(void)
292 {
293 	struct uvm_pmemrange	*pmr;
294 	struct vm_page		*pg;
295 	int			 i;
296 
297 	uvm_lock_fpageq();
298 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
299 		/* Dirty single pages. */
300 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_ZERO]))
301 		    != NULL) {
302 			uvm_pmr_remove(pmr, pg);
303 			atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
304 			uvm_pmr_insert(pmr, pg, 0);
305 		}
306 
307 		/* Dirty multi page ranges. */
308 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_ZERO]))
309 		    != NULL) {
310 			pg--; /* Size tree always has second page. */
311 			uvm_pmr_remove(pmr, pg);
312 			for (i = 0; i < pg->fpgsz; i++)
313 				atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
314 			uvm_pmr_insert(pmr, pg, 0);
315 		}
316 	}
317 
318 	uvmexp.zeropages = 0;
319 	uvm_unlock_fpageq();
320 }
321 
322 /*
323  * Allocate the highest address that can hold sz.
324  *
325  * sz in bytes.
326  */
327 int
328 uvm_pmr_alloc_pig(paddr_t *addr, psize_t sz)
329 {
330 	struct uvm_pmemrange	*pmr;
331 	struct vm_page		*pig_pg, *pg;
332 
333 	/*
334 	 * Convert sz to pages, since that is what pmemrange uses internally.
335 	 */
336 	sz = atop(round_page(sz));
337 
338 	uvm_lock_fpageq();
339 
340 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
341 		RB_FOREACH_REVERSE(pig_pg, uvm_pmr_addr, &pmr->addr) {
342 			if (pig_pg->fpgsz >= sz) {
343 				goto found;
344 			}
345 		}
346 	}
347 
348 	/*
349 	 * Allocation failure.
350 	 */
351 	uvm_unlock_fpageq();
352 	return ENOMEM;
353 
354 found:
355 	/* Remove page from freelist. */
356 	uvm_pmr_remove_size(pmr, pig_pg);
357 	pig_pg->fpgsz -= sz;
358 	pg = pig_pg + pig_pg->fpgsz;
359 	if (pig_pg->fpgsz == 0)
360 		uvm_pmr_remove_addr(pmr, pig_pg);
361 	else
362 		uvm_pmr_insert_size(pmr, pig_pg);
363 
364 	uvmexp.free -= sz;
365 	*addr = VM_PAGE_TO_PHYS(pg);
366 
367 	/*
368 	 * Update pg flags.
369 	 *
370 	 * Note that we trash the sz argument now.
371 	 */
372 	while (sz > 0) {
373 		KASSERT(pg->pg_flags & PQ_FREE);
374 
375 		atomic_clearbits_int(&pg->pg_flags,
376 		    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3);
377 
378 		if (pg->pg_flags & PG_ZERO)
379 			uvmexp.zeropages -= sz;
380 		atomic_clearbits_int(&pg->pg_flags,
381 		    PG_ZERO|PQ_FREE);
382 
383 		pg->uobject = NULL;
384 		pg->uanon = NULL;
385 		pg->pg_version++;
386 
387 		/*
388 		 * Next.
389 		 */
390 		pg++;
391 		sz--;
392 	}
393 
394 	/* Return. */
395 	uvm_unlock_fpageq();
396 	return 0;
397 }
398 
399 /*
400  * Allocate a piglet area.
401  *
402  * This is as low as possible.
403  * Piglets are aligned.
404  *
405  * sz and align in bytes.
406  *
407  * The call will sleep for the pagedaemon to attempt to free memory.
408  * The pagedaemon may decide its not possible to free enough memory, causing
409  * the allocation to fail.
410  */
411 int
412 uvm_pmr_alloc_piglet(vaddr_t *va, paddr_t *pa, vsize_t sz, paddr_t align)
413 {
414 	paddr_t			 pg_addr, piglet_addr;
415 	struct uvm_pmemrange	*pmr;
416 	struct vm_page		*pig_pg, *pg;
417 	struct pglist		 pageq;
418 	int			 pdaemon_woken;
419 	vaddr_t			 piglet_va;
420 
421 	KASSERT((align & (align - 1)) == 0);
422 	pdaemon_woken = 0; /* Didn't wake the pagedaemon. */
423 
424 	/*
425 	 * Fixup arguments: align must be at least PAGE_SIZE,
426 	 * sz will be converted to pagecount, since that is what
427 	 * pmemrange uses internally.
428 	 */
429 	if (align < PAGE_SIZE)
430 		align = PAGE_SIZE;
431 	sz = round_page(sz);
432 
433 	uvm_lock_fpageq();
434 
435 	TAILQ_FOREACH_REVERSE(pmr, &uvm.pmr_control.use, uvm_pmemrange_use,
436 	    pmr_use) {
437 retry:
438 		/*
439 		 * Search for a range with enough space.
440 		 * Use the address tree, to ensure the range is as low as
441 		 * possible.
442 		 */
443 		RB_FOREACH(pig_pg, uvm_pmr_addr, &pmr->addr) {
444 			pg_addr = VM_PAGE_TO_PHYS(pig_pg);
445 			piglet_addr = (pg_addr + (align - 1)) & ~(align - 1);
446 
447 			if (atop(pg_addr) + pig_pg->fpgsz >=
448 			    atop(piglet_addr) + atop(sz))
449 				goto found;
450 		}
451 	}
452 
453 	/*
454 	 * Try to coerse the pagedaemon into freeing memory
455 	 * for the piglet.
456 	 *
457 	 * pdaemon_woken is set to prevent the code from
458 	 * falling into an endless loop.
459 	 */
460 	if (!pdaemon_woken) {
461 		pdaemon_woken = 1;
462 		if (uvm_wait_pla(ptoa(pmr->low), ptoa(pmr->high) - 1,
463 		    sz, UVM_PLA_FAILOK) == 0)
464 			goto retry;
465 	}
466 
467 	/* Return failure. */
468 	uvm_unlock_fpageq();
469 	return ENOMEM;
470 
471 found:
472 	/*
473 	 * Extract piglet from pigpen.
474 	 */
475 	TAILQ_INIT(&pageq);
476 	uvm_pmr_extract_range(pmr, pig_pg,
477 	    atop(piglet_addr), atop(piglet_addr) + atop(sz), &pageq);
478 
479 	*pa = piglet_addr;
480 	uvmexp.free -= atop(sz);
481 
482 	/*
483 	 * Update pg flags.
484 	 *
485 	 * Note that we trash the sz argument now.
486 	 */
487 	TAILQ_FOREACH(pg, &pageq, pageq) {
488 		KASSERT(pg->pg_flags & PQ_FREE);
489 
490 		atomic_clearbits_int(&pg->pg_flags,
491 		    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3);
492 
493 		if (pg->pg_flags & PG_ZERO)
494 			uvmexp.zeropages--;
495 		atomic_clearbits_int(&pg->pg_flags,
496 		    PG_ZERO|PQ_FREE);
497 
498 		pg->uobject = NULL;
499 		pg->uanon = NULL;
500 		pg->pg_version++;
501 	}
502 
503 	uvm_unlock_fpageq();
504 
505 	/*
506 	 * Now allocate a va.
507 	 * Use direct mappings for the pages.
508 	 */
509 
510 	piglet_va = *va = (vaddr_t)km_alloc(sz, &kv_any, &kp_none, &kd_waitok);
511 	if (!piglet_va) {
512 		uvm_pglistfree(&pageq);
513 		return ENOMEM;
514 	}
515 
516 	/*
517 	 * Map piglet to va.
518 	 */
519 	TAILQ_FOREACH(pg, &pageq, pageq) {
520 		pmap_kenter_pa(piglet_va, VM_PAGE_TO_PHYS(pg), UVM_PROT_RW);
521 		piglet_va += PAGE_SIZE;
522 	}
523 	pmap_update(pmap_kernel());
524 
525 	return 0;
526 }
527 
528 /*
529  * Free a piglet area.
530  */
531 void
532 uvm_pmr_free_piglet(vaddr_t va, vsize_t sz)
533 {
534 	paddr_t			 pa;
535 	struct vm_page		*pg;
536 
537 	/*
538 	 * Fix parameters.
539 	 */
540 	sz = round_page(sz);
541 
542 	/*
543 	 * Find the first page in piglet.
544 	 * Since piglets are contiguous, the first pg is all we need.
545 	 */
546 	if (!pmap_extract(pmap_kernel(), va, &pa))
547 		panic("uvm_pmr_free_piglet: piglet 0x%lx has no pages", va);
548 	pg = PHYS_TO_VM_PAGE(pa);
549 	if (pg == NULL)
550 		panic("uvm_pmr_free_piglet: unmanaged page 0x%lx", pa);
551 
552 	/*
553 	 * Unmap.
554 	 */
555 	pmap_kremove(va, sz);
556 	pmap_update(pmap_kernel());
557 
558 	/*
559 	 * Free the physical and virtual memory.
560 	 */
561 	uvm_pmr_freepages(pg, atop(sz));
562 	km_free((void *)va, sz, &kv_any, &kp_none);
563 }
564 
565 /*
566  * Physmem RLE compression support.
567  *
568  * Given a physical page address, it will return the number of pages
569  * starting at the address, that are free.  Clamps to the number of pages in
570  * HIBERNATE_CHUNK_SIZE. Returns 0 if the page at addr is not free.
571  */
572 int
573 uvm_page_rle(paddr_t addr)
574 {
575 	struct vm_page		*pg, *pg_end;
576 	struct vm_physseg	*vmp;
577 	int			 pseg_idx, off_idx;
578 
579 	pseg_idx = vm_physseg_find(atop(addr), &off_idx);
580 	if (pseg_idx == -1)
581 		return 0;
582 
583 	vmp = &vm_physmem[pseg_idx];
584 	pg = &vmp->pgs[off_idx];
585 	if (!(pg->pg_flags & PQ_FREE))
586 		return 0;
587 
588 	/*
589 	 * Search for the first non-free page after pg.
590 	 * Note that the page may not be the first page in a free pmemrange,
591 	 * therefore pg->fpgsz cannot be used.
592 	 */
593 	for (pg_end = pg; pg_end <= vmp->lastpg &&
594 	    (pg_end->pg_flags & PQ_FREE) == PQ_FREE; pg_end++)
595 		;
596 	return min((pg_end - pg), HIBERNATE_CHUNK_SIZE/PAGE_SIZE);
597 }
598 
599 /*
600  * Fills out the hibernate_info union pointed to by hiber_info
601  * with information about this machine (swap signature block
602  * offsets, number of memory ranges, kernel in use, etc)
603  */
604 int
605 get_hibernate_info(union hibernate_info *hiber_info, int suspend)
606 {
607 	int chunktable_size;
608 	struct disklabel dl;
609 	char err_string[128], *dl_ret;
610 
611 	/* Determine I/O function to use */
612 	hiber_info->io_func = get_hibernate_io_function();
613 	if (hiber_info->io_func == NULL)
614 		return (1);
615 
616 	/* Calculate hibernate device */
617 	hiber_info->device = swdevt[0].sw_dev;
618 
619 	/* Read disklabel (used to calculate signature and image offsets) */
620 	dl_ret = disk_readlabel(&dl, hiber_info->device, err_string, 128);
621 
622 	if (dl_ret) {
623 		printf("Hibernate error reading disklabel: %s\n", dl_ret);
624 		return (1);
625 	}
626 
627 	/* Make sure we have a swap partition. */
628 	if (dl.d_partitions[1].p_fstype != FS_SWAP ||
629 	    dl.d_partitions[1].p_size == 0)
630 		return (1);
631 
632 	hiber_info->secsize = dl.d_secsize;
633 
634 	/* Make sure the signature can fit in one block */
635 	KASSERT(sizeof(union hibernate_info)/hiber_info->secsize == 1);
636 
637 	/* Calculate swap offset from start of disk */
638 	hiber_info->swap_offset = dl.d_partitions[1].p_offset;
639 
640 	/* Calculate signature block location */
641 	hiber_info->sig_offset = dl.d_partitions[1].p_offset +
642 	    dl.d_partitions[1].p_size -
643 	    sizeof(union hibernate_info)/hiber_info->secsize;
644 
645 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / hiber_info->secsize;
646 
647 	/* Stash kernel version information */
648 	bzero(&hiber_info->kernel_version, 128);
649 	bcopy(version, &hiber_info->kernel_version,
650 	    min(strlen(version), sizeof(hiber_info->kernel_version)-1));
651 
652 	if (suspend) {
653 		/* Allocate piglet region */
654 		if (uvm_pmr_alloc_piglet(&hiber_info->piglet_va,
655 		    &hiber_info->piglet_pa, HIBERNATE_CHUNK_SIZE*3,
656 		    HIBERNATE_CHUNK_SIZE)) {
657 			printf("Hibernate failed to allocate the piglet\n");
658 			return (1);
659 		}
660 		hiber_info->io_page = (void *)hiber_info->piglet_va;
661 
662 		/*
663 		 * Initialize of the hibernate IO function (for drivers which
664 		 * need that)
665 		 */
666 		if (hiber_info->io_func(hiber_info->device, 0,
667 		    (vaddr_t)NULL, 0, HIB_INIT, hiber_info->io_page))
668 			goto fail;
669 
670 	} else {
671 		/*
672 		 * Resuming kernels use a regular I/O page since we won't
673 		 * have access to the suspended kernel's piglet VA at this
674 		 * point. No need to free this I/O page as it will vanish
675 		 * as part of the resume.
676 		 */
677 		hiber_info->io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
678 		if (!hiber_info->io_page)
679 			return (1);
680 	}
681 
682 
683 	if (get_hibernate_info_md(hiber_info))
684 		goto fail;
685 
686 	/* Calculate memory image location */
687 	hiber_info->image_offset = dl.d_partitions[1].p_offset +
688 	    dl.d_partitions[1].p_size -
689 	    (hiber_info->image_size / hiber_info->secsize) -
690 	    sizeof(union hibernate_info)/hiber_info->secsize -
691 	    chunktable_size;
692 
693 	return (0);
694 fail:
695 	if (suspend)
696 		uvm_pmr_free_piglet(hiber_info->piglet_va, HIBERNATE_CHUNK_SIZE*3);
697 
698 	return (1);
699 }
700 
701 /*
702  * Allocate nitems*size bytes from the hiballoc area presently in use
703  */
704 void
705 *hibernate_zlib_alloc(void *unused, int nitems, int size)
706 {
707 	struct hibernate_zlib_state *hibernate_state;
708 
709 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
710 
711 	return hib_alloc(&hibernate_state->hiballoc_arena, nitems*size);
712 }
713 
714 /*
715  * Free the memory pointed to by addr in the hiballoc area presently in
716  * use
717  */
718 void
719 hibernate_zlib_free(void *unused, void *addr)
720 {
721 	struct hibernate_zlib_state *hibernate_state;
722 
723 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
724 
725 	hib_free(&hibernate_state->hiballoc_arena, addr);
726 }
727 
728 /*
729  * Gets the next RLE value from the image stream
730  */
731 int
732 hibernate_get_next_rle(void)
733 {
734 	int rle, i;
735 	struct hibernate_zlib_state *hibernate_state;
736 
737 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
738 
739 	/* Read RLE code */
740 	hibernate_state->hib_stream.next_out = (char *)&rle;
741 	hibernate_state->hib_stream.avail_out = sizeof(rle);
742 
743 	i = inflate(&hibernate_state->hib_stream, Z_FULL_FLUSH);
744 	if (i != Z_OK && i != Z_STREAM_END) {
745 		/*
746 		 * XXX - this will likely reboot/hang most machines,
747 		 *       but there's not much else we can do here.
748 		 */
749 		panic("inflate rle error");
750 	}
751 
752 	/* Sanity check what RLE value we got */
753 	if (rle > HIBERNATE_CHUNK_SIZE/PAGE_SIZE || rle < 0)
754 		panic("invalid RLE code");
755 
756 	if (i == Z_STREAM_END)
757 		rle = -1;
758 
759 	return rle;
760 }
761 
762 /*
763  * Inflate next page of data from the image stream
764  */
765 int
766 hibernate_inflate_page(void)
767 {
768 	struct hibernate_zlib_state *hibernate_state;
769 	int i;
770 
771 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
772 
773 	/* Set up the stream for inflate */
774 	hibernate_state->hib_stream.next_out = (char *)HIBERNATE_INFLATE_PAGE;
775 	hibernate_state->hib_stream.avail_out = PAGE_SIZE;
776 
777 	/* Process next block of data */
778 	i = inflate(&hibernate_state->hib_stream, Z_PARTIAL_FLUSH);
779 	if (i != Z_OK && i != Z_STREAM_END) {
780 		/*
781 		 * XXX - this will likely reboot/hang most machines,
782 		 *       but there's not much else we can do here.
783 		 */
784 
785 		panic("inflate error");
786 	}
787 
788 	/* We should always have extracted a full page ... */
789 	if (hibernate_state->hib_stream.avail_out != 0)
790 		panic("incomplete page");
791 
792 	return (i == Z_STREAM_END);
793 }
794 
795 /*
796  * Inflate size bytes from src into dest, skipping any pages in
797  * [src..dest] that are special (see hibernate_inflate_skip)
798  *
799  * This function executes while using the resume-time stack
800  * and pmap, and therefore cannot use ddb/printf/etc. Doing so
801  * will likely hang or reset the machine.
802  */
803 void
804 hibernate_inflate_region(union hibernate_info *hiber_info, paddr_t dest,
805     paddr_t src, size_t size)
806 {
807 	int end_stream = 0 ;
808 	struct hibernate_zlib_state *hibernate_state;
809 
810 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
811 
812 	hibernate_state->hib_stream.next_in = (char *)src;
813 	hibernate_state->hib_stream.avail_in = size;
814 
815 	do {
816 		/* Flush cache and TLB */
817 		hibernate_flush();
818 
819 		/*
820 		 * Is this a special page? If yes, redirect the
821 		 * inflate output to a scratch page (eg, discard it)
822 		 */
823 		if (hibernate_inflate_skip(hiber_info, dest)) {
824 			hibernate_enter_resume_mapping(
825 			    HIBERNATE_INFLATE_PAGE,
826 			    HIBERNATE_INFLATE_PAGE, 0);
827 		} else {
828 			hibernate_enter_resume_mapping(
829 			    HIBERNATE_INFLATE_PAGE, dest, 0);
830 		}
831 
832 		hibernate_flush();
833 		end_stream = hibernate_inflate_page();
834 
835 		dest += PAGE_SIZE;
836 	} while (!end_stream);
837 }
838 
839 /*
840  * deflate from src into the I/O page, up to 'remaining' bytes
841  *
842  * Returns number of input bytes consumed, and may reset
843  * the 'remaining' parameter if not all the output space was consumed
844  * (this information is needed to know how much to write to disk
845  */
846 size_t
847 hibernate_deflate(union hibernate_info *hiber_info, paddr_t src,
848     size_t *remaining)
849 {
850 	vaddr_t hibernate_io_page = hiber_info->piglet_va + PAGE_SIZE;
851 	struct hibernate_zlib_state *hibernate_state;
852 
853 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
854 
855 	/* Set up the stream for deflate */
856 	hibernate_state->hib_stream.next_in = (caddr_t)src;
857 	hibernate_state->hib_stream.avail_in = PAGE_SIZE - (src & PAGE_MASK);
858 	hibernate_state->hib_stream.next_out = (caddr_t)hibernate_io_page +
859 	    (PAGE_SIZE - *remaining);
860 	hibernate_state->hib_stream.avail_out = *remaining;
861 
862 	/* Process next block of data */
863 	if (deflate(&hibernate_state->hib_stream, Z_PARTIAL_FLUSH) != Z_OK)
864 		panic("hibernate zlib deflate error");
865 
866 	/* Update pointers and return number of bytes consumed */
867 	*remaining = hibernate_state->hib_stream.avail_out;
868 	return (PAGE_SIZE - (src & PAGE_MASK)) -
869 	    hibernate_state->hib_stream.avail_in;
870 }
871 
872 /*
873  * Write the hibernation information specified in hiber_info
874  * to the location in swap previously calculated (last block of
875  * swap), called the "signature block".
876  *
877  * Write the memory chunk table to the area in swap immediately
878  * preceding the signature block.
879  */
880 int
881 hibernate_write_signature(union hibernate_info *hiber_info)
882 {
883 	/* Write hibernate info to disk */
884 	return (hiber_info->io_func(hiber_info->device, hiber_info->sig_offset,
885 	    (vaddr_t)hiber_info, hiber_info->secsize, HIB_W,
886 	    hiber_info->io_page));
887 }
888 
889 /*
890  * Write the memory chunk table to the area in swap immediately
891  * preceding the signature block. The chunk table is stored
892  * in the piglet when this function is called.
893  */
894 int
895 hibernate_write_chunktable(union hibernate_info *hiber_info)
896 {
897 	struct hibernate_disk_chunk *chunks;
898 	vaddr_t hibernate_chunk_table_start;
899 	size_t hibernate_chunk_table_size;
900 	daddr_t chunkbase;
901 	int i;
902 
903 	hibernate_chunk_table_size = HIBERNATE_CHUNK_TABLE_SIZE;
904 
905 	chunkbase = hiber_info->sig_offset -
906 	    (hibernate_chunk_table_size / hiber_info->secsize);
907 
908 	hibernate_chunk_table_start = hiber_info->piglet_va +
909 	    HIBERNATE_CHUNK_SIZE;
910 
911 	chunks = (struct hibernate_disk_chunk *)(hiber_info->piglet_va +
912 	    HIBERNATE_CHUNK_SIZE);
913 
914 	/* Write chunk table */
915 	for (i = 0; i < hibernate_chunk_table_size; i += MAXPHYS) {
916 		if (hiber_info->io_func(hiber_info->device,
917 		    chunkbase + (i/hiber_info->secsize),
918 		    (vaddr_t)(hibernate_chunk_table_start + i),
919 		    MAXPHYS, HIB_W, hiber_info->io_page))
920 			return (1);
921 	}
922 
923 	return (0);
924 }
925 
926 /*
927  * Write an empty hiber_info to the swap signature block, which is
928  * guaranteed to not match any valid hiber_info.
929  */
930 int
931 hibernate_clear_signature(void)
932 {
933 	union hibernate_info blank_hiber_info;
934 	union hibernate_info hiber_info;
935 
936 	/* Zero out a blank hiber_info */
937 	bzero(&blank_hiber_info, sizeof(hiber_info));
938 
939 	if (get_hibernate_info(&hiber_info, 0))
940 		return (1);
941 
942 	/* Write (zeroed) hibernate info to disk */
943 	if (hibernate_block_io(&hiber_info,
944 	    hiber_info.sig_offset - hiber_info.swap_offset,
945 	    hiber_info.secsize, (vaddr_t)&blank_hiber_info, 1))
946 		panic("error hibernate write 6");
947 
948 	return (0);
949 }
950 
951 /*
952  * Check chunk range overlap when calculating whether or not to copy a
953  * compressed chunk to the piglet area before decompressing.
954  *
955  * returns zero if the ranges do not overlap, non-zero otherwise.
956  */
957 int
958 hibernate_check_overlap(paddr_t r1s, paddr_t r1e, paddr_t r2s, paddr_t r2e)
959 {
960 	/* case A : end of r1 overlaps start of r2 */
961 	if (r1s < r2s && r1e > r2s)
962 		return (1);
963 
964 	/* case B : r1 entirely inside r2 */
965 	if (r1s >= r2s && r1e <= r2e)
966 		return (1);
967 
968 	/* case C : r2 entirely inside r1 */
969 	if (r2s >= r1s && r2e <= r1e)
970 		return (1);
971 
972 	/* case D : end of r2 overlaps start of r1 */
973 	if (r2s < r1s && r2e > r1s)
974 		return (1);
975 
976 	return (0);
977 }
978 
979 /*
980  * Compare two hibernate_infos to determine if they are the same (eg,
981  * we should be performing a hibernate resume on this machine.
982  * Not all fields are checked - just enough to verify that the machine
983  * has the same memory configuration and kernel as the one that
984  * wrote the signature previously.
985  */
986 int
987 hibernate_compare_signature(union hibernate_info *mine,
988     union hibernate_info *disk)
989 {
990 	u_int i;
991 
992 	if (mine->nranges != disk->nranges)
993 		return (1);
994 
995 	if (strcmp(mine->kernel_version, disk->kernel_version) != 0)
996 		return (1);
997 
998 	for (i = 0; i < mine->nranges; i++) {
999 		if ((mine->ranges[i].base != disk->ranges[i].base) ||
1000 		    (mine->ranges[i].end != disk->ranges[i].end) )
1001 			return (1);
1002 	}
1003 
1004 	return (0);
1005 }
1006 
1007 /*
1008  * Transfers xfer_size bytes between the hibernate device specified in
1009  * hib_info at offset blkctr and the vaddr specified at dest.
1010  *
1011  * Separate offsets and pages are used to handle misaligned reads (reads
1012  * that span a page boundary).
1013  *
1014  * blkctr specifies a relative offset (relative to the start of swap),
1015  * not an absolute disk offset
1016  *
1017  */
1018 int
1019 hibernate_block_io(union hibernate_info *hib_info, daddr_t blkctr,
1020     size_t xfer_size, vaddr_t dest, int iswrite)
1021 {
1022 	struct buf *bp;
1023 	struct bdevsw *bdsw;
1024 	int error;
1025 
1026 	bp = geteblk(xfer_size);
1027 	bdsw = &bdevsw[major(hib_info->device)];
1028 
1029 	error = (*bdsw->d_open)(hib_info->device, FREAD, S_IFCHR, curproc);
1030 	if (error) {
1031 		printf("hibernate_block_io open failed\n");
1032 		return (1);
1033 	}
1034 
1035 	if (iswrite)
1036 		bcopy((caddr_t)dest, bp->b_data, xfer_size);
1037 
1038 	bp->b_bcount = xfer_size;
1039 	bp->b_blkno = blkctr;
1040 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
1041 	SET(bp->b_flags, B_BUSY | (iswrite ? B_WRITE : B_READ) | B_RAW);
1042 	bp->b_dev = hib_info->device;
1043 	bp->b_cylinder = 0;
1044 	(*bdsw->d_strategy)(bp);
1045 
1046 	error = biowait(bp);
1047 	if (error) {
1048 		printf("hibernate_block_io biowait failed %d\n", error);
1049 		error = (*bdsw->d_close)(hib_info->device, 0, S_IFCHR,
1050 		    curproc);
1051 		if (error)
1052 			printf("hibernate_block_io error close failed\n");
1053 		return (1);
1054 	}
1055 
1056 	error = (*bdsw->d_close)(hib_info->device, FREAD, S_IFCHR, curproc);
1057 	if (error) {
1058 		printf("hibernate_block_io close failed\n");
1059 		return (1);
1060 	}
1061 
1062 	if (!iswrite)
1063 		bcopy(bp->b_data, (caddr_t)dest, xfer_size);
1064 
1065 	bp->b_flags |= B_INVAL;
1066 	brelse(bp);
1067 
1068 	return (0);
1069 }
1070 
1071 /*
1072  * Reads the signature block from swap, checks against the current machine's
1073  * information. If the information matches, perform a resume by reading the
1074  * saved image into the pig area, and unpacking.
1075  */
1076 void
1077 hibernate_resume(void)
1078 {
1079 	union hibernate_info hiber_info;
1080 	int s;
1081 
1082 	/* Get current running machine's hibernate info */
1083 	bzero(&hiber_info, sizeof(hiber_info));
1084 	if (get_hibernate_info(&hiber_info, 0))
1085 		return;
1086 
1087 	/* Read hibernate info from disk */
1088 	s = splbio();
1089 
1090 	if (hibernate_block_io(&hiber_info,
1091 	    hiber_info.sig_offset - hiber_info.swap_offset,
1092 	    hiber_info.secsize, (vaddr_t)&disk_hiber_info, 0))
1093 		panic("error in hibernate read");
1094 
1095 	/*
1096 	 * If on-disk and in-memory hibernate signatures match,
1097 	 * this means we should do a resume from hibernate.
1098 	 */
1099 	if (hibernate_compare_signature(&hiber_info, &disk_hiber_info)) {
1100 		splx(s);
1101 		return;
1102 	}
1103 
1104 	printf("Unhibernating...\n");
1105 
1106 	/* Read the image from disk into the image (pig) area */
1107 	if (hibernate_read_image(&disk_hiber_info))
1108 		goto fail;
1109 
1110 	if (config_suspend(TAILQ_FIRST(&alldevs), DVACT_QUIESCE) != 0)
1111 		goto fail;
1112 
1113 	(void) splhigh();
1114 	disable_intr();
1115 	cold = 1;
1116 
1117 	if (config_suspend(TAILQ_FIRST(&alldevs), DVACT_SUSPEND) != 0) {
1118 		cold = 0;
1119 		enable_intr();
1120 		goto fail;
1121 	}
1122 
1123 	/* Point of no return ... */
1124 
1125 	pmap_kenter_pa(HIBERNATE_HIBALLOC_PAGE, HIBERNATE_HIBALLOC_PAGE,
1126 	    VM_PROT_ALL);
1127 	pmap_activate(curproc);
1128 
1129 	/* Switch stacks */
1130 	hibernate_switch_stack_machdep();
1131 
1132 	/*
1133 	 * Image is now in high memory (pig area), copy to correct location
1134 	 * in memory. We'll eventually end up copying on top of ourself, but
1135 	 * we are assured the kernel code here is the same between the
1136 	 * hibernated and resuming kernel, and we are running on our own
1137 	 * stack, so the overwrite is ok.
1138 	 */
1139 	hibernate_unpack_image(&disk_hiber_info);
1140 
1141 	/*
1142 	 * Resume the loaded kernel by jumping to the MD resume vector.
1143 	 * We won't be returning from this call.
1144 	 */
1145 	hibernate_resume_machdep();
1146 
1147 fail:
1148 	splx(s);
1149 	printf("Unable to resume hibernated image\n");
1150 }
1151 
1152 /*
1153  * Unpack image from pig area to original location by looping through the
1154  * list of output chunks in the order they should be restored (fchunks).
1155  * This ordering is used to avoid having inflate overwrite a chunk in the
1156  * middle of processing that chunk. This will, of course, happen during the
1157  * final output chunk, where we copy the chunk to the piglet area first,
1158  * before inflating.
1159  */
1160 void
1161 hibernate_unpack_image(union hibernate_info *hiber_info)
1162 {
1163 	struct hibernate_disk_chunk *chunks;
1164 	union hibernate_info local_hiber_info;
1165 	paddr_t image_cur = global_pig_start;
1166 	short i, *fchunks;
1167 	char *pva = (char *)hiber_info->piglet_va;
1168 	struct hibernate_zlib_state *hibernate_state;
1169 
1170 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1171 
1172 	/* Mask off based on arch-specific piglet page size */
1173 	pva = (char *)((paddr_t)pva & (PIGLET_PAGE_MASK));
1174 	fchunks = (short *)(pva + (4 * PAGE_SIZE));
1175 
1176 	chunks = (struct hibernate_disk_chunk *)(pva +  HIBERNATE_CHUNK_SIZE);
1177 
1178 	/* Can't use hiber_info that's passed in after this point */
1179 	bcopy(hiber_info, &local_hiber_info, sizeof(union hibernate_info));
1180 
1181 	hibernate_activate_resume_pt_machdep();
1182 
1183 	for (i = 0; i < local_hiber_info.chunk_ctr; i++) {
1184 		/* Reset zlib for inflate */
1185 		if (hibernate_zlib_reset(&local_hiber_info, 0) != Z_OK)
1186 			panic("hibernate failed to reset zlib for inflate");
1187 
1188 		hibernate_process_chunk(&local_hiber_info, &chunks[fchunks[i]],
1189 		    image_cur);
1190 
1191 		image_cur += chunks[fchunks[i]].compressed_size;
1192 
1193 	}
1194 }
1195 
1196 /*
1197  * Bounce a compressed image chunk to the piglet, entering mappings for the
1198  * copied pages as needed
1199  */
1200 void
1201 hibernate_copy_chunk_to_piglet(paddr_t img_cur, vaddr_t piglet, size_t size)
1202 {
1203 	size_t ct, ofs;
1204 	paddr_t src = img_cur;
1205 	vaddr_t dest = piglet;
1206 
1207 	/* Copy first partial page */
1208 	ct = (PAGE_SIZE) - (src & PAGE_MASK);
1209 	ofs = (src & PAGE_MASK);
1210 
1211 	if (ct < PAGE_SIZE) {
1212 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE,
1213 			(src - ofs), 0);
1214 		hibernate_flush();
1215 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE + ofs), (caddr_t)dest, ct);
1216 		src += ct;
1217 		dest += ct;
1218 	}
1219 
1220 	/* Copy remaining pages */
1221 	while (src < size + img_cur) {
1222 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE, src, 0);
1223 		hibernate_flush();
1224 		ct = PAGE_SIZE;
1225 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE), (caddr_t)dest, ct);
1226 		hibernate_flush();
1227 		src += ct;
1228 		dest += ct;
1229 	}
1230 }
1231 
1232 /*
1233  * Process a chunk by bouncing it to the piglet, followed by unpacking
1234  */
1235 void
1236 hibernate_process_chunk(union hibernate_info *hiber_info,
1237     struct hibernate_disk_chunk *chunk, paddr_t img_cur)
1238 {
1239 	char *pva = (char *)hiber_info->piglet_va;
1240 
1241 	hibernate_copy_chunk_to_piglet(img_cur,
1242 	 (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)), chunk->compressed_size);
1243 
1244 	hibernate_inflate_region(hiber_info, chunk->base,
1245 	    (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)),
1246 	    chunk->compressed_size);
1247 }
1248 
1249 /*
1250  * Write a compressed version of this machine's memory to disk, at the
1251  * precalculated swap offset:
1252  *
1253  * end of swap - signature block size - chunk table size - memory size
1254  *
1255  * The function begins by looping through each phys mem range, cutting each
1256  * one into MD sized chunks. These chunks are then compressed individually
1257  * and written out to disk, in phys mem order. Some chunks might compress
1258  * more than others, and for this reason, each chunk's size is recorded
1259  * in the chunk table, which is written to disk after the image has
1260  * properly been compressed and written (in hibernate_write_chunktable).
1261  *
1262  * When this function is called, the machine is nearly suspended - most
1263  * devices are quiesced/suspended, interrupts are off, and cold has
1264  * been set. This means that there can be no side effects once the
1265  * write has started, and the write function itself can also have no
1266  * side effects. This also means no printfs are permitted (since it
1267  * has side effects.)
1268  */
1269 int
1270 hibernate_write_chunks(union hibernate_info *hiber_info)
1271 {
1272 	paddr_t range_base, range_end, inaddr, temp_inaddr;
1273 	size_t nblocks, out_remaining, used;
1274 	struct hibernate_disk_chunk *chunks;
1275 	vaddr_t hibernate_io_page = hiber_info->piglet_va + PAGE_SIZE;
1276 	daddr_t blkctr = hiber_info->image_offset, offset = 0;
1277 	int i;
1278 	struct hibernate_zlib_state *hibernate_state;
1279 
1280 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1281 
1282 	hiber_info->chunk_ctr = 0;
1283 
1284 	/*
1285 	 * Allocate VA for the temp and copy page.
1286 	 * These will become part of the suspended kernel and will
1287 	 * be freed in hibernate_free, upon resume.
1288 	 */
1289 	hibernate_temp_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1290 	    &kp_none, &kd_nowait);
1291 	if (!hibernate_temp_page)
1292 		return (1);
1293 
1294 	hibernate_copy_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1295 	    &kp_none, &kd_nowait);
1296 	if (!hibernate_copy_page)
1297 		return (1);
1298 
1299 	pmap_kenter_pa(hibernate_copy_page,
1300 	    (hiber_info->piglet_pa + 3*PAGE_SIZE), VM_PROT_ALL);
1301 
1302 	/* XXX - not needed on all archs */
1303 	pmap_activate(curproc);
1304 
1305 	chunks = (struct hibernate_disk_chunk *)(hiber_info->piglet_va +
1306 	    HIBERNATE_CHUNK_SIZE);
1307 
1308 	/* Calculate the chunk regions */
1309 	for (i = 0; i < hiber_info->nranges; i++) {
1310 		range_base = hiber_info->ranges[i].base;
1311 		range_end = hiber_info->ranges[i].end;
1312 
1313 		inaddr = range_base;
1314 
1315 		while (inaddr < range_end) {
1316 			chunks[hiber_info->chunk_ctr].base = inaddr;
1317 			if (inaddr + HIBERNATE_CHUNK_SIZE < range_end)
1318 				chunks[hiber_info->chunk_ctr].end = inaddr +
1319 				    HIBERNATE_CHUNK_SIZE;
1320 			else
1321 				chunks[hiber_info->chunk_ctr].end = range_end;
1322 
1323 			inaddr += HIBERNATE_CHUNK_SIZE;
1324 			hiber_info->chunk_ctr ++;
1325 		}
1326 	}
1327 
1328 	/* Compress and write the chunks in the chunktable */
1329 	for (i = 0; i < hiber_info->chunk_ctr; i++) {
1330 		range_base = chunks[i].base;
1331 		range_end = chunks[i].end;
1332 
1333 		chunks[i].offset = blkctr;
1334 
1335 		/* Reset zlib for deflate */
1336 		if (hibernate_zlib_reset(hiber_info, 1) != Z_OK)
1337 			return (1);
1338 
1339 		inaddr = range_base;
1340 
1341 		/*
1342 		 * For each range, loop through its phys mem region
1343 		 * and write out the chunks (the last chunk might be
1344 		 * smaller than the chunk size).
1345 		 */
1346 		while (inaddr < range_end) {
1347 			out_remaining = PAGE_SIZE;
1348 			while (out_remaining > 0 && inaddr < range_end) {
1349 
1350 				/*
1351 				 * Adjust for regions that are not evenly
1352 				 * divisible by PAGE_SIZE or overflowed
1353 				 * pages from the previous iteration.
1354 				 */
1355 				temp_inaddr = (inaddr & PAGE_MASK) +
1356 				    hibernate_copy_page;
1357 
1358 				/* Deflate from temp_inaddr to IO page */
1359 				if (inaddr != range_end) {
1360 					pmap_kenter_pa(hibernate_temp_page,
1361 					    inaddr & PMAP_PA_MASK, VM_PROT_ALL);
1362 
1363 					/* XXX - not needed on all archs */
1364 					pmap_activate(curproc);
1365 
1366 					bcopy((caddr_t)hibernate_temp_page,
1367 					    (caddr_t)hibernate_copy_page, PAGE_SIZE);
1368 					inaddr += hibernate_deflate(hiber_info,
1369 					    temp_inaddr, &out_remaining);
1370 				}
1371 
1372 				if (out_remaining == 0) {
1373 					/* Filled up the page */
1374 					nblocks = PAGE_SIZE / hiber_info->secsize;
1375 
1376 					if (hiber_info->io_func(hiber_info->device,
1377 					    blkctr, (vaddr_t)hibernate_io_page,
1378 					    PAGE_SIZE, HIB_W, hiber_info->io_page))
1379 						return (1);
1380 
1381 					blkctr += nblocks;
1382 				}
1383 			}
1384 		}
1385 
1386 		if (inaddr != range_end)
1387 			return (1);
1388 
1389 		/*
1390 		 * End of range. Round up to next secsize bytes
1391 		 * after finishing compress
1392 		 */
1393 		if (out_remaining == 0)
1394 			out_remaining = PAGE_SIZE;
1395 
1396 		/* Finish compress */
1397 		hibernate_state->hib_stream.next_in = (caddr_t)inaddr;
1398 		hibernate_state->hib_stream.avail_in = 0;
1399 		hibernate_state->hib_stream.next_out =
1400 		    (caddr_t)hibernate_io_page + (PAGE_SIZE - out_remaining);
1401 		hibernate_state->hib_stream.avail_out = out_remaining;
1402 
1403 		if (deflate(&hibernate_state->hib_stream, Z_FINISH) !=
1404 		    Z_STREAM_END)
1405 			return (1);
1406 
1407 		out_remaining = hibernate_state->hib_stream.avail_out;
1408 
1409 		used = PAGE_SIZE - out_remaining;
1410 		nblocks = used / hiber_info->secsize;
1411 
1412 		/* Round up to next block if needed */
1413 		if (used % hiber_info->secsize != 0)
1414 			nblocks ++;
1415 
1416 		/* Write final block(s) for this chunk */
1417 		if (hiber_info->io_func(hiber_info->device, blkctr,
1418 		    (vaddr_t)hibernate_io_page, nblocks*hiber_info->secsize,
1419 		    HIB_W, hiber_info->io_page))
1420 			return (1);
1421 
1422 		blkctr += nblocks;
1423 
1424 		offset = blkctr;
1425 		chunks[i].compressed_size = (offset - chunks[i].offset) *
1426 		    hiber_info->secsize;
1427 	}
1428 
1429 	return (0);
1430 }
1431 
1432 /*
1433  * Reset the zlib stream state and allocate a new hiballoc area for either
1434  * inflate or deflate. This function is called once for each hibernate chunk.
1435  * Calling hiballoc_init multiple times is acceptable since the memory it is
1436  * provided is unmanaged memory (stolen). We use the memory provided to us
1437  * by the piglet allocated via the supplied hiber_info.
1438  */
1439 int
1440 hibernate_zlib_reset(union hibernate_info *hiber_info, int deflate)
1441 {
1442 	vaddr_t hibernate_zlib_start;
1443 	size_t hibernate_zlib_size;
1444 	char *pva = (char *)hiber_info->piglet_va;
1445 	struct hibernate_zlib_state *hibernate_state;
1446 
1447 	hibernate_state = (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1448 
1449 	if(!deflate)
1450 		pva = (char *)((paddr_t)pva & (PIGLET_PAGE_MASK));
1451 
1452 	hibernate_zlib_start = (vaddr_t)(pva + (28 * PAGE_SIZE));
1453 	hibernate_zlib_size = 80 * PAGE_SIZE;
1454 
1455 	bzero((caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1456 	bzero((caddr_t)hibernate_state, PAGE_SIZE);
1457 
1458 	/* Set up stream structure */
1459 	hibernate_state->hib_stream.zalloc = (alloc_func)hibernate_zlib_alloc;
1460 	hibernate_state->hib_stream.zfree = (free_func)hibernate_zlib_free;
1461 
1462 	/* Initialize the hiballoc arena for zlib allocs/frees */
1463 	hiballoc_init(&hibernate_state->hiballoc_arena,
1464 	    (caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1465 
1466 	if (deflate) {
1467 		return deflateInit(&hibernate_state->hib_stream,
1468 		    Z_BEST_SPEED);
1469 	} else
1470 		return inflateInit(&hibernate_state->hib_stream);
1471 }
1472 
1473 /*
1474  * Reads the hibernated memory image from disk, whose location and
1475  * size are recorded in hiber_info. Begin by reading the persisted
1476  * chunk table, which records the original chunk placement location
1477  * and compressed size for each. Next, allocate a pig region of
1478  * sufficient size to hold the compressed image. Next, read the
1479  * chunks into the pig area (calling hibernate_read_chunks to do this),
1480  * and finally, if all of the above succeeds, clear the hibernate signature.
1481  * The function will then return to hibernate_resume, which will proceed
1482  * to unpack the pig image to the correct place in memory.
1483  */
1484 int
1485 hibernate_read_image(union hibernate_info *hiber_info)
1486 {
1487 	size_t compressed_size, disk_size, chunktable_size, pig_sz;
1488 	paddr_t image_start, image_end, pig_start, pig_end;
1489 	struct hibernate_disk_chunk *chunks;
1490 	daddr_t blkctr;
1491 	vaddr_t chunktable = (vaddr_t)NULL;
1492 	paddr_t piglet_chunktable = hiber_info->piglet_pa +
1493 	    HIBERNATE_CHUNK_SIZE;
1494 	int i;
1495 
1496 	pmap_activate(curproc);
1497 
1498 	/* Calculate total chunk table size in disk blocks */
1499 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / hiber_info->secsize;
1500 
1501 	blkctr = hiber_info->sig_offset - chunktable_size -
1502 			hiber_info->swap_offset;
1503 
1504 	chunktable = (vaddr_t)km_alloc(HIBERNATE_CHUNK_TABLE_SIZE, &kv_any,
1505 	    &kp_none, &kd_nowait);
1506 
1507 	if (!chunktable)
1508 		return (1);
1509 
1510 	/* Read the chunktable from disk into the piglet chunktable */
1511 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE;
1512 	    i += PAGE_SIZE, blkctr += PAGE_SIZE/hiber_info->secsize) {
1513 		pmap_kenter_pa(chunktable + i, piglet_chunktable + i, VM_PROT_ALL);
1514 		pmap_update(pmap_kernel());
1515 		hibernate_block_io(hiber_info, blkctr, PAGE_SIZE,
1516 		    chunktable + i, 0);
1517 	}
1518 
1519 	blkctr = hiber_info->image_offset;
1520 	compressed_size = 0;
1521 
1522 	chunks = (struct hibernate_disk_chunk *)chunktable;
1523 
1524 	for (i = 0; i < hiber_info->chunk_ctr; i++)
1525 		compressed_size += chunks[i].compressed_size;
1526 
1527 	disk_size = compressed_size;
1528 
1529 	/* Allocate the pig area */
1530 	pig_sz = compressed_size + HIBERNATE_CHUNK_SIZE;
1531 	if (uvm_pmr_alloc_pig(&pig_start, pig_sz) == ENOMEM)
1532 		return (1);
1533 
1534 	pig_end = pig_start + pig_sz;
1535 
1536 	/* Calculate image extents. Pig image must end on a chunk boundary. */
1537 	image_end = pig_end & ~(HIBERNATE_CHUNK_SIZE - 1);
1538 	image_start = pig_start;
1539 
1540 	image_start = image_end - disk_size;
1541 
1542 	hibernate_read_chunks(hiber_info, image_start, image_end, disk_size,
1543 	    chunks);
1544 
1545 	pmap_kremove(chunktable, PAGE_SIZE);
1546 	pmap_update(pmap_kernel());
1547 
1548 	/* Prepare the resume time pmap/page table */
1549 	hibernate_populate_resume_pt(hiber_info, image_start, image_end);
1550 
1551 	/* Read complete, clear the signature and return */
1552 	return hibernate_clear_signature();
1553 }
1554 
1555 /*
1556  * Read the hibernated memory chunks from disk (chunk information at this
1557  * point is stored in the piglet) into the pig area specified by
1558  * [pig_start .. pig_end]. Order the chunks so that the final chunk is the
1559  * only chunk with overlap possibilities.
1560  */
1561 int
1562 hibernate_read_chunks(union hibernate_info *hib_info, paddr_t pig_start,
1563     paddr_t pig_end, size_t image_compr_size,
1564     struct hibernate_disk_chunk *chunks)
1565 {
1566 	paddr_t img_index, img_cur, r1s, r1e, r2s, r2e;
1567 	paddr_t copy_start, copy_end, piglet_cur;
1568 	paddr_t piglet_base = hib_info->piglet_pa;
1569 	paddr_t piglet_end = piglet_base + HIBERNATE_CHUNK_SIZE;
1570 	daddr_t blkctr;
1571 	size_t processed, compressed_size, read_size;
1572 	int overlap, found, nchunks, nochunks = 0, nfchunks = 0, npchunks = 0;
1573 	short *ochunks, *pchunks, *fchunks, i, j;
1574 	vaddr_t tempva = (vaddr_t)NULL, hibernate_fchunk_area = (vaddr_t)NULL;
1575 
1576 	global_pig_start = pig_start;
1577 
1578 	/* XXX - dont need this on all archs */
1579 	pmap_activate(curproc);
1580 
1581 	/*
1582 	 * These mappings go into the resuming kernel's page table, and are
1583 	 * used only during image read. They dissappear from existence
1584 	 * when the suspended kernel is unpacked on top of us.
1585 	 */
1586 	tempva = (vaddr_t)km_alloc(2*PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
1587 	if (!tempva)
1588 		return (1);
1589 	hibernate_fchunk_area = (vaddr_t)km_alloc(24*PAGE_SIZE, &kv_any,
1590 	    &kp_none, &kd_nowait);
1591 	if (!hibernate_fchunk_area)
1592 		return (1);
1593 
1594 	/* Final output chunk ordering VA */
1595 	fchunks = (short *)hibernate_fchunk_area;
1596 
1597 	/* Piglet chunk ordering VA */
1598 	pchunks = (short *)(hibernate_fchunk_area + (8*PAGE_SIZE));
1599 
1600 	/* Final chunk ordering VA */
1601 	ochunks = (short *)(hibernate_fchunk_area + (16*PAGE_SIZE));
1602 
1603 	/* Map the chunk ordering region */
1604 	for(i=0; i<24 ; i++) {
1605 		pmap_kenter_pa(hibernate_fchunk_area + (i*PAGE_SIZE),
1606 			piglet_base + ((4+i)*PAGE_SIZE), VM_PROT_ALL);
1607 		pmap_update(pmap_kernel());
1608 	}
1609 
1610 	nchunks = hib_info->chunk_ctr;
1611 
1612 	/* Initially start all chunks as unplaced */
1613 	for (i = 0; i < nchunks; i++)
1614 		chunks[i].flags = 0;
1615 
1616 	/*
1617 	 * Search the list for chunks that are outside the pig area. These
1618 	 * can be placed first in the final output list.
1619 	 */
1620 	for (i = 0; i < nchunks; i++) {
1621 		if (chunks[i].end <= pig_start || chunks[i].base >= pig_end) {
1622 			ochunks[nochunks] = i;
1623 			fchunks[nfchunks] = i;
1624 			nochunks++;
1625 			nfchunks++;
1626 			chunks[i].flags |= HIBERNATE_CHUNK_USED;
1627 		}
1628 	}
1629 
1630 	/*
1631 	 * Walk the ordering, place the chunks in ascending memory order.
1632 	 * Conflicts might arise, these are handled next.
1633 	 */
1634 	do {
1635 		img_index = -1;
1636 		found = 0;
1637 		j = -1;
1638 		for (i = 0; i < nchunks; i++)
1639 			if (chunks[i].base < img_index &&
1640 			    chunks[i].flags == 0 ) {
1641 				j = i;
1642 				img_index = chunks[i].base;
1643 			}
1644 
1645 		if (j != -1) {
1646 			found = 1;
1647 			ochunks[nochunks] = j;
1648 			nochunks++;
1649 			chunks[j].flags |= HIBERNATE_CHUNK_PLACED;
1650 		}
1651 	} while (found);
1652 
1653 	img_index = pig_start;
1654 
1655 	/*
1656 	 * Identify chunk output conflicts (chunks whose pig load area
1657 	 * corresponds to their original memory placement location)
1658 	 */
1659 	for (i = 0; i < nochunks ; i++) {
1660 		overlap = 0;
1661 		r1s = img_index;
1662 		r1e = img_index + chunks[ochunks[i]].compressed_size;
1663 		r2s = chunks[ochunks[i]].base;
1664 		r2e = chunks[ochunks[i]].end;
1665 
1666 		overlap = hibernate_check_overlap(r1s, r1e, r2s, r2e);
1667 		if (overlap)
1668 			chunks[ochunks[i]].flags |= HIBERNATE_CHUNK_CONFLICT;
1669 		img_index += chunks[ochunks[i]].compressed_size;
1670 	}
1671 
1672 	/*
1673 	 * Prepare the final output chunk list. Calculate an output
1674 	 * inflate strategy for overlapping chunks if needed.
1675 	 */
1676 	img_index = pig_start;
1677 	for (i = 0; i < nochunks ; i++) {
1678 		/*
1679 		 * If a conflict is detected, consume enough compressed
1680 		 * output chunks to fill the piglet
1681 		 */
1682 		if (chunks[ochunks[i]].flags & HIBERNATE_CHUNK_CONFLICT) {
1683 			copy_start = piglet_base;
1684 			copy_end = piglet_end;
1685 			piglet_cur = piglet_base;
1686 			npchunks = 0;
1687 			j = i;
1688 
1689 			while (copy_start < copy_end && j < nochunks) {
1690 				piglet_cur += chunks[ochunks[j]].compressed_size;
1691 				pchunks[npchunks] = ochunks[j];
1692 				npchunks++;
1693 				copy_start += chunks[ochunks[j]].compressed_size;
1694 				img_index += chunks[ochunks[j]].compressed_size;
1695 				i++;
1696 				j++;
1697 			}
1698 
1699 			piglet_cur = piglet_base;
1700 			for (j = 0; j < npchunks; j++) {
1701 				piglet_cur += chunks[pchunks[j]].compressed_size;
1702 				fchunks[nfchunks] = pchunks[j];
1703 				chunks[pchunks[j]].flags |= HIBERNATE_CHUNK_USED;
1704 				nfchunks++;
1705 			}
1706 		} else {
1707 			/*
1708 			 * No conflict, chunk can be added without copying
1709 			 */
1710 			if ((chunks[ochunks[i]].flags &
1711 			    HIBERNATE_CHUNK_USED) == 0) {
1712 				fchunks[nfchunks] = ochunks[i];
1713 				chunks[ochunks[i]].flags |= HIBERNATE_CHUNK_USED;
1714 				nfchunks++;
1715 			}
1716 			img_index += chunks[ochunks[i]].compressed_size;
1717 		}
1718 	}
1719 
1720 	img_index = pig_start;
1721 	for (i = 0; i < nfchunks; i++) {
1722 		piglet_cur = piglet_base;
1723 		img_index += chunks[fchunks[i]].compressed_size;
1724 	}
1725 
1726 	img_cur = pig_start;
1727 
1728 	for (i = 0; i < nfchunks; i++) {
1729 		blkctr = chunks[fchunks[i]].offset - hib_info->swap_offset;
1730 		processed = 0;
1731 		compressed_size = chunks[fchunks[i]].compressed_size;
1732 
1733 		while (processed < compressed_size) {
1734 			pmap_kenter_pa(tempva, img_cur, VM_PROT_ALL);
1735 			pmap_kenter_pa(tempva + PAGE_SIZE, img_cur+PAGE_SIZE,
1736 			    VM_PROT_ALL);
1737 			pmap_update(pmap_kernel());
1738 
1739 			if (compressed_size - processed >= PAGE_SIZE)
1740 				read_size = PAGE_SIZE;
1741 			else
1742 				read_size = compressed_size - processed;
1743 
1744 			hibernate_block_io(hib_info, blkctr, read_size,
1745 			    tempva + (img_cur & PAGE_MASK), 0);
1746 
1747 			blkctr += (read_size / hib_info->secsize);
1748 
1749 			hibernate_flush();
1750 			pmap_kremove(tempva, PAGE_SIZE);
1751 			pmap_kremove(tempva + PAGE_SIZE, PAGE_SIZE);
1752 			processed += read_size;
1753 			img_cur += read_size;
1754 		}
1755 	}
1756 
1757 	pmap_kremove(hibernate_fchunk_area, PAGE_SIZE);
1758 	pmap_kremove((vaddr_t)pchunks, PAGE_SIZE);
1759 	pmap_kremove((vaddr_t)fchunks, PAGE_SIZE);
1760 	pmap_update(pmap_kernel());
1761 
1762 	return (0);
1763 }
1764 
1765 /*
1766  * Hibernating a machine comprises the following operations:
1767  *  1. Calculating this machine's hibernate_info information
1768  *  2. Allocating a piglet and saving the piglet's physaddr
1769  *  3. Calculating the memory chunks
1770  *  4. Writing the compressed chunks to disk
1771  *  5. Writing the chunk table
1772  *  6. Writing the signature block (hibernate_info)
1773  *
1774  * On most architectures, the function calling hibernate_suspend would
1775  * then power off the machine using some MD-specific implementation.
1776  */
1777 int
1778 hibernate_suspend(void)
1779 {
1780 	union hibernate_info hib_info;
1781 	size_t swap_size;
1782 
1783 	/*
1784 	 * Calculate memory ranges, swap offsets, etc.
1785 	 * This also allocates a piglet whose physaddr is stored in
1786 	 * hib_info->piglet_pa and vaddr stored in hib_info->piglet_va
1787 	 */
1788 	if (get_hibernate_info(&hib_info, 1))
1789 		return (1);
1790 
1791 	swap_size = hib_info.image_size + hib_info.secsize +
1792 		HIBERNATE_CHUNK_TABLE_SIZE;
1793 
1794 	if (uvm_swap_check_range(hib_info.device, swap_size)) {
1795 		printf("insufficient swap space for hibernate\n");
1796 		return (1);
1797 	}
1798 
1799 	pmap_kenter_pa(HIBERNATE_HIBALLOC_PAGE, HIBERNATE_HIBALLOC_PAGE,
1800 		VM_PROT_ALL);
1801 	pmap_activate(curproc);
1802 
1803 	/* Stash the piglet VA so we can free it in the resuming kernel */
1804 	global_piglet_va = hib_info.piglet_va;
1805 
1806 	if (hibernate_write_chunks(&hib_info))
1807 		return (1);
1808 
1809 	if (hibernate_write_chunktable(&hib_info))
1810 		return (1);
1811 
1812 	if (hibernate_write_signature(&hib_info))
1813 		return (1);
1814 
1815 	delay(500000);
1816 	return (0);
1817 }
1818 
1819 /*
1820  * Free items allocated by hibernate_suspend()
1821  */
1822 void
1823 hibernate_free(void)
1824 {
1825 	if (global_piglet_va)
1826 		uvm_pmr_free_piglet(global_piglet_va,
1827 		    3*HIBERNATE_CHUNK_SIZE);
1828 
1829 	if (hibernate_copy_page)
1830 		pmap_kremove(hibernate_copy_page, PAGE_SIZE);
1831 	if (hibernate_temp_page)
1832 		pmap_kremove(hibernate_temp_page, PAGE_SIZE);
1833 
1834 	pmap_update(pmap_kernel());
1835 
1836 	if (hibernate_copy_page)
1837 		km_free((void *)hibernate_copy_page, PAGE_SIZE,
1838 		    &kv_any, &kp_none);
1839 	if (hibernate_temp_page)
1840 		km_free((void *)hibernate_temp_page, PAGE_SIZE,
1841 		    &kv_any, &kp_none);
1842 
1843 	global_piglet_va = 0;
1844 	hibernate_copy_page = 0;
1845 	hibernate_temp_page = 0;
1846 }
1847