xref: /openbsd-src/sys/kern/subr_hibernate.c (revision 03d1830d007e3a69f3003837cde28cc1014c0218)
1 /*	$OpenBSD: subr_hibernate.c,v 1.110 2014/12/17 19:42:15 tedu Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  * Copyright (c) 2011 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/hibernate.h>
21 #include <sys/malloc.h>
22 #include <sys/param.h>
23 #include <sys/tree.h>
24 #include <sys/systm.h>
25 #include <sys/disklabel.h>
26 #include <sys/disk.h>
27 #include <sys/conf.h>
28 #include <sys/buf.h>
29 #include <sys/fcntl.h>
30 #include <sys/stat.h>
31 #include <sys/atomic.h>
32 
33 #include <uvm/uvm.h>
34 #include <uvm/uvm_swap.h>
35 
36 #include <machine/hibernate.h>
37 
38 /*
39  * Hibernate piglet layout information
40  *
41  * The piglet is a scratch area of memory allocated by the suspending kernel.
42  * Its phys and virt addrs are recorded in the signature block. The piglet is
43  * used to guarantee an unused area of memory that can be used by the resuming
44  * kernel for various things. The piglet is excluded during unpack operations.
45  * The piglet size is presently 4*HIBERNATE_CHUNK_SIZE (typically 4*4MB).
46  *
47  * Offset from piglet_base	Purpose
48  * ----------------------------------------------------------------------------
49  * 0				Private page for suspend I/O write functions
50  * 1*PAGE_SIZE			I/O page used during hibernate suspend
51  * 2*PAGE_SIZE			I/O page used during hibernate suspend
52  * 3*PAGE_SIZE			copy page used during hibernate suspend
53  * 4*PAGE_SIZE			final chunk ordering list (24 pages)
54  * 28*PAGE_SIZE			RLE utility page
55  * 29*PAGE_SIZE			start of hiballoc area
56  * 109*PAGE_SIZE		end of hiballoc area (80 pages)
57  * ...				unused
58  * HIBERNATE_CHUNK_SIZE		start of hibernate chunk table
59  * 2*HIBERNATE_CHUNK_SIZE	bounce area for chunks being unpacked
60  * 4*HIBERNATE_CHUNK_SIZE	end of piglet
61  */
62 
63 /* Temporary vaddr ranges used during hibernate */
64 vaddr_t hibernate_temp_page;
65 vaddr_t hibernate_copy_page;
66 vaddr_t hibernate_rle_page;
67 
68 /* Hibernate info as read from disk during resume */
69 union hibernate_info disk_hib;
70 
71 /*
72  * Global copy of the pig start address. This needs to be a global as we
73  * switch stacks after computing it - it can't be stored on the stack.
74  */
75 paddr_t global_pig_start;
76 
77 /*
78  * Global copies of the piglet start addresses (PA/VA). We store these
79  * as globals to avoid having to carry them around as parameters, as the
80  * piglet is allocated early and freed late - its lifecycle extends beyond
81  * that of the hibernate info union which is calculated on suspend/resume.
82  */
83 vaddr_t global_piglet_va;
84 paddr_t global_piglet_pa;
85 
86 /* #define HIB_DEBUG */
87 #ifdef HIB_DEBUG
88 int	hib_debug = 99;
89 #define DPRINTF(x...)     do { if (hib_debug) printf(x); } while (0)
90 #define DNPRINTF(n,x...)  do { if (hib_debug > (n)) printf(x); } while (0)
91 #else
92 #define DPRINTF(x...)
93 #define DNPRINTF(n,x...)
94 #endif
95 
96 #ifndef NO_PROPOLICE
97 extern long __guard_local;
98 #endif /* ! NO_PROPOLICE */
99 
100 void hibernate_copy_chunk_to_piglet(paddr_t, vaddr_t, size_t);
101 int hibernate_calc_rle(paddr_t, paddr_t);
102 int hibernate_write_rle(union hibernate_info *, paddr_t, paddr_t, daddr_t *,
103 	size_t *);
104 
105 #define MAX_RLE (HIBERNATE_CHUNK_SIZE / PAGE_SIZE)
106 
107 /*
108  * Hib alloc enforced alignment.
109  */
110 #define HIB_ALIGN		8 /* bytes alignment */
111 
112 /*
113  * sizeof builtin operation, but with alignment constraint.
114  */
115 #define HIB_SIZEOF(_type)	roundup(sizeof(_type), HIB_ALIGN)
116 
117 struct hiballoc_entry {
118 	size_t			hibe_use;
119 	size_t			hibe_space;
120 	RB_ENTRY(hiballoc_entry) hibe_entry;
121 };
122 
123 /*
124  * Sort hibernate memory ranges by ascending PA
125  */
126 void
127 hibernate_sort_ranges(union hibernate_info *hib_info)
128 {
129 	int i, j;
130 	struct hibernate_memory_range *ranges;
131 	paddr_t base, end;
132 
133 	ranges = hib_info->ranges;
134 
135 	for (i = 1; i < hib_info->nranges; i++) {
136 		j = i;
137 		while (j > 0 && ranges[j - 1].base > ranges[j].base) {
138 			base = ranges[j].base;
139 			end = ranges[j].end;
140 			ranges[j].base = ranges[j - 1].base;
141 			ranges[j].end = ranges[j - 1].end;
142 			ranges[j - 1].base = base;
143 			ranges[j - 1].end = end;
144 			j--;
145 		}
146 	}
147 }
148 
149 /*
150  * Compare hiballoc entries based on the address they manage.
151  *
152  * Since the address is fixed, relative to struct hiballoc_entry,
153  * we just compare the hiballoc_entry pointers.
154  */
155 static __inline int
156 hibe_cmp(struct hiballoc_entry *l, struct hiballoc_entry *r)
157 {
158 	return l < r ? -1 : (l > r);
159 }
160 
161 RB_PROTOTYPE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
162 
163 /*
164  * Given a hiballoc entry, return the address it manages.
165  */
166 static __inline void *
167 hib_entry_to_addr(struct hiballoc_entry *entry)
168 {
169 	caddr_t addr;
170 
171 	addr = (caddr_t)entry;
172 	addr += HIB_SIZEOF(struct hiballoc_entry);
173 	return addr;
174 }
175 
176 /*
177  * Given an address, find the hiballoc that corresponds.
178  */
179 static __inline struct hiballoc_entry*
180 hib_addr_to_entry(void *addr_param)
181 {
182 	caddr_t addr;
183 
184 	addr = (caddr_t)addr_param;
185 	addr -= HIB_SIZEOF(struct hiballoc_entry);
186 	return (struct hiballoc_entry*)addr;
187 }
188 
189 RB_GENERATE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
190 
191 /*
192  * Allocate memory from the arena.
193  *
194  * Returns NULL if no memory is available.
195  */
196 void *
197 hib_alloc(struct hiballoc_arena *arena, size_t alloc_sz)
198 {
199 	struct hiballoc_entry *entry, *new_entry;
200 	size_t find_sz;
201 
202 	/*
203 	 * Enforce alignment of HIB_ALIGN bytes.
204 	 *
205 	 * Note that, because the entry is put in front of the allocation,
206 	 * 0-byte allocations are guaranteed a unique address.
207 	 */
208 	alloc_sz = roundup(alloc_sz, HIB_ALIGN);
209 
210 	/*
211 	 * Find an entry with hibe_space >= find_sz.
212 	 *
213 	 * If the root node is not large enough, we switch to tree traversal.
214 	 * Because all entries are made at the bottom of the free space,
215 	 * traversal from the end has a slightly better chance of yielding
216 	 * a sufficiently large space.
217 	 */
218 	find_sz = alloc_sz + HIB_SIZEOF(struct hiballoc_entry);
219 	entry = RB_ROOT(&arena->hib_addrs);
220 	if (entry != NULL && entry->hibe_space < find_sz) {
221 		RB_FOREACH_REVERSE(entry, hiballoc_addr, &arena->hib_addrs) {
222 			if (entry->hibe_space >= find_sz)
223 				break;
224 		}
225 	}
226 
227 	/*
228 	 * Insufficient or too fragmented memory.
229 	 */
230 	if (entry == NULL)
231 		return NULL;
232 
233 	/*
234 	 * Create new entry in allocated space.
235 	 */
236 	new_entry = (struct hiballoc_entry*)(
237 	    (caddr_t)hib_entry_to_addr(entry) + entry->hibe_use);
238 	new_entry->hibe_space = entry->hibe_space - find_sz;
239 	new_entry->hibe_use = alloc_sz;
240 
241 	/*
242 	 * Insert entry.
243 	 */
244 	if (RB_INSERT(hiballoc_addr, &arena->hib_addrs, new_entry) != NULL)
245 		panic("hib_alloc: insert failure");
246 	entry->hibe_space = 0;
247 
248 	/* Return address managed by entry. */
249 	return hib_entry_to_addr(new_entry);
250 }
251 
252 /*
253  * Free a pointer previously allocated from this arena.
254  *
255  * If addr is NULL, this will be silently accepted.
256  */
257 void
258 hib_free(struct hiballoc_arena *arena, void *addr)
259 {
260 	struct hiballoc_entry *entry, *prev;
261 
262 	if (addr == NULL)
263 		return;
264 
265 	/*
266 	 * Derive entry from addr and check it is really in this arena.
267 	 */
268 	entry = hib_addr_to_entry(addr);
269 	if (RB_FIND(hiballoc_addr, &arena->hib_addrs, entry) != entry)
270 		panic("hib_free: freed item %p not in hib arena", addr);
271 
272 	/*
273 	 * Give the space in entry to its predecessor.
274 	 *
275 	 * If entry has no predecessor, change its used space into free space
276 	 * instead.
277 	 */
278 	prev = RB_PREV(hiballoc_addr, &arena->hib_addrs, entry);
279 	if (prev != NULL &&
280 	    (void *)((caddr_t)prev + HIB_SIZEOF(struct hiballoc_entry) +
281 	    prev->hibe_use + prev->hibe_space) == entry) {
282 		/* Merge entry. */
283 		RB_REMOVE(hiballoc_addr, &arena->hib_addrs, entry);
284 		prev->hibe_space += HIB_SIZEOF(struct hiballoc_entry) +
285 		    entry->hibe_use + entry->hibe_space;
286 	} else {
287 		/* Flip used memory to free space. */
288 		entry->hibe_space += entry->hibe_use;
289 		entry->hibe_use = 0;
290 	}
291 }
292 
293 /*
294  * Initialize hiballoc.
295  *
296  * The allocator will manage memmory at ptr, which is len bytes.
297  */
298 int
299 hiballoc_init(struct hiballoc_arena *arena, void *p_ptr, size_t p_len)
300 {
301 	struct hiballoc_entry *entry;
302 	caddr_t ptr;
303 	size_t len;
304 
305 	RB_INIT(&arena->hib_addrs);
306 
307 	/*
308 	 * Hib allocator enforces HIB_ALIGN alignment.
309 	 * Fixup ptr and len.
310 	 */
311 	ptr = (caddr_t)roundup((vaddr_t)p_ptr, HIB_ALIGN);
312 	len = p_len - ((size_t)ptr - (size_t)p_ptr);
313 	len &= ~((size_t)HIB_ALIGN - 1);
314 
315 	/*
316 	 * Insufficient memory to be able to allocate and also do bookkeeping.
317 	 */
318 	if (len <= HIB_SIZEOF(struct hiballoc_entry))
319 		return ENOMEM;
320 
321 	/*
322 	 * Create entry describing space.
323 	 */
324 	entry = (struct hiballoc_entry*)ptr;
325 	entry->hibe_use = 0;
326 	entry->hibe_space = len - HIB_SIZEOF(struct hiballoc_entry);
327 	RB_INSERT(hiballoc_addr, &arena->hib_addrs, entry);
328 
329 	return 0;
330 }
331 
332 /*
333  * Zero all free memory.
334  */
335 void
336 uvm_pmr_zero_everything(void)
337 {
338 	struct uvm_pmemrange	*pmr;
339 	struct vm_page		*pg;
340 	int			 i;
341 
342 	uvm_lock_fpageq();
343 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
344 		/* Zero single pages. */
345 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_DIRTY]))
346 		    != NULL) {
347 			uvm_pmr_remove(pmr, pg);
348 			uvm_pagezero(pg);
349 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
350 			uvmexp.zeropages++;
351 			uvm_pmr_insert(pmr, pg, 0);
352 		}
353 
354 		/* Zero multi page ranges. */
355 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_DIRTY]))
356 		    != NULL) {
357 			pg--; /* Size tree always has second page. */
358 			uvm_pmr_remove(pmr, pg);
359 			for (i = 0; i < pg->fpgsz; i++) {
360 				uvm_pagezero(&pg[i]);
361 				atomic_setbits_int(&pg[i].pg_flags, PG_ZERO);
362 				uvmexp.zeropages++;
363 			}
364 			uvm_pmr_insert(pmr, pg, 0);
365 		}
366 	}
367 	uvm_unlock_fpageq();
368 }
369 
370 /*
371  * Mark all memory as dirty.
372  *
373  * Used to inform the system that the clean memory isn't clean for some
374  * reason, for example because we just came back from hibernate.
375  */
376 void
377 uvm_pmr_dirty_everything(void)
378 {
379 	struct uvm_pmemrange	*pmr;
380 	struct vm_page		*pg;
381 	int			 i;
382 
383 	uvm_lock_fpageq();
384 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
385 		/* Dirty single pages. */
386 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_ZERO]))
387 		    != NULL) {
388 			uvm_pmr_remove(pmr, pg);
389 			atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
390 			uvm_pmr_insert(pmr, pg, 0);
391 		}
392 
393 		/* Dirty multi page ranges. */
394 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_ZERO]))
395 		    != NULL) {
396 			pg--; /* Size tree always has second page. */
397 			uvm_pmr_remove(pmr, pg);
398 			for (i = 0; i < pg->fpgsz; i++)
399 				atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
400 			uvm_pmr_insert(pmr, pg, 0);
401 		}
402 	}
403 
404 	uvmexp.zeropages = 0;
405 	uvm_unlock_fpageq();
406 }
407 
408 /*
409  * Allocate an area that can hold sz bytes and doesn't overlap with
410  * the piglet at piglet_pa.
411  */
412 int
413 uvm_pmr_alloc_pig(paddr_t *pa, psize_t sz, paddr_t piglet_pa)
414 {
415 	struct uvm_constraint_range pig_constraint;
416 	struct kmem_pa_mode kp_pig = {
417 		.kp_constraint = &pig_constraint,
418 		.kp_maxseg = 1
419 	};
420 	vaddr_t va;
421 
422 	sz = round_page(sz);
423 
424 	pig_constraint.ucr_low = piglet_pa + 4 * HIBERNATE_CHUNK_SIZE;
425 	pig_constraint.ucr_high = -1;
426 
427 	va = (vaddr_t)km_alloc(sz, &kv_any, &kp_pig, &kd_nowait);
428 	if (va == 0) {
429 		pig_constraint.ucr_low = 0;
430 		pig_constraint.ucr_high = piglet_pa - 1;
431 
432 		va = (vaddr_t)km_alloc(sz, &kv_any, &kp_pig, &kd_nowait);
433 		if (va == 0)
434 			return ENOMEM;
435 	}
436 
437 	pmap_extract(pmap_kernel(), va, pa);
438 	return 0;
439 }
440 
441 /*
442  * Allocate a piglet area.
443  *
444  * This needs to be in DMA-safe memory.
445  * Piglets are aligned.
446  *
447  * sz and align in bytes.
448  *
449  * The call will sleep for the pagedaemon to attempt to free memory.
450  * The pagedaemon may decide its not possible to free enough memory, causing
451  * the allocation to fail.
452  */
453 int
454 uvm_pmr_alloc_piglet(vaddr_t *va, paddr_t *pa, vsize_t sz, paddr_t align)
455 {
456 	struct kmem_pa_mode kp_piglet = {
457 		.kp_constraint = &dma_constraint,
458 		.kp_align = align,
459 		.kp_maxseg = 1
460 	};
461 
462 	/* Ensure align is a power of 2 */
463 	KASSERT((align & (align - 1)) == 0);
464 
465 	/*
466 	 * Fixup arguments: align must be at least PAGE_SIZE,
467 	 * sz will be converted to pagecount, since that is what
468 	 * pmemrange uses internally.
469 	 */
470 	if (align < PAGE_SIZE)
471 		align = PAGE_SIZE;
472 	sz = round_page(sz);
473 
474 	*va = (vaddr_t)km_alloc(sz, &kv_any, &kp_piglet, &kd_nowait);
475 	if (*va == 0)
476 		return ENOMEM;
477 
478 	pmap_extract(pmap_kernel(), *va, pa);
479 	return 0;
480 }
481 
482 /*
483  * Free a piglet area.
484  */
485 void
486 uvm_pmr_free_piglet(vaddr_t va, vsize_t sz)
487 {
488 	/*
489 	 * Fix parameters.
490 	 */
491 	sz = round_page(sz);
492 
493 	/*
494 	 * Free the physical and virtual memory.
495 	 */
496 	km_free((void *)va, sz, &kv_any, &kp_dma_contig);
497 }
498 
499 /*
500  * Physmem RLE compression support.
501  *
502  * Given a physical page address, return the number of pages starting at the
503  * address that are free.  Clamps to the number of pages in
504  * HIBERNATE_CHUNK_SIZE. Returns 0 if the page at addr is not free.
505  */
506 int
507 uvm_page_rle(paddr_t addr)
508 {
509 	struct vm_page		*pg, *pg_end;
510 	struct vm_physseg	*vmp;
511 	int			 pseg_idx, off_idx;
512 
513 	pseg_idx = vm_physseg_find(atop(addr), &off_idx);
514 	if (pseg_idx == -1)
515 		return 0;
516 
517 	vmp = &vm_physmem[pseg_idx];
518 	pg = &vmp->pgs[off_idx];
519 	if (!(pg->pg_flags & PQ_FREE))
520 		return 0;
521 
522 	/*
523 	 * Search for the first non-free page after pg.
524 	 * Note that the page may not be the first page in a free pmemrange,
525 	 * therefore pg->fpgsz cannot be used.
526 	 */
527 	for (pg_end = pg; pg_end <= vmp->lastpg &&
528 	    (pg_end->pg_flags & PQ_FREE) == PQ_FREE; pg_end++)
529 		;
530 	return min((pg_end - pg), HIBERNATE_CHUNK_SIZE/PAGE_SIZE);
531 }
532 
533 /*
534  * Fills out the hibernate_info union pointed to by hib
535  * with information about this machine (swap signature block
536  * offsets, number of memory ranges, kernel in use, etc)
537  */
538 int
539 get_hibernate_info(union hibernate_info *hib, int suspend)
540 {
541 	struct disklabel dl;
542 	char err_string[128], *dl_ret;
543 
544 #ifndef NO_PROPOLICE
545 	/* Save propolice guard */
546 	hib->guard = __guard_local;
547 #endif /* ! NO_PROPOLICE */
548 
549 	/* Determine I/O function to use */
550 	hib->io_func = get_hibernate_io_function(swdevt[0].sw_dev);
551 	if (hib->io_func == NULL)
552 		return (1);
553 
554 	/* Calculate hibernate device */
555 	hib->dev = swdevt[0].sw_dev;
556 
557 	/* Read disklabel (used to calculate signature and image offsets) */
558 	dl_ret = disk_readlabel(&dl, hib->dev, err_string, sizeof(err_string));
559 
560 	if (dl_ret) {
561 		printf("Hibernate error reading disklabel: %s\n", dl_ret);
562 		return (1);
563 	}
564 
565 	/* Make sure we have a swap partition. */
566 	if (dl.d_partitions[1].p_fstype != FS_SWAP ||
567 	    DL_GETPSIZE(&dl.d_partitions[1]) == 0)
568 		return (1);
569 
570 	/* Make sure the signature can fit in one block */
571 	if (sizeof(union hibernate_info) > DEV_BSIZE)
572 		return (1);
573 
574 	/* Magic number */
575 	hib->magic = HIBERNATE_MAGIC;
576 
577 	/* Calculate signature block location */
578 	hib->sig_offset = DL_GETPSIZE(&dl.d_partitions[1]) -
579 	    sizeof(union hibernate_info)/DEV_BSIZE;
580 
581 	/* Stash kernel version information */
582 	memset(&hib->kernel_version, 0, 128);
583 	bcopy(version, &hib->kernel_version,
584 	    min(strlen(version), sizeof(hib->kernel_version)-1));
585 
586 	if (suspend) {
587 		/* Grab the previously-allocated piglet addresses */
588 		hib->piglet_va = global_piglet_va;
589 		hib->piglet_pa = global_piglet_pa;
590 		hib->io_page = (void *)hib->piglet_va;
591 
592 		/*
593 		 * Initialization of the hibernate IO function for drivers
594 		 * that need to do prep work (such as allocating memory or
595 		 * setting up data structures that cannot safely be done
596 		 * during suspend without causing side effects). There is
597 		 * a matching HIB_DONE call performed after the write is
598 		 * completed.
599 		 */
600 		if (hib->io_func(hib->dev, DL_GETPOFFSET(&dl.d_partitions[1]),
601 		    (vaddr_t)NULL, DL_GETPSIZE(&dl.d_partitions[1]),
602 		    HIB_INIT, hib->io_page))
603 			goto fail;
604 
605 	} else {
606 		/*
607 		 * Resuming kernels use a regular private page for the driver
608 		 * No need to free this I/O page as it will vanish as part of
609 		 * the resume.
610 		 */
611 		hib->io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
612 		if (!hib->io_page)
613 			goto fail;
614 	}
615 
616 	if (get_hibernate_info_md(hib))
617 		goto fail;
618 
619 	return (0);
620 
621 fail:
622 	return (1);
623 }
624 
625 /*
626  * Allocate nitems*size bytes from the hiballoc area presently in use
627  */
628 void *
629 hibernate_zlib_alloc(void *unused, int nitems, int size)
630 {
631 	struct hibernate_zlib_state *hibernate_state;
632 
633 	hibernate_state =
634 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
635 
636 	return hib_alloc(&hibernate_state->hiballoc_arena, nitems*size);
637 }
638 
639 /*
640  * Free the memory pointed to by addr in the hiballoc area presently in
641  * use
642  */
643 void
644 hibernate_zlib_free(void *unused, void *addr)
645 {
646 	struct hibernate_zlib_state *hibernate_state;
647 
648 	hibernate_state =
649 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
650 
651 	hib_free(&hibernate_state->hiballoc_arena, addr);
652 }
653 
654 /*
655  * Inflate next page of data from the image stream.
656  * The rle parameter is modified on exit to contain the number of pages to
657  * skip in the output stream (or 0 if this page was inflated into).
658  *
659  * Returns 0 if the stream contains additional data, or 1 if the stream is
660  * finished.
661  */
662 int
663 hibernate_inflate_page(int *rle)
664 {
665 	struct hibernate_zlib_state *hibernate_state;
666 	int i;
667 
668 	hibernate_state =
669 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
670 
671 	/* Set up the stream for RLE code inflate */
672 	hibernate_state->hib_stream.next_out = (unsigned char *)rle;
673 	hibernate_state->hib_stream.avail_out = sizeof(*rle);
674 
675 	/* Inflate RLE code */
676 	i = inflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH);
677 	if (i != Z_OK && i != Z_STREAM_END) {
678 		/*
679 		 * XXX - this will likely reboot/hang most machines
680 		 *       since the console output buffer will be unmapped,
681 		 *       but there's not much else we can do here.
682 		 */
683 		panic("rle inflate stream error");
684 	}
685 
686 	if (hibernate_state->hib_stream.avail_out != 0) {
687 		/*
688 		 * XXX - this will likely reboot/hang most machines
689 		 *       since the console output buffer will be unmapped,
690 		 *       but there's not much else we can do here.
691 		 */
692 		panic("rle short inflate error");
693 	}
694 
695 	if (*rle < 0 || *rle > 1024) {
696 		/*
697 		 * XXX - this will likely reboot/hang most machines
698 		 *       since the console output buffer will be unmapped,
699 		 *       but there's not much else we can do here.
700 		 */
701 		panic("invalid rle count");
702 	}
703 
704 	if (i == Z_STREAM_END)
705 		return (1);
706 
707 	if (*rle != 0)
708 		return (0);
709 
710 	/* Set up the stream for page inflate */
711 	hibernate_state->hib_stream.next_out =
712 		(unsigned char *)HIBERNATE_INFLATE_PAGE;
713 	hibernate_state->hib_stream.avail_out = PAGE_SIZE;
714 
715 	/* Process next block of data */
716 	i = inflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH);
717 	if (i != Z_OK && i != Z_STREAM_END) {
718 		/*
719 		 * XXX - this will likely reboot/hang most machines
720 		 *       since the console output buffer will be unmapped,
721 		 *       but there's not much else we can do here.
722 		 */
723 		panic("inflate error");
724 	}
725 
726 	/* We should always have extracted a full page ... */
727 	if (hibernate_state->hib_stream.avail_out != 0) {
728 		/*
729 		 * XXX - this will likely reboot/hang most machines
730 		 *       since the console output buffer will be unmapped,
731 		 *       but there's not much else we can do here.
732 		 */
733 		panic("incomplete page");
734 	}
735 
736 	return (i == Z_STREAM_END);
737 }
738 
739 /*
740  * Inflate size bytes from src into dest, skipping any pages in
741  * [src..dest] that are special (see hibernate_inflate_skip)
742  *
743  * This function executes while using the resume-time stack
744  * and pmap, and therefore cannot use ddb/printf/etc. Doing so
745  * will likely hang or reset the machine since the console output buffer
746  * will be unmapped.
747  */
748 void
749 hibernate_inflate_region(union hibernate_info *hib, paddr_t dest,
750     paddr_t src, size_t size)
751 {
752 	int end_stream = 0, rle;
753 	struct hibernate_zlib_state *hibernate_state;
754 
755 	hibernate_state =
756 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
757 
758 	hibernate_state->hib_stream.next_in = (unsigned char *)src;
759 	hibernate_state->hib_stream.avail_in = size;
760 
761 	do {
762 		/*
763 		 * Is this a special page? If yes, redirect the
764 		 * inflate output to a scratch page (eg, discard it)
765 		 */
766 		if (hibernate_inflate_skip(hib, dest)) {
767 			hibernate_enter_resume_mapping(
768 			    HIBERNATE_INFLATE_PAGE,
769 			    HIBERNATE_INFLATE_PAGE, 0);
770 		} else {
771 			hibernate_enter_resume_mapping(
772 			    HIBERNATE_INFLATE_PAGE, dest, 0);
773 		}
774 
775 		hibernate_flush();
776 		end_stream = hibernate_inflate_page(&rle);
777 
778 		if (rle == 0)
779 			dest += PAGE_SIZE;
780 		else
781 			dest += (rle * PAGE_SIZE);
782 	} while (!end_stream);
783 }
784 
785 /*
786  * deflate from src into the I/O page, up to 'remaining' bytes
787  *
788  * Returns number of input bytes consumed, and may reset
789  * the 'remaining' parameter if not all the output space was consumed
790  * (this information is needed to know how much to write to disk
791  */
792 size_t
793 hibernate_deflate(union hibernate_info *hib, paddr_t src,
794     size_t *remaining)
795 {
796 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
797 	struct hibernate_zlib_state *hibernate_state;
798 
799 	hibernate_state =
800 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
801 
802 	/* Set up the stream for deflate */
803 	hibernate_state->hib_stream.next_in = (unsigned char *)src;
804 	hibernate_state->hib_stream.avail_in = PAGE_SIZE - (src & PAGE_MASK);
805 	hibernate_state->hib_stream.next_out =
806 		(unsigned char *)hibernate_io_page + (PAGE_SIZE - *remaining);
807 	hibernate_state->hib_stream.avail_out = *remaining;
808 
809 	/* Process next block of data */
810 	if (deflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH) != Z_OK)
811 		panic("hibernate zlib deflate error");
812 
813 	/* Update pointers and return number of bytes consumed */
814 	*remaining = hibernate_state->hib_stream.avail_out;
815 	return (PAGE_SIZE - (src & PAGE_MASK)) -
816 	    hibernate_state->hib_stream.avail_in;
817 }
818 
819 /*
820  * Write the hibernation information specified in hiber_info
821  * to the location in swap previously calculated (last block of
822  * swap), called the "signature block".
823  */
824 int
825 hibernate_write_signature(union hibernate_info *hib)
826 {
827 	/* Write hibernate info to disk */
828 	return (hib->io_func(hib->dev, hib->sig_offset,
829 	    (vaddr_t)hib, DEV_BSIZE, HIB_W,
830 	    hib->io_page));
831 }
832 
833 /*
834  * Write the memory chunk table to the area in swap immediately
835  * preceding the signature block. The chunk table is stored
836  * in the piglet when this function is called.  Returns errno.
837  */
838 int
839 hibernate_write_chunktable(union hibernate_info *hib)
840 {
841 	vaddr_t hibernate_chunk_table_start;
842 	size_t hibernate_chunk_table_size;
843 	int i, err;
844 
845 	hibernate_chunk_table_size = HIBERNATE_CHUNK_TABLE_SIZE;
846 
847 	hibernate_chunk_table_start = hib->piglet_va +
848 	    HIBERNATE_CHUNK_SIZE;
849 
850 	/* Write chunk table */
851 	for (i = 0; i < hibernate_chunk_table_size; i += MAXPHYS) {
852 		if ((err = hib->io_func(hib->dev,
853 		    hib->chunktable_offset + (i/DEV_BSIZE),
854 		    (vaddr_t)(hibernate_chunk_table_start + i),
855 		    MAXPHYS, HIB_W, hib->io_page))) {
856 			DPRINTF("chunktable write error: %d\n", err);
857 			return (err);
858 		}
859 	}
860 
861 	return (0);
862 }
863 
864 /*
865  * Write an empty hiber_info to the swap signature block, which is
866  * guaranteed to not match any valid hib.
867  */
868 int
869 hibernate_clear_signature(void)
870 {
871 	union hibernate_info blank_hiber_info;
872 	union hibernate_info hib;
873 
874 	/* Zero out a blank hiber_info */
875 	memset(&blank_hiber_info, 0, sizeof(union hibernate_info));
876 
877 	/* Get the signature block location */
878 	if (get_hibernate_info(&hib, 0))
879 		return (1);
880 
881 	/* Write (zeroed) hibernate info to disk */
882 	DPRINTF("clearing hibernate signature block location: %lld\n",
883 		hib.sig_offset);
884 	if (hibernate_block_io(&hib,
885 	    hib.sig_offset,
886 	    DEV_BSIZE, (vaddr_t)&blank_hiber_info, 1))
887 		printf("Warning: could not clear hibernate signature\n");
888 
889 	return (0);
890 }
891 
892 /*
893  * Compare two hibernate_infos to determine if they are the same (eg,
894  * we should be performing a hibernate resume on this machine.
895  * Not all fields are checked - just enough to verify that the machine
896  * has the same memory configuration and kernel as the one that
897  * wrote the signature previously.
898  */
899 int
900 hibernate_compare_signature(union hibernate_info *mine,
901     union hibernate_info *disk)
902 {
903 	u_int i;
904 
905 	if (mine->nranges != disk->nranges) {
906 		DPRINTF("hibernate memory range count mismatch\n");
907 		return (1);
908 	}
909 
910 	if (strcmp(mine->kernel_version, disk->kernel_version) != 0) {
911 		DPRINTF("hibernate kernel version mismatch\n");
912 		return (1);
913 	}
914 
915 	for (i = 0; i < mine->nranges; i++) {
916 		if ((mine->ranges[i].base != disk->ranges[i].base) ||
917 		    (mine->ranges[i].end != disk->ranges[i].end) ) {
918 			DPRINTF("hib range %d mismatch [%p-%p != %p-%p]\n",
919 				i,
920 				(void *)mine->ranges[i].base,
921 				(void *)mine->ranges[i].end,
922 				(void *)disk->ranges[i].base,
923 				(void *)disk->ranges[i].end);
924 			return (1);
925 		}
926 	}
927 
928 	return (0);
929 }
930 
931 /*
932  * Transfers xfer_size bytes between the hibernate device specified in
933  * hib_info at offset blkctr and the vaddr specified at dest.
934  *
935  * Separate offsets and pages are used to handle misaligned reads (reads
936  * that span a page boundary).
937  *
938  * blkctr specifies a relative offset (relative to the start of swap),
939  * not an absolute disk offset
940  *
941  */
942 int
943 hibernate_block_io(union hibernate_info *hib, daddr_t blkctr,
944     size_t xfer_size, vaddr_t dest, int iswrite)
945 {
946 	struct buf *bp;
947 	struct bdevsw *bdsw;
948 	int error;
949 
950 	bp = geteblk(xfer_size);
951 	bdsw = &bdevsw[major(hib->dev)];
952 
953 	error = (*bdsw->d_open)(hib->dev, FREAD, S_IFCHR, curproc);
954 	if (error) {
955 		printf("hibernate_block_io open failed\n");
956 		return (1);
957 	}
958 
959 	if (iswrite)
960 		bcopy((caddr_t)dest, bp->b_data, xfer_size);
961 
962 	bp->b_bcount = xfer_size;
963 	bp->b_blkno = blkctr;
964 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
965 	SET(bp->b_flags, B_BUSY | (iswrite ? B_WRITE : B_READ) | B_RAW);
966 	bp->b_dev = hib->dev;
967 	(*bdsw->d_strategy)(bp);
968 
969 	error = biowait(bp);
970 	if (error) {
971 		printf("hib block_io biowait error %d blk %lld size %zu\n",
972 			error, (long long)blkctr, xfer_size);
973 		error = (*bdsw->d_close)(hib->dev, 0, S_IFCHR,
974 		    curproc);
975 		if (error)
976 			printf("hibernate_block_io error close failed\n");
977 		return (1);
978 	}
979 
980 	error = (*bdsw->d_close)(hib->dev, FREAD, S_IFCHR, curproc);
981 	if (error) {
982 		printf("hibernate_block_io close failed\n");
983 		return (1);
984 	}
985 
986 	if (!iswrite)
987 		bcopy(bp->b_data, (caddr_t)dest, xfer_size);
988 
989 	bp->b_flags |= B_INVAL;
990 	brelse(bp);
991 
992 	return (0);
993 }
994 
995 /*
996  * Reads the signature block from swap, checks against the current machine's
997  * information. If the information matches, perform a resume by reading the
998  * saved image into the pig area, and unpacking.
999  */
1000 void
1001 hibernate_resume(void)
1002 {
1003 	union hibernate_info hib;
1004 	int s;
1005 
1006 	/* Get current running machine's hibernate info */
1007 	memset(&hib, 0, sizeof(hib));
1008 	if (get_hibernate_info(&hib, 0)) {
1009 		DPRINTF("couldn't retrieve machine's hibernate info\n");
1010 		return;
1011 	}
1012 
1013 	/* Read hibernate info from disk */
1014 	s = splbio();
1015 
1016 	DPRINTF("reading hibernate signature block location: %lld\n",
1017 		hib.sig_offset);
1018 
1019 	if (hibernate_block_io(&hib,
1020 	    hib.sig_offset,
1021 	    DEV_BSIZE, (vaddr_t)&disk_hib, 0)) {
1022 		DPRINTF("error in hibernate read");
1023 		splx(s);
1024 		return;
1025 	}
1026 
1027 	/* Check magic number */
1028 	if (disk_hib.magic != HIBERNATE_MAGIC) {
1029 		DPRINTF("wrong magic number in hibernate signature: %x\n",
1030 			disk_hib.magic);
1031 		splx(s);
1032 		return;
1033 	}
1034 
1035 	/*
1036 	 * We (possibly) found a hibernate signature. Clear signature first,
1037 	 * to prevent accidental resume or endless resume cycles later.
1038 	 */
1039 	if (hibernate_clear_signature()) {
1040 		DPRINTF("error clearing hibernate signature block\n");
1041 		splx(s);
1042 		return;
1043 	}
1044 
1045 	/*
1046 	 * If on-disk and in-memory hibernate signatures match,
1047 	 * this means we should do a resume from hibernate.
1048 	 */
1049 	if (hibernate_compare_signature(&hib, &disk_hib)) {
1050 		DPRINTF("mismatched hibernate signature block\n");
1051 		splx(s);
1052 		return;
1053 	}
1054 
1055 #ifdef MULTIPROCESSOR
1056 	/* XXX - if we fail later, we may need to rehatch APs on some archs */
1057 	DPRINTF("hibernate: quiescing APs\n");
1058 	hibernate_quiesce_cpus();
1059 #endif /* MULTIPROCESSOR */
1060 
1061 	/* Read the image from disk into the image (pig) area */
1062 	if (hibernate_read_image(&disk_hib))
1063 		goto fail;
1064 
1065 	DPRINTF("hibernate: quiescing devices\n");
1066 	if (config_suspend_all(DVACT_QUIESCE) != 0)
1067 		goto fail;
1068 
1069 	(void) splhigh();
1070 	hibernate_disable_intr_machdep();
1071 	cold = 1;
1072 
1073 	DPRINTF("hibernate: suspending devices\n");
1074 	if (config_suspend_all(DVACT_SUSPEND) != 0) {
1075 		cold = 0;
1076 		hibernate_enable_intr_machdep();
1077 		goto fail;
1078 	}
1079 
1080 	printf("Unpacking image...\n");
1081 
1082 	/* Switch stacks */
1083 	DPRINTF("hibernate: switching stacks\n");
1084 	hibernate_switch_stack_machdep();
1085 
1086 #ifndef NO_PROPOLICE
1087 	/* Start using suspended kernel's propolice guard */
1088 	__guard_local = disk_hib.guard;
1089 #endif /* ! NO_PROPOLICE */
1090 
1091 	/* Unpack and resume */
1092 	hibernate_unpack_image(&disk_hib);
1093 
1094 fail:
1095 	splx(s);
1096 	printf("\nUnable to resume hibernated image\n");
1097 }
1098 
1099 /*
1100  * Unpack image from pig area to original location by looping through the
1101  * list of output chunks in the order they should be restored (fchunks).
1102  *
1103  * Note that due to the stack smash protector and the fact that we have
1104  * switched stacks, it is not permitted to return from this function.
1105  */
1106 void
1107 hibernate_unpack_image(union hibernate_info *hib)
1108 {
1109 	struct hibernate_disk_chunk *chunks;
1110 	union hibernate_info local_hib;
1111 	paddr_t image_cur = global_pig_start;
1112 	short i, *fchunks;
1113 	char *pva;
1114 
1115 	/* Piglet will be identity mapped (VA == PA) */
1116 	pva = (char *)hib->piglet_pa;
1117 
1118 	fchunks = (short *)(pva + (4 * PAGE_SIZE));
1119 
1120 	chunks = (struct hibernate_disk_chunk *)(pva + HIBERNATE_CHUNK_SIZE);
1121 
1122 	/* Can't use hiber_info that's passed in after this point */
1123 	bcopy(hib, &local_hib, sizeof(union hibernate_info));
1124 
1125 	/* VA == PA */
1126 	local_hib.piglet_va = local_hib.piglet_pa;
1127 
1128 	/*
1129 	 * Point of no return. Once we pass this point, only kernel code can
1130 	 * be accessed. No global variables or other kernel data structures
1131 	 * are guaranteed to be coherent after unpack starts.
1132 	 *
1133 	 * The image is now in high memory (pig area), we unpack from the pig
1134 	 * to the correct location in memory. We'll eventually end up copying
1135 	 * on top of ourself, but we are assured the kernel code here is the
1136 	 * same between the hibernated and resuming kernel, and we are running
1137 	 * on our own stack, so the overwrite is ok.
1138 	 */
1139 	DPRINTF("hibernate: activating alt. pagetable and starting unpack\n");
1140 	hibernate_activate_resume_pt_machdep();
1141 
1142 	for (i = 0; i < local_hib.chunk_ctr; i++) {
1143 		/* Reset zlib for inflate */
1144 		if (hibernate_zlib_reset(&local_hib, 0) != Z_OK)
1145 			panic("hibernate failed to reset zlib for inflate");
1146 
1147 		hibernate_process_chunk(&local_hib, &chunks[fchunks[i]],
1148 		    image_cur);
1149 
1150 		image_cur += chunks[fchunks[i]].compressed_size;
1151 
1152 	}
1153 
1154 	/*
1155 	 * Resume the loaded kernel by jumping to the MD resume vector.
1156 	 * We won't be returning from this call.
1157 	 */
1158 	hibernate_resume_machdep();
1159 }
1160 
1161 /*
1162  * Bounce a compressed image chunk to the piglet, entering mappings for the
1163  * copied pages as needed
1164  */
1165 void
1166 hibernate_copy_chunk_to_piglet(paddr_t img_cur, vaddr_t piglet, size_t size)
1167 {
1168 	size_t ct, ofs;
1169 	paddr_t src = img_cur;
1170 	vaddr_t dest = piglet;
1171 
1172 	/* Copy first partial page */
1173 	ct = (PAGE_SIZE) - (src & PAGE_MASK);
1174 	ofs = (src & PAGE_MASK);
1175 
1176 	if (ct < PAGE_SIZE) {
1177 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE,
1178 			(src - ofs), 0);
1179 		hibernate_flush();
1180 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE + ofs), (caddr_t)dest, ct);
1181 		src += ct;
1182 		dest += ct;
1183 	}
1184 
1185 	/* Copy remaining pages */
1186 	while (src < size + img_cur) {
1187 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE, src, 0);
1188 		hibernate_flush();
1189 		ct = PAGE_SIZE;
1190 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE), (caddr_t)dest, ct);
1191 		hibernate_flush();
1192 		src += ct;
1193 		dest += ct;
1194 	}
1195 }
1196 
1197 /*
1198  * Process a chunk by bouncing it to the piglet, followed by unpacking
1199  */
1200 void
1201 hibernate_process_chunk(union hibernate_info *hib,
1202     struct hibernate_disk_chunk *chunk, paddr_t img_cur)
1203 {
1204 	char *pva = (char *)hib->piglet_va;
1205 
1206 	hibernate_copy_chunk_to_piglet(img_cur,
1207 	 (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)), chunk->compressed_size);
1208 	hibernate_inflate_region(hib, chunk->base,
1209 	    (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)),
1210 	    chunk->compressed_size);
1211 }
1212 
1213 /*
1214  * Calculate RLE component for 'inaddr'. Clamps to max RLE pages between
1215  * inaddr and range_end.
1216  */
1217 int
1218 hibernate_calc_rle(paddr_t inaddr, paddr_t range_end)
1219 {
1220 	int rle;
1221 
1222 	rle = uvm_page_rle(inaddr);
1223 	KASSERT(rle >= 0 && rle <= MAX_RLE);
1224 
1225 	/* Clamp RLE to range end */
1226 	if (rle > 0 && inaddr + (rle * PAGE_SIZE) > range_end)
1227 		rle = (range_end - inaddr) / PAGE_SIZE;
1228 
1229 	return (rle);
1230 }
1231 
1232 /*
1233  * Write the RLE byte for page at 'inaddr' to the output stream.
1234  * Returns the number of pages to be skipped at 'inaddr'.
1235  */
1236 int
1237 hibernate_write_rle(union hibernate_info *hib, paddr_t inaddr,
1238 	paddr_t range_end, daddr_t *blkctr,
1239 	size_t *out_remaining)
1240 {
1241 	int rle, err, *rleloc;
1242 	struct hibernate_zlib_state *hibernate_state;
1243 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
1244 
1245 	hibernate_state =
1246 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1247 
1248 	rle = hibernate_calc_rle(inaddr, range_end);
1249 
1250 	rleloc = (int *)hibernate_rle_page + MAX_RLE - 1;
1251 	*rleloc = rle;
1252 
1253 	/* Deflate the RLE byte into the stream */
1254 	hibernate_deflate(hib, (paddr_t)rleloc, out_remaining);
1255 
1256 	/* Did we fill the output page? If so, flush to disk */
1257 	if (*out_remaining == 0) {
1258 		if ((err = hib->io_func(hib->dev, *blkctr + hib->image_offset,
1259 			(vaddr_t)hibernate_io_page, PAGE_SIZE, HIB_W,
1260 			hib->io_page))) {
1261 				DPRINTF("hib write error %d\n", err);
1262 				return (err);
1263 		}
1264 
1265 		*blkctr += PAGE_SIZE / DEV_BSIZE;
1266 		*out_remaining = PAGE_SIZE;
1267 
1268 		/* If we didn't deflate the entire RLE byte, finish it now */
1269 		if (hibernate_state->hib_stream.avail_in != 0)
1270 			hibernate_deflate(hib,
1271 				(vaddr_t)hibernate_state->hib_stream.next_in,
1272 				out_remaining);
1273 	}
1274 
1275 	return (rle);
1276 }
1277 
1278 /*
1279  * Write a compressed version of this machine's memory to disk, at the
1280  * precalculated swap offset:
1281  *
1282  * end of swap - signature block size - chunk table size - memory size
1283  *
1284  * The function begins by looping through each phys mem range, cutting each
1285  * one into MD sized chunks. These chunks are then compressed individually
1286  * and written out to disk, in phys mem order. Some chunks might compress
1287  * more than others, and for this reason, each chunk's size is recorded
1288  * in the chunk table, which is written to disk after the image has
1289  * properly been compressed and written (in hibernate_write_chunktable).
1290  *
1291  * When this function is called, the machine is nearly suspended - most
1292  * devices are quiesced/suspended, interrupts are off, and cold has
1293  * been set. This means that there can be no side effects once the
1294  * write has started, and the write function itself can also have no
1295  * side effects. This also means no printfs are permitted (since printf
1296  * has side effects.)
1297  *
1298  * Return values :
1299  *
1300  * 0      - success
1301  * EIO    - I/O error occurred writing the chunks
1302  * EINVAL - Failed to write a complete range
1303  * ENOMEM - Memory allocation failure during preparation of the zlib arena
1304  */
1305 int
1306 hibernate_write_chunks(union hibernate_info *hib)
1307 {
1308 	paddr_t range_base, range_end, inaddr, temp_inaddr;
1309 	size_t nblocks, out_remaining, used;
1310 	struct hibernate_disk_chunk *chunks;
1311 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
1312 	daddr_t blkctr = 0;
1313 	int i, rle, err;
1314 	struct hibernate_zlib_state *hibernate_state;
1315 
1316 	hibernate_state =
1317 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1318 
1319 	hib->chunk_ctr = 0;
1320 
1321 	/*
1322 	 * Map the utility VAs to the piglet. See the piglet map at the
1323 	 * top of this file for piglet layout information.
1324 	 */
1325 	hibernate_copy_page = hib->piglet_va + 3 * PAGE_SIZE;
1326 	hibernate_rle_page = hib->piglet_va + 28 * PAGE_SIZE;
1327 
1328 	chunks = (struct hibernate_disk_chunk *)(hib->piglet_va +
1329 	    HIBERNATE_CHUNK_SIZE);
1330 
1331 	/* Calculate the chunk regions */
1332 	for (i = 0; i < hib->nranges; i++) {
1333 		range_base = hib->ranges[i].base;
1334 		range_end = hib->ranges[i].end;
1335 
1336 		inaddr = range_base;
1337 
1338 		while (inaddr < range_end) {
1339 			chunks[hib->chunk_ctr].base = inaddr;
1340 			if (inaddr + HIBERNATE_CHUNK_SIZE < range_end)
1341 				chunks[hib->chunk_ctr].end = inaddr +
1342 				    HIBERNATE_CHUNK_SIZE;
1343 			else
1344 				chunks[hib->chunk_ctr].end = range_end;
1345 
1346 			inaddr += HIBERNATE_CHUNK_SIZE;
1347 			hib->chunk_ctr ++;
1348 		}
1349 	}
1350 
1351 	uvm_pmr_dirty_everything();
1352 	uvm_pmr_zero_everything();
1353 
1354 	/* Compress and write the chunks in the chunktable */
1355 	for (i = 0; i < hib->chunk_ctr; i++) {
1356 		range_base = chunks[i].base;
1357 		range_end = chunks[i].end;
1358 
1359 		chunks[i].offset = blkctr + hib->image_offset;
1360 
1361 		/* Reset zlib for deflate */
1362 		if (hibernate_zlib_reset(hib, 1) != Z_OK) {
1363 			DPRINTF("hibernate_zlib_reset failed for deflate\n");
1364 			return (ENOMEM);
1365 		}
1366 
1367 		inaddr = range_base;
1368 
1369 		/*
1370 		 * For each range, loop through its phys mem region
1371 		 * and write out the chunks (the last chunk might be
1372 		 * smaller than the chunk size).
1373 		 */
1374 		while (inaddr < range_end) {
1375 			out_remaining = PAGE_SIZE;
1376 			while (out_remaining > 0 && inaddr < range_end) {
1377 				/*
1378 				 * Adjust for regions that are not evenly
1379 				 * divisible by PAGE_SIZE or overflowed
1380 				 * pages from the previous iteration.
1381 				 */
1382 				temp_inaddr = (inaddr & PAGE_MASK) +
1383 				    hibernate_copy_page;
1384 
1385 				/* Deflate from temp_inaddr to IO page */
1386 				if (inaddr != range_end) {
1387 					if (inaddr % PAGE_SIZE == 0) {
1388 						rle = hibernate_write_rle(hib,
1389 							inaddr,
1390 							range_end,
1391 							&blkctr,
1392 							&out_remaining);
1393 					}
1394 
1395 					if (rle == 0) {
1396 						pmap_kenter_pa(hibernate_temp_page,
1397 							inaddr & PMAP_PA_MASK,
1398 							PROT_READ);
1399 
1400 						pmap_activate(curproc);
1401 
1402 						bcopy((caddr_t)hibernate_temp_page,
1403 							(caddr_t)hibernate_copy_page,
1404 							PAGE_SIZE);
1405 						inaddr += hibernate_deflate(hib,
1406 							temp_inaddr,
1407 							&out_remaining);
1408 					} else {
1409 						inaddr += rle * PAGE_SIZE;
1410 						if (inaddr > range_end)
1411 							inaddr = range_end;
1412 					}
1413 
1414 				}
1415 
1416 				if (out_remaining == 0) {
1417 					/* Filled up the page */
1418 					nblocks = PAGE_SIZE / DEV_BSIZE;
1419 
1420 					if ((err = hib->io_func(hib->dev,
1421 					    blkctr + hib->image_offset,
1422 					    (vaddr_t)hibernate_io_page,
1423 					    PAGE_SIZE, HIB_W, hib->io_page))) {
1424 						DPRINTF("hib write error %d\n",
1425 						    err);
1426 						return (err);
1427 					}
1428 
1429 					blkctr += nblocks;
1430 				}
1431 			}
1432 		}
1433 
1434 		if (inaddr != range_end) {
1435 			DPRINTF("deflate range ended prematurely\n");
1436 			return (EINVAL);
1437 		}
1438 
1439 		/*
1440 		 * End of range. Round up to next secsize bytes
1441 		 * after finishing compress
1442 		 */
1443 		if (out_remaining == 0)
1444 			out_remaining = PAGE_SIZE;
1445 
1446 		/* Finish compress */
1447 		hibernate_state->hib_stream.next_in = (unsigned char *)inaddr;
1448 		hibernate_state->hib_stream.avail_in = 0;
1449 		hibernate_state->hib_stream.next_out =
1450 		    (unsigned char *)hibernate_io_page +
1451 			(PAGE_SIZE - out_remaining);
1452 
1453 		/* We have an extra output page available for finalize */
1454 		hibernate_state->hib_stream.avail_out =
1455 			out_remaining + PAGE_SIZE;
1456 
1457 		if ((err = deflate(&hibernate_state->hib_stream, Z_FINISH)) !=
1458 		    Z_STREAM_END) {
1459 			DPRINTF("deflate error in output stream: %d\n", err);
1460 			return (err);
1461 		}
1462 
1463 		out_remaining = hibernate_state->hib_stream.avail_out;
1464 
1465 		used = 2 * PAGE_SIZE - out_remaining;
1466 		nblocks = used / DEV_BSIZE;
1467 
1468 		/* Round up to next block if needed */
1469 		if (used % DEV_BSIZE != 0)
1470 			nblocks ++;
1471 
1472 		/* Write final block(s) for this chunk */
1473 		if ((err = hib->io_func(hib->dev, blkctr + hib->image_offset,
1474 		    (vaddr_t)hibernate_io_page, nblocks*DEV_BSIZE,
1475 		    HIB_W, hib->io_page))) {
1476 			DPRINTF("hib final write error %d\n", err);
1477 			return (err);
1478 		}
1479 
1480 		blkctr += nblocks;
1481 
1482 		chunks[i].compressed_size = (blkctr + hib->image_offset -
1483 		    chunks[i].offset) * DEV_BSIZE;
1484 	}
1485 
1486 	hib->chunktable_offset = hib->image_offset + blkctr;
1487 	return (0);
1488 }
1489 
1490 /*
1491  * Reset the zlib stream state and allocate a new hiballoc area for either
1492  * inflate or deflate. This function is called once for each hibernate chunk.
1493  * Calling hiballoc_init multiple times is acceptable since the memory it is
1494  * provided is unmanaged memory (stolen). We use the memory provided to us
1495  * by the piglet allocated via the supplied hib.
1496  */
1497 int
1498 hibernate_zlib_reset(union hibernate_info *hib, int deflate)
1499 {
1500 	vaddr_t hibernate_zlib_start;
1501 	size_t hibernate_zlib_size;
1502 	char *pva = (char *)hib->piglet_va;
1503 	struct hibernate_zlib_state *hibernate_state;
1504 
1505 	hibernate_state =
1506 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1507 
1508 	if (!deflate)
1509 		pva = (char *)((paddr_t)pva & (PIGLET_PAGE_MASK));
1510 
1511 	/*
1512 	 * See piglet layout information at the start of this file for
1513 	 * information on the zlib page assignments.
1514 	 */
1515 	hibernate_zlib_start = (vaddr_t)(pva + (29 * PAGE_SIZE));
1516 	hibernate_zlib_size = 80 * PAGE_SIZE;
1517 
1518 	memset((void *)hibernate_zlib_start, 0, hibernate_zlib_size);
1519 	memset(hibernate_state, 0, PAGE_SIZE);
1520 
1521 	/* Set up stream structure */
1522 	hibernate_state->hib_stream.zalloc = (alloc_func)hibernate_zlib_alloc;
1523 	hibernate_state->hib_stream.zfree = (free_func)hibernate_zlib_free;
1524 
1525 	/* Initialize the hiballoc arena for zlib allocs/frees */
1526 	hiballoc_init(&hibernate_state->hiballoc_arena,
1527 	    (caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1528 
1529 	if (deflate) {
1530 		return deflateInit(&hibernate_state->hib_stream,
1531 		    Z_BEST_SPEED);
1532 	} else
1533 		return inflateInit(&hibernate_state->hib_stream);
1534 }
1535 
1536 /*
1537  * Reads the hibernated memory image from disk, whose location and
1538  * size are recorded in hib. Begin by reading the persisted
1539  * chunk table, which records the original chunk placement location
1540  * and compressed size for each. Next, allocate a pig region of
1541  * sufficient size to hold the compressed image. Next, read the
1542  * chunks into the pig area (calling hibernate_read_chunks to do this),
1543  * and finally, if all of the above succeeds, clear the hibernate signature.
1544  * The function will then return to hibernate_resume, which will proceed
1545  * to unpack the pig image to the correct place in memory.
1546  */
1547 int
1548 hibernate_read_image(union hibernate_info *hib)
1549 {
1550 	size_t compressed_size, disk_size, chunktable_size, pig_sz;
1551 	paddr_t image_start, image_end, pig_start, pig_end;
1552 	struct hibernate_disk_chunk *chunks;
1553 	daddr_t blkctr;
1554 	vaddr_t chunktable = (vaddr_t)NULL;
1555 	paddr_t piglet_chunktable = hib->piglet_pa +
1556 	    HIBERNATE_CHUNK_SIZE;
1557 	int i, status;
1558 
1559 	status = 0;
1560 	pmap_activate(curproc);
1561 
1562 	/* Calculate total chunk table size in disk blocks */
1563 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / DEV_BSIZE;
1564 
1565 	blkctr = hib->chunktable_offset;
1566 
1567 	chunktable = (vaddr_t)km_alloc(HIBERNATE_CHUNK_TABLE_SIZE, &kv_any,
1568 	    &kp_none, &kd_nowait);
1569 
1570 	if (!chunktable)
1571 		return (1);
1572 
1573 	/* Map chunktable pages */
1574 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE; i += PAGE_SIZE)
1575 		pmap_kenter_pa(chunktable + i, piglet_chunktable + i,
1576 		    PROT_READ | PROT_WRITE);
1577 	pmap_update(pmap_kernel());
1578 
1579 	/* Read the chunktable from disk into the piglet chunktable */
1580 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE;
1581 	    i += MAXPHYS, blkctr += MAXPHYS/DEV_BSIZE)
1582 		hibernate_block_io(hib, blkctr, MAXPHYS,
1583 		    chunktable + i, 0);
1584 
1585 	blkctr = hib->image_offset;
1586 	compressed_size = 0;
1587 
1588 	chunks = (struct hibernate_disk_chunk *)chunktable;
1589 
1590 	for (i = 0; i < hib->chunk_ctr; i++)
1591 		compressed_size += chunks[i].compressed_size;
1592 
1593 	disk_size = compressed_size;
1594 
1595 	printf("unhibernating @ block %lld length %lu bytes\n",
1596 	    hib->sig_offset - chunktable_size,
1597 	    compressed_size);
1598 
1599 	/* Allocate the pig area */
1600 	pig_sz = compressed_size + HIBERNATE_CHUNK_SIZE;
1601 	if (uvm_pmr_alloc_pig(&pig_start, pig_sz, hib->piglet_pa) == ENOMEM) {
1602 		status = 1;
1603 		goto unmap;
1604 	}
1605 
1606 	pig_end = pig_start + pig_sz;
1607 
1608 	/* Calculate image extents. Pig image must end on a chunk boundary. */
1609 	image_end = pig_end & ~(HIBERNATE_CHUNK_SIZE - 1);
1610 	image_start = image_end - disk_size;
1611 
1612 	hibernate_read_chunks(hib, image_start, image_end, disk_size,
1613 	    chunks);
1614 
1615 	/* Prepare the resume time pmap/page table */
1616 	hibernate_populate_resume_pt(hib, image_start, image_end);
1617 
1618 unmap:
1619 	/* Unmap chunktable pages */
1620 	pmap_kremove(chunktable, HIBERNATE_CHUNK_TABLE_SIZE);
1621 	pmap_update(pmap_kernel());
1622 
1623 	return (status);
1624 }
1625 
1626 /*
1627  * Read the hibernated memory chunks from disk (chunk information at this
1628  * point is stored in the piglet) into the pig area specified by
1629  * [pig_start .. pig_end]. Order the chunks so that the final chunk is the
1630  * only chunk with overlap possibilities.
1631  */
1632 int
1633 hibernate_read_chunks(union hibernate_info *hib, paddr_t pig_start,
1634     paddr_t pig_end, size_t image_compr_size,
1635     struct hibernate_disk_chunk *chunks)
1636 {
1637 	paddr_t img_cur, piglet_base;
1638 	daddr_t blkctr;
1639 	size_t processed, compressed_size, read_size;
1640 	int nchunks, nfchunks, num_io_pages;
1641 	vaddr_t tempva, hibernate_fchunk_area;
1642 	short *fchunks, i, j;
1643 
1644 	tempva = (vaddr_t)NULL;
1645 	hibernate_fchunk_area = (vaddr_t)NULL;
1646 	nfchunks = 0;
1647 	piglet_base = hib->piglet_pa;
1648 	global_pig_start = pig_start;
1649 
1650 	/*
1651 	 * These mappings go into the resuming kernel's page table, and are
1652 	 * used only during image read. They dissappear from existence
1653 	 * when the suspended kernel is unpacked on top of us.
1654 	 */
1655 	tempva = (vaddr_t)km_alloc(MAXPHYS + PAGE_SIZE, &kv_any, &kp_none,
1656 		&kd_nowait);
1657 	if (!tempva)
1658 		return (1);
1659 	hibernate_fchunk_area = (vaddr_t)km_alloc(24 * PAGE_SIZE, &kv_any,
1660 	    &kp_none, &kd_nowait);
1661 	if (!hibernate_fchunk_area)
1662 		return (1);
1663 
1664 	/* Final output chunk ordering VA */
1665 	fchunks = (short *)hibernate_fchunk_area;
1666 
1667 	/* Map the chunk ordering region */
1668 	for(i = 0; i < 24 ; i++)
1669 		pmap_kenter_pa(hibernate_fchunk_area + (i * PAGE_SIZE),
1670 			piglet_base + ((4 + i) * PAGE_SIZE),
1671 			PROT_READ | PROT_WRITE);
1672 	pmap_update(pmap_kernel());
1673 
1674 	nchunks = hib->chunk_ctr;
1675 
1676 	/* Initially start all chunks as unplaced */
1677 	for (i = 0; i < nchunks; i++)
1678 		chunks[i].flags = 0;
1679 
1680 	/*
1681 	 * Search the list for chunks that are outside the pig area. These
1682 	 * can be placed first in the final output list.
1683 	 */
1684 	for (i = 0; i < nchunks; i++) {
1685 		if (chunks[i].end <= pig_start || chunks[i].base >= pig_end) {
1686 			fchunks[nfchunks] = i;
1687 			nfchunks++;
1688 			chunks[i].flags |= HIBERNATE_CHUNK_PLACED;
1689 		}
1690 	}
1691 
1692 	/*
1693 	 * Walk the ordering, place the chunks in ascending memory order.
1694 	 */
1695 	for (i = 0; i < nchunks; i++) {
1696 		if (chunks[i].flags != HIBERNATE_CHUNK_PLACED) {
1697 			fchunks[nfchunks] = i;
1698 			nfchunks++;
1699 			chunks[i].flags = HIBERNATE_CHUNK_PLACED;
1700 		}
1701 	}
1702 
1703 	img_cur = pig_start;
1704 
1705 	for (i = 0; i < nfchunks; i++) {
1706 		blkctr = chunks[fchunks[i]].offset;
1707 		processed = 0;
1708 		compressed_size = chunks[fchunks[i]].compressed_size;
1709 
1710 		while (processed < compressed_size) {
1711 			if (compressed_size - processed >= MAXPHYS)
1712 				read_size = MAXPHYS;
1713 			else
1714 				read_size = compressed_size - processed;
1715 
1716 			/*
1717 			 * We're reading read_size bytes, offset from the
1718 			 * start of a page by img_cur % PAGE_SIZE, so the
1719 			 * end will be read_size + (img_cur % PAGE_SIZE)
1720 			 * from the start of the first page.  Round that
1721 			 * up to the next page size.
1722 			 */
1723 			num_io_pages = (read_size + (img_cur % PAGE_SIZE)
1724 				+ PAGE_SIZE - 1) / PAGE_SIZE;
1725 
1726 			KASSERT(num_io_pages <= MAXPHYS/PAGE_SIZE + 1);
1727 
1728 			/* Map pages for this read */
1729 			for (j = 0; j < num_io_pages; j ++)
1730 				pmap_kenter_pa(tempva + j * PAGE_SIZE,
1731 				    img_cur + j * PAGE_SIZE,
1732 				    PROT_READ | PROT_WRITE);
1733 
1734 			pmap_update(pmap_kernel());
1735 
1736 			hibernate_block_io(hib, blkctr, read_size,
1737 			    tempva + (img_cur & PAGE_MASK), 0);
1738 
1739 			blkctr += (read_size / DEV_BSIZE);
1740 
1741 			pmap_kremove(tempva, num_io_pages * PAGE_SIZE);
1742 			pmap_update(pmap_kernel());
1743 
1744 			processed += read_size;
1745 			img_cur += read_size;
1746 		}
1747 	}
1748 
1749 	pmap_kremove(hibernate_fchunk_area, 24 * PAGE_SIZE);
1750 	pmap_update(pmap_kernel());
1751 
1752 	return (0);
1753 }
1754 
1755 /*
1756  * Hibernating a machine comprises the following operations:
1757  *  1. Calculating this machine's hibernate_info information
1758  *  2. Allocating a piglet and saving the piglet's physaddr
1759  *  3. Calculating the memory chunks
1760  *  4. Writing the compressed chunks to disk
1761  *  5. Writing the chunk table
1762  *  6. Writing the signature block (hibernate_info)
1763  *
1764  * On most architectures, the function calling hibernate_suspend would
1765  * then power off the machine using some MD-specific implementation.
1766  */
1767 int
1768 hibernate_suspend(void)
1769 {
1770 	union hibernate_info hib;
1771 	u_long start, end;
1772 
1773 	/*
1774 	 * Calculate memory ranges, swap offsets, etc.
1775 	 * This also allocates a piglet whose physaddr is stored in
1776 	 * hib->piglet_pa and vaddr stored in hib->piglet_va
1777 	 */
1778 	if (get_hibernate_info(&hib, 1)) {
1779 		DPRINTF("failed to obtain hibernate info\n");
1780 		return (1);
1781 	}
1782 
1783 	/* Find a page-addressed region in swap [start,end] */
1784 	if (uvm_hibswap(hib.dev, &start, &end)) {
1785 		printf("hibernate: cannot find any swap\n");
1786 		return (1);
1787 	}
1788 
1789 	if (end - start < 1000) {
1790 		printf("hibernate: insufficient swap (%lu is too small)\n",
1791 			end - start);
1792 		return (1);
1793 	}
1794 
1795 	/* Calculate block offsets in swap */
1796 	hib.image_offset = ctod(start);
1797 
1798 	DPRINTF("hibernate @ block %lld max-length %lu blocks\n",
1799 	    hib.image_offset, ctod(end) - ctod(start));
1800 
1801 	pmap_kenter_pa(HIBERNATE_HIBALLOC_PAGE, HIBERNATE_HIBALLOC_PAGE,
1802 		PROT_READ | PROT_WRITE);
1803 	pmap_activate(curproc);
1804 
1805 	DPRINTF("hibernate: writing chunks\n");
1806 	if (hibernate_write_chunks(&hib)) {
1807 		DPRINTF("hibernate_write_chunks failed\n");
1808 		goto fail;
1809 	}
1810 
1811 	DPRINTF("hibernate: writing chunktable\n");
1812 	if (hibernate_write_chunktable(&hib)) {
1813 		DPRINTF("hibernate_write_chunktable failed\n");
1814 		goto fail;
1815 	}
1816 
1817 	DPRINTF("hibernate: writing signature\n");
1818 	if (hibernate_write_signature(&hib)) {
1819 		DPRINTF("hibernate_write_signature failed\n");
1820 		goto fail;
1821 	}
1822 
1823 	/* Allow the disk to settle */
1824 	delay(500000);
1825 
1826 	/*
1827 	 * Give the device-specific I/O function a notification that we're
1828 	 * done, and that it can clean up or shutdown as needed.
1829 	 */
1830 	hib.io_func(hib.dev, 0, (vaddr_t)NULL, 0, HIB_DONE, hib.io_page);
1831 
1832 	return (0);
1833 fail:
1834 	pmap_kremove(HIBERNATE_HIBALLOC_PAGE, PAGE_SIZE);
1835 	pmap_update(pmap_kernel());
1836 	return (1);
1837 }
1838 
1839 int
1840 hibernate_alloc(void)
1841 {
1842 	KASSERT(global_piglet_va == 0);
1843 	KASSERT(hibernate_temp_page == 0);
1844 
1845 	/* Allocate a piglet, store its addresses in the supplied globals */
1846 	if (uvm_pmr_alloc_piglet(&global_piglet_va, &global_piglet_pa,
1847 	    HIBERNATE_CHUNK_SIZE * 4, HIBERNATE_CHUNK_SIZE))
1848 		return (ENOMEM);
1849 
1850 	/*
1851 	 * Allocate VA for the temp page.
1852 	 *
1853 	 * This will become part of the suspended kernel and will
1854 	 * be freed in hibernate_free, upon resume (or hibernate
1855 	 * failure)
1856 	 */
1857 	hibernate_temp_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1858 	    &kp_none, &kd_nowait);
1859 	if (!hibernate_temp_page) {
1860 		DPRINTF("out of memory allocating hibernate_temp_page\n");
1861 		return (ENOMEM);
1862 	}
1863 
1864 	return (0);
1865 }
1866 
1867 /*
1868  * Free items allocated by hibernate_alloc()
1869  */
1870 void
1871 hibernate_free(void)
1872 {
1873 	if (global_piglet_va)
1874 		uvm_pmr_free_piglet(global_piglet_va,
1875 		    4 * HIBERNATE_CHUNK_SIZE);
1876 
1877 	if (hibernate_temp_page) {
1878 		pmap_kremove(hibernate_temp_page, PAGE_SIZE);
1879 		km_free((void *)hibernate_temp_page, PAGE_SIZE,
1880 		    &kv_any, &kp_none);
1881 	}
1882 
1883 	global_piglet_va = 0;
1884 	hibernate_temp_page = 0;
1885 	pmap_kremove(HIBERNATE_HIBALLOC_PAGE, PAGE_SIZE);
1886 	pmap_update(pmap_kernel());
1887 }
1888