xref: /openbsd-src/sys/kern/subr_hibernate.c (revision 9a033ad32995d2ed18ce56c1c106eed45ee7fb76)
1 /*	$OpenBSD: subr_hibernate.c,v 1.104 2014/10/16 04:19:33 mlarkin Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  * Copyright (c) 2011 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/hibernate.h>
21 #include <sys/malloc.h>
22 #include <sys/param.h>
23 #include <sys/tree.h>
24 #include <sys/systm.h>
25 #include <sys/disklabel.h>
26 #include <sys/disk.h>
27 #include <sys/conf.h>
28 #include <sys/buf.h>
29 #include <sys/fcntl.h>
30 #include <sys/stat.h>
31 #include <uvm/uvm.h>
32 #include <uvm/uvm_swap.h>
33 #include <machine/hibernate.h>
34 
35 /*
36  * Hibernate piglet layout information
37  *
38  * The piglet is a scratch area of memory allocated by the suspending kernel.
39  * Its phys and virt addrs are recorded in the signature block. The piglet is
40  * used to guarantee an unused area of memory that can be used by the resuming
41  * kernel for various things. The piglet is excluded during unpack operations.
42  * The piglet size is presently 4*HIBERNATE_CHUNK_SIZE (typically 4*4MB).
43  *
44  * Offset from piglet_base	Purpose
45  * ----------------------------------------------------------------------------
46  * 0				Private page for suspend I/O write functions
47  * 1*PAGE_SIZE			I/O page used during hibernate suspend
48  * 2*PAGE_SIZE			I/O page used during hibernate suspend
49  * 3*PAGE_SIZE			copy page used during hibernate suspend
50  * 4*PAGE_SIZE			final chunk ordering list (24 pages)
51  * 28*PAGE_SIZE			RLE utility page
52  * 29*PAGE_SIZE			start of hiballoc area
53  * 109*PAGE_SIZE		end of hiballoc area (80 pages)
54  * ...				unused
55  * HIBERNATE_CHUNK_SIZE		start of hibernate chunk table
56  * 2*HIBERNATE_CHUNK_SIZE	bounce area for chunks being unpacked
57  * 4*HIBERNATE_CHUNK_SIZE	end of piglet
58  */
59 
60 /* Temporary vaddr ranges used during hibernate */
61 vaddr_t hibernate_temp_page;
62 vaddr_t hibernate_copy_page;
63 vaddr_t hibernate_rle_page;
64 
65 /* Hibernate info as read from disk during resume */
66 union hibernate_info disk_hib;
67 paddr_t global_pig_start;
68 vaddr_t global_piglet_va;
69 paddr_t global_piglet_pa;
70 
71 /* #define HIB_DEBUG */
72 #ifdef HIB_DEBUG
73 int	hib_debug = 99;
74 #define DPRINTF(x...)     do { if (hib_debug) printf(x); } while (0)
75 #define DNPRINTF(n,x...)  do { if (hib_debug > (n)) printf(x); } while (0)
76 #else
77 #define DPRINTF(x...)
78 #define DNPRINTF(n,x...)
79 #endif
80 
81 #ifndef NO_PROPOLICE
82 extern long __guard_local;
83 #endif /* ! NO_PROPOLICE */
84 
85 void hibernate_copy_chunk_to_piglet(paddr_t, vaddr_t, size_t);
86 int hibernate_calc_rle(paddr_t, paddr_t);
87 int hibernate_write_rle(union hibernate_info *, paddr_t, paddr_t, daddr_t *,
88 	size_t *);
89 
90 #define MAX_RLE (HIBERNATE_CHUNK_SIZE / PAGE_SIZE)
91 
92 /*
93  * Hib alloc enforced alignment.
94  */
95 #define HIB_ALIGN		8 /* bytes alignment */
96 
97 /*
98  * sizeof builtin operation, but with alignment constraint.
99  */
100 #define HIB_SIZEOF(_type)	roundup(sizeof(_type), HIB_ALIGN)
101 
102 struct hiballoc_entry {
103 	size_t			hibe_use;
104 	size_t			hibe_space;
105 	RB_ENTRY(hiballoc_entry) hibe_entry;
106 };
107 
108 /*
109  * Sort hibernate memory ranges by ascending PA
110  */
111 void
112 hibernate_sort_ranges(union hibernate_info *hib_info)
113 {
114 	int i, j;
115 	struct hibernate_memory_range *ranges;
116 	paddr_t base, end;
117 
118 	ranges = hib_info->ranges;
119 
120 	for (i = 1; i < hib_info->nranges; i++) {
121 		j = i;
122 		while (j > 0 && ranges[j - 1].base > ranges[j].base) {
123 			base = ranges[j].base;
124 			end = ranges[j].end;
125 			ranges[j].base = ranges[j - 1].base;
126 			ranges[j].end = ranges[j - 1].end;
127 			ranges[j - 1].base = base;
128 			ranges[j - 1].end = end;
129 			j--;
130 		}
131 	}
132 }
133 
134 /*
135  * Compare hiballoc entries based on the address they manage.
136  *
137  * Since the address is fixed, relative to struct hiballoc_entry,
138  * we just compare the hiballoc_entry pointers.
139  */
140 static __inline int
141 hibe_cmp(struct hiballoc_entry *l, struct hiballoc_entry *r)
142 {
143 	return l < r ? -1 : (l > r);
144 }
145 
146 RB_PROTOTYPE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
147 
148 /*
149  * Given a hiballoc entry, return the address it manages.
150  */
151 static __inline void *
152 hib_entry_to_addr(struct hiballoc_entry *entry)
153 {
154 	caddr_t addr;
155 
156 	addr = (caddr_t)entry;
157 	addr += HIB_SIZEOF(struct hiballoc_entry);
158 	return addr;
159 }
160 
161 /*
162  * Given an address, find the hiballoc that corresponds.
163  */
164 static __inline struct hiballoc_entry*
165 hib_addr_to_entry(void *addr_param)
166 {
167 	caddr_t addr;
168 
169 	addr = (caddr_t)addr_param;
170 	addr -= HIB_SIZEOF(struct hiballoc_entry);
171 	return (struct hiballoc_entry*)addr;
172 }
173 
174 RB_GENERATE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
175 
176 /*
177  * Allocate memory from the arena.
178  *
179  * Returns NULL if no memory is available.
180  */
181 void *
182 hib_alloc(struct hiballoc_arena *arena, size_t alloc_sz)
183 {
184 	struct hiballoc_entry *entry, *new_entry;
185 	size_t find_sz;
186 
187 	/*
188 	 * Enforce alignment of HIB_ALIGN bytes.
189 	 *
190 	 * Note that, because the entry is put in front of the allocation,
191 	 * 0-byte allocations are guaranteed a unique address.
192 	 */
193 	alloc_sz = roundup(alloc_sz, HIB_ALIGN);
194 
195 	/*
196 	 * Find an entry with hibe_space >= find_sz.
197 	 *
198 	 * If the root node is not large enough, we switch to tree traversal.
199 	 * Because all entries are made at the bottom of the free space,
200 	 * traversal from the end has a slightly better chance of yielding
201 	 * a sufficiently large space.
202 	 */
203 	find_sz = alloc_sz + HIB_SIZEOF(struct hiballoc_entry);
204 	entry = RB_ROOT(&arena->hib_addrs);
205 	if (entry != NULL && entry->hibe_space < find_sz) {
206 		RB_FOREACH_REVERSE(entry, hiballoc_addr, &arena->hib_addrs) {
207 			if (entry->hibe_space >= find_sz)
208 				break;
209 		}
210 	}
211 
212 	/*
213 	 * Insufficient or too fragmented memory.
214 	 */
215 	if (entry == NULL)
216 		return NULL;
217 
218 	/*
219 	 * Create new entry in allocated space.
220 	 */
221 	new_entry = (struct hiballoc_entry*)(
222 	    (caddr_t)hib_entry_to_addr(entry) + entry->hibe_use);
223 	new_entry->hibe_space = entry->hibe_space - find_sz;
224 	new_entry->hibe_use = alloc_sz;
225 
226 	/*
227 	 * Insert entry.
228 	 */
229 	if (RB_INSERT(hiballoc_addr, &arena->hib_addrs, new_entry) != NULL)
230 		panic("hib_alloc: insert failure");
231 	entry->hibe_space = 0;
232 
233 	/* Return address managed by entry. */
234 	return hib_entry_to_addr(new_entry);
235 }
236 
237 /*
238  * Free a pointer previously allocated from this arena.
239  *
240  * If addr is NULL, this will be silently accepted.
241  */
242 void
243 hib_free(struct hiballoc_arena *arena, void *addr)
244 {
245 	struct hiballoc_entry *entry, *prev;
246 
247 	if (addr == NULL)
248 		return;
249 
250 	/*
251 	 * Derive entry from addr and check it is really in this arena.
252 	 */
253 	entry = hib_addr_to_entry(addr);
254 	if (RB_FIND(hiballoc_addr, &arena->hib_addrs, entry) != entry)
255 		panic("hib_free: freed item %p not in hib arena", addr);
256 
257 	/*
258 	 * Give the space in entry to its predecessor.
259 	 *
260 	 * If entry has no predecessor, change its used space into free space
261 	 * instead.
262 	 */
263 	prev = RB_PREV(hiballoc_addr, &arena->hib_addrs, entry);
264 	if (prev != NULL &&
265 	    (void *)((caddr_t)prev + HIB_SIZEOF(struct hiballoc_entry) +
266 	    prev->hibe_use + prev->hibe_space) == entry) {
267 		/* Merge entry. */
268 		RB_REMOVE(hiballoc_addr, &arena->hib_addrs, entry);
269 		prev->hibe_space += HIB_SIZEOF(struct hiballoc_entry) +
270 		    entry->hibe_use + entry->hibe_space;
271 	} else {
272 		/* Flip used memory to free space. */
273 		entry->hibe_space += entry->hibe_use;
274 		entry->hibe_use = 0;
275 	}
276 }
277 
278 /*
279  * Initialize hiballoc.
280  *
281  * The allocator will manage memmory at ptr, which is len bytes.
282  */
283 int
284 hiballoc_init(struct hiballoc_arena *arena, void *p_ptr, size_t p_len)
285 {
286 	struct hiballoc_entry *entry;
287 	caddr_t ptr;
288 	size_t len;
289 
290 	RB_INIT(&arena->hib_addrs);
291 
292 	/*
293 	 * Hib allocator enforces HIB_ALIGN alignment.
294 	 * Fixup ptr and len.
295 	 */
296 	ptr = (caddr_t)roundup((vaddr_t)p_ptr, HIB_ALIGN);
297 	len = p_len - ((size_t)ptr - (size_t)p_ptr);
298 	len &= ~((size_t)HIB_ALIGN - 1);
299 
300 	/*
301 	 * Insufficient memory to be able to allocate and also do bookkeeping.
302 	 */
303 	if (len <= HIB_SIZEOF(struct hiballoc_entry))
304 		return ENOMEM;
305 
306 	/*
307 	 * Create entry describing space.
308 	 */
309 	entry = (struct hiballoc_entry*)ptr;
310 	entry->hibe_use = 0;
311 	entry->hibe_space = len - HIB_SIZEOF(struct hiballoc_entry);
312 	RB_INSERT(hiballoc_addr, &arena->hib_addrs, entry);
313 
314 	return 0;
315 }
316 
317 /*
318  * Zero all free memory.
319  */
320 void
321 uvm_pmr_zero_everything(void)
322 {
323 	struct uvm_pmemrange	*pmr;
324 	struct vm_page		*pg;
325 	int			 i;
326 
327 	uvm_lock_fpageq();
328 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
329 		/* Zero single pages. */
330 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_DIRTY]))
331 		    != NULL) {
332 			uvm_pmr_remove(pmr, pg);
333 			uvm_pagezero(pg);
334 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
335 			uvmexp.zeropages++;
336 			uvm_pmr_insert(pmr, pg, 0);
337 		}
338 
339 		/* Zero multi page ranges. */
340 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_DIRTY]))
341 		    != NULL) {
342 			pg--; /* Size tree always has second page. */
343 			uvm_pmr_remove(pmr, pg);
344 			for (i = 0; i < pg->fpgsz; i++) {
345 				uvm_pagezero(&pg[i]);
346 				atomic_setbits_int(&pg[i].pg_flags, PG_ZERO);
347 				uvmexp.zeropages++;
348 			}
349 			uvm_pmr_insert(pmr, pg, 0);
350 		}
351 	}
352 	uvm_unlock_fpageq();
353 }
354 
355 /*
356  * Mark all memory as dirty.
357  *
358  * Used to inform the system that the clean memory isn't clean for some
359  * reason, for example because we just came back from hibernate.
360  */
361 void
362 uvm_pmr_dirty_everything(void)
363 {
364 	struct uvm_pmemrange	*pmr;
365 	struct vm_page		*pg;
366 	int			 i;
367 
368 	uvm_lock_fpageq();
369 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
370 		/* Dirty single pages. */
371 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_ZERO]))
372 		    != NULL) {
373 			uvm_pmr_remove(pmr, pg);
374 			atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
375 			uvm_pmr_insert(pmr, pg, 0);
376 		}
377 
378 		/* Dirty multi page ranges. */
379 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_ZERO]))
380 		    != NULL) {
381 			pg--; /* Size tree always has second page. */
382 			uvm_pmr_remove(pmr, pg);
383 			for (i = 0; i < pg->fpgsz; i++)
384 				atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
385 			uvm_pmr_insert(pmr, pg, 0);
386 		}
387 	}
388 
389 	uvmexp.zeropages = 0;
390 	uvm_unlock_fpageq();
391 }
392 
393 /*
394  * Allocate an area that can hold sz bytes and doesn't overlap with
395  * the piglet at piglet_pa.
396  */
397 int
398 uvm_pmr_alloc_pig(paddr_t *pa, psize_t sz, paddr_t piglet_pa)
399 {
400 	struct uvm_constraint_range pig_constraint;
401 	struct kmem_pa_mode kp_pig = {
402 		.kp_constraint = &pig_constraint,
403 		.kp_maxseg = 1
404 	};
405 	vaddr_t va;
406 
407 	sz = round_page(sz);
408 
409 	pig_constraint.ucr_low = piglet_pa + 4 * HIBERNATE_CHUNK_SIZE;
410 	pig_constraint.ucr_high = -1;
411 
412 	va = (vaddr_t)km_alloc(sz, &kv_any, &kp_pig, &kd_nowait);
413 	if (va == 0) {
414 		pig_constraint.ucr_low = 0;
415 		pig_constraint.ucr_high = piglet_pa - 1;
416 
417 		va = (vaddr_t)km_alloc(sz, &kv_any, &kp_pig, &kd_nowait);
418 		if (va == 0)
419 			return ENOMEM;
420 	}
421 
422 	pmap_extract(pmap_kernel(), va, pa);
423 	return 0;
424 }
425 
426 /*
427  * Allocate a piglet area.
428  *
429  * This needs to be in DMA-safe memory.
430  * Piglets are aligned.
431  *
432  * sz and align in bytes.
433  *
434  * The call will sleep for the pagedaemon to attempt to free memory.
435  * The pagedaemon may decide its not possible to free enough memory, causing
436  * the allocation to fail.
437  */
438 int
439 uvm_pmr_alloc_piglet(vaddr_t *va, paddr_t *pa, vsize_t sz, paddr_t align)
440 {
441 	struct kmem_pa_mode kp_piglet = {
442 		.kp_constraint = &dma_constraint,
443 		.kp_align = align,
444 		.kp_maxseg = 1
445 	};
446 
447 	/* Ensure align is a power of 2 */
448 	KASSERT((align & (align - 1)) == 0);
449 
450 	/*
451 	 * Fixup arguments: align must be at least PAGE_SIZE,
452 	 * sz will be converted to pagecount, since that is what
453 	 * pmemrange uses internally.
454 	 */
455 	if (align < PAGE_SIZE)
456 		align = PAGE_SIZE;
457 	sz = round_page(sz);
458 
459 	*va = (vaddr_t)km_alloc(sz, &kv_any, &kp_piglet, &kd_nowait);
460 	if (*va == 0)
461 		return ENOMEM;
462 
463 	pmap_extract(pmap_kernel(), *va, pa);
464 	return 0;
465 }
466 
467 /*
468  * Free a piglet area.
469  */
470 void
471 uvm_pmr_free_piglet(vaddr_t va, vsize_t sz)
472 {
473 	/*
474 	 * Fix parameters.
475 	 */
476 	sz = round_page(sz);
477 
478 	/*
479 	 * Free the physical and virtual memory.
480 	 */
481 	km_free((void *)va, sz, &kv_any, &kp_dma_contig);
482 }
483 
484 /*
485  * Physmem RLE compression support.
486  *
487  * Given a physical page address, return the number of pages starting at the
488  * address that are free.  Clamps to the number of pages in
489  * HIBERNATE_CHUNK_SIZE. Returns 0 if the page at addr is not free.
490  */
491 int
492 uvm_page_rle(paddr_t addr)
493 {
494 	struct vm_page		*pg, *pg_end;
495 	struct vm_physseg	*vmp;
496 	int			 pseg_idx, off_idx;
497 
498 	pseg_idx = vm_physseg_find(atop(addr), &off_idx);
499 	if (pseg_idx == -1)
500 		return 0;
501 
502 	vmp = &vm_physmem[pseg_idx];
503 	pg = &vmp->pgs[off_idx];
504 	if (!(pg->pg_flags & PQ_FREE))
505 		return 0;
506 
507 	/*
508 	 * Search for the first non-free page after pg.
509 	 * Note that the page may not be the first page in a free pmemrange,
510 	 * therefore pg->fpgsz cannot be used.
511 	 */
512 	for (pg_end = pg; pg_end <= vmp->lastpg &&
513 	    (pg_end->pg_flags & PQ_FREE) == PQ_FREE; pg_end++)
514 		;
515 	return min((pg_end - pg), HIBERNATE_CHUNK_SIZE/PAGE_SIZE);
516 }
517 
518 /*
519  * Fills out the hibernate_info union pointed to by hib
520  * with information about this machine (swap signature block
521  * offsets, number of memory ranges, kernel in use, etc)
522  */
523 int
524 get_hibernate_info(union hibernate_info *hib, int suspend)
525 {
526 	struct disklabel dl;
527 	char err_string[128], *dl_ret;
528 
529 #ifndef NO_PROPOLICE
530 	/* Save propolice guard */
531 	hib->guard = __guard_local;
532 #endif /* ! NO_PROPOLICE */
533 
534 	/* Determine I/O function to use */
535 	hib->io_func = get_hibernate_io_function(swdevt[0].sw_dev);
536 	if (hib->io_func == NULL)
537 		return (1);
538 
539 	/* Calculate hibernate device */
540 	hib->dev = swdevt[0].sw_dev;
541 
542 	/* Read disklabel (used to calculate signature and image offsets) */
543 	dl_ret = disk_readlabel(&dl, hib->dev, err_string, sizeof(err_string));
544 
545 	if (dl_ret) {
546 		printf("Hibernate error reading disklabel: %s\n", dl_ret);
547 		return (1);
548 	}
549 
550 	/* Make sure we have a swap partition. */
551 	if (dl.d_partitions[1].p_fstype != FS_SWAP ||
552 	    DL_GETPSIZE(&dl.d_partitions[1]) == 0)
553 		return (1);
554 
555 	/* Make sure the signature can fit in one block */
556 	if (sizeof(union hibernate_info) > DEV_BSIZE)
557 		return (1);
558 
559 	/* Magic number */
560 	hib->magic = HIBERNATE_MAGIC;
561 
562 	/* Calculate signature block location */
563 	hib->sig_offset = DL_GETPSIZE(&dl.d_partitions[1]) -
564 	    sizeof(union hibernate_info)/DEV_BSIZE;
565 
566 	/* Stash kernel version information */
567 	memset(&hib->kernel_version, 0, 128);
568 	bcopy(version, &hib->kernel_version,
569 	    min(strlen(version), sizeof(hib->kernel_version)-1));
570 
571 	if (suspend) {
572 		hib->piglet_va = global_piglet_va;
573 		hib->piglet_pa = global_piglet_pa;
574 		hib->io_page = (void *)hib->piglet_va;
575 
576 		/*
577 		 * Initialization of the hibernate IO function for drivers
578 		 * that need to do prep work (such as allocating memory or
579 		 * setting up data structures that cannot safely be done
580 		 * during suspend without causing side effects). There is
581 		 * a matching HIB_DONE call performed after the write is
582 		 * completed.
583 		 */
584 		if (hib->io_func(hib->dev, DL_GETPOFFSET(&dl.d_partitions[1]),
585 		    (vaddr_t)NULL, DL_GETPSIZE(&dl.d_partitions[1]),
586 		    HIB_INIT, hib->io_page))
587 			goto fail;
588 
589 	} else {
590 		/*
591 		 * Resuming kernels use a regular private page for the driver
592 		 * No need to free this I/O page as it will vanish as part of
593 		 * the resume.
594 		 */
595 		hib->io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
596 		if (!hib->io_page)
597 			goto fail;
598 	}
599 
600 
601 	if (get_hibernate_info_md(hib))
602 		goto fail;
603 
604 	return (0);
605 
606 fail:
607 	return (1);
608 }
609 
610 /*
611  * Allocate nitems*size bytes from the hiballoc area presently in use
612  */
613 void *
614 hibernate_zlib_alloc(void *unused, int nitems, int size)
615 {
616 	struct hibernate_zlib_state *hibernate_state;
617 
618 	hibernate_state =
619 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
620 
621 	return hib_alloc(&hibernate_state->hiballoc_arena, nitems*size);
622 }
623 
624 /*
625  * Free the memory pointed to by addr in the hiballoc area presently in
626  * use
627  */
628 void
629 hibernate_zlib_free(void *unused, void *addr)
630 {
631 	struct hibernate_zlib_state *hibernate_state;
632 
633 	hibernate_state =
634 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
635 
636 	hib_free(&hibernate_state->hiballoc_arena, addr);
637 }
638 
639 /*
640  * Inflate next page of data from the image stream.
641  * The rle parameter is modified on exit to contain the number of pages to
642  * skip in the output stream (or 0 if this page was inflated into).
643  *
644  * Returns 0 if the stream contains additional data, or 1 if the stream is
645  * finished.
646  */
647 int
648 hibernate_inflate_page(int *rle)
649 {
650 	struct hibernate_zlib_state *hibernate_state;
651 	int i;
652 
653 	hibernate_state =
654 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
655 
656 	/* Set up the stream for RLE code inflate */
657 	hibernate_state->hib_stream.next_out = (unsigned char *)rle;
658 	hibernate_state->hib_stream.avail_out = sizeof(*rle);
659 
660 	/* Inflate RLE code */
661 	i = inflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH);
662 	if (i != Z_OK && i != Z_STREAM_END) {
663 		/*
664 		 * XXX - this will likely reboot/hang most machines
665 		 *       since the console output buffer will be unmapped,
666 		 *       but there's not much else we can do here.
667 		 */
668 		panic("rle inflate stream error");
669 	}
670 
671 	if (hibernate_state->hib_stream.avail_out != 0) {
672 		/*
673 		 * XXX - this will likely reboot/hang most machines
674 		 *       since the console output buffer will be unmapped,
675 		 *       but there's not much else we can do here.
676 		 */
677 		panic("rle short inflate error");
678 	}
679 
680 	if (*rle < 0 || *rle > 1024) {
681 		/*
682 		 * XXX - this will likely reboot/hang most machines
683 		 *       since the console output buffer will be unmapped,
684 		 *       but there's not much else we can do here.
685 		 */
686 		panic("invalid rle count");
687 	}
688 
689 	if (i == Z_STREAM_END)
690 		return (1);
691 
692 	if (*rle != 0)
693 		return (0);
694 
695 	/* Set up the stream for page inflate */
696 	hibernate_state->hib_stream.next_out =
697 		(unsigned char *)HIBERNATE_INFLATE_PAGE;
698 	hibernate_state->hib_stream.avail_out = PAGE_SIZE;
699 
700 	/* Process next block of data */
701 	i = inflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH);
702 	if (i != Z_OK && i != Z_STREAM_END) {
703 		/*
704 		 * XXX - this will likely reboot/hang most machines
705 		 *       since the console output buffer will be unmapped,
706 		 *       but there's not much else we can do here.
707 		 */
708 		panic("inflate error");
709 	}
710 
711 	/* We should always have extracted a full page ... */
712 	if (hibernate_state->hib_stream.avail_out != 0) {
713 		/*
714 		 * XXX - this will likely reboot/hang most machines
715 		 *       since the console output buffer will be unmapped,
716 		 *       but there's not much else we can do here.
717 		 */
718 		panic("incomplete page");
719 	}
720 
721 	return (i == Z_STREAM_END);
722 }
723 
724 /*
725  * Inflate size bytes from src into dest, skipping any pages in
726  * [src..dest] that are special (see hibernate_inflate_skip)
727  *
728  * This function executes while using the resume-time stack
729  * and pmap, and therefore cannot use ddb/printf/etc. Doing so
730  * will likely hang or reset the machine since the console output buffer
731  * will be unmapped.
732  */
733 void
734 hibernate_inflate_region(union hibernate_info *hib, paddr_t dest,
735     paddr_t src, size_t size)
736 {
737 	int end_stream = 0, rle;
738 	struct hibernate_zlib_state *hibernate_state;
739 
740 	hibernate_state =
741 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
742 
743 	hibernate_state->hib_stream.next_in = (unsigned char *)src;
744 	hibernate_state->hib_stream.avail_in = size;
745 
746 	do {
747 		/*
748 		 * Is this a special page? If yes, redirect the
749 		 * inflate output to a scratch page (eg, discard it)
750 		 */
751 		if (hibernate_inflate_skip(hib, dest)) {
752 			hibernate_enter_resume_mapping(
753 			    HIBERNATE_INFLATE_PAGE,
754 			    HIBERNATE_INFLATE_PAGE, 0);
755 		} else {
756 			hibernate_enter_resume_mapping(
757 			    HIBERNATE_INFLATE_PAGE, dest, 0);
758 		}
759 
760 		hibernate_flush();
761 		end_stream = hibernate_inflate_page(&rle);
762 
763 		if (rle == 0)
764 			dest += PAGE_SIZE;
765 		else
766 			dest += (rle * PAGE_SIZE);
767 	} while (!end_stream);
768 }
769 
770 /*
771  * deflate from src into the I/O page, up to 'remaining' bytes
772  *
773  * Returns number of input bytes consumed, and may reset
774  * the 'remaining' parameter if not all the output space was consumed
775  * (this information is needed to know how much to write to disk
776  */
777 size_t
778 hibernate_deflate(union hibernate_info *hib, paddr_t src,
779     size_t *remaining)
780 {
781 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
782 	struct hibernate_zlib_state *hibernate_state;
783 
784 	hibernate_state =
785 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
786 
787 	/* Set up the stream for deflate */
788 	hibernate_state->hib_stream.next_in = (unsigned char *)src;
789 	hibernate_state->hib_stream.avail_in = PAGE_SIZE - (src & PAGE_MASK);
790 	hibernate_state->hib_stream.next_out =
791 		(unsigned char *)hibernate_io_page + (PAGE_SIZE - *remaining);
792 	hibernate_state->hib_stream.avail_out = *remaining;
793 
794 	/* Process next block of data */
795 	if (deflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH) != Z_OK)
796 		panic("hibernate zlib deflate error");
797 
798 	/* Update pointers and return number of bytes consumed */
799 	*remaining = hibernate_state->hib_stream.avail_out;
800 	return (PAGE_SIZE - (src & PAGE_MASK)) -
801 	    hibernate_state->hib_stream.avail_in;
802 }
803 
804 /*
805  * Write the hibernation information specified in hiber_info
806  * to the location in swap previously calculated (last block of
807  * swap), called the "signature block".
808  */
809 int
810 hibernate_write_signature(union hibernate_info *hib)
811 {
812 	/* Write hibernate info to disk */
813 	return (hib->io_func(hib->dev, hib->sig_offset,
814 	    (vaddr_t)hib, DEV_BSIZE, HIB_W,
815 	    hib->io_page));
816 }
817 
818 /*
819  * Write the memory chunk table to the area in swap immediately
820  * preceding the signature block. The chunk table is stored
821  * in the piglet when this function is called.  Returns errno.
822  */
823 int
824 hibernate_write_chunktable(union hibernate_info *hib)
825 {
826 	vaddr_t hibernate_chunk_table_start;
827 	size_t hibernate_chunk_table_size;
828 	int i, err;
829 
830 	hibernate_chunk_table_size = HIBERNATE_CHUNK_TABLE_SIZE;
831 
832 	hibernate_chunk_table_start = hib->piglet_va +
833 	    HIBERNATE_CHUNK_SIZE;
834 
835 	/* Write chunk table */
836 	for (i = 0; i < hibernate_chunk_table_size; i += MAXPHYS) {
837 		if ((err = hib->io_func(hib->dev,
838 		    hib->chunktable_offset + (i/DEV_BSIZE),
839 		    (vaddr_t)(hibernate_chunk_table_start + i),
840 		    MAXPHYS, HIB_W, hib->io_page))) {
841 			DPRINTF("chunktable write error: %d\n", err);
842 			return (err);
843 		}
844 	}
845 
846 	return (0);
847 }
848 
849 /*
850  * Write an empty hiber_info to the swap signature block, which is
851  * guaranteed to not match any valid hib.
852  */
853 int
854 hibernate_clear_signature(void)
855 {
856 	union hibernate_info blank_hiber_info;
857 	union hibernate_info hib;
858 
859 	/* Zero out a blank hiber_info */
860 	memset(&blank_hiber_info, 0, sizeof(union hibernate_info));
861 
862 	/* Get the signature block location */
863 	if (get_hibernate_info(&hib, 0))
864 		return (1);
865 
866 	/* Write (zeroed) hibernate info to disk */
867 	DPRINTF("clearing hibernate signature block location: %lld\n",
868 		hib.sig_offset);
869 	if (hibernate_block_io(&hib,
870 	    hib.sig_offset,
871 	    DEV_BSIZE, (vaddr_t)&blank_hiber_info, 1))
872 		printf("Warning: could not clear hibernate signature\n");
873 
874 	return (0);
875 }
876 
877 /*
878  * Compare two hibernate_infos to determine if they are the same (eg,
879  * we should be performing a hibernate resume on this machine.
880  * Not all fields are checked - just enough to verify that the machine
881  * has the same memory configuration and kernel as the one that
882  * wrote the signature previously.
883  */
884 int
885 hibernate_compare_signature(union hibernate_info *mine,
886     union hibernate_info *disk)
887 {
888 	u_int i;
889 
890 	if (mine->nranges != disk->nranges) {
891 		DPRINTF("hibernate memory range count mismatch\n");
892 		return (1);
893 	}
894 
895 	if (strcmp(mine->kernel_version, disk->kernel_version) != 0) {
896 		DPRINTF("hibernate kernel version mismatch\n");
897 		return (1);
898 	}
899 
900 	for (i = 0; i < mine->nranges; i++) {
901 		if ((mine->ranges[i].base != disk->ranges[i].base) ||
902 		    (mine->ranges[i].end != disk->ranges[i].end) ) {
903 			DPRINTF("hib range %d mismatch [%p-%p != %p-%p]\n",
904 				i,
905 				(void *)mine->ranges[i].base,
906 				(void *)mine->ranges[i].end,
907 				(void *)disk->ranges[i].base,
908 				(void *)disk->ranges[i].end);
909 			return (1);
910 		}
911 	}
912 
913 	return (0);
914 }
915 
916 /*
917  * Transfers xfer_size bytes between the hibernate device specified in
918  * hib_info at offset blkctr and the vaddr specified at dest.
919  *
920  * Separate offsets and pages are used to handle misaligned reads (reads
921  * that span a page boundary).
922  *
923  * blkctr specifies a relative offset (relative to the start of swap),
924  * not an absolute disk offset
925  *
926  */
927 int
928 hibernate_block_io(union hibernate_info *hib, daddr_t blkctr,
929     size_t xfer_size, vaddr_t dest, int iswrite)
930 {
931 	struct buf *bp;
932 	struct bdevsw *bdsw;
933 	int error;
934 
935 	bp = geteblk(xfer_size);
936 	bdsw = &bdevsw[major(hib->dev)];
937 
938 	error = (*bdsw->d_open)(hib->dev, FREAD, S_IFCHR, curproc);
939 	if (error) {
940 		printf("hibernate_block_io open failed\n");
941 		return (1);
942 	}
943 
944 	if (iswrite)
945 		bcopy((caddr_t)dest, bp->b_data, xfer_size);
946 
947 	bp->b_bcount = xfer_size;
948 	bp->b_blkno = blkctr;
949 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
950 	SET(bp->b_flags, B_BUSY | (iswrite ? B_WRITE : B_READ) | B_RAW);
951 	bp->b_dev = hib->dev;
952 	(*bdsw->d_strategy)(bp);
953 
954 	error = biowait(bp);
955 	if (error) {
956 		printf("hib block_io biowait error %d blk %lld size %zu\n",
957 			error, (long long)blkctr, xfer_size);
958 		error = (*bdsw->d_close)(hib->dev, 0, S_IFCHR,
959 		    curproc);
960 		if (error)
961 			printf("hibernate_block_io error close failed\n");
962 		return (1);
963 	}
964 
965 	error = (*bdsw->d_close)(hib->dev, FREAD, S_IFCHR, curproc);
966 	if (error) {
967 		printf("hibernate_block_io close failed\n");
968 		return (1);
969 	}
970 
971 	if (!iswrite)
972 		bcopy(bp->b_data, (caddr_t)dest, xfer_size);
973 
974 	bp->b_flags |= B_INVAL;
975 	brelse(bp);
976 
977 	return (0);
978 }
979 
980 /*
981  * Reads the signature block from swap, checks against the current machine's
982  * information. If the information matches, perform a resume by reading the
983  * saved image into the pig area, and unpacking.
984  */
985 void
986 hibernate_resume(void)
987 {
988 	union hibernate_info hib;
989 	int s;
990 
991 	/* Get current running machine's hibernate info */
992 	memset(&hib, 0, sizeof(hib));
993 	if (get_hibernate_info(&hib, 0)) {
994 		DPRINTF("couldn't retrieve machine's hibernate info\n");
995 		return;
996 	}
997 
998 	/* Read hibernate info from disk */
999 	s = splbio();
1000 
1001 	DPRINTF("reading hibernate signature block location: %lld\n",
1002 		hib.sig_offset);
1003 
1004 	if (hibernate_block_io(&hib,
1005 	    hib.sig_offset,
1006 	    DEV_BSIZE, (vaddr_t)&disk_hib, 0)) {
1007 		DPRINTF("error in hibernate read");
1008 		splx(s);
1009 		return;
1010 	}
1011 
1012 	/* Check magic number */
1013 	if (disk_hib.magic != HIBERNATE_MAGIC) {
1014 		DPRINTF("wrong magic number in hibernate signature: %x\n",
1015 			disk_hib.magic);
1016 		splx(s);
1017 		return;
1018 	}
1019 
1020 	/*
1021 	 * We (possibly) found a hibernate signature. Clear signature first,
1022 	 * to prevent accidental resume or endless resume cycles later.
1023 	 */
1024 	if (hibernate_clear_signature()) {
1025 		DPRINTF("error clearing hibernate signature block\n");
1026 		splx(s);
1027 		return;
1028 	}
1029 
1030 	/*
1031 	 * If on-disk and in-memory hibernate signatures match,
1032 	 * this means we should do a resume from hibernate.
1033 	 */
1034 	if (hibernate_compare_signature(&hib, &disk_hib)) {
1035 		DPRINTF("mismatched hibernate signature block\n");
1036 		splx(s);
1037 		return;
1038 	}
1039 
1040 #ifdef MULTIPROCESSOR
1041 	/* XXX - if we fail later, we may need to rehatch APs on some archs */
1042 	DPRINTF("hibernate: quiescing APs\n");
1043 	hibernate_quiesce_cpus();
1044 #endif /* MULTIPROCESSOR */
1045 
1046 	/* Read the image from disk into the image (pig) area */
1047 	if (hibernate_read_image(&disk_hib))
1048 		goto fail;
1049 
1050 	DPRINTF("hibernate: quiescing devices\n");
1051 	if (config_suspend_all(DVACT_QUIESCE) != 0)
1052 		goto fail;
1053 
1054 	(void) splhigh();
1055 	hibernate_disable_intr_machdep();
1056 	cold = 1;
1057 
1058 	DPRINTF("hibernate: suspending devices\n");
1059 	if (config_suspend_all(DVACT_SUSPEND) != 0) {
1060 		cold = 0;
1061 		hibernate_enable_intr_machdep();
1062 		goto fail;
1063 	}
1064 
1065 	pmap_kenter_pa(HIBERNATE_HIBALLOC_PAGE, HIBERNATE_HIBALLOC_PAGE,
1066 	    VM_PROT_ALL);
1067 	pmap_activate(curproc);
1068 
1069 	printf("Unpacking image...\n");
1070 
1071 	/* Switch stacks */
1072 	DPRINTF("hibernate: switching stacks\n");
1073 	hibernate_switch_stack_machdep();
1074 
1075 #ifndef NO_PROPOLICE
1076 	/* Start using suspended kernel's propolice guard */
1077 	__guard_local = disk_hib.guard;
1078 #endif /* ! NO_PROPOLICE */
1079 
1080 	/* Unpack and resume */
1081 	hibernate_unpack_image(&disk_hib);
1082 
1083 fail:
1084 	splx(s);
1085 	printf("\nUnable to resume hibernated image\n");
1086 }
1087 
1088 /*
1089  * Unpack image from pig area to original location by looping through the
1090  * list of output chunks in the order they should be restored (fchunks).
1091  *
1092  * Note that due to the stack smash protector and the fact that we have
1093  * switched stacks, it is not permitted to return from this function.
1094  */
1095 void
1096 hibernate_unpack_image(union hibernate_info *hib)
1097 {
1098 	struct hibernate_disk_chunk *chunks;
1099 	union hibernate_info local_hib;
1100 	paddr_t image_cur = global_pig_start;
1101 	short i, *fchunks;
1102 	char *pva;
1103 	struct hibernate_zlib_state *hibernate_state;
1104 
1105 	hibernate_state =
1106 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1107 
1108 	/* Piglet will be identity mapped (VA == PA) */
1109 	pva = (char *)hib->piglet_pa;
1110 
1111 	fchunks = (short *)(pva + (4 * PAGE_SIZE));
1112 
1113 	chunks = (struct hibernate_disk_chunk *)(pva + HIBERNATE_CHUNK_SIZE);
1114 
1115 	/* Can't use hiber_info that's passed in after this point */
1116 	bcopy(hib, &local_hib, sizeof(union hibernate_info));
1117 
1118 	/* VA == PA */
1119 	local_hib.piglet_va = local_hib.piglet_pa;
1120 
1121 	/*
1122 	 * Point of no return. Once we pass this point, only kernel code can
1123 	 * be accessed. No global variables or other kernel data structures
1124 	 * are guaranteed to be coherent after unpack starts.
1125 	 *
1126 	 * The image is now in high memory (pig area), we unpack from the pig
1127 	 * to the correct location in memory. We'll eventually end up copying
1128 	 * on top of ourself, but we are assured the kernel code here is the
1129 	 * same between the hibernated and resuming kernel, and we are running
1130 	 * on our own stack, so the overwrite is ok.
1131 	 */
1132 	DPRINTF("hibernate: activating alt. pagetable and starting unpack\n");
1133 	hibernate_activate_resume_pt_machdep();
1134 
1135 	for (i = 0; i < local_hib.chunk_ctr; i++) {
1136 		/* Reset zlib for inflate */
1137 		if (hibernate_zlib_reset(&local_hib, 0) != Z_OK)
1138 			panic("hibernate failed to reset zlib for inflate");
1139 
1140 		hibernate_process_chunk(&local_hib, &chunks[fchunks[i]],
1141 		    image_cur);
1142 
1143 		image_cur += chunks[fchunks[i]].compressed_size;
1144 
1145 	}
1146 
1147 	/*
1148 	 * Resume the loaded kernel by jumping to the MD resume vector.
1149 	 * We won't be returning from this call.
1150 	 */
1151 	hibernate_resume_machdep();
1152 }
1153 
1154 /*
1155  * Bounce a compressed image chunk to the piglet, entering mappings for the
1156  * copied pages as needed
1157  */
1158 void
1159 hibernate_copy_chunk_to_piglet(paddr_t img_cur, vaddr_t piglet, size_t size)
1160 {
1161 	size_t ct, ofs;
1162 	paddr_t src = img_cur;
1163 	vaddr_t dest = piglet;
1164 
1165 	/* Copy first partial page */
1166 	ct = (PAGE_SIZE) - (src & PAGE_MASK);
1167 	ofs = (src & PAGE_MASK);
1168 
1169 	if (ct < PAGE_SIZE) {
1170 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE,
1171 			(src - ofs), 0);
1172 		hibernate_flush();
1173 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE + ofs), (caddr_t)dest, ct);
1174 		src += ct;
1175 		dest += ct;
1176 	}
1177 
1178 	/* Copy remaining pages */
1179 	while (src < size + img_cur) {
1180 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE, src, 0);
1181 		hibernate_flush();
1182 		ct = PAGE_SIZE;
1183 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE), (caddr_t)dest, ct);
1184 		hibernate_flush();
1185 		src += ct;
1186 		dest += ct;
1187 	}
1188 }
1189 
1190 /*
1191  * Process a chunk by bouncing it to the piglet, followed by unpacking
1192  */
1193 void
1194 hibernate_process_chunk(union hibernate_info *hib,
1195     struct hibernate_disk_chunk *chunk, paddr_t img_cur)
1196 {
1197 	char *pva = (char *)hib->piglet_va;
1198 
1199 	hibernate_copy_chunk_to_piglet(img_cur,
1200 	 (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)), chunk->compressed_size);
1201 	hibernate_inflate_region(hib, chunk->base,
1202 	    (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)),
1203 	    chunk->compressed_size);
1204 }
1205 
1206 /*
1207  * Calculate RLE component for 'inaddr'. Clamps to max RLE pages between
1208  * inaddr and range_end.
1209  */
1210 int
1211 hibernate_calc_rle(paddr_t inaddr, paddr_t range_end)
1212 {
1213 	int rle;
1214 
1215 	rle = uvm_page_rle(inaddr);
1216 	KASSERT(rle >= 0 && rle <= MAX_RLE);
1217 
1218 	/* Clamp RLE to range end */
1219 	if (rle > 0 && inaddr + (rle * PAGE_SIZE) > range_end)
1220 		rle = (range_end - inaddr) / PAGE_SIZE;
1221 
1222 	return (rle);
1223 }
1224 
1225 /*
1226  * Write the RLE byte for page at 'inaddr' to the output stream.
1227  * Returns the number of pages to be skipped at 'inaddr'.
1228  */
1229 int
1230 hibernate_write_rle(union hibernate_info *hib, paddr_t inaddr,
1231 	paddr_t range_end, daddr_t *blkctr,
1232 	size_t *out_remaining)
1233 {
1234 	int rle, err, *rleloc;
1235 	struct hibernate_zlib_state *hibernate_state;
1236 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
1237 
1238 	hibernate_state =
1239 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1240 
1241 	rle = hibernate_calc_rle(inaddr, range_end);
1242 
1243 	rleloc = (int *)hibernate_rle_page + MAX_RLE - 1;
1244 	*rleloc = rle;
1245 
1246 	/* Deflate the RLE byte into the stream */
1247 	hibernate_deflate(hib, (paddr_t)rleloc, out_remaining);
1248 
1249 	/* Did we fill the output page? If so, flush to disk */
1250 	if (*out_remaining == 0) {
1251 		if ((err = hib->io_func(hib->dev, *blkctr + hib->image_offset,
1252 			(vaddr_t)hibernate_io_page, PAGE_SIZE, HIB_W,
1253 			hib->io_page))) {
1254 				DPRINTF("hib write error %d\n", err);
1255 				return (err);
1256 		}
1257 
1258 		*blkctr += PAGE_SIZE / DEV_BSIZE;
1259 		*out_remaining = PAGE_SIZE;
1260 
1261 		/* If we didn't deflate the entire RLE byte, finish it now */
1262 		if (hibernate_state->hib_stream.avail_in != 0)
1263 			hibernate_deflate(hib,
1264 				(vaddr_t)hibernate_state->hib_stream.next_in,
1265 				out_remaining);
1266 	}
1267 
1268 	return (rle);
1269 }
1270 
1271 /*
1272  * Write a compressed version of this machine's memory to disk, at the
1273  * precalculated swap offset:
1274  *
1275  * end of swap - signature block size - chunk table size - memory size
1276  *
1277  * The function begins by looping through each phys mem range, cutting each
1278  * one into MD sized chunks. These chunks are then compressed individually
1279  * and written out to disk, in phys mem order. Some chunks might compress
1280  * more than others, and for this reason, each chunk's size is recorded
1281  * in the chunk table, which is written to disk after the image has
1282  * properly been compressed and written (in hibernate_write_chunktable).
1283  *
1284  * When this function is called, the machine is nearly suspended - most
1285  * devices are quiesced/suspended, interrupts are off, and cold has
1286  * been set. This means that there can be no side effects once the
1287  * write has started, and the write function itself can also have no
1288  * side effects. This also means no printfs are permitted (since printf
1289  * has side effects.)
1290  *
1291  * Return values :
1292  *
1293  * 0      - success
1294  * EIO    - I/O error occurred writing the chunks
1295  * EINVAL - Failed to write a complete range
1296  * ENOMEM - Memory allocation failure during preparation of the zlib arena
1297  */
1298 int
1299 hibernate_write_chunks(union hibernate_info *hib)
1300 {
1301 	paddr_t range_base, range_end, inaddr, temp_inaddr;
1302 	size_t nblocks, out_remaining, used;
1303 	struct hibernate_disk_chunk *chunks;
1304 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
1305 	daddr_t blkctr = 0;
1306 	int i, rle, err;
1307 	struct hibernate_zlib_state *hibernate_state;
1308 
1309 	hibernate_state =
1310 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1311 
1312 	hib->chunk_ctr = 0;
1313 
1314 	/*
1315 	 * Map the utility VAs to the piglet. See the piglet map at the
1316 	 * top of this file for piglet layout information.
1317 	 */
1318 	hibernate_copy_page = global_piglet_va + 3 * PAGE_SIZE;
1319 	hibernate_rle_page = global_piglet_va + 28 * PAGE_SIZE;
1320 
1321 	chunks = (struct hibernate_disk_chunk *)(hib->piglet_va +
1322 	    HIBERNATE_CHUNK_SIZE);
1323 
1324 	/* Calculate the chunk regions */
1325 	for (i = 0; i < hib->nranges; i++) {
1326 		range_base = hib->ranges[i].base;
1327 		range_end = hib->ranges[i].end;
1328 
1329 		inaddr = range_base;
1330 
1331 		while (inaddr < range_end) {
1332 			chunks[hib->chunk_ctr].base = inaddr;
1333 			if (inaddr + HIBERNATE_CHUNK_SIZE < range_end)
1334 				chunks[hib->chunk_ctr].end = inaddr +
1335 				    HIBERNATE_CHUNK_SIZE;
1336 			else
1337 				chunks[hib->chunk_ctr].end = range_end;
1338 
1339 			inaddr += HIBERNATE_CHUNK_SIZE;
1340 			hib->chunk_ctr ++;
1341 		}
1342 	}
1343 
1344 	uvm_pmr_dirty_everything();
1345 	uvm_pmr_zero_everything();
1346 
1347 	/* Compress and write the chunks in the chunktable */
1348 	for (i = 0; i < hib->chunk_ctr; i++) {
1349 		range_base = chunks[i].base;
1350 		range_end = chunks[i].end;
1351 
1352 		chunks[i].offset = blkctr + hib->image_offset;
1353 
1354 		/* Reset zlib for deflate */
1355 		if (hibernate_zlib_reset(hib, 1) != Z_OK) {
1356 			DPRINTF("hibernate_zlib_reset failed for deflate\n");
1357 			return (ENOMEM);
1358 		}
1359 
1360 		inaddr = range_base;
1361 
1362 		/*
1363 		 * For each range, loop through its phys mem region
1364 		 * and write out the chunks (the last chunk might be
1365 		 * smaller than the chunk size).
1366 		 */
1367 		while (inaddr < range_end) {
1368 			out_remaining = PAGE_SIZE;
1369 			while (out_remaining > 0 && inaddr < range_end) {
1370 				/*
1371 				 * Adjust for regions that are not evenly
1372 				 * divisible by PAGE_SIZE or overflowed
1373 				 * pages from the previous iteration.
1374 				 */
1375 				temp_inaddr = (inaddr & PAGE_MASK) +
1376 				    hibernate_copy_page;
1377 
1378 				/* Deflate from temp_inaddr to IO page */
1379 				if (inaddr != range_end) {
1380 					if (inaddr % PAGE_SIZE == 0) {
1381 						rle = hibernate_write_rle(hib,
1382 							inaddr,
1383 							range_end,
1384 							&blkctr,
1385 							&out_remaining);
1386 					}
1387 
1388 					if (rle == 0) {
1389 						pmap_kenter_pa(hibernate_temp_page,
1390 							inaddr & PMAP_PA_MASK,
1391 							VM_PROT_ALL);
1392 
1393 						pmap_activate(curproc);
1394 
1395 						bcopy((caddr_t)hibernate_temp_page,
1396 							(caddr_t)hibernate_copy_page,
1397 							PAGE_SIZE);
1398 						inaddr += hibernate_deflate(hib,
1399 							temp_inaddr,
1400 							&out_remaining);
1401 					} else {
1402 						inaddr += rle * PAGE_SIZE;
1403 						if (inaddr > range_end)
1404 							inaddr = range_end;
1405 					}
1406 
1407 				}
1408 
1409 				if (out_remaining == 0) {
1410 					/* Filled up the page */
1411 					nblocks = PAGE_SIZE / DEV_BSIZE;
1412 
1413 					if ((err = hib->io_func(hib->dev,
1414 					    blkctr + hib->image_offset,
1415 					    (vaddr_t)hibernate_io_page,
1416 					    PAGE_SIZE, HIB_W, hib->io_page))) {
1417 						DPRINTF("hib write error %d\n",
1418 						    err);
1419 						return (err);
1420 					}
1421 
1422 					blkctr += nblocks;
1423 				}
1424 			}
1425 		}
1426 
1427 		if (inaddr != range_end) {
1428 			DPRINTF("deflate range ended prematurely\n");
1429 			return (EINVAL);
1430 		}
1431 
1432 		/*
1433 		 * End of range. Round up to next secsize bytes
1434 		 * after finishing compress
1435 		 */
1436 		if (out_remaining == 0)
1437 			out_remaining = PAGE_SIZE;
1438 
1439 		/* Finish compress */
1440 		hibernate_state->hib_stream.next_in = (unsigned char *)inaddr;
1441 		hibernate_state->hib_stream.avail_in = 0;
1442 		hibernate_state->hib_stream.next_out =
1443 		    (unsigned char *)hibernate_io_page +
1444 			(PAGE_SIZE - out_remaining);
1445 
1446 		/* We have an extra output page available for finalize */
1447 		hibernate_state->hib_stream.avail_out =
1448 			out_remaining + PAGE_SIZE;
1449 
1450 		if ((err = deflate(&hibernate_state->hib_stream, Z_FINISH)) !=
1451 		    Z_STREAM_END) {
1452 			DPRINTF("deflate error in output stream: %d\n", err);
1453 			return (err);
1454 		}
1455 
1456 		out_remaining = hibernate_state->hib_stream.avail_out;
1457 
1458 		used = 2 * PAGE_SIZE - out_remaining;
1459 		nblocks = used / DEV_BSIZE;
1460 
1461 		/* Round up to next block if needed */
1462 		if (used % DEV_BSIZE != 0)
1463 			nblocks ++;
1464 
1465 		/* Write final block(s) for this chunk */
1466 		if ((err = hib->io_func(hib->dev, blkctr + hib->image_offset,
1467 		    (vaddr_t)hibernate_io_page, nblocks*DEV_BSIZE,
1468 		    HIB_W, hib->io_page))) {
1469 			DPRINTF("hib final write error %d\n", err);
1470 			return (err);
1471 		}
1472 
1473 		blkctr += nblocks;
1474 
1475 		chunks[i].compressed_size = (blkctr + hib->image_offset -
1476 		    chunks[i].offset) * DEV_BSIZE;
1477 	}
1478 
1479 	hib->chunktable_offset = hib->image_offset + blkctr;
1480 	return (0);
1481 }
1482 
1483 /*
1484  * Reset the zlib stream state and allocate a new hiballoc area for either
1485  * inflate or deflate. This function is called once for each hibernate chunk.
1486  * Calling hiballoc_init multiple times is acceptable since the memory it is
1487  * provided is unmanaged memory (stolen). We use the memory provided to us
1488  * by the piglet allocated via the supplied hib.
1489  */
1490 int
1491 hibernate_zlib_reset(union hibernate_info *hib, int deflate)
1492 {
1493 	vaddr_t hibernate_zlib_start;
1494 	size_t hibernate_zlib_size;
1495 	char *pva = (char *)hib->piglet_va;
1496 	struct hibernate_zlib_state *hibernate_state;
1497 
1498 	hibernate_state =
1499 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1500 
1501 	if (!deflate)
1502 		pva = (char *)((paddr_t)pva & (PIGLET_PAGE_MASK));
1503 
1504 	/*
1505 	 * See piglet layout information at the start of this file for
1506 	 * information on the zlib page assignments.
1507 	 */
1508 	hibernate_zlib_start = (vaddr_t)(pva + (29 * PAGE_SIZE));
1509 	hibernate_zlib_size = 80 * PAGE_SIZE;
1510 
1511 	memset((void *)hibernate_zlib_start, 0, hibernate_zlib_size);
1512 	memset(hibernate_state, 0, PAGE_SIZE);
1513 
1514 	/* Set up stream structure */
1515 	hibernate_state->hib_stream.zalloc = (alloc_func)hibernate_zlib_alloc;
1516 	hibernate_state->hib_stream.zfree = (free_func)hibernate_zlib_free;
1517 
1518 	/* Initialize the hiballoc arena for zlib allocs/frees */
1519 	hiballoc_init(&hibernate_state->hiballoc_arena,
1520 	    (caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1521 
1522 	if (deflate) {
1523 		return deflateInit(&hibernate_state->hib_stream,
1524 		    Z_BEST_SPEED);
1525 	} else
1526 		return inflateInit(&hibernate_state->hib_stream);
1527 }
1528 
1529 /*
1530  * Reads the hibernated memory image from disk, whose location and
1531  * size are recorded in hib. Begin by reading the persisted
1532  * chunk table, which records the original chunk placement location
1533  * and compressed size for each. Next, allocate a pig region of
1534  * sufficient size to hold the compressed image. Next, read the
1535  * chunks into the pig area (calling hibernate_read_chunks to do this),
1536  * and finally, if all of the above succeeds, clear the hibernate signature.
1537  * The function will then return to hibernate_resume, which will proceed
1538  * to unpack the pig image to the correct place in memory.
1539  */
1540 int
1541 hibernate_read_image(union hibernate_info *hib)
1542 {
1543 	size_t compressed_size, disk_size, chunktable_size, pig_sz;
1544 	paddr_t image_start, image_end, pig_start, pig_end;
1545 	struct hibernate_disk_chunk *chunks;
1546 	daddr_t blkctr;
1547 	vaddr_t chunktable = (vaddr_t)NULL;
1548 	paddr_t piglet_chunktable = hib->piglet_pa +
1549 	    HIBERNATE_CHUNK_SIZE;
1550 	int i, status;
1551 
1552 	status = 0;
1553 	pmap_activate(curproc);
1554 
1555 	/* Calculate total chunk table size in disk blocks */
1556 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / DEV_BSIZE;
1557 
1558 	blkctr = hib->chunktable_offset;
1559 
1560 	chunktable = (vaddr_t)km_alloc(HIBERNATE_CHUNK_TABLE_SIZE, &kv_any,
1561 	    &kp_none, &kd_nowait);
1562 
1563 	if (!chunktable)
1564 		return (1);
1565 
1566 	/* Map chunktable pages */
1567 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE; i += PAGE_SIZE)
1568 		pmap_kenter_pa(chunktable + i, piglet_chunktable + i,
1569 		    VM_PROT_ALL);
1570 	pmap_update(pmap_kernel());
1571 
1572 	/* Read the chunktable from disk into the piglet chunktable */
1573 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE;
1574 	    i += MAXPHYS, blkctr += MAXPHYS/DEV_BSIZE)
1575 		hibernate_block_io(hib, blkctr, MAXPHYS,
1576 		    chunktable + i, 0);
1577 
1578 	blkctr = hib->image_offset;
1579 	compressed_size = 0;
1580 
1581 	chunks = (struct hibernate_disk_chunk *)chunktable;
1582 
1583 	for (i = 0; i < hib->chunk_ctr; i++)
1584 		compressed_size += chunks[i].compressed_size;
1585 
1586 	disk_size = compressed_size;
1587 
1588 	printf("unhibernating @ block %lld length %lu bytes\n",
1589 	    hib->sig_offset - chunktable_size,
1590 	    compressed_size);
1591 
1592 	/* Allocate the pig area */
1593 	pig_sz = compressed_size + HIBERNATE_CHUNK_SIZE;
1594 	if (uvm_pmr_alloc_pig(&pig_start, pig_sz, hib->piglet_pa) == ENOMEM) {
1595 		status = 1;
1596 		goto unmap;
1597 	}
1598 
1599 	pig_end = pig_start + pig_sz;
1600 
1601 	/* Calculate image extents. Pig image must end on a chunk boundary. */
1602 	image_end = pig_end & ~(HIBERNATE_CHUNK_SIZE - 1);
1603 	image_start = image_end - disk_size;
1604 
1605 	hibernate_read_chunks(hib, image_start, image_end, disk_size,
1606 	    chunks);
1607 
1608 	/* Prepare the resume time pmap/page table */
1609 	hibernate_populate_resume_pt(hib, image_start, image_end);
1610 
1611 unmap:
1612 	/* Unmap chunktable pages */
1613 	pmap_kremove(chunktable, HIBERNATE_CHUNK_TABLE_SIZE);
1614 	pmap_update(pmap_kernel());
1615 
1616 	return (status);
1617 }
1618 
1619 /*
1620  * Read the hibernated memory chunks from disk (chunk information at this
1621  * point is stored in the piglet) into the pig area specified by
1622  * [pig_start .. pig_end]. Order the chunks so that the final chunk is the
1623  * only chunk with overlap possibilities.
1624  */
1625 int
1626 hibernate_read_chunks(union hibernate_info *hib, paddr_t pig_start,
1627     paddr_t pig_end, size_t image_compr_size,
1628     struct hibernate_disk_chunk *chunks)
1629 {
1630 	paddr_t img_cur, piglet_base;
1631 	daddr_t blkctr;
1632 	size_t processed, compressed_size, read_size;
1633 	int nchunks, nfchunks, num_io_pages;
1634 	vaddr_t tempva, hibernate_fchunk_area;
1635 	short *fchunks, i, j;
1636 
1637 	tempva = (vaddr_t)NULL;
1638 	hibernate_fchunk_area = (vaddr_t)NULL;
1639 	nfchunks = 0;
1640 	piglet_base = hib->piglet_pa;
1641 	global_pig_start = pig_start;
1642 
1643 	pmap_activate(curproc);
1644 
1645 	/*
1646 	 * These mappings go into the resuming kernel's page table, and are
1647 	 * used only during image read. They dissappear from existence
1648 	 * when the suspended kernel is unpacked on top of us.
1649 	 */
1650 	tempva = (vaddr_t)km_alloc(MAXPHYS + PAGE_SIZE, &kv_any, &kp_none,
1651 		&kd_nowait);
1652 	if (!tempva)
1653 		return (1);
1654 	hibernate_fchunk_area = (vaddr_t)km_alloc(24 * PAGE_SIZE, &kv_any,
1655 	    &kp_none, &kd_nowait);
1656 	if (!hibernate_fchunk_area)
1657 		return (1);
1658 
1659 	/* Final output chunk ordering VA */
1660 	fchunks = (short *)hibernate_fchunk_area;
1661 
1662 	/* Map the chunk ordering region */
1663 	for(i = 0; i < 24 ; i++)
1664 		pmap_kenter_pa(hibernate_fchunk_area + (i * PAGE_SIZE),
1665 			piglet_base + ((4 + i) * PAGE_SIZE), VM_PROT_ALL);
1666 	pmap_update(pmap_kernel());
1667 
1668 	nchunks = hib->chunk_ctr;
1669 
1670 	/* Initially start all chunks as unplaced */
1671 	for (i = 0; i < nchunks; i++)
1672 		chunks[i].flags = 0;
1673 
1674 	/*
1675 	 * Search the list for chunks that are outside the pig area. These
1676 	 * can be placed first in the final output list.
1677 	 */
1678 	for (i = 0; i < nchunks; i++) {
1679 		if (chunks[i].end <= pig_start || chunks[i].base >= pig_end) {
1680 			fchunks[nfchunks] = i;
1681 			nfchunks++;
1682 			chunks[i].flags |= HIBERNATE_CHUNK_PLACED;
1683 		}
1684 	}
1685 
1686 	/*
1687 	 * Walk the ordering, place the chunks in ascending memory order.
1688 	 */
1689 	for (i = 0; i < nchunks; i++) {
1690 		if (chunks[i].flags != HIBERNATE_CHUNK_PLACED) {
1691 			fchunks[nfchunks] = i;
1692 			nfchunks++;
1693 			chunks[i].flags = HIBERNATE_CHUNK_PLACED;
1694 		}
1695 	}
1696 
1697 	img_cur = pig_start;
1698 
1699 	for (i = 0; i < nfchunks; i++) {
1700 		blkctr = chunks[fchunks[i]].offset;
1701 		processed = 0;
1702 		compressed_size = chunks[fchunks[i]].compressed_size;
1703 
1704 		while (processed < compressed_size) {
1705 			if (compressed_size - processed >= MAXPHYS)
1706 				read_size = MAXPHYS;
1707 			else
1708 				read_size = compressed_size - processed;
1709 
1710 			/*
1711 			 * We're reading read_size bytes, offset from the
1712 			 * start of a page by img_cur % PAGE_SIZE, so the
1713 			 * end will be read_size + (img_cur % PAGE_SIZE)
1714 			 * from the start of the first page.  Round that
1715 			 * up to the next page size.
1716 			 */
1717 			num_io_pages = (read_size + (img_cur % PAGE_SIZE)
1718 				+ PAGE_SIZE - 1) / PAGE_SIZE;
1719 
1720 			KASSERT(num_io_pages <= MAXPHYS/PAGE_SIZE + 1);
1721 
1722 			/* Map pages for this read */
1723 			for (j = 0; j < num_io_pages; j ++)
1724 				pmap_kenter_pa(tempva + j * PAGE_SIZE,
1725 					img_cur + j * PAGE_SIZE, VM_PROT_ALL);
1726 
1727 			pmap_update(pmap_kernel());
1728 
1729 			hibernate_block_io(hib, blkctr, read_size,
1730 			    tempva + (img_cur & PAGE_MASK), 0);
1731 
1732 			blkctr += (read_size / DEV_BSIZE);
1733 
1734 			pmap_kremove(tempva, num_io_pages * PAGE_SIZE);
1735 			pmap_update(pmap_kernel());
1736 
1737 			processed += read_size;
1738 			img_cur += read_size;
1739 		}
1740 	}
1741 
1742 	pmap_kremove(hibernate_fchunk_area, 24 * PAGE_SIZE);
1743 	pmap_update(pmap_kernel());
1744 
1745 	return (0);
1746 }
1747 
1748 /*
1749  * Hibernating a machine comprises the following operations:
1750  *  1. Calculating this machine's hibernate_info information
1751  *  2. Allocating a piglet and saving the piglet's physaddr
1752  *  3. Calculating the memory chunks
1753  *  4. Writing the compressed chunks to disk
1754  *  5. Writing the chunk table
1755  *  6. Writing the signature block (hibernate_info)
1756  *
1757  * On most architectures, the function calling hibernate_suspend would
1758  * then power off the machine using some MD-specific implementation.
1759  */
1760 int
1761 hibernate_suspend(void)
1762 {
1763 	union hibernate_info hib;
1764 	u_long start, end;
1765 
1766 	/*
1767 	 * Calculate memory ranges, swap offsets, etc.
1768 	 * This also allocates a piglet whose physaddr is stored in
1769 	 * hib->piglet_pa and vaddr stored in hib->piglet_va
1770 	 */
1771 	if (get_hibernate_info(&hib, 1)) {
1772 		DPRINTF("failed to obtain hibernate info\n");
1773 		return (1);
1774 	}
1775 
1776 	/* Find a page-addressed region in swap [start,end] */
1777 	if (uvm_hibswap(hib.dev, &start, &end)) {
1778 		printf("hibernate: cannot find any swap\n");
1779 		return (1);
1780 	}
1781 
1782 	if (end - start < 1000) {
1783 		printf("hibernate: insufficient swap (%lu is too small)\n",
1784 			end - start);
1785 		return (1);
1786 	}
1787 
1788 	/* Calculate block offsets in swap */
1789 	hib.image_offset = ctod(start);
1790 
1791 	DPRINTF("hibernate @ block %lld max-length %lu blocks\n",
1792 	    hib.image_offset, ctod(end) - ctod(start));
1793 
1794 	pmap_kenter_pa(HIBERNATE_HIBALLOC_PAGE, HIBERNATE_HIBALLOC_PAGE,
1795 		VM_PROT_ALL);
1796 	pmap_activate(curproc);
1797 
1798 	DPRINTF("hibernate: writing chunks\n");
1799 	if (hibernate_write_chunks(&hib)) {
1800 		DPRINTF("hibernate_write_chunks failed\n");
1801 		return (1);
1802 	}
1803 
1804 	DPRINTF("hibernate: writing chunktable\n");
1805 	if (hibernate_write_chunktable(&hib)) {
1806 		DPRINTF("hibernate_write_chunktable failed\n");
1807 		return (1);
1808 	}
1809 
1810 	DPRINTF("hibernate: writing signature\n");
1811 	if (hibernate_write_signature(&hib)) {
1812 		DPRINTF("hibernate_write_signature failed\n");
1813 		return (1);
1814 	}
1815 
1816 	/* Allow the disk to settle */
1817 	delay(500000);
1818 
1819 	/*
1820 	 * Give the device-specific I/O function a notification that we're
1821 	 * done, and that it can clean up or shutdown as needed.
1822 	 */
1823 	hib.io_func(hib.dev, 0, (vaddr_t)NULL, 0, HIB_DONE, hib.io_page);
1824 
1825 	return (0);
1826 }
1827 
1828 int
1829 hibernate_alloc(void)
1830 {
1831 	KASSERT(global_piglet_va == 0);
1832 	KASSERT(hibernate_temp_page == 0);
1833 
1834 	if (uvm_pmr_alloc_piglet(&global_piglet_va, &global_piglet_pa,
1835 	    HIBERNATE_CHUNK_SIZE * 4, HIBERNATE_CHUNK_SIZE))
1836 		return (ENOMEM);
1837 
1838 	/*
1839 	 * Allocate VA for the temp page.
1840 	 *
1841 	 * This will become part of the suspended kernel and will
1842 	 * be freed in hibernate_free, upon resume.
1843 	 */
1844 	hibernate_temp_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1845 	    &kp_none, &kd_nowait);
1846 	if (!hibernate_temp_page) {
1847 		DPRINTF("out of memory allocating hibernate_temp_page\n");
1848 		return (ENOMEM);
1849 	}
1850 
1851 	return (0);
1852 }
1853 
1854 /*
1855  * Free items allocated by hibernate_alloc()
1856  */
1857 void
1858 hibernate_free(void)
1859 {
1860 	if (global_piglet_va)
1861 		uvm_pmr_free_piglet(global_piglet_va,
1862 		    4 * HIBERNATE_CHUNK_SIZE);
1863 
1864 	if (hibernate_temp_page)
1865 		pmap_kremove(hibernate_temp_page, PAGE_SIZE);
1866 
1867 	pmap_update(pmap_kernel());
1868 
1869 	if (hibernate_temp_page)
1870 		km_free((void *)hibernate_temp_page, PAGE_SIZE,
1871 		    &kv_any, &kp_none);
1872 
1873 	global_piglet_va = 0;
1874 	hibernate_temp_page = 0;
1875 }
1876