xref: /openbsd-src/sys/kern/subr_hibernate.c (revision 87dd1dd0f81affe9ece279010204ab6bd409397e)
1 /*	$OpenBSD: subr_hibernate.c,v 1.114 2015/02/07 01:19:40 deraadt Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  * Copyright (c) 2011 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/hibernate.h>
21 #include <sys/malloc.h>
22 #include <sys/param.h>
23 #include <sys/tree.h>
24 #include <sys/systm.h>
25 #include <sys/disklabel.h>
26 #include <sys/disk.h>
27 #include <sys/conf.h>
28 #include <sys/buf.h>
29 #include <sys/fcntl.h>
30 #include <sys/stat.h>
31 #include <sys/atomic.h>
32 
33 #include <uvm/uvm.h>
34 #include <uvm/uvm_swap.h>
35 
36 #include <machine/hibernate.h>
37 
38 /*
39  * Hibernate piglet layout information
40  *
41  * The piglet is a scratch area of memory allocated by the suspending kernel.
42  * Its phys and virt addrs are recorded in the signature block. The piglet is
43  * used to guarantee an unused area of memory that can be used by the resuming
44  * kernel for various things. The piglet is excluded during unpack operations.
45  * The piglet size is presently 4*HIBERNATE_CHUNK_SIZE (typically 4*4MB).
46  *
47  * Offset from piglet_base	Purpose
48  * ----------------------------------------------------------------------------
49  * 0				Private page for suspend I/O write functions
50  * 1*PAGE_SIZE			I/O page used during hibernate suspend
51  * 2*PAGE_SIZE			I/O page used during hibernate suspend
52  * 3*PAGE_SIZE			copy page used during hibernate suspend
53  * 4*PAGE_SIZE			final chunk ordering list (24 pages)
54  * 28*PAGE_SIZE			RLE utility page
55  * 29*PAGE_SIZE			start of hiballoc area
56  * 109*PAGE_SIZE		end of hiballoc area (80 pages)
57  * ...				unused
58  * HIBERNATE_CHUNK_SIZE		start of hibernate chunk table
59  * 2*HIBERNATE_CHUNK_SIZE	bounce area for chunks being unpacked
60  * 4*HIBERNATE_CHUNK_SIZE	end of piglet
61  */
62 
63 /* Temporary vaddr ranges used during hibernate */
64 vaddr_t hibernate_temp_page;
65 vaddr_t hibernate_copy_page;
66 vaddr_t hibernate_rle_page;
67 
68 /* Hibernate info as read from disk during resume */
69 union hibernate_info disk_hib;
70 
71 /*
72  * Global copy of the pig start address. This needs to be a global as we
73  * switch stacks after computing it - it can't be stored on the stack.
74  */
75 paddr_t global_pig_start;
76 
77 /*
78  * Global copies of the piglet start addresses (PA/VA). We store these
79  * as globals to avoid having to carry them around as parameters, as the
80  * piglet is allocated early and freed late - its lifecycle extends beyond
81  * that of the hibernate info union which is calculated on suspend/resume.
82  */
83 vaddr_t global_piglet_va;
84 paddr_t global_piglet_pa;
85 
86 /* #define HIB_DEBUG */
87 #ifdef HIB_DEBUG
88 int	hib_debug = 99;
89 #define DPRINTF(x...)     do { if (hib_debug) printf(x); } while (0)
90 #define DNPRINTF(n,x...)  do { if (hib_debug > (n)) printf(x); } while (0)
91 #else
92 #define DPRINTF(x...)
93 #define DNPRINTF(n,x...)
94 #endif
95 
96 #ifndef NO_PROPOLICE
97 extern long __guard_local;
98 #endif /* ! NO_PROPOLICE */
99 
100 void hibernate_copy_chunk_to_piglet(paddr_t, vaddr_t, size_t);
101 int hibernate_calc_rle(paddr_t, paddr_t);
102 int hibernate_write_rle(union hibernate_info *, paddr_t, paddr_t, daddr_t *,
103 	size_t *);
104 
105 #define MAX_RLE (HIBERNATE_CHUNK_SIZE / PAGE_SIZE)
106 
107 /*
108  * Hib alloc enforced alignment.
109  */
110 #define HIB_ALIGN		8 /* bytes alignment */
111 
112 /*
113  * sizeof builtin operation, but with alignment constraint.
114  */
115 #define HIB_SIZEOF(_type)	roundup(sizeof(_type), HIB_ALIGN)
116 
117 struct hiballoc_entry {
118 	size_t			hibe_use;
119 	size_t			hibe_space;
120 	RB_ENTRY(hiballoc_entry) hibe_entry;
121 };
122 
123 /*
124  * Sort hibernate memory ranges by ascending PA
125  */
126 void
127 hibernate_sort_ranges(union hibernate_info *hib_info)
128 {
129 	int i, j;
130 	struct hibernate_memory_range *ranges;
131 	paddr_t base, end;
132 
133 	ranges = hib_info->ranges;
134 
135 	for (i = 1; i < hib_info->nranges; i++) {
136 		j = i;
137 		while (j > 0 && ranges[j - 1].base > ranges[j].base) {
138 			base = ranges[j].base;
139 			end = ranges[j].end;
140 			ranges[j].base = ranges[j - 1].base;
141 			ranges[j].end = ranges[j - 1].end;
142 			ranges[j - 1].base = base;
143 			ranges[j - 1].end = end;
144 			j--;
145 		}
146 	}
147 }
148 
149 /*
150  * Compare hiballoc entries based on the address they manage.
151  *
152  * Since the address is fixed, relative to struct hiballoc_entry,
153  * we just compare the hiballoc_entry pointers.
154  */
155 static __inline int
156 hibe_cmp(struct hiballoc_entry *l, struct hiballoc_entry *r)
157 {
158 	return l < r ? -1 : (l > r);
159 }
160 
161 RB_PROTOTYPE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
162 
163 /*
164  * Given a hiballoc entry, return the address it manages.
165  */
166 static __inline void *
167 hib_entry_to_addr(struct hiballoc_entry *entry)
168 {
169 	caddr_t addr;
170 
171 	addr = (caddr_t)entry;
172 	addr += HIB_SIZEOF(struct hiballoc_entry);
173 	return addr;
174 }
175 
176 /*
177  * Given an address, find the hiballoc that corresponds.
178  */
179 static __inline struct hiballoc_entry*
180 hib_addr_to_entry(void *addr_param)
181 {
182 	caddr_t addr;
183 
184 	addr = (caddr_t)addr_param;
185 	addr -= HIB_SIZEOF(struct hiballoc_entry);
186 	return (struct hiballoc_entry*)addr;
187 }
188 
189 RB_GENERATE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
190 
191 /*
192  * Allocate memory from the arena.
193  *
194  * Returns NULL if no memory is available.
195  */
196 void *
197 hib_alloc(struct hiballoc_arena *arena, size_t alloc_sz)
198 {
199 	struct hiballoc_entry *entry, *new_entry;
200 	size_t find_sz;
201 
202 	/*
203 	 * Enforce alignment of HIB_ALIGN bytes.
204 	 *
205 	 * Note that, because the entry is put in front of the allocation,
206 	 * 0-byte allocations are guaranteed a unique address.
207 	 */
208 	alloc_sz = roundup(alloc_sz, HIB_ALIGN);
209 
210 	/*
211 	 * Find an entry with hibe_space >= find_sz.
212 	 *
213 	 * If the root node is not large enough, we switch to tree traversal.
214 	 * Because all entries are made at the bottom of the free space,
215 	 * traversal from the end has a slightly better chance of yielding
216 	 * a sufficiently large space.
217 	 */
218 	find_sz = alloc_sz + HIB_SIZEOF(struct hiballoc_entry);
219 	entry = RB_ROOT(&arena->hib_addrs);
220 	if (entry != NULL && entry->hibe_space < find_sz) {
221 		RB_FOREACH_REVERSE(entry, hiballoc_addr, &arena->hib_addrs) {
222 			if (entry->hibe_space >= find_sz)
223 				break;
224 		}
225 	}
226 
227 	/*
228 	 * Insufficient or too fragmented memory.
229 	 */
230 	if (entry == NULL)
231 		return NULL;
232 
233 	/*
234 	 * Create new entry in allocated space.
235 	 */
236 	new_entry = (struct hiballoc_entry*)(
237 	    (caddr_t)hib_entry_to_addr(entry) + entry->hibe_use);
238 	new_entry->hibe_space = entry->hibe_space - find_sz;
239 	new_entry->hibe_use = alloc_sz;
240 
241 	/*
242 	 * Insert entry.
243 	 */
244 	if (RB_INSERT(hiballoc_addr, &arena->hib_addrs, new_entry) != NULL)
245 		panic("hib_alloc: insert failure");
246 	entry->hibe_space = 0;
247 
248 	/* Return address managed by entry. */
249 	return hib_entry_to_addr(new_entry);
250 }
251 
252 void
253 hib_getentropy(char **bufp, size_t *bufplen)
254 {
255 	/* fill in */
256 }
257 
258 /*
259  * Free a pointer previously allocated from this arena.
260  *
261  * If addr is NULL, this will be silently accepted.
262  */
263 void
264 hib_free(struct hiballoc_arena *arena, void *addr)
265 {
266 	struct hiballoc_entry *entry, *prev;
267 
268 	if (addr == NULL)
269 		return;
270 
271 	/*
272 	 * Derive entry from addr and check it is really in this arena.
273 	 */
274 	entry = hib_addr_to_entry(addr);
275 	if (RB_FIND(hiballoc_addr, &arena->hib_addrs, entry) != entry)
276 		panic("hib_free: freed item %p not in hib arena", addr);
277 
278 	/*
279 	 * Give the space in entry to its predecessor.
280 	 *
281 	 * If entry has no predecessor, change its used space into free space
282 	 * instead.
283 	 */
284 	prev = RB_PREV(hiballoc_addr, &arena->hib_addrs, entry);
285 	if (prev != NULL &&
286 	    (void *)((caddr_t)prev + HIB_SIZEOF(struct hiballoc_entry) +
287 	    prev->hibe_use + prev->hibe_space) == entry) {
288 		/* Merge entry. */
289 		RB_REMOVE(hiballoc_addr, &arena->hib_addrs, entry);
290 		prev->hibe_space += HIB_SIZEOF(struct hiballoc_entry) +
291 		    entry->hibe_use + entry->hibe_space;
292 	} else {
293 		/* Flip used memory to free space. */
294 		entry->hibe_space += entry->hibe_use;
295 		entry->hibe_use = 0;
296 	}
297 }
298 
299 /*
300  * Initialize hiballoc.
301  *
302  * The allocator will manage memmory at ptr, which is len bytes.
303  */
304 int
305 hiballoc_init(struct hiballoc_arena *arena, void *p_ptr, size_t p_len)
306 {
307 	struct hiballoc_entry *entry;
308 	caddr_t ptr;
309 	size_t len;
310 
311 	RB_INIT(&arena->hib_addrs);
312 
313 	/*
314 	 * Hib allocator enforces HIB_ALIGN alignment.
315 	 * Fixup ptr and len.
316 	 */
317 	ptr = (caddr_t)roundup((vaddr_t)p_ptr, HIB_ALIGN);
318 	len = p_len - ((size_t)ptr - (size_t)p_ptr);
319 	len &= ~((size_t)HIB_ALIGN - 1);
320 
321 	/*
322 	 * Insufficient memory to be able to allocate and also do bookkeeping.
323 	 */
324 	if (len <= HIB_SIZEOF(struct hiballoc_entry))
325 		return ENOMEM;
326 
327 	/*
328 	 * Create entry describing space.
329 	 */
330 	entry = (struct hiballoc_entry*)ptr;
331 	entry->hibe_use = 0;
332 	entry->hibe_space = len - HIB_SIZEOF(struct hiballoc_entry);
333 	RB_INSERT(hiballoc_addr, &arena->hib_addrs, entry);
334 
335 	return 0;
336 }
337 
338 /*
339  * Zero all free memory.
340  */
341 void
342 uvm_pmr_zero_everything(void)
343 {
344 	struct uvm_pmemrange	*pmr;
345 	struct vm_page		*pg;
346 	int			 i;
347 
348 	uvm_lock_fpageq();
349 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
350 		/* Zero single pages. */
351 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_DIRTY]))
352 		    != NULL) {
353 			uvm_pmr_remove(pmr, pg);
354 			uvm_pagezero(pg);
355 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
356 			uvmexp.zeropages++;
357 			uvm_pmr_insert(pmr, pg, 0);
358 		}
359 
360 		/* Zero multi page ranges. */
361 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_DIRTY]))
362 		    != NULL) {
363 			pg--; /* Size tree always has second page. */
364 			uvm_pmr_remove(pmr, pg);
365 			for (i = 0; i < pg->fpgsz; i++) {
366 				uvm_pagezero(&pg[i]);
367 				atomic_setbits_int(&pg[i].pg_flags, PG_ZERO);
368 				uvmexp.zeropages++;
369 			}
370 			uvm_pmr_insert(pmr, pg, 0);
371 		}
372 	}
373 	uvm_unlock_fpageq();
374 }
375 
376 /*
377  * Mark all memory as dirty.
378  *
379  * Used to inform the system that the clean memory isn't clean for some
380  * reason, for example because we just came back from hibernate.
381  */
382 void
383 uvm_pmr_dirty_everything(void)
384 {
385 	struct uvm_pmemrange	*pmr;
386 	struct vm_page		*pg;
387 	int			 i;
388 
389 	uvm_lock_fpageq();
390 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
391 		/* Dirty single pages. */
392 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_ZERO]))
393 		    != NULL) {
394 			uvm_pmr_remove(pmr, pg);
395 			atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
396 			uvm_pmr_insert(pmr, pg, 0);
397 		}
398 
399 		/* Dirty multi page ranges. */
400 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_ZERO]))
401 		    != NULL) {
402 			pg--; /* Size tree always has second page. */
403 			uvm_pmr_remove(pmr, pg);
404 			for (i = 0; i < pg->fpgsz; i++)
405 				atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
406 			uvm_pmr_insert(pmr, pg, 0);
407 		}
408 	}
409 
410 	uvmexp.zeropages = 0;
411 	uvm_unlock_fpageq();
412 }
413 
414 /*
415  * Allocate an area that can hold sz bytes and doesn't overlap with
416  * the piglet at piglet_pa.
417  */
418 int
419 uvm_pmr_alloc_pig(paddr_t *pa, psize_t sz, paddr_t piglet_pa)
420 {
421 	struct uvm_constraint_range pig_constraint;
422 	struct kmem_pa_mode kp_pig = {
423 		.kp_constraint = &pig_constraint,
424 		.kp_maxseg = 1
425 	};
426 	vaddr_t va;
427 
428 	sz = round_page(sz);
429 
430 	pig_constraint.ucr_low = piglet_pa + 4 * HIBERNATE_CHUNK_SIZE;
431 	pig_constraint.ucr_high = -1;
432 
433 	va = (vaddr_t)km_alloc(sz, &kv_any, &kp_pig, &kd_nowait);
434 	if (va == 0) {
435 		pig_constraint.ucr_low = 0;
436 		pig_constraint.ucr_high = piglet_pa - 1;
437 
438 		va = (vaddr_t)km_alloc(sz, &kv_any, &kp_pig, &kd_nowait);
439 		if (va == 0)
440 			return ENOMEM;
441 	}
442 
443 	pmap_extract(pmap_kernel(), va, pa);
444 	return 0;
445 }
446 
447 /*
448  * Allocate a piglet area.
449  *
450  * This needs to be in DMA-safe memory.
451  * Piglets are aligned.
452  *
453  * sz and align in bytes.
454  *
455  * The call will sleep for the pagedaemon to attempt to free memory.
456  * The pagedaemon may decide its not possible to free enough memory, causing
457  * the allocation to fail.
458  */
459 int
460 uvm_pmr_alloc_piglet(vaddr_t *va, paddr_t *pa, vsize_t sz, paddr_t align)
461 {
462 	struct kmem_pa_mode kp_piglet = {
463 		.kp_constraint = &dma_constraint,
464 		.kp_align = align,
465 		.kp_maxseg = 1
466 	};
467 
468 	/* Ensure align is a power of 2 */
469 	KASSERT((align & (align - 1)) == 0);
470 
471 	/*
472 	 * Fixup arguments: align must be at least PAGE_SIZE,
473 	 * sz will be converted to pagecount, since that is what
474 	 * pmemrange uses internally.
475 	 */
476 	if (align < PAGE_SIZE)
477 		kp_piglet.kp_align = PAGE_SIZE;
478 
479 	sz = round_page(sz);
480 
481 	*va = (vaddr_t)km_alloc(sz, &kv_any, &kp_piglet, &kd_nowait);
482 	if (*va == 0)
483 		return ENOMEM;
484 
485 	pmap_extract(pmap_kernel(), *va, pa);
486 	return 0;
487 }
488 
489 /*
490  * Free a piglet area.
491  */
492 void
493 uvm_pmr_free_piglet(vaddr_t va, vsize_t sz)
494 {
495 	/*
496 	 * Fix parameters.
497 	 */
498 	sz = round_page(sz);
499 
500 	/*
501 	 * Free the physical and virtual memory.
502 	 */
503 	km_free((void *)va, sz, &kv_any, &kp_dma_contig);
504 }
505 
506 /*
507  * Physmem RLE compression support.
508  *
509  * Given a physical page address, return the number of pages starting at the
510  * address that are free.  Clamps to the number of pages in
511  * HIBERNATE_CHUNK_SIZE. Returns 0 if the page at addr is not free.
512  */
513 int
514 uvm_page_rle(paddr_t addr)
515 {
516 	struct vm_page		*pg, *pg_end;
517 	struct vm_physseg	*vmp;
518 	int			 pseg_idx, off_idx;
519 
520 	pseg_idx = vm_physseg_find(atop(addr), &off_idx);
521 	if (pseg_idx == -1)
522 		return 0;
523 
524 	vmp = &vm_physmem[pseg_idx];
525 	pg = &vmp->pgs[off_idx];
526 	if (!(pg->pg_flags & PQ_FREE))
527 		return 0;
528 
529 	/*
530 	 * Search for the first non-free page after pg.
531 	 * Note that the page may not be the first page in a free pmemrange,
532 	 * therefore pg->fpgsz cannot be used.
533 	 */
534 	for (pg_end = pg; pg_end <= vmp->lastpg &&
535 	    (pg_end->pg_flags & PQ_FREE) == PQ_FREE; pg_end++)
536 		;
537 	return min((pg_end - pg), HIBERNATE_CHUNK_SIZE/PAGE_SIZE);
538 }
539 
540 /*
541  * Fills out the hibernate_info union pointed to by hib
542  * with information about this machine (swap signature block
543  * offsets, number of memory ranges, kernel in use, etc)
544  */
545 int
546 get_hibernate_info(union hibernate_info *hib, int suspend)
547 {
548 	struct disklabel dl;
549 	char err_string[128], *dl_ret;
550 
551 #ifndef NO_PROPOLICE
552 	/* Save propolice guard */
553 	hib->guard = __guard_local;
554 #endif /* ! NO_PROPOLICE */
555 
556 	/* Determine I/O function to use */
557 	hib->io_func = get_hibernate_io_function(swdevt[0].sw_dev);
558 	if (hib->io_func == NULL)
559 		return (1);
560 
561 	/* Calculate hibernate device */
562 	hib->dev = swdevt[0].sw_dev;
563 
564 	/* Read disklabel (used to calculate signature and image offsets) */
565 	dl_ret = disk_readlabel(&dl, hib->dev, err_string, sizeof(err_string));
566 
567 	if (dl_ret) {
568 		printf("Hibernate error reading disklabel: %s\n", dl_ret);
569 		return (1);
570 	}
571 
572 	/* Make sure we have a swap partition. */
573 	if (dl.d_partitions[1].p_fstype != FS_SWAP ||
574 	    DL_GETPSIZE(&dl.d_partitions[1]) == 0)
575 		return (1);
576 
577 	/* Make sure the signature can fit in one block */
578 	if (sizeof(union hibernate_info) > DEV_BSIZE)
579 		return (1);
580 
581 	/* Magic number */
582 	hib->magic = HIBERNATE_MAGIC;
583 
584 	/* Calculate signature block location */
585 	hib->sig_offset = DL_GETPSIZE(&dl.d_partitions[1]) -
586 	    sizeof(union hibernate_info)/DEV_BSIZE;
587 
588 	/* Stash kernel version information */
589 	memset(&hib->kernel_version, 0, 128);
590 	bcopy(version, &hib->kernel_version,
591 	    min(strlen(version), sizeof(hib->kernel_version)-1));
592 
593 	if (suspend) {
594 		/* Grab the previously-allocated piglet addresses */
595 		hib->piglet_va = global_piglet_va;
596 		hib->piglet_pa = global_piglet_pa;
597 		hib->io_page = (void *)hib->piglet_va;
598 
599 		/*
600 		 * Initialization of the hibernate IO function for drivers
601 		 * that need to do prep work (such as allocating memory or
602 		 * setting up data structures that cannot safely be done
603 		 * during suspend without causing side effects). There is
604 		 * a matching HIB_DONE call performed after the write is
605 		 * completed.
606 		 */
607 		if (hib->io_func(hib->dev, DL_GETPOFFSET(&dl.d_partitions[1]),
608 		    (vaddr_t)NULL, DL_GETPSIZE(&dl.d_partitions[1]),
609 		    HIB_INIT, hib->io_page))
610 			goto fail;
611 
612 	} else {
613 		/*
614 		 * Resuming kernels use a regular private page for the driver
615 		 * No need to free this I/O page as it will vanish as part of
616 		 * the resume.
617 		 */
618 		hib->io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
619 		if (!hib->io_page)
620 			goto fail;
621 	}
622 
623 	if (get_hibernate_info_md(hib))
624 		goto fail;
625 
626 	return (0);
627 
628 fail:
629 	return (1);
630 }
631 
632 /*
633  * Allocate nitems*size bytes from the hiballoc area presently in use
634  */
635 void *
636 hibernate_zlib_alloc(void *unused, int nitems, int size)
637 {
638 	struct hibernate_zlib_state *hibernate_state;
639 
640 	hibernate_state =
641 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
642 
643 	return hib_alloc(&hibernate_state->hiballoc_arena, nitems*size);
644 }
645 
646 /*
647  * Free the memory pointed to by addr in the hiballoc area presently in
648  * use
649  */
650 void
651 hibernate_zlib_free(void *unused, void *addr)
652 {
653 	struct hibernate_zlib_state *hibernate_state;
654 
655 	hibernate_state =
656 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
657 
658 	hib_free(&hibernate_state->hiballoc_arena, addr);
659 }
660 
661 /*
662  * Inflate next page of data from the image stream.
663  * The rle parameter is modified on exit to contain the number of pages to
664  * skip in the output stream (or 0 if this page was inflated into).
665  *
666  * Returns 0 if the stream contains additional data, or 1 if the stream is
667  * finished.
668  */
669 int
670 hibernate_inflate_page(int *rle)
671 {
672 	struct hibernate_zlib_state *hibernate_state;
673 	int i;
674 
675 	hibernate_state =
676 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
677 
678 	/* Set up the stream for RLE code inflate */
679 	hibernate_state->hib_stream.next_out = (unsigned char *)rle;
680 	hibernate_state->hib_stream.avail_out = sizeof(*rle);
681 
682 	/* Inflate RLE code */
683 	i = inflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH);
684 	if (i != Z_OK && i != Z_STREAM_END) {
685 		/*
686 		 * XXX - this will likely reboot/hang most machines
687 		 *       since the console output buffer will be unmapped,
688 		 *       but there's not much else we can do here.
689 		 */
690 		panic("rle inflate stream error");
691 	}
692 
693 	if (hibernate_state->hib_stream.avail_out != 0) {
694 		/*
695 		 * XXX - this will likely reboot/hang most machines
696 		 *       since the console output buffer will be unmapped,
697 		 *       but there's not much else we can do here.
698 		 */
699 		panic("rle short inflate error");
700 	}
701 
702 	if (*rle < 0 || *rle > 1024) {
703 		/*
704 		 * XXX - this will likely reboot/hang most machines
705 		 *       since the console output buffer will be unmapped,
706 		 *       but there's not much else we can do here.
707 		 */
708 		panic("invalid rle count");
709 	}
710 
711 	if (i == Z_STREAM_END)
712 		return (1);
713 
714 	if (*rle != 0)
715 		return (0);
716 
717 	/* Set up the stream for page inflate */
718 	hibernate_state->hib_stream.next_out =
719 		(unsigned char *)HIBERNATE_INFLATE_PAGE;
720 	hibernate_state->hib_stream.avail_out = PAGE_SIZE;
721 
722 	/* Process next block of data */
723 	i = inflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH);
724 	if (i != Z_OK && i != Z_STREAM_END) {
725 		/*
726 		 * XXX - this will likely reboot/hang most machines
727 		 *       since the console output buffer will be unmapped,
728 		 *       but there's not much else we can do here.
729 		 */
730 		panic("inflate error");
731 	}
732 
733 	/* We should always have extracted a full page ... */
734 	if (hibernate_state->hib_stream.avail_out != 0) {
735 		/*
736 		 * XXX - this will likely reboot/hang most machines
737 		 *       since the console output buffer will be unmapped,
738 		 *       but there's not much else we can do here.
739 		 */
740 		panic("incomplete page");
741 	}
742 
743 	return (i == Z_STREAM_END);
744 }
745 
746 /*
747  * Inflate size bytes from src into dest, skipping any pages in
748  * [src..dest] that are special (see hibernate_inflate_skip)
749  *
750  * This function executes while using the resume-time stack
751  * and pmap, and therefore cannot use ddb/printf/etc. Doing so
752  * will likely hang or reset the machine since the console output buffer
753  * will be unmapped.
754  */
755 void
756 hibernate_inflate_region(union hibernate_info *hib, paddr_t dest,
757     paddr_t src, size_t size)
758 {
759 	int end_stream = 0, rle;
760 	struct hibernate_zlib_state *hibernate_state;
761 
762 	hibernate_state =
763 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
764 
765 	hibernate_state->hib_stream.next_in = (unsigned char *)src;
766 	hibernate_state->hib_stream.avail_in = size;
767 
768 	do {
769 		/*
770 		 * Is this a special page? If yes, redirect the
771 		 * inflate output to a scratch page (eg, discard it)
772 		 */
773 		if (hibernate_inflate_skip(hib, dest)) {
774 			hibernate_enter_resume_mapping(
775 			    HIBERNATE_INFLATE_PAGE,
776 			    HIBERNATE_INFLATE_PAGE, 0);
777 		} else {
778 			hibernate_enter_resume_mapping(
779 			    HIBERNATE_INFLATE_PAGE, dest, 0);
780 		}
781 
782 		hibernate_flush();
783 		end_stream = hibernate_inflate_page(&rle);
784 
785 		if (rle == 0)
786 			dest += PAGE_SIZE;
787 		else
788 			dest += (rle * PAGE_SIZE);
789 	} while (!end_stream);
790 }
791 
792 /*
793  * deflate from src into the I/O page, up to 'remaining' bytes
794  *
795  * Returns number of input bytes consumed, and may reset
796  * the 'remaining' parameter if not all the output space was consumed
797  * (this information is needed to know how much to write to disk
798  */
799 size_t
800 hibernate_deflate(union hibernate_info *hib, paddr_t src,
801     size_t *remaining)
802 {
803 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
804 	struct hibernate_zlib_state *hibernate_state;
805 
806 	hibernate_state =
807 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
808 
809 	/* Set up the stream for deflate */
810 	hibernate_state->hib_stream.next_in = (unsigned char *)src;
811 	hibernate_state->hib_stream.avail_in = PAGE_SIZE - (src & PAGE_MASK);
812 	hibernate_state->hib_stream.next_out =
813 		(unsigned char *)hibernate_io_page + (PAGE_SIZE - *remaining);
814 	hibernate_state->hib_stream.avail_out = *remaining;
815 
816 	/* Process next block of data */
817 	if (deflate(&hibernate_state->hib_stream, Z_SYNC_FLUSH) != Z_OK)
818 		panic("hibernate zlib deflate error");
819 
820 	/* Update pointers and return number of bytes consumed */
821 	*remaining = hibernate_state->hib_stream.avail_out;
822 	return (PAGE_SIZE - (src & PAGE_MASK)) -
823 	    hibernate_state->hib_stream.avail_in;
824 }
825 
826 /*
827  * Write the hibernation information specified in hiber_info
828  * to the location in swap previously calculated (last block of
829  * swap), called the "signature block".
830  */
831 int
832 hibernate_write_signature(union hibernate_info *hib)
833 {
834 	/* Write hibernate info to disk */
835 	return (hib->io_func(hib->dev, hib->sig_offset,
836 	    (vaddr_t)hib, DEV_BSIZE, HIB_W,
837 	    hib->io_page));
838 }
839 
840 /*
841  * Write the memory chunk table to the area in swap immediately
842  * preceding the signature block. The chunk table is stored
843  * in the piglet when this function is called.  Returns errno.
844  */
845 int
846 hibernate_write_chunktable(union hibernate_info *hib)
847 {
848 	vaddr_t hibernate_chunk_table_start;
849 	size_t hibernate_chunk_table_size;
850 	int i, err;
851 
852 	hibernate_chunk_table_size = HIBERNATE_CHUNK_TABLE_SIZE;
853 
854 	hibernate_chunk_table_start = hib->piglet_va +
855 	    HIBERNATE_CHUNK_SIZE;
856 
857 	/* Write chunk table */
858 	for (i = 0; i < hibernate_chunk_table_size; i += MAXPHYS) {
859 		if ((err = hib->io_func(hib->dev,
860 		    hib->chunktable_offset + (i/DEV_BSIZE),
861 		    (vaddr_t)(hibernate_chunk_table_start + i),
862 		    MAXPHYS, HIB_W, hib->io_page))) {
863 			DPRINTF("chunktable write error: %d\n", err);
864 			return (err);
865 		}
866 	}
867 
868 	return (0);
869 }
870 
871 /*
872  * Write an empty hiber_info to the swap signature block, which is
873  * guaranteed to not match any valid hib.
874  */
875 int
876 hibernate_clear_signature(void)
877 {
878 	union hibernate_info blank_hiber_info;
879 	union hibernate_info hib;
880 
881 	/* Zero out a blank hiber_info */
882 	memset(&blank_hiber_info, 0, sizeof(union hibernate_info));
883 
884 	/* Get the signature block location */
885 	if (get_hibernate_info(&hib, 0))
886 		return (1);
887 
888 	/* Write (zeroed) hibernate info to disk */
889 	DPRINTF("clearing hibernate signature block location: %lld\n",
890 		hib.sig_offset);
891 	if (hibernate_block_io(&hib,
892 	    hib.sig_offset,
893 	    DEV_BSIZE, (vaddr_t)&blank_hiber_info, 1))
894 		printf("Warning: could not clear hibernate signature\n");
895 
896 	return (0);
897 }
898 
899 /*
900  * Compare two hibernate_infos to determine if they are the same (eg,
901  * we should be performing a hibernate resume on this machine.
902  * Not all fields are checked - just enough to verify that the machine
903  * has the same memory configuration and kernel as the one that
904  * wrote the signature previously.
905  */
906 int
907 hibernate_compare_signature(union hibernate_info *mine,
908     union hibernate_info *disk)
909 {
910 	u_int i;
911 
912 	if (mine->nranges != disk->nranges) {
913 		DPRINTF("hibernate memory range count mismatch\n");
914 		return (1);
915 	}
916 
917 	if (strcmp(mine->kernel_version, disk->kernel_version) != 0) {
918 		DPRINTF("hibernate kernel version mismatch\n");
919 		return (1);
920 	}
921 
922 	for (i = 0; i < mine->nranges; i++) {
923 		if ((mine->ranges[i].base != disk->ranges[i].base) ||
924 		    (mine->ranges[i].end != disk->ranges[i].end) ) {
925 			DPRINTF("hib range %d mismatch [%p-%p != %p-%p]\n",
926 				i,
927 				(void *)mine->ranges[i].base,
928 				(void *)mine->ranges[i].end,
929 				(void *)disk->ranges[i].base,
930 				(void *)disk->ranges[i].end);
931 			return (1);
932 		}
933 	}
934 
935 	return (0);
936 }
937 
938 /*
939  * Transfers xfer_size bytes between the hibernate device specified in
940  * hib_info at offset blkctr and the vaddr specified at dest.
941  *
942  * Separate offsets and pages are used to handle misaligned reads (reads
943  * that span a page boundary).
944  *
945  * blkctr specifies a relative offset (relative to the start of swap),
946  * not an absolute disk offset
947  *
948  */
949 int
950 hibernate_block_io(union hibernate_info *hib, daddr_t blkctr,
951     size_t xfer_size, vaddr_t dest, int iswrite)
952 {
953 	struct buf *bp;
954 	struct bdevsw *bdsw;
955 	int error;
956 
957 	bp = geteblk(xfer_size);
958 	bdsw = &bdevsw[major(hib->dev)];
959 
960 	error = (*bdsw->d_open)(hib->dev, FREAD, S_IFCHR, curproc);
961 	if (error) {
962 		printf("hibernate_block_io open failed\n");
963 		return (1);
964 	}
965 
966 	if (iswrite)
967 		bcopy((caddr_t)dest, bp->b_data, xfer_size);
968 
969 	bp->b_bcount = xfer_size;
970 	bp->b_blkno = blkctr;
971 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
972 	SET(bp->b_flags, B_BUSY | (iswrite ? B_WRITE : B_READ) | B_RAW);
973 	bp->b_dev = hib->dev;
974 	(*bdsw->d_strategy)(bp);
975 
976 	error = biowait(bp);
977 	if (error) {
978 		printf("hib block_io biowait error %d blk %lld size %zu\n",
979 			error, (long long)blkctr, xfer_size);
980 		error = (*bdsw->d_close)(hib->dev, 0, S_IFCHR,
981 		    curproc);
982 		if (error)
983 			printf("hibernate_block_io error close failed\n");
984 		return (1);
985 	}
986 
987 	error = (*bdsw->d_close)(hib->dev, FREAD, S_IFCHR, curproc);
988 	if (error) {
989 		printf("hibernate_block_io close failed\n");
990 		return (1);
991 	}
992 
993 	if (!iswrite)
994 		bcopy(bp->b_data, (caddr_t)dest, xfer_size);
995 
996 	bp->b_flags |= B_INVAL;
997 	brelse(bp);
998 
999 	return (0);
1000 }
1001 
1002 /*
1003  * Reads the signature block from swap, checks against the current machine's
1004  * information. If the information matches, perform a resume by reading the
1005  * saved image into the pig area, and unpacking.
1006  */
1007 void
1008 hibernate_resume(void)
1009 {
1010 	union hibernate_info hib;
1011 	int s;
1012 
1013 	/* Get current running machine's hibernate info */
1014 	memset(&hib, 0, sizeof(hib));
1015 	if (get_hibernate_info(&hib, 0)) {
1016 		DPRINTF("couldn't retrieve machine's hibernate info\n");
1017 		return;
1018 	}
1019 
1020 	/* Read hibernate info from disk */
1021 	s = splbio();
1022 
1023 	DPRINTF("reading hibernate signature block location: %lld\n",
1024 		hib.sig_offset);
1025 
1026 	if (hibernate_block_io(&hib,
1027 	    hib.sig_offset,
1028 	    DEV_BSIZE, (vaddr_t)&disk_hib, 0)) {
1029 		DPRINTF("error in hibernate read");
1030 		splx(s);
1031 		return;
1032 	}
1033 
1034 	/* Check magic number */
1035 	if (disk_hib.magic != HIBERNATE_MAGIC) {
1036 		DPRINTF("wrong magic number in hibernate signature: %x\n",
1037 			disk_hib.magic);
1038 		splx(s);
1039 		return;
1040 	}
1041 
1042 	/*
1043 	 * We (possibly) found a hibernate signature. Clear signature first,
1044 	 * to prevent accidental resume or endless resume cycles later.
1045 	 */
1046 	if (hibernate_clear_signature()) {
1047 		DPRINTF("error clearing hibernate signature block\n");
1048 		splx(s);
1049 		return;
1050 	}
1051 
1052 	/*
1053 	 * If on-disk and in-memory hibernate signatures match,
1054 	 * this means we should do a resume from hibernate.
1055 	 */
1056 	if (hibernate_compare_signature(&hib, &disk_hib)) {
1057 		DPRINTF("mismatched hibernate signature block\n");
1058 		splx(s);
1059 		return;
1060 	}
1061 
1062 #ifdef MULTIPROCESSOR
1063 	/* XXX - if we fail later, we may need to rehatch APs on some archs */
1064 	DPRINTF("hibernate: quiescing APs\n");
1065 	hibernate_quiesce_cpus();
1066 #endif /* MULTIPROCESSOR */
1067 
1068 	/* Read the image from disk into the image (pig) area */
1069 	if (hibernate_read_image(&disk_hib))
1070 		goto fail;
1071 
1072 	DPRINTF("hibernate: quiescing devices\n");
1073 	if (config_suspend_all(DVACT_QUIESCE) != 0)
1074 		goto fail;
1075 
1076 	(void) splhigh();
1077 	hibernate_disable_intr_machdep();
1078 	cold = 1;
1079 
1080 	DPRINTF("hibernate: suspending devices\n");
1081 	if (config_suspend_all(DVACT_SUSPEND) != 0) {
1082 		cold = 0;
1083 		hibernate_enable_intr_machdep();
1084 		goto fail;
1085 	}
1086 
1087 	printf("Unpacking image...\n");
1088 
1089 	/* Switch stacks */
1090 	DPRINTF("hibernate: switching stacks\n");
1091 	hibernate_switch_stack_machdep();
1092 
1093 #ifndef NO_PROPOLICE
1094 	/* Start using suspended kernel's propolice guard */
1095 	__guard_local = disk_hib.guard;
1096 #endif /* ! NO_PROPOLICE */
1097 
1098 	/* Unpack and resume */
1099 	hibernate_unpack_image(&disk_hib);
1100 
1101 fail:
1102 	splx(s);
1103 	printf("\nUnable to resume hibernated image\n");
1104 }
1105 
1106 /*
1107  * Unpack image from pig area to original location by looping through the
1108  * list of output chunks in the order they should be restored (fchunks).
1109  *
1110  * Note that due to the stack smash protector and the fact that we have
1111  * switched stacks, it is not permitted to return from this function.
1112  */
1113 void
1114 hibernate_unpack_image(union hibernate_info *hib)
1115 {
1116 	struct hibernate_disk_chunk *chunks;
1117 	union hibernate_info local_hib;
1118 	paddr_t image_cur = global_pig_start;
1119 	short i, *fchunks;
1120 	char *pva;
1121 
1122 	/* Piglet will be identity mapped (VA == PA) */
1123 	pva = (char *)hib->piglet_pa;
1124 
1125 	fchunks = (short *)(pva + (4 * PAGE_SIZE));
1126 
1127 	chunks = (struct hibernate_disk_chunk *)(pva + HIBERNATE_CHUNK_SIZE);
1128 
1129 	/* Can't use hiber_info that's passed in after this point */
1130 	bcopy(hib, &local_hib, sizeof(union hibernate_info));
1131 
1132 	/* VA == PA */
1133 	local_hib.piglet_va = local_hib.piglet_pa;
1134 
1135 	/*
1136 	 * Point of no return. Once we pass this point, only kernel code can
1137 	 * be accessed. No global variables or other kernel data structures
1138 	 * are guaranteed to be coherent after unpack starts.
1139 	 *
1140 	 * The image is now in high memory (pig area), we unpack from the pig
1141 	 * to the correct location in memory. We'll eventually end up copying
1142 	 * on top of ourself, but we are assured the kernel code here is the
1143 	 * same between the hibernated and resuming kernel, and we are running
1144 	 * on our own stack, so the overwrite is ok.
1145 	 */
1146 	DPRINTF("hibernate: activating alt. pagetable and starting unpack\n");
1147 	hibernate_activate_resume_pt_machdep();
1148 
1149 	for (i = 0; i < local_hib.chunk_ctr; i++) {
1150 		/* Reset zlib for inflate */
1151 		if (hibernate_zlib_reset(&local_hib, 0) != Z_OK)
1152 			panic("hibernate failed to reset zlib for inflate");
1153 
1154 		hibernate_process_chunk(&local_hib, &chunks[fchunks[i]],
1155 		    image_cur);
1156 
1157 		image_cur += chunks[fchunks[i]].compressed_size;
1158 
1159 	}
1160 
1161 	/*
1162 	 * Resume the loaded kernel by jumping to the MD resume vector.
1163 	 * We won't be returning from this call.
1164 	 */
1165 	hibernate_resume_machdep();
1166 }
1167 
1168 /*
1169  * Bounce a compressed image chunk to the piglet, entering mappings for the
1170  * copied pages as needed
1171  */
1172 void
1173 hibernate_copy_chunk_to_piglet(paddr_t img_cur, vaddr_t piglet, size_t size)
1174 {
1175 	size_t ct, ofs;
1176 	paddr_t src = img_cur;
1177 	vaddr_t dest = piglet;
1178 
1179 	/* Copy first partial page */
1180 	ct = (PAGE_SIZE) - (src & PAGE_MASK);
1181 	ofs = (src & PAGE_MASK);
1182 
1183 	if (ct < PAGE_SIZE) {
1184 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE,
1185 			(src - ofs), 0);
1186 		hibernate_flush();
1187 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE + ofs), (caddr_t)dest, ct);
1188 		src += ct;
1189 		dest += ct;
1190 	}
1191 
1192 	/* Copy remaining pages */
1193 	while (src < size + img_cur) {
1194 		hibernate_enter_resume_mapping(HIBERNATE_INFLATE_PAGE, src, 0);
1195 		hibernate_flush();
1196 		ct = PAGE_SIZE;
1197 		bcopy((caddr_t)(HIBERNATE_INFLATE_PAGE), (caddr_t)dest, ct);
1198 		hibernate_flush();
1199 		src += ct;
1200 		dest += ct;
1201 	}
1202 }
1203 
1204 /*
1205  * Process a chunk by bouncing it to the piglet, followed by unpacking
1206  */
1207 void
1208 hibernate_process_chunk(union hibernate_info *hib,
1209     struct hibernate_disk_chunk *chunk, paddr_t img_cur)
1210 {
1211 	char *pva = (char *)hib->piglet_va;
1212 
1213 	hibernate_copy_chunk_to_piglet(img_cur,
1214 	 (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)), chunk->compressed_size);
1215 	hibernate_inflate_region(hib, chunk->base,
1216 	    (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)),
1217 	    chunk->compressed_size);
1218 }
1219 
1220 /*
1221  * Calculate RLE component for 'inaddr'. Clamps to max RLE pages between
1222  * inaddr and range_end.
1223  */
1224 int
1225 hibernate_calc_rle(paddr_t inaddr, paddr_t range_end)
1226 {
1227 	int rle;
1228 
1229 	rle = uvm_page_rle(inaddr);
1230 	KASSERT(rle >= 0 && rle <= MAX_RLE);
1231 
1232 	/* Clamp RLE to range end */
1233 	if (rle > 0 && inaddr + (rle * PAGE_SIZE) > range_end)
1234 		rle = (range_end - inaddr) / PAGE_SIZE;
1235 
1236 	return (rle);
1237 }
1238 
1239 /*
1240  * Write the RLE byte for page at 'inaddr' to the output stream.
1241  * Returns the number of pages to be skipped at 'inaddr'.
1242  */
1243 int
1244 hibernate_write_rle(union hibernate_info *hib, paddr_t inaddr,
1245 	paddr_t range_end, daddr_t *blkctr,
1246 	size_t *out_remaining)
1247 {
1248 	int rle, err, *rleloc;
1249 	struct hibernate_zlib_state *hibernate_state;
1250 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
1251 
1252 	hibernate_state =
1253 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1254 
1255 	rle = hibernate_calc_rle(inaddr, range_end);
1256 
1257 	rleloc = (int *)hibernate_rle_page + MAX_RLE - 1;
1258 	*rleloc = rle;
1259 
1260 	/* Deflate the RLE byte into the stream */
1261 	hibernate_deflate(hib, (paddr_t)rleloc, out_remaining);
1262 
1263 	/* Did we fill the output page? If so, flush to disk */
1264 	if (*out_remaining == 0) {
1265 		if ((err = hib->io_func(hib->dev, *blkctr + hib->image_offset,
1266 			(vaddr_t)hibernate_io_page, PAGE_SIZE, HIB_W,
1267 			hib->io_page))) {
1268 				DPRINTF("hib write error %d\n", err);
1269 				return (err);
1270 		}
1271 
1272 		*blkctr += PAGE_SIZE / DEV_BSIZE;
1273 		*out_remaining = PAGE_SIZE;
1274 
1275 		/* If we didn't deflate the entire RLE byte, finish it now */
1276 		if (hibernate_state->hib_stream.avail_in != 0)
1277 			hibernate_deflate(hib,
1278 				(vaddr_t)hibernate_state->hib_stream.next_in,
1279 				out_remaining);
1280 	}
1281 
1282 	return (rle);
1283 }
1284 
1285 /*
1286  * Write a compressed version of this machine's memory to disk, at the
1287  * precalculated swap offset:
1288  *
1289  * end of swap - signature block size - chunk table size - memory size
1290  *
1291  * The function begins by looping through each phys mem range, cutting each
1292  * one into MD sized chunks. These chunks are then compressed individually
1293  * and written out to disk, in phys mem order. Some chunks might compress
1294  * more than others, and for this reason, each chunk's size is recorded
1295  * in the chunk table, which is written to disk after the image has
1296  * properly been compressed and written (in hibernate_write_chunktable).
1297  *
1298  * When this function is called, the machine is nearly suspended - most
1299  * devices are quiesced/suspended, interrupts are off, and cold has
1300  * been set. This means that there can be no side effects once the
1301  * write has started, and the write function itself can also have no
1302  * side effects. This also means no printfs are permitted (since printf
1303  * has side effects.)
1304  *
1305  * Return values :
1306  *
1307  * 0      - success
1308  * EIO    - I/O error occurred writing the chunks
1309  * EINVAL - Failed to write a complete range
1310  * ENOMEM - Memory allocation failure during preparation of the zlib arena
1311  */
1312 int
1313 hibernate_write_chunks(union hibernate_info *hib)
1314 {
1315 	paddr_t range_base, range_end, inaddr, temp_inaddr;
1316 	size_t nblocks, out_remaining, used;
1317 	struct hibernate_disk_chunk *chunks;
1318 	vaddr_t hibernate_io_page = hib->piglet_va + PAGE_SIZE;
1319 	daddr_t blkctr = 0;
1320 	int i, rle, err;
1321 	struct hibernate_zlib_state *hibernate_state;
1322 
1323 	hibernate_state =
1324 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1325 
1326 	hib->chunk_ctr = 0;
1327 
1328 	/*
1329 	 * Map the utility VAs to the piglet. See the piglet map at the
1330 	 * top of this file for piglet layout information.
1331 	 */
1332 	hibernate_copy_page = hib->piglet_va + 3 * PAGE_SIZE;
1333 	hibernate_rle_page = hib->piglet_va + 28 * PAGE_SIZE;
1334 
1335 	chunks = (struct hibernate_disk_chunk *)(hib->piglet_va +
1336 	    HIBERNATE_CHUNK_SIZE);
1337 
1338 	/* Calculate the chunk regions */
1339 	for (i = 0; i < hib->nranges; i++) {
1340 		range_base = hib->ranges[i].base;
1341 		range_end = hib->ranges[i].end;
1342 
1343 		inaddr = range_base;
1344 
1345 		while (inaddr < range_end) {
1346 			chunks[hib->chunk_ctr].base = inaddr;
1347 			if (inaddr + HIBERNATE_CHUNK_SIZE < range_end)
1348 				chunks[hib->chunk_ctr].end = inaddr +
1349 				    HIBERNATE_CHUNK_SIZE;
1350 			else
1351 				chunks[hib->chunk_ctr].end = range_end;
1352 
1353 			inaddr += HIBERNATE_CHUNK_SIZE;
1354 			hib->chunk_ctr ++;
1355 		}
1356 	}
1357 
1358 	uvm_pmr_dirty_everything();
1359 	uvm_pmr_zero_everything();
1360 
1361 	/* Compress and write the chunks in the chunktable */
1362 	for (i = 0; i < hib->chunk_ctr; i++) {
1363 		range_base = chunks[i].base;
1364 		range_end = chunks[i].end;
1365 
1366 		chunks[i].offset = blkctr + hib->image_offset;
1367 
1368 		/* Reset zlib for deflate */
1369 		if (hibernate_zlib_reset(hib, 1) != Z_OK) {
1370 			DPRINTF("hibernate_zlib_reset failed for deflate\n");
1371 			return (ENOMEM);
1372 		}
1373 
1374 		inaddr = range_base;
1375 
1376 		/*
1377 		 * For each range, loop through its phys mem region
1378 		 * and write out the chunks (the last chunk might be
1379 		 * smaller than the chunk size).
1380 		 */
1381 		while (inaddr < range_end) {
1382 			out_remaining = PAGE_SIZE;
1383 			while (out_remaining > 0 && inaddr < range_end) {
1384 				/*
1385 				 * Adjust for regions that are not evenly
1386 				 * divisible by PAGE_SIZE or overflowed
1387 				 * pages from the previous iteration.
1388 				 */
1389 				temp_inaddr = (inaddr & PAGE_MASK) +
1390 				    hibernate_copy_page;
1391 
1392 				/* Deflate from temp_inaddr to IO page */
1393 				if (inaddr != range_end) {
1394 					if (inaddr % PAGE_SIZE == 0) {
1395 						rle = hibernate_write_rle(hib,
1396 							inaddr,
1397 							range_end,
1398 							&blkctr,
1399 							&out_remaining);
1400 					}
1401 
1402 					if (rle == 0) {
1403 						pmap_kenter_pa(hibernate_temp_page,
1404 							inaddr & PMAP_PA_MASK,
1405 							PROT_READ);
1406 
1407 						bcopy((caddr_t)hibernate_temp_page,
1408 							(caddr_t)hibernate_copy_page,
1409 							PAGE_SIZE);
1410 						inaddr += hibernate_deflate(hib,
1411 							temp_inaddr,
1412 							&out_remaining);
1413 					} else {
1414 						inaddr += rle * PAGE_SIZE;
1415 						if (inaddr > range_end)
1416 							inaddr = range_end;
1417 					}
1418 
1419 				}
1420 
1421 				if (out_remaining == 0) {
1422 					/* Filled up the page */
1423 					nblocks = PAGE_SIZE / DEV_BSIZE;
1424 
1425 					if ((err = hib->io_func(hib->dev,
1426 					    blkctr + hib->image_offset,
1427 					    (vaddr_t)hibernate_io_page,
1428 					    PAGE_SIZE, HIB_W, hib->io_page))) {
1429 						DPRINTF("hib write error %d\n",
1430 						    err);
1431 						return (err);
1432 					}
1433 
1434 					blkctr += nblocks;
1435 				}
1436 			}
1437 		}
1438 
1439 		if (inaddr != range_end) {
1440 			DPRINTF("deflate range ended prematurely\n");
1441 			return (EINVAL);
1442 		}
1443 
1444 		/*
1445 		 * End of range. Round up to next secsize bytes
1446 		 * after finishing compress
1447 		 */
1448 		if (out_remaining == 0)
1449 			out_remaining = PAGE_SIZE;
1450 
1451 		/* Finish compress */
1452 		hibernate_state->hib_stream.next_in = (unsigned char *)inaddr;
1453 		hibernate_state->hib_stream.avail_in = 0;
1454 		hibernate_state->hib_stream.next_out =
1455 		    (unsigned char *)hibernate_io_page +
1456 			(PAGE_SIZE - out_remaining);
1457 
1458 		/* We have an extra output page available for finalize */
1459 		hibernate_state->hib_stream.avail_out =
1460 			out_remaining + PAGE_SIZE;
1461 
1462 		if ((err = deflate(&hibernate_state->hib_stream, Z_FINISH)) !=
1463 		    Z_STREAM_END) {
1464 			DPRINTF("deflate error in output stream: %d\n", err);
1465 			return (err);
1466 		}
1467 
1468 		out_remaining = hibernate_state->hib_stream.avail_out;
1469 
1470 		used = 2 * PAGE_SIZE - out_remaining;
1471 		nblocks = used / DEV_BSIZE;
1472 
1473 		/* Round up to next block if needed */
1474 		if (used % DEV_BSIZE != 0)
1475 			nblocks ++;
1476 
1477 		/* Write final block(s) for this chunk */
1478 		if ((err = hib->io_func(hib->dev, blkctr + hib->image_offset,
1479 		    (vaddr_t)hibernate_io_page, nblocks*DEV_BSIZE,
1480 		    HIB_W, hib->io_page))) {
1481 			DPRINTF("hib final write error %d\n", err);
1482 			return (err);
1483 		}
1484 
1485 		blkctr += nblocks;
1486 
1487 		chunks[i].compressed_size = (blkctr + hib->image_offset -
1488 		    chunks[i].offset) * DEV_BSIZE;
1489 	}
1490 
1491 	hib->chunktable_offset = hib->image_offset + blkctr;
1492 	return (0);
1493 }
1494 
1495 /*
1496  * Reset the zlib stream state and allocate a new hiballoc area for either
1497  * inflate or deflate. This function is called once for each hibernate chunk.
1498  * Calling hiballoc_init multiple times is acceptable since the memory it is
1499  * provided is unmanaged memory (stolen). We use the memory provided to us
1500  * by the piglet allocated via the supplied hib.
1501  */
1502 int
1503 hibernate_zlib_reset(union hibernate_info *hib, int deflate)
1504 {
1505 	vaddr_t hibernate_zlib_start;
1506 	size_t hibernate_zlib_size;
1507 	char *pva = (char *)hib->piglet_va;
1508 	struct hibernate_zlib_state *hibernate_state;
1509 
1510 	hibernate_state =
1511 	    (struct hibernate_zlib_state *)HIBERNATE_HIBALLOC_PAGE;
1512 
1513 	if (!deflate)
1514 		pva = (char *)((paddr_t)pva & (PIGLET_PAGE_MASK));
1515 
1516 	/*
1517 	 * See piglet layout information at the start of this file for
1518 	 * information on the zlib page assignments.
1519 	 */
1520 	hibernate_zlib_start = (vaddr_t)(pva + (29 * PAGE_SIZE));
1521 	hibernate_zlib_size = 80 * PAGE_SIZE;
1522 
1523 	memset((void *)hibernate_zlib_start, 0, hibernate_zlib_size);
1524 	memset(hibernate_state, 0, PAGE_SIZE);
1525 
1526 	/* Set up stream structure */
1527 	hibernate_state->hib_stream.zalloc = (alloc_func)hibernate_zlib_alloc;
1528 	hibernate_state->hib_stream.zfree = (free_func)hibernate_zlib_free;
1529 
1530 	/* Initialize the hiballoc arena for zlib allocs/frees */
1531 	hiballoc_init(&hibernate_state->hiballoc_arena,
1532 	    (caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1533 
1534 	if (deflate) {
1535 		return deflateInit(&hibernate_state->hib_stream,
1536 		    Z_BEST_SPEED);
1537 	} else
1538 		return inflateInit(&hibernate_state->hib_stream);
1539 }
1540 
1541 /*
1542  * Reads the hibernated memory image from disk, whose location and
1543  * size are recorded in hib. Begin by reading the persisted
1544  * chunk table, which records the original chunk placement location
1545  * and compressed size for each. Next, allocate a pig region of
1546  * sufficient size to hold the compressed image. Next, read the
1547  * chunks into the pig area (calling hibernate_read_chunks to do this),
1548  * and finally, if all of the above succeeds, clear the hibernate signature.
1549  * The function will then return to hibernate_resume, which will proceed
1550  * to unpack the pig image to the correct place in memory.
1551  */
1552 int
1553 hibernate_read_image(union hibernate_info *hib)
1554 {
1555 	size_t compressed_size, disk_size, chunktable_size, pig_sz;
1556 	paddr_t image_start, image_end, pig_start, pig_end;
1557 	struct hibernate_disk_chunk *chunks;
1558 	daddr_t blkctr;
1559 	vaddr_t chunktable = (vaddr_t)NULL;
1560 	paddr_t piglet_chunktable = hib->piglet_pa +
1561 	    HIBERNATE_CHUNK_SIZE;
1562 	int i, status;
1563 
1564 	status = 0;
1565 	pmap_activate(curproc);
1566 
1567 	/* Calculate total chunk table size in disk blocks */
1568 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / DEV_BSIZE;
1569 
1570 	blkctr = hib->chunktable_offset;
1571 
1572 	chunktable = (vaddr_t)km_alloc(HIBERNATE_CHUNK_TABLE_SIZE, &kv_any,
1573 	    &kp_none, &kd_nowait);
1574 
1575 	if (!chunktable)
1576 		return (1);
1577 
1578 	/* Map chunktable pages */
1579 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE; i += PAGE_SIZE)
1580 		pmap_kenter_pa(chunktable + i, piglet_chunktable + i,
1581 		    PROT_READ | PROT_WRITE);
1582 	pmap_update(pmap_kernel());
1583 
1584 	/* Read the chunktable from disk into the piglet chunktable */
1585 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE;
1586 	    i += MAXPHYS, blkctr += MAXPHYS/DEV_BSIZE)
1587 		hibernate_block_io(hib, blkctr, MAXPHYS,
1588 		    chunktable + i, 0);
1589 
1590 	blkctr = hib->image_offset;
1591 	compressed_size = 0;
1592 
1593 	chunks = (struct hibernate_disk_chunk *)chunktable;
1594 
1595 	for (i = 0; i < hib->chunk_ctr; i++)
1596 		compressed_size += chunks[i].compressed_size;
1597 
1598 	disk_size = compressed_size;
1599 
1600 	printf("unhibernating @ block %lld length %lu bytes\n",
1601 	    hib->sig_offset - chunktable_size,
1602 	    compressed_size);
1603 
1604 	/* Allocate the pig area */
1605 	pig_sz = compressed_size + HIBERNATE_CHUNK_SIZE;
1606 	if (uvm_pmr_alloc_pig(&pig_start, pig_sz, hib->piglet_pa) == ENOMEM) {
1607 		status = 1;
1608 		goto unmap;
1609 	}
1610 
1611 	pig_end = pig_start + pig_sz;
1612 
1613 	/* Calculate image extents. Pig image must end on a chunk boundary. */
1614 	image_end = pig_end & ~(HIBERNATE_CHUNK_SIZE - 1);
1615 	image_start = image_end - disk_size;
1616 
1617 	hibernate_read_chunks(hib, image_start, image_end, disk_size,
1618 	    chunks);
1619 
1620 	/* Prepare the resume time pmap/page table */
1621 	hibernate_populate_resume_pt(hib, image_start, image_end);
1622 
1623 unmap:
1624 	/* Unmap chunktable pages */
1625 	pmap_kremove(chunktable, HIBERNATE_CHUNK_TABLE_SIZE);
1626 	pmap_update(pmap_kernel());
1627 
1628 	return (status);
1629 }
1630 
1631 /*
1632  * Read the hibernated memory chunks from disk (chunk information at this
1633  * point is stored in the piglet) into the pig area specified by
1634  * [pig_start .. pig_end]. Order the chunks so that the final chunk is the
1635  * only chunk with overlap possibilities.
1636  */
1637 int
1638 hibernate_read_chunks(union hibernate_info *hib, paddr_t pig_start,
1639     paddr_t pig_end, size_t image_compr_size,
1640     struct hibernate_disk_chunk *chunks)
1641 {
1642 	paddr_t img_cur, piglet_base;
1643 	daddr_t blkctr;
1644 	size_t processed, compressed_size, read_size;
1645 	int nchunks, nfchunks, num_io_pages;
1646 	vaddr_t tempva, hibernate_fchunk_area;
1647 	short *fchunks, i, j;
1648 
1649 	tempva = (vaddr_t)NULL;
1650 	hibernate_fchunk_area = (vaddr_t)NULL;
1651 	nfchunks = 0;
1652 	piglet_base = hib->piglet_pa;
1653 	global_pig_start = pig_start;
1654 
1655 	/*
1656 	 * These mappings go into the resuming kernel's page table, and are
1657 	 * used only during image read. They dissappear from existence
1658 	 * when the suspended kernel is unpacked on top of us.
1659 	 */
1660 	tempva = (vaddr_t)km_alloc(MAXPHYS + PAGE_SIZE, &kv_any, &kp_none,
1661 		&kd_nowait);
1662 	if (!tempva)
1663 		return (1);
1664 	hibernate_fchunk_area = (vaddr_t)km_alloc(24 * PAGE_SIZE, &kv_any,
1665 	    &kp_none, &kd_nowait);
1666 	if (!hibernate_fchunk_area)
1667 		return (1);
1668 
1669 	/* Final output chunk ordering VA */
1670 	fchunks = (short *)hibernate_fchunk_area;
1671 
1672 	/* Map the chunk ordering region */
1673 	for(i = 0; i < 24 ; i++)
1674 		pmap_kenter_pa(hibernate_fchunk_area + (i * PAGE_SIZE),
1675 			piglet_base + ((4 + i) * PAGE_SIZE),
1676 			PROT_READ | PROT_WRITE);
1677 	pmap_update(pmap_kernel());
1678 
1679 	nchunks = hib->chunk_ctr;
1680 
1681 	/* Initially start all chunks as unplaced */
1682 	for (i = 0; i < nchunks; i++)
1683 		chunks[i].flags = 0;
1684 
1685 	/*
1686 	 * Search the list for chunks that are outside the pig area. These
1687 	 * can be placed first in the final output list.
1688 	 */
1689 	for (i = 0; i < nchunks; i++) {
1690 		if (chunks[i].end <= pig_start || chunks[i].base >= pig_end) {
1691 			fchunks[nfchunks] = i;
1692 			nfchunks++;
1693 			chunks[i].flags |= HIBERNATE_CHUNK_PLACED;
1694 		}
1695 	}
1696 
1697 	/*
1698 	 * Walk the ordering, place the chunks in ascending memory order.
1699 	 */
1700 	for (i = 0; i < nchunks; i++) {
1701 		if (chunks[i].flags != HIBERNATE_CHUNK_PLACED) {
1702 			fchunks[nfchunks] = i;
1703 			nfchunks++;
1704 			chunks[i].flags = HIBERNATE_CHUNK_PLACED;
1705 		}
1706 	}
1707 
1708 	img_cur = pig_start;
1709 
1710 	for (i = 0; i < nfchunks; i++) {
1711 		blkctr = chunks[fchunks[i]].offset;
1712 		processed = 0;
1713 		compressed_size = chunks[fchunks[i]].compressed_size;
1714 
1715 		while (processed < compressed_size) {
1716 			if (compressed_size - processed >= MAXPHYS)
1717 				read_size = MAXPHYS;
1718 			else
1719 				read_size = compressed_size - processed;
1720 
1721 			/*
1722 			 * We're reading read_size bytes, offset from the
1723 			 * start of a page by img_cur % PAGE_SIZE, so the
1724 			 * end will be read_size + (img_cur % PAGE_SIZE)
1725 			 * from the start of the first page.  Round that
1726 			 * up to the next page size.
1727 			 */
1728 			num_io_pages = (read_size + (img_cur % PAGE_SIZE)
1729 				+ PAGE_SIZE - 1) / PAGE_SIZE;
1730 
1731 			KASSERT(num_io_pages <= MAXPHYS/PAGE_SIZE + 1);
1732 
1733 			/* Map pages for this read */
1734 			for (j = 0; j < num_io_pages; j ++)
1735 				pmap_kenter_pa(tempva + j * PAGE_SIZE,
1736 				    img_cur + j * PAGE_SIZE,
1737 				    PROT_READ | PROT_WRITE);
1738 
1739 			pmap_update(pmap_kernel());
1740 
1741 			hibernate_block_io(hib, blkctr, read_size,
1742 			    tempva + (img_cur & PAGE_MASK), 0);
1743 
1744 			blkctr += (read_size / DEV_BSIZE);
1745 
1746 			pmap_kremove(tempva, num_io_pages * PAGE_SIZE);
1747 			pmap_update(pmap_kernel());
1748 
1749 			processed += read_size;
1750 			img_cur += read_size;
1751 		}
1752 	}
1753 
1754 	pmap_kremove(hibernate_fchunk_area, 24 * PAGE_SIZE);
1755 	pmap_update(pmap_kernel());
1756 
1757 	return (0);
1758 }
1759 
1760 /*
1761  * Hibernating a machine comprises the following operations:
1762  *  1. Calculating this machine's hibernate_info information
1763  *  2. Allocating a piglet and saving the piglet's physaddr
1764  *  3. Calculating the memory chunks
1765  *  4. Writing the compressed chunks to disk
1766  *  5. Writing the chunk table
1767  *  6. Writing the signature block (hibernate_info)
1768  *
1769  * On most architectures, the function calling hibernate_suspend would
1770  * then power off the machine using some MD-specific implementation.
1771  */
1772 int
1773 hibernate_suspend(void)
1774 {
1775 	union hibernate_info hib;
1776 	u_long start, end;
1777 
1778 	/*
1779 	 * Calculate memory ranges, swap offsets, etc.
1780 	 * This also allocates a piglet whose physaddr is stored in
1781 	 * hib->piglet_pa and vaddr stored in hib->piglet_va
1782 	 */
1783 	if (get_hibernate_info(&hib, 1)) {
1784 		DPRINTF("failed to obtain hibernate info\n");
1785 		return (1);
1786 	}
1787 
1788 	/* Find a page-addressed region in swap [start,end] */
1789 	if (uvm_hibswap(hib.dev, &start, &end)) {
1790 		printf("hibernate: cannot find any swap\n");
1791 		return (1);
1792 	}
1793 
1794 	if (end - start < 1000) {
1795 		printf("hibernate: insufficient swap (%lu is too small)\n",
1796 			end - start);
1797 		return (1);
1798 	}
1799 
1800 	/* Calculate block offsets in swap */
1801 	hib.image_offset = ctod(start);
1802 
1803 	DPRINTF("hibernate @ block %lld max-length %lu blocks\n",
1804 	    hib.image_offset, ctod(end) - ctod(start));
1805 
1806 	pmap_activate(curproc);
1807 	DPRINTF("hibernate: writing chunks\n");
1808 	if (hibernate_write_chunks(&hib)) {
1809 		DPRINTF("hibernate_write_chunks failed\n");
1810 		return (1);
1811 	}
1812 
1813 	DPRINTF("hibernate: writing chunktable\n");
1814 	if (hibernate_write_chunktable(&hib)) {
1815 		DPRINTF("hibernate_write_chunktable failed\n");
1816 		return (1);
1817 	}
1818 
1819 	DPRINTF("hibernate: writing signature\n");
1820 	if (hibernate_write_signature(&hib)) {
1821 		DPRINTF("hibernate_write_signature failed\n");
1822 		return (1);
1823 	}
1824 
1825 	/* Allow the disk to settle */
1826 	delay(500000);
1827 
1828 	/*
1829 	 * Give the device-specific I/O function a notification that we're
1830 	 * done, and that it can clean up or shutdown as needed.
1831 	 */
1832 	hib.io_func(hib.dev, 0, (vaddr_t)NULL, 0, HIB_DONE, hib.io_page);
1833 	return (0);
1834 }
1835 
1836 int
1837 hibernate_alloc(void)
1838 {
1839 	KASSERT(global_piglet_va == 0);
1840 	KASSERT(hibernate_temp_page == 0);
1841 
1842 	pmap_activate(curproc);
1843 	pmap_kenter_pa(HIBERNATE_HIBALLOC_PAGE, HIBERNATE_HIBALLOC_PAGE,
1844 		PROT_READ | PROT_WRITE);
1845 
1846 	/* Allocate a piglet, store its addresses in the supplied globals */
1847 	if (uvm_pmr_alloc_piglet(&global_piglet_va, &global_piglet_pa,
1848 	    HIBERNATE_CHUNK_SIZE * 4, HIBERNATE_CHUNK_SIZE))
1849 		return (ENOMEM);
1850 
1851 	/*
1852 	 * Allocate VA for the temp page.
1853 	 *
1854 	 * This will become part of the suspended kernel and will
1855 	 * be freed in hibernate_free, upon resume (or hibernate
1856 	 * failure)
1857 	 */
1858 	hibernate_temp_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1859 	    &kp_none, &kd_nowait);
1860 	if (!hibernate_temp_page) {
1861 		DPRINTF("out of memory allocating hibernate_temp_page\n");
1862 		return (ENOMEM);
1863 	}
1864 
1865 	return (0);
1866 }
1867 
1868 /*
1869  * Free items allocated by hibernate_alloc()
1870  */
1871 void
1872 hibernate_free(void)
1873 {
1874 	pmap_activate(curproc);
1875 
1876 	if (global_piglet_va)
1877 		uvm_pmr_free_piglet(global_piglet_va,
1878 		    4 * HIBERNATE_CHUNK_SIZE);
1879 
1880 	if (hibernate_temp_page) {
1881 		pmap_kremove(hibernate_temp_page, PAGE_SIZE);
1882 		km_free((void *)hibernate_temp_page, PAGE_SIZE,
1883 		    &kv_any, &kp_none);
1884 	}
1885 
1886 	global_piglet_va = 0;
1887 	hibernate_temp_page = 0;
1888 	pmap_kremove(HIBERNATE_HIBALLOC_PAGE, PAGE_SIZE);
1889 	pmap_update(pmap_kernel());
1890 }
1891