xref: /openbsd-src/sys/kern/subr_hibernate.c (revision aa7ef211b75aafba4b58b1e7bc7173e89f84a39d)
1 /*	$OpenBSD: subr_hibernate.c,v 1.32 2011/11/29 05:21:08 deraadt Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  * Copyright (c) 2011 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/hibernate.h>
21 #include <sys/malloc.h>
22 #include <sys/param.h>
23 #include <sys/tree.h>
24 #include <sys/types.h>
25 #include <sys/systm.h>
26 #include <sys/disklabel.h>
27 #include <sys/disk.h>
28 #include <sys/conf.h>
29 #include <sys/buf.h>
30 #include <sys/fcntl.h>
31 #include <sys/stat.h>
32 #include <uvm/uvm.h>
33 #include <machine/hibernate.h>
34 
35 struct hibernate_zlib_state *hibernate_state;
36 
37 /* Temporary vaddr ranges used during hibernate */
38 vaddr_t hibernate_temp_page;
39 vaddr_t hibernate_copy_page;
40 
41 /* Hibernate info as read from disk during resume */
42 union hibernate_info disk_hiber_info;
43 paddr_t global_pig_start;
44 vaddr_t global_piglet_va;
45 
46 /*
47  * Hib alloc enforced alignment.
48  */
49 #define HIB_ALIGN		8 /* bytes alignment */
50 
51 /*
52  * sizeof builtin operation, but with alignment constraint.
53  */
54 #define HIB_SIZEOF(_type)	roundup(sizeof(_type), HIB_ALIGN)
55 
56 struct hiballoc_entry {
57 	size_t			hibe_use;
58 	size_t			hibe_space;
59 	RB_ENTRY(hiballoc_entry) hibe_entry;
60 };
61 
62 /*
63  * Compare hiballoc entries based on the address they manage.
64  *
65  * Since the address is fixed, relative to struct hiballoc_entry,
66  * we just compare the hiballoc_entry pointers.
67  */
68 static __inline int
69 hibe_cmp(struct hiballoc_entry *l, struct hiballoc_entry *r)
70 {
71 	return l < r ? -1 : (l > r);
72 }
73 
74 RB_PROTOTYPE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
75 
76 /*
77  * Given a hiballoc entry, return the address it manages.
78  */
79 static __inline void *
80 hib_entry_to_addr(struct hiballoc_entry *entry)
81 {
82 	caddr_t addr;
83 
84 	addr = (caddr_t)entry;
85 	addr += HIB_SIZEOF(struct hiballoc_entry);
86 	return addr;
87 }
88 
89 /*
90  * Given an address, find the hiballoc that corresponds.
91  */
92 static __inline struct hiballoc_entry*
93 hib_addr_to_entry(void *addr_param)
94 {
95 	caddr_t addr;
96 
97 	addr = (caddr_t)addr_param;
98 	addr -= HIB_SIZEOF(struct hiballoc_entry);
99 	return (struct hiballoc_entry*)addr;
100 }
101 
102 RB_GENERATE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
103 
104 /*
105  * Allocate memory from the arena.
106  *
107  * Returns NULL if no memory is available.
108  */
109 void *
110 hib_alloc(struct hiballoc_arena *arena, size_t alloc_sz)
111 {
112 	struct hiballoc_entry *entry, *new_entry;
113 	size_t find_sz;
114 
115 	/*
116 	 * Enforce alignment of HIB_ALIGN bytes.
117 	 *
118 	 * Note that, because the entry is put in front of the allocation,
119 	 * 0-byte allocations are guaranteed a unique address.
120 	 */
121 	alloc_sz = roundup(alloc_sz, HIB_ALIGN);
122 
123 	/*
124 	 * Find an entry with hibe_space >= find_sz.
125 	 *
126 	 * If the root node is not large enough, we switch to tree traversal.
127 	 * Because all entries are made at the bottom of the free space,
128 	 * traversal from the end has a slightly better chance of yielding
129 	 * a sufficiently large space.
130 	 */
131 	find_sz = alloc_sz + HIB_SIZEOF(struct hiballoc_entry);
132 	entry = RB_ROOT(&arena->hib_addrs);
133 	if (entry != NULL && entry->hibe_space < find_sz) {
134 		RB_FOREACH_REVERSE(entry, hiballoc_addr, &arena->hib_addrs) {
135 			if (entry->hibe_space >= find_sz)
136 				break;
137 		}
138 	}
139 
140 	/*
141 	 * Insufficient or too fragmented memory.
142 	 */
143 	if (entry == NULL)
144 		return NULL;
145 
146 	/*
147 	 * Create new entry in allocated space.
148 	 */
149 	new_entry = (struct hiballoc_entry*)(
150 	    (caddr_t)hib_entry_to_addr(entry) + entry->hibe_use);
151 	new_entry->hibe_space = entry->hibe_space - find_sz;
152 	new_entry->hibe_use = alloc_sz;
153 
154 	/*
155 	 * Insert entry.
156 	 */
157 	if (RB_INSERT(hiballoc_addr, &arena->hib_addrs, new_entry) != NULL)
158 		panic("hib_alloc: insert failure");
159 	entry->hibe_space = 0;
160 
161 	/* Return address managed by entry. */
162 	return hib_entry_to_addr(new_entry);
163 }
164 
165 /*
166  * Free a pointer previously allocated from this arena.
167  *
168  * If addr is NULL, this will be silently accepted.
169  */
170 void
171 hib_free(struct hiballoc_arena *arena, void *addr)
172 {
173 	struct hiballoc_entry *entry, *prev;
174 
175 	if (addr == NULL)
176 		return;
177 
178 	/*
179 	 * Derive entry from addr and check it is really in this arena.
180 	 */
181 	entry = hib_addr_to_entry(addr);
182 	if (RB_FIND(hiballoc_addr, &arena->hib_addrs, entry) != entry)
183 		panic("hib_free: freed item %p not in hib arena", addr);
184 
185 	/*
186 	 * Give the space in entry to its predecessor.
187 	 *
188 	 * If entry has no predecessor, change its used space into free space
189 	 * instead.
190 	 */
191 	prev = RB_PREV(hiballoc_addr, &arena->hib_addrs, entry);
192 	if (prev != NULL &&
193 	    (void *)((caddr_t)prev + HIB_SIZEOF(struct hiballoc_entry) +
194 	    prev->hibe_use + prev->hibe_space) == entry) {
195 		/* Merge entry. */
196 		RB_REMOVE(hiballoc_addr, &arena->hib_addrs, entry);
197 		prev->hibe_space += HIB_SIZEOF(struct hiballoc_entry) +
198 		    entry->hibe_use + entry->hibe_space;
199 	} else {
200 		/* Flip used memory to free space. */
201 		entry->hibe_space += entry->hibe_use;
202 		entry->hibe_use = 0;
203 	}
204 }
205 
206 /*
207  * Initialize hiballoc.
208  *
209  * The allocator will manage memmory at ptr, which is len bytes.
210  */
211 int
212 hiballoc_init(struct hiballoc_arena *arena, void *p_ptr, size_t p_len)
213 {
214 	struct hiballoc_entry *entry;
215 	caddr_t ptr;
216 	size_t len;
217 
218 	RB_INIT(&arena->hib_addrs);
219 
220 	/*
221 	 * Hib allocator enforces HIB_ALIGN alignment.
222 	 * Fixup ptr and len.
223 	 */
224 	ptr = (caddr_t)roundup((vaddr_t)p_ptr, HIB_ALIGN);
225 	len = p_len - ((size_t)ptr - (size_t)p_ptr);
226 	len &= ~((size_t)HIB_ALIGN - 1);
227 
228 	/*
229 	 * Insufficient memory to be able to allocate and also do bookkeeping.
230 	 */
231 	if (len <= HIB_SIZEOF(struct hiballoc_entry))
232 		return ENOMEM;
233 
234 	/*
235 	 * Create entry describing space.
236 	 */
237 	entry = (struct hiballoc_entry*)ptr;
238 	entry->hibe_use = 0;
239 	entry->hibe_space = len - HIB_SIZEOF(struct hiballoc_entry);
240 	RB_INSERT(hiballoc_addr, &arena->hib_addrs, entry);
241 
242 	return 0;
243 }
244 
245 /*
246  * Zero all free memory.
247  */
248 void
249 uvm_pmr_zero_everything(void)
250 {
251 	struct uvm_pmemrange	*pmr;
252 	struct vm_page		*pg;
253 	int			 i;
254 
255 	uvm_lock_fpageq();
256 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
257 		/* Zero single pages. */
258 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_DIRTY]))
259 		    != NULL) {
260 			uvm_pmr_remove(pmr, pg);
261 			uvm_pagezero(pg);
262 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
263 			uvmexp.zeropages++;
264 			uvm_pmr_insert(pmr, pg, 0);
265 		}
266 
267 		/* Zero multi page ranges. */
268 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_DIRTY]))
269 		    != NULL) {
270 			pg--; /* Size tree always has second page. */
271 			uvm_pmr_remove(pmr, pg);
272 			for (i = 0; i < pg->fpgsz; i++) {
273 				uvm_pagezero(&pg[i]);
274 				atomic_setbits_int(&pg[i].pg_flags, PG_ZERO);
275 				uvmexp.zeropages++;
276 			}
277 			uvm_pmr_insert(pmr, pg, 0);
278 		}
279 	}
280 	uvm_unlock_fpageq();
281 }
282 
283 /*
284  * Mark all memory as dirty.
285  *
286  * Used to inform the system that the clean memory isn't clean for some
287  * reason, for example because we just came back from hibernate.
288  */
289 void
290 uvm_pmr_dirty_everything(void)
291 {
292 	struct uvm_pmemrange	*pmr;
293 	struct vm_page		*pg;
294 	int			 i;
295 
296 	uvm_lock_fpageq();
297 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
298 		/* Dirty single pages. */
299 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_ZERO]))
300 		    != NULL) {
301 			uvm_pmr_remove(pmr, pg);
302 			atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
303 			uvm_pmr_insert(pmr, pg, 0);
304 		}
305 
306 		/* Dirty multi page ranges. */
307 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_ZERO]))
308 		    != NULL) {
309 			pg--; /* Size tree always has second page. */
310 			uvm_pmr_remove(pmr, pg);
311 			for (i = 0; i < pg->fpgsz; i++)
312 				atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
313 			uvm_pmr_insert(pmr, pg, 0);
314 		}
315 	}
316 
317 	uvmexp.zeropages = 0;
318 	uvm_unlock_fpageq();
319 }
320 
321 /*
322  * Allocate the highest address that can hold sz.
323  *
324  * sz in bytes.
325  */
326 int
327 uvm_pmr_alloc_pig(paddr_t *addr, psize_t sz)
328 {
329 	struct uvm_pmemrange	*pmr;
330 	struct vm_page		*pig_pg, *pg;
331 
332 	/*
333 	 * Convert sz to pages, since that is what pmemrange uses internally.
334 	 */
335 	sz = atop(round_page(sz));
336 
337 	uvm_lock_fpageq();
338 
339 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
340 		RB_FOREACH_REVERSE(pig_pg, uvm_pmr_addr, &pmr->addr) {
341 			if (pig_pg->fpgsz >= sz) {
342 				goto found;
343 			}
344 		}
345 	}
346 
347 	/*
348 	 * Allocation failure.
349 	 */
350 	uvm_unlock_pageq();
351 	return ENOMEM;
352 
353 found:
354 	/* Remove page from freelist. */
355 	uvm_pmr_remove_size(pmr, pig_pg);
356 	pig_pg->fpgsz -= sz;
357 	pg = pig_pg + pig_pg->fpgsz;
358 	if (pig_pg->fpgsz == 0)
359 		uvm_pmr_remove_addr(pmr, pig_pg);
360 	else
361 		uvm_pmr_insert_size(pmr, pig_pg);
362 
363 	uvmexp.free -= sz;
364 	*addr = VM_PAGE_TO_PHYS(pg);
365 
366 	/*
367 	 * Update pg flags.
368 	 *
369 	 * Note that we trash the sz argument now.
370 	 */
371 	while (sz > 0) {
372 		KASSERT(pg->pg_flags & PQ_FREE);
373 
374 		atomic_clearbits_int(&pg->pg_flags,
375 		    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3);
376 
377 		if (pg->pg_flags & PG_ZERO)
378 			uvmexp.zeropages -= sz;
379 		atomic_clearbits_int(&pg->pg_flags,
380 		    PG_ZERO|PQ_FREE);
381 
382 		pg->uobject = NULL;
383 		pg->uanon = NULL;
384 		pg->pg_version++;
385 
386 		/*
387 		 * Next.
388 		 */
389 		pg++;
390 		sz--;
391 	}
392 
393 	/* Return. */
394 	uvm_unlock_fpageq();
395 	return 0;
396 }
397 
398 /*
399  * Allocate a piglet area.
400  *
401  * This is as low as possible.
402  * Piglets are aligned.
403  *
404  * sz and align in bytes.
405  *
406  * The call will sleep for the pagedaemon to attempt to free memory.
407  * The pagedaemon may decide its not possible to free enough memory, causing
408  * the allocation to fail.
409  */
410 int
411 uvm_pmr_alloc_piglet(vaddr_t *va, paddr_t *pa, vsize_t sz, paddr_t align)
412 {
413 	paddr_t			 pg_addr, piglet_addr;
414 	struct uvm_pmemrange	*pmr;
415 	struct vm_page		*pig_pg, *pg;
416 	struct pglist		 pageq;
417 	int			 pdaemon_woken;
418 	vaddr_t			 piglet_va;
419 
420 	KASSERT((align & (align - 1)) == 0);
421 	pdaemon_woken = 0; /* Didn't wake the pagedaemon. */
422 
423 	/*
424 	 * Fixup arguments: align must be at least PAGE_SIZE,
425 	 * sz will be converted to pagecount, since that is what
426 	 * pmemrange uses internally.
427 	 */
428 	if (align < PAGE_SIZE)
429 		align = PAGE_SIZE;
430 	sz = round_page(sz);
431 
432 	uvm_lock_fpageq();
433 
434 	TAILQ_FOREACH_REVERSE(pmr, &uvm.pmr_control.use, uvm_pmemrange_use,
435 	    pmr_use) {
436 retry:
437 		/*
438 		 * Search for a range with enough space.
439 		 * Use the address tree, to ensure the range is as low as
440 		 * possible.
441 		 */
442 		RB_FOREACH(pig_pg, uvm_pmr_addr, &pmr->addr) {
443 			pg_addr = VM_PAGE_TO_PHYS(pig_pg);
444 			piglet_addr = (pg_addr + (align - 1)) & ~(align - 1);
445 
446 			if (atop(pg_addr) + pig_pg->fpgsz >=
447 			    atop(piglet_addr) + atop(sz))
448 				goto found;
449 		}
450 	}
451 
452 	/*
453 	 * Try to coerse the pagedaemon into freeing memory
454 	 * for the piglet.
455 	 *
456 	 * pdaemon_woken is set to prevent the code from
457 	 * falling into an endless loop.
458 	 */
459 	if (!pdaemon_woken) {
460 		pdaemon_woken = 1;
461 		if (uvm_wait_pla(ptoa(pmr->low), ptoa(pmr->high) - 1,
462 		    sz, UVM_PLA_FAILOK) == 0)
463 			goto retry;
464 	}
465 
466 	/* Return failure. */
467 	uvm_unlock_fpageq();
468 	return ENOMEM;
469 
470 found:
471 	/*
472 	 * Extract piglet from pigpen.
473 	 */
474 	TAILQ_INIT(&pageq);
475 	uvm_pmr_extract_range(pmr, pig_pg,
476 	    atop(piglet_addr), atop(piglet_addr) + atop(sz), &pageq);
477 
478 	*pa = piglet_addr;
479 	uvmexp.free -= atop(sz);
480 
481 	/*
482 	 * Update pg flags.
483 	 *
484 	 * Note that we trash the sz argument now.
485 	 */
486 	TAILQ_FOREACH(pg, &pageq, pageq) {
487 		KASSERT(pg->pg_flags & PQ_FREE);
488 
489 		atomic_clearbits_int(&pg->pg_flags,
490 		    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3);
491 
492 		if (pg->pg_flags & PG_ZERO)
493 			uvmexp.zeropages--;
494 		atomic_clearbits_int(&pg->pg_flags,
495 		    PG_ZERO|PQ_FREE);
496 
497 		pg->uobject = NULL;
498 		pg->uanon = NULL;
499 		pg->pg_version++;
500 	}
501 
502 	uvm_unlock_fpageq();
503 
504 	/*
505 	 * Now allocate a va.
506 	 * Use direct mappings for the pages.
507 	 */
508 
509 	piglet_va = *va = (vaddr_t)km_alloc(sz, &kv_any, &kp_none, &kd_waitok);
510 	if (!piglet_va) {
511 		uvm_pglistfree(&pageq);
512 		return ENOMEM;
513 	}
514 
515 	/*
516 	 * Map piglet to va.
517 	 */
518 	TAILQ_FOREACH(pg, &pageq, pageq) {
519 		pmap_kenter_pa(piglet_va, VM_PAGE_TO_PHYS(pg), UVM_PROT_RW);
520 		piglet_va += PAGE_SIZE;
521 	}
522 	pmap_update(pmap_kernel());
523 
524 	return 0;
525 }
526 
527 /*
528  * Free a piglet area.
529  */
530 void
531 uvm_pmr_free_piglet(vaddr_t va, vsize_t sz)
532 {
533 	paddr_t			 pa;
534 	struct vm_page		*pg;
535 
536 	/*
537 	 * Fix parameters.
538 	 */
539 	sz = round_page(sz);
540 
541 	/*
542 	 * Find the first page in piglet.
543 	 * Since piglets are contiguous, the first pg is all we need.
544 	 */
545 	if (!pmap_extract(pmap_kernel(), va, &pa))
546 		panic("uvm_pmr_free_piglet: piglet 0x%lx has no pages", va);
547 	pg = PHYS_TO_VM_PAGE(pa);
548 	if (pg == NULL)
549 		panic("uvm_pmr_free_piglet: unmanaged page 0x%lx", pa);
550 
551 	/*
552 	 * Unmap.
553 	 */
554 	pmap_kremove(va, sz);
555 	pmap_update(pmap_kernel());
556 
557 	/*
558 	 * Free the physical and virtual memory.
559 	 */
560 	uvm_pmr_freepages(pg, atop(sz));
561 	km_free((void *)va, sz, &kv_any, &kp_none);
562 }
563 
564 /*
565  * Physmem RLE compression support.
566  *
567  * Given a physical page address, it will return the number of pages
568  * starting at the address, that are free.  Clamps to a max of 255 pages.
569  * Returns 0 if the page at addr is not free.
570  */
571 u_char
572 uvm_page_rle(paddr_t addr)
573 {
574 	struct vm_page		*pg, *pg_end;
575 	struct vm_physseg	*vmp;
576 	int			 pseg_idx, off_idx;
577 
578 	pseg_idx = vm_physseg_find(atop(addr), &off_idx);
579 	if (pseg_idx == -1)
580 		return 0;
581 
582 	vmp = &vm_physmem[pseg_idx];
583 	pg = &vmp->pgs[off_idx];
584 	if (!(pg->pg_flags & PQ_FREE))
585 		return 0;
586 
587 	/*
588 	 * Search for the first non-free page after pg.
589 	 * Note that the page may not be the first page in a free pmemrange,
590 	 * therefore pg->fpgsz cannot be used.
591 	 */
592 	for (pg_end = pg; pg_end <= vmp->lastpg &&
593 	    (pg_end->pg_flags & PQ_FREE) == PQ_FREE; pg_end++)
594 		;
595 	return max(pg_end - pg, 255);
596 }
597 
598 /*
599  * Fills out the hibernate_info union pointed to by hiber_info
600  * with information about this machine (swap signature block
601  * offsets, number of memory ranges, kernel in use, etc)
602  */
603 int
604 get_hibernate_info(union hibernate_info *hiber_info, int suspend)
605 {
606 	int chunktable_size;
607 	struct disklabel dl;
608 	char err_string[128], *dl_ret;
609 
610 	/* Determine I/O function to use */
611 	hiber_info->io_func = get_hibernate_io_function();
612 	if (hiber_info->io_func == NULL)
613 		return (1);
614 
615 	/* Calculate hibernate device */
616 	hiber_info->device = swdevt[0].sw_dev;
617 
618 	/* Read disklabel (used to calculate signature and image offsets) */
619 	dl_ret = disk_readlabel(&dl, hiber_info->device, err_string, 128);
620 
621 	if (dl_ret) {
622 		printf("Hibernate error reading disklabel: %s\n", dl_ret);
623 		return (1);
624 	}
625 
626 	hiber_info->secsize = dl.d_secsize;
627 
628 	/* Make sure the signature can fit in one block */
629 	KASSERT(sizeof(union hibernate_info)/hiber_info->secsize == 1);
630 
631 	/* Calculate swap offset from start of disk */
632 	hiber_info->swap_offset = dl.d_partitions[1].p_offset;
633 
634 	/* Calculate signature block location */
635 	hiber_info->sig_offset = dl.d_partitions[1].p_offset +
636 	    dl.d_partitions[1].p_size -
637 	    sizeof(union hibernate_info)/hiber_info->secsize;
638 
639 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / hiber_info->secsize;
640 
641 	/* Stash kernel version information */
642 	bzero(&hiber_info->kernel_version, 128);
643 	bcopy(version, &hiber_info->kernel_version,
644 	    min(strlen(version), sizeof(hiber_info->kernel_version)-1));
645 
646 	if (suspend) {
647 		/* Allocate piglet region */
648 		if (uvm_pmr_alloc_piglet(&hiber_info->piglet_va,
649 		    &hiber_info->piglet_pa, HIBERNATE_CHUNK_SIZE*3,
650 		    HIBERNATE_CHUNK_SIZE)) {
651 			printf("Hibernate failed to allocate the piglet\n");
652 			return (1);
653 		}
654 		hiber_info->io_page = (void *)hiber_info->piglet_va;
655 	} else {
656 		/*
657 		 * Resuming kernels use a regular I/O page since we won't
658 		 * have access to the suspended kernel's piglet VA at this
659 		 * point. No need to free this I/O page as it will vanish
660 		 * as part of the resume.
661 		 */
662 		hiber_info->io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
663 		if (!hiber_info->io_page)
664 			return (1);
665 	}
666 
667 
668 	/*
669 	 * Initialize of the hibernate IO function (for drivers which
670 	 * need that)
671 	 */
672 	if (hiber_info->io_func(hiber_info->device, 0,
673 	    (vaddr_t)NULL, 0, HIB_INIT, hiber_info->io_page))
674 		goto fail;
675 
676 	if (get_hibernate_info_md(hiber_info))
677 		goto fail;
678 
679 	/* Calculate memory image location */
680 	hiber_info->image_offset = dl.d_partitions[1].p_offset +
681 	    dl.d_partitions[1].p_size -
682 	    (hiber_info->image_size / hiber_info->secsize) -
683 	    sizeof(union hibernate_info)/hiber_info->secsize -
684 	    chunktable_size;
685 
686 	return (0);
687 fail:
688 	if (suspend)
689 		uvm_pmr_free_piglet(hiber_info->piglet_va, HIBERNATE_CHUNK_SIZE*3);
690 
691 	return (1);
692 }
693 
694 /*
695  * Allocate nitems*size bytes from the hiballoc area presently in use
696  */
697 void
698 *hibernate_zlib_alloc(void *unused, int nitems, int size)
699 {
700 	return hib_alloc(&hibernate_state->hiballoc_arena, nitems*size);
701 }
702 
703 /*
704  * Free the memory pointed to by addr in the hiballoc area presently in
705  * use
706  */
707 void
708 hibernate_zlib_free(void *unused, void *addr)
709 {
710 	hib_free(&hibernate_state->hiballoc_arena, addr);
711 }
712 
713 /*
714  * Inflate size bytes from src into dest, skipping any pages in
715  * [src..dest] that are special (see hibernate_inflate_skip)
716  *
717  * This function executes while using the resume-time stack
718  * and pmap, and therefore cannot use ddb/printf/etc. Doing so
719  * will likely hang or reset the machine.
720  */
721 void
722 hibernate_inflate(union hibernate_info *hiber_info, paddr_t dest,
723     paddr_t src, size_t size)
724 {
725 	int i;
726 	u_char rle;
727 
728 	hibernate_state->hib_stream.next_in = (char *)src;
729 	hibernate_state->hib_stream.avail_in = size;
730 
731 	do {
732 		/* Flush cache and TLB */
733 		hibernate_flush();
734 
735 		/* Read RLE code */
736 		hibernate_state->hib_stream.next_out = (char *)&rle;
737 		hibernate_state->hib_stream.avail_out = sizeof(rle);
738 
739 		i = inflate(&hibernate_state->hib_stream, Z_FULL_FLUSH);
740 		if (i != Z_OK && i != Z_STREAM_END) {
741 			/*
742 			 * XXX - this will likely reboot/hang most machines,
743 			 *       but there's not much else we can do here.
744 			 */
745 			panic("inflate rle error");
746 		}
747 
748 		if (i == Z_STREAM_END)
749 			goto next_page;
750 
751 		/* Skip while RLE code is != 0 */
752 		while (rle != 0) {
753 			dest += (rle * PAGE_SIZE);
754 			hibernate_state->hib_stream.next_out = (char *)&rle;
755 			hibernate_state->hib_stream.avail_out = sizeof(rle);
756 
757 			i = inflate(&hibernate_state->hib_stream,
758 			    Z_FULL_FLUSH);
759 			if (i != Z_OK && i != Z_STREAM_END) {
760 				/*
761 				 * XXX - this will likely reboot/hang most
762 				 *       machines but there's not much else
763 				 *       we can do here.
764 				 */
765 				panic("inflate rle error 2");
766 			}
767 		}
768 
769 		if (i == Z_STREAM_END)
770 			goto next_page;
771 
772 		/*
773 		 * Is this a special page? If yes, redirect the
774 		 * inflate output to a scratch page (eg, discard it)
775 		 */
776 		if (hibernate_inflate_skip(hiber_info, dest))
777 			hibernate_enter_resume_mapping(
778 			    HIBERNATE_INFLATE_PAGE,
779 			    HIBERNATE_INFLATE_PAGE, 0);
780 		else
781 			hibernate_enter_resume_mapping(
782 			    HIBERNATE_INFLATE_PAGE, dest, 0);
783 
784 		hibernate_flush();
785 
786 		/* Set up the stream for inflate */
787 		hibernate_state->hib_stream.next_out =
788 		    (char *)HIBERNATE_INFLATE_PAGE;
789 		hibernate_state->hib_stream.avail_out = PAGE_SIZE;
790 
791 		/* Process next block of data */
792 		i = inflate(&hibernate_state->hib_stream, Z_PARTIAL_FLUSH);
793 		if (i != Z_OK && i != Z_STREAM_END) {
794 			/*
795 			 * XXX - this will likely reboot/hang most machines,
796 			 *       but there's not much else we can do here.
797 			 */
798 			panic("inflate error");
799 		}
800 
801 next_page:
802 		dest += PAGE_SIZE - hibernate_state->hib_stream.avail_out;
803 	} while (i != Z_STREAM_END);
804 }
805 
806 /*
807  * deflate from src into the I/O page, up to 'remaining' bytes
808  *
809  * Returns number of input bytes consumed, and may reset
810  * the 'remaining' parameter if not all the output space was consumed
811  * (this information is needed to know how much to write to disk
812  */
813 size_t
814 hibernate_deflate(union hibernate_info *hiber_info, paddr_t src,
815     size_t *remaining)
816 {
817 	vaddr_t hibernate_io_page = hiber_info->piglet_va + PAGE_SIZE;
818 
819 	/* Set up the stream for deflate */
820 	hibernate_state->hib_stream.next_in = (caddr_t)src;
821 	hibernate_state->hib_stream.avail_in = PAGE_SIZE - (src & PAGE_MASK);
822 	hibernate_state->hib_stream.next_out = (caddr_t)hibernate_io_page +
823 	    (PAGE_SIZE - *remaining);
824 	hibernate_state->hib_stream.avail_out = *remaining;
825 
826 	/* Process next block of data */
827 	if (deflate(&hibernate_state->hib_stream, Z_PARTIAL_FLUSH) != Z_OK)
828 		panic("hibernate zlib deflate error");
829 
830 	/* Update pointers and return number of bytes consumed */
831 	*remaining = hibernate_state->hib_stream.avail_out;
832 	return (PAGE_SIZE - (src & PAGE_MASK)) -
833 	    hibernate_state->hib_stream.avail_in;
834 }
835 
836 /*
837  * Write the hibernation information specified in hiber_info
838  * to the location in swap previously calculated (last block of
839  * swap), called the "signature block".
840  *
841  * Write the memory chunk table to the area in swap immediately
842  * preceding the signature block.
843  */
844 int
845 hibernate_write_signature(union hibernate_info *hiber_info)
846 {
847 	/* Write hibernate info to disk */
848 	return (hiber_info->io_func(hiber_info->device, hiber_info->sig_offset,
849 	    (vaddr_t)hiber_info, hiber_info->secsize, HIB_W,
850 	    hiber_info->io_page));
851 }
852 
853 /*
854  * Write the memory chunk table to the area in swap immediately
855  * preceding the signature block. The chunk table is stored
856  * in the piglet when this function is called.
857  */
858 int
859 hibernate_write_chunktable(union hibernate_info *hiber_info)
860 {
861 	struct hibernate_disk_chunk *chunks;
862 	vaddr_t hibernate_chunk_table_start;
863 	size_t hibernate_chunk_table_size;
864 	daddr_t chunkbase;
865 	int i;
866 
867 	hibernate_chunk_table_size = HIBERNATE_CHUNK_TABLE_SIZE;
868 
869 	chunkbase = hiber_info->sig_offset -
870 	    (hibernate_chunk_table_size / hiber_info->secsize);
871 
872 	hibernate_chunk_table_start = hiber_info->piglet_va +
873 	    HIBERNATE_CHUNK_SIZE;
874 
875 	chunks = (struct hibernate_disk_chunk *)(hiber_info->piglet_va +
876 	    HIBERNATE_CHUNK_SIZE);
877 
878 	/* Write chunk table */
879 	for (i = 0; i < hibernate_chunk_table_size; i += MAXPHYS) {
880 		if (hiber_info->io_func(hiber_info->device,
881 		    chunkbase + (i/hiber_info->secsize),
882 		    (vaddr_t)(hibernate_chunk_table_start + i),
883 		    MAXPHYS, HIB_W, hiber_info->io_page))
884 			return (1);
885 	}
886 
887 	return (0);
888 }
889 
890 /*
891  * Write an empty hiber_info to the swap signature block, which is
892  * guaranteed to not match any valid hiber_info.
893  */
894 int
895 hibernate_clear_signature(void)
896 {
897 	union hibernate_info blank_hiber_info;
898 	union hibernate_info hiber_info;
899 
900 	/* Zero out a blank hiber_info */
901 	bzero(&blank_hiber_info, sizeof(hiber_info));
902 
903 	if (get_hibernate_info(&hiber_info, 0))
904 		return (1);
905 
906 	/* Write (zeroed) hibernate info to disk */
907 	/* XXX - use regular kernel write routine for this */
908 	if (hiber_info.io_func(hiber_info.device, hiber_info.sig_offset,
909 	    (vaddr_t)&blank_hiber_info, hiber_info.secsize, HIB_W,
910 	    hiber_info.io_page))
911 		panic("error hibernate write 6");
912 
913 	return (0);
914 }
915 
916 /*
917  * Check chunk range overlap when calculating whether or not to copy a
918  * compressed chunk to the piglet area before decompressing.
919  *
920  * returns zero if the ranges do not overlap, non-zero otherwise.
921  */
922 int
923 hibernate_check_overlap(paddr_t r1s, paddr_t r1e, paddr_t r2s, paddr_t r2e)
924 {
925 	/* case A : end of r1 overlaps start of r2 */
926 	if (r1s < r2s && r1e > r2s)
927 		return (1);
928 
929 	/* case B : r1 entirely inside r2 */
930 	if (r1s >= r2s && r1e <= r2e)
931 		return (1);
932 
933 	/* case C : r2 entirely inside r1 */
934 	if (r2s >= r1s && r2e <= r1e)
935 		return (1);
936 
937 	/* case D : end of r2 overlaps start of r1 */
938 	if (r2s < r1s && r2e > r1s)
939 		return (1);
940 
941 	return (0);
942 }
943 
944 /*
945  * Compare two hibernate_infos to determine if they are the same (eg,
946  * we should be performing a hibernate resume on this machine.
947  * Not all fields are checked - just enough to verify that the machine
948  * has the same memory configuration and kernel as the one that
949  * wrote the signature previously.
950  */
951 int
952 hibernate_compare_signature(union hibernate_info *mine,
953     union hibernate_info *disk)
954 {
955 	u_int i;
956 
957 	if (mine->nranges != disk->nranges)
958 		return (1);
959 
960 	if (strcmp(mine->kernel_version, disk->kernel_version) != 0)
961 		return (1);
962 
963 	for (i = 0; i < mine->nranges; i++) {
964 		if ((mine->ranges[i].base != disk->ranges[i].base) ||
965 		    (mine->ranges[i].end != disk->ranges[i].end) )
966 			return (1);
967 	}
968 
969 	return (0);
970 }
971 
972 /*
973  * Reads read_size bytes from the hibernate device specified in
974  * hib_info at offset blkctr. Output is placed into the vaddr specified
975  * at dest.
976  *
977  * Separate offsets and pages are used to handle misaligned reads (reads
978  * that span a page boundary).
979  *
980  * blkctr specifies a relative offset (relative to the start of swap),
981  * not an absolute disk offset
982  *
983  */
984 int
985 hibernate_read_block(union hibernate_info *hib_info, daddr_t blkctr,
986     size_t read_size, vaddr_t dest)
987 {
988 	struct buf *bp;
989 	struct bdevsw *bdsw;
990 	int error;
991 
992 	bp = geteblk(read_size);
993 	bdsw = &bdevsw[major(hib_info->device)];
994 
995 	error = (*bdsw->d_open)(hib_info->device, FREAD, S_IFCHR, curproc);
996 	if (error) {
997 		printf("hibernate_read_block open failed\n");
998 		return (1);
999 	}
1000 
1001 	bp->b_bcount = read_size;
1002 	bp->b_blkno = blkctr;
1003 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
1004 	SET(bp->b_flags, B_BUSY | B_READ | B_RAW);
1005 	bp->b_dev = hib_info->device;
1006 	bp->b_cylinder = 0;
1007 	(*bdsw->d_strategy)(bp);
1008 
1009 	error = biowait(bp);
1010 	if (error) {
1011 		printf("hibernate_read_block biowait failed %d\n", error);
1012 		error = (*bdsw->d_close)(hib_info->device, 0, S_IFCHR,
1013 		    curproc);
1014 		if (error)
1015 			printf("hibernate_read_block error close failed\n");
1016 		return (1);
1017 	}
1018 
1019 	error = (*bdsw->d_close)(hib_info->device, FREAD, S_IFCHR, curproc);
1020 	if (error) {
1021 		printf("hibernate_read_block close failed\n");
1022 		return (1);
1023 	}
1024 
1025 	bcopy(bp->b_data, (caddr_t)dest, read_size);
1026 
1027 	bp->b_flags |= B_INVAL;
1028 	brelse(bp);
1029 
1030 	return (0);
1031 }
1032 
1033 /*
1034  * Reads the signature block from swap, checks against the current machine's
1035  * information. If the information matches, perform a resume by reading the
1036  * saved image into the pig area, and unpacking.
1037  */
1038 void
1039 hibernate_resume(void)
1040 {
1041 	union hibernate_info hiber_info;
1042 	int s;
1043 
1044 	/* Get current running machine's hibernate info */
1045 	bzero(&hiber_info, sizeof(hiber_info));
1046 	if (get_hibernate_info(&hiber_info, 0))
1047 		return;
1048 
1049 	/* Read hibernate info from disk */
1050 	s = splbio();
1051 
1052 	/* XXX use regular kernel read routine here */
1053 	if (hiber_info.io_func(hiber_info.device, hiber_info.sig_offset,
1054 	    (vaddr_t)&disk_hiber_info, hiber_info.secsize, HIB_R,
1055 	    hiber_info.io_page))
1056 		panic("error in hibernate read");
1057 
1058 	/*
1059 	 * If on-disk and in-memory hibernate signatures match,
1060 	 * this means we should do a resume from hibernate.
1061 	 */
1062 	if (hibernate_compare_signature(&hiber_info, &disk_hiber_info))
1063 		return;
1064 
1065 	/* Read the image from disk into the image (pig) area */
1066 	if (hibernate_read_image(&disk_hiber_info))
1067 		goto fail;
1068 
1069 	/* Point of no return ... */
1070 
1071 	disable_intr();
1072 	cold = 1;
1073 
1074 	/* Switch stacks */
1075 	hibernate_switch_stack_machdep();
1076 
1077 	/*
1078 	 * Image is now in high memory (pig area), copy to correct location
1079 	 * in memory. We'll eventually end up copying on top of ourself, but
1080 	 * we are assured the kernel code here is the same between the
1081 	 * hibernated and resuming kernel, and we are running on our own
1082 	 * stack, so the overwrite is ok.
1083 	 */
1084 	hibernate_unpack_image(&disk_hiber_info);
1085 
1086 	/*
1087 	 * Resume the loaded kernel by jumping to the MD resume vector.
1088 	 * We won't be returning from this call.
1089 	 */
1090 	hibernate_resume_machdep();
1091 
1092 fail:
1093 	printf("Unable to resume hibernated image\n");
1094 }
1095 
1096 /*
1097  * Unpack image from pig area to original location by looping through the
1098  * list of output chunks in the order they should be restored (fchunks).
1099  * This ordering is used to avoid having inflate overwrite a chunk in the
1100  * middle of processing that chunk. This will, of course, happen during the
1101  * final output chunk, where we copy the chunk to the piglet area first,
1102  * before inflating.
1103  */
1104 void
1105 hibernate_unpack_image(union hibernate_info *hiber_info)
1106 {
1107 	struct hibernate_disk_chunk *chunks;
1108 	union hibernate_info local_hiber_info;
1109 	paddr_t image_cur = global_pig_start;
1110 	int *fchunks, i;
1111 	char *pva = (char *)hiber_info->piglet_va;
1112 
1113 	/* Mask off based on arch-specific piglet page size */
1114 	pva = (char *)((paddr_t)pva & (PIGLET_PAGE_MASK));
1115 	fchunks = (int *)(pva + (6 * PAGE_SIZE));
1116 
1117 	chunks = (struct hibernate_disk_chunk *)(pva +  HIBERNATE_CHUNK_SIZE);
1118 
1119 	/* Can't use hiber_info that's passed in after here */
1120 	bcopy(hiber_info, &local_hiber_info, sizeof(union hibernate_info));
1121 
1122 	hibernate_state = (struct hibernate_zlib_state *)
1123 	    (pva + (7 * PAGE_SIZE));
1124 
1125 	hibernate_activate_resume_pt_machdep();
1126 
1127 	for (i = 0; i < local_hiber_info.chunk_ctr; i++) {
1128 		/* Reset zlib for inflate */
1129 		if (hibernate_zlib_reset(&local_hiber_info, 0) != Z_OK)
1130 			panic("hibernate failed to reset zlib for inflate");
1131 
1132 		/*
1133 		 * If there is a conflict, copy the chunk to the piglet area
1134 		 * before unpacking it to its original location.
1135 		 */
1136 		if ((chunks[fchunks[i]].flags & HIBERNATE_CHUNK_CONFLICT) == 0)
1137 			hibernate_inflate(&local_hiber_info,
1138 			    chunks[fchunks[i]].base, image_cur,
1139 			    chunks[fchunks[i]].compressed_size);
1140 		else {
1141 			bcopy((caddr_t)image_cur,
1142 			    pva + (HIBERNATE_CHUNK_SIZE * 2),
1143 			    chunks[fchunks[i]].compressed_size);
1144 			hibernate_inflate(&local_hiber_info,
1145 			    chunks[fchunks[i]].base,
1146 			    (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)),
1147 			    chunks[fchunks[i]].compressed_size);
1148 		}
1149 		image_cur += chunks[fchunks[i]].compressed_size;
1150 	}
1151 }
1152 
1153 /*
1154  * Write a compressed version of this machine's memory to disk, at the
1155  * precalculated swap offset:
1156  *
1157  * end of swap - signature block size - chunk table size - memory size
1158  *
1159  * The function begins by looping through each phys mem range, cutting each
1160  * one into 4MB chunks. These chunks are then compressed individually
1161  * and written out to disk, in phys mem order. Some chunks might compress
1162  * more than others, and for this reason, each chunk's size is recorded
1163  * in the chunk table, which is written to disk after the image has
1164  * properly been compressed and written (in hibernate_write_chunktable).
1165  *
1166  * When this function is called, the machine is nearly suspended - most
1167  * devices are quiesced/suspended, interrupts are off, and cold has
1168  * been set. This means that there can be no side effects once the
1169  * write has started, and the write function itself can also have no
1170  * side effects.
1171  *
1172  * This function uses the piglet area during this process as follows:
1173  *
1174  * offset from piglet base	use
1175  * -----------------------	--------------------
1176  * 0				i/o allocation area
1177  * PAGE_SIZE			i/o write area
1178  * 2*PAGE_SIZE			temp/scratch page
1179  * 3*PAGE_SIZE			temp/scratch page
1180  * 4*PAGE_SIZE			hiballoc arena
1181  * 5*PAGE_SIZE to 85*PAGE_SIZE	zlib deflate area
1182  * ...
1183  * HIBERNATE_CHUNK_SIZE		chunk table temporary area
1184  *
1185  * Some transient piglet content is saved as part of deflate,
1186  * but it is irrelevant during resume as it will be repurposed
1187  * at that time for other things.
1188  */
1189 int
1190 hibernate_write_chunks(union hibernate_info *hiber_info)
1191 {
1192 	paddr_t range_base, range_end, inaddr, temp_inaddr;
1193 	size_t nblocks, out_remaining, used, offset = 0;
1194 	struct hibernate_disk_chunk *chunks;
1195 	vaddr_t hibernate_io_page = hiber_info->piglet_va + PAGE_SIZE;
1196 	daddr_t blkctr = hiber_info->image_offset;
1197 	int i;
1198 
1199 	hiber_info->chunk_ctr = 0;
1200 
1201 	/*
1202 	 * Allocate VA for the temp and copy page.
1203 	 * These will becomee part of the suspended kernel and will
1204 	 * be freed in hibernate_free, upon resume.
1205 	 */
1206 	hibernate_temp_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1207 	    &kp_none, &kd_nowait);
1208 	if (!hibernate_temp_page)
1209 		return (1);
1210 
1211 	hibernate_copy_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1212 	    &kp_none, &kd_nowait);
1213 	if (!hibernate_copy_page)
1214 		return (1);
1215 
1216 	pmap_kenter_pa(hibernate_copy_page,
1217 	    (hiber_info->piglet_pa + 3*PAGE_SIZE), VM_PROT_ALL);
1218 
1219 	/* XXX - not needed on all archs */
1220 	pmap_activate(curproc);
1221 
1222 	chunks = (struct hibernate_disk_chunk *)(hiber_info->piglet_va +
1223 	    HIBERNATE_CHUNK_SIZE);
1224 
1225 	/* Calculate the chunk regions */
1226 	for (i = 0; i < hiber_info->nranges; i++) {
1227 		range_base = hiber_info->ranges[i].base;
1228 		range_end = hiber_info->ranges[i].end;
1229 
1230 		inaddr = range_base;
1231 
1232 		while (inaddr < range_end) {
1233 			chunks[hiber_info->chunk_ctr].base = inaddr;
1234 			if (inaddr + HIBERNATE_CHUNK_SIZE < range_end)
1235 				chunks[hiber_info->chunk_ctr].end = inaddr +
1236 				    HIBERNATE_CHUNK_SIZE;
1237 			else
1238 				chunks[hiber_info->chunk_ctr].end = range_end;
1239 
1240 			inaddr += HIBERNATE_CHUNK_SIZE;
1241 			hiber_info->chunk_ctr ++;
1242 		}
1243 	}
1244 
1245 	/* Compress and write the chunks in the chunktable */
1246 	for (i = 0; i < hiber_info->chunk_ctr; i++) {
1247 		range_base = chunks[i].base;
1248 		range_end = chunks[i].end;
1249 
1250 		chunks[i].offset = blkctr;
1251 
1252 		/* Reset zlib for deflate */
1253 		if (hibernate_zlib_reset(hiber_info, 1) != Z_OK)
1254 			return (1);
1255 
1256 		inaddr = range_base;
1257 
1258 		/*
1259 		 * For each range, loop through its phys mem region
1260 		 * and write out the chunks (the last chunk might be
1261 		 * smaller than the chunk size).
1262 		 */
1263 		while (inaddr < range_end) {
1264 			out_remaining = PAGE_SIZE;
1265 			while (out_remaining > 0 && inaddr < range_end) {
1266 				u_char rle;
1267 
1268 				/*
1269 				 * Adjust for regions that are not evenly
1270 				 * divisible by PAGE_SIZE or overflowed
1271 				 * pages from the previous iteration.
1272 				 */
1273 				temp_inaddr = (inaddr & PAGE_MASK) +
1274 				    hibernate_copy_page;
1275 
1276 				rle = uvm_page_rle(inaddr);
1277 				while (rle != 0 && inaddr < range_end) {
1278 					hibernate_state->hib_stream.next_in =
1279 					    (char *)&rle;
1280 					hibernate_state->hib_stream.avail_in =
1281 					    sizeof(rle);
1282 					hibernate_state->hib_stream.next_out =
1283 					    (caddr_t)hibernate_io_page +
1284 					    (PAGE_SIZE - out_remaining);
1285 					hibernate_state->hib_stream.avail_out =
1286 					    out_remaining;
1287 
1288 					if (deflate(&hibernate_state->hib_stream,
1289 					    Z_PARTIAL_FLUSH) != Z_OK)
1290 						return (1);
1291 
1292 					out_remaining =
1293 					    hibernate_state->hib_stream.avail_out;
1294 					inaddr += (rle * PAGE_SIZE);
1295 					if (inaddr > range_end)
1296 						inaddr = range_end;
1297 					else
1298 						rle = uvm_page_rle(inaddr);
1299 				}
1300 
1301 				if (out_remaining == 0) {
1302 					/* Filled up the page */
1303 					nblocks = PAGE_SIZE / hiber_info->secsize;
1304 
1305 					if (hiber_info->io_func(hiber_info->device,
1306 					    blkctr, (vaddr_t)hibernate_io_page,
1307 					    PAGE_SIZE, HIB_W, hiber_info->io_page))
1308 						return (1);
1309 
1310 					blkctr += nblocks;
1311 					out_remaining = PAGE_SIZE;
1312 				}
1313 
1314 				/* Write '0' RLE code */
1315 				if (inaddr < range_end) {
1316 					hibernate_state->hib_stream.next_in =
1317 					    (char *)&rle;
1318 					hibernate_state->hib_stream.avail_in =
1319 					    sizeof(rle);
1320 					hibernate_state->hib_stream.next_out =
1321 				    	    (caddr_t)hibernate_io_page +
1322 					    (PAGE_SIZE - out_remaining);
1323 					hibernate_state->hib_stream.avail_out =
1324 					    out_remaining;
1325 
1326 					if (deflate(&hibernate_state->hib_stream,
1327 					    Z_PARTIAL_FLUSH) != Z_OK)
1328 						return (1);
1329 
1330 					out_remaining =
1331 					    hibernate_state->hib_stream.avail_out;
1332 				}
1333 
1334 				if (out_remaining == 0) {
1335 					/* Filled up the page */
1336 					nblocks = PAGE_SIZE / hiber_info->secsize;
1337 
1338 					if (hiber_info->io_func(hiber_info->device,
1339 					    blkctr, (vaddr_t)hibernate_io_page,
1340 					    PAGE_SIZE, HIB_W, hiber_info->io_page))
1341 						return (1);
1342 
1343 					blkctr += nblocks;
1344 					out_remaining = PAGE_SIZE;
1345 				}
1346 
1347 				/* Deflate from temp_inaddr to IO page */
1348 				if (inaddr != range_end) {
1349 					pmap_kenter_pa(hibernate_temp_page,
1350 					    inaddr & PMAP_PA_MASK, VM_PROT_ALL);
1351 
1352 					/* XXX - not needed on all archs */
1353 					pmap_activate(curproc);
1354 
1355 					bcopy((caddr_t)hibernate_temp_page,
1356 					    (caddr_t)hibernate_copy_page, PAGE_SIZE);
1357 					inaddr += hibernate_deflate(hiber_info,
1358 					    temp_inaddr, &out_remaining);
1359 				}
1360 			}
1361 
1362 			if (out_remaining == 0) {
1363 				/* Filled up the page */
1364 				nblocks = PAGE_SIZE / hiber_info->secsize;
1365 
1366 				if (hiber_info->io_func(hiber_info->device,
1367 				    blkctr, (vaddr_t)hibernate_io_page,
1368 				    PAGE_SIZE, HIB_W, hiber_info->io_page))
1369 					return (1);
1370 
1371 				blkctr += nblocks;
1372 			}
1373 		}
1374 
1375 		if (inaddr != range_end)
1376 			return (1);
1377 
1378 		/*
1379 		 * End of range. Round up to next secsize bytes
1380 		 * after finishing compress
1381 		 */
1382 		if (out_remaining == 0)
1383 			out_remaining = PAGE_SIZE;
1384 
1385 		/* Finish compress */
1386 		hibernate_state->hib_stream.next_in = (caddr_t)inaddr;
1387 		hibernate_state->hib_stream.avail_in = 0;
1388 		hibernate_state->hib_stream.next_out =
1389 		    (caddr_t)hibernate_io_page + (PAGE_SIZE - out_remaining);
1390 		hibernate_state->hib_stream.avail_out = out_remaining;
1391 
1392 		if (deflate(&hibernate_state->hib_stream, Z_FINISH) !=
1393 		    Z_STREAM_END)
1394 			return (1);
1395 
1396 		out_remaining = hibernate_state->hib_stream.avail_out;
1397 
1398 		used = PAGE_SIZE - out_remaining;
1399 		nblocks = used / hiber_info->secsize;
1400 
1401 		/* Round up to next block if needed */
1402 		if (used % hiber_info->secsize != 0)
1403 			nblocks ++;
1404 
1405 		/* Write final block(s) for this chunk */
1406 		if (hiber_info->io_func(hiber_info->device, blkctr,
1407 		    (vaddr_t)hibernate_io_page, nblocks*hiber_info->secsize,
1408 		    HIB_W, hiber_info->io_page))
1409 			return (1);
1410 
1411 		blkctr += nblocks;
1412 
1413 		offset = blkctr;
1414 		chunks[i].compressed_size = (offset - chunks[i].offset) *
1415 		    hiber_info->secsize;
1416 	}
1417 
1418 	return (0);
1419 }
1420 
1421 /*
1422  * Reset the zlib stream state and allocate a new hiballoc area for either
1423  * inflate or deflate. This function is called once for each hibernate chunk.
1424  * Calling hiballoc_init multiple times is acceptable since the memory it is
1425  * provided is unmanaged memory (stolen). We use the memory provided to us
1426  * by the piglet allocated via the supplied hiber_info.
1427  */
1428 int
1429 hibernate_zlib_reset(union hibernate_info *hiber_info, int deflate)
1430 {
1431 	vaddr_t hibernate_zlib_start;
1432 	size_t hibernate_zlib_size;
1433 	char *pva = (char *)hiber_info->piglet_va;
1434 
1435 	hibernate_state = (struct hibernate_zlib_state *)
1436 	    (pva + (7 * PAGE_SIZE));
1437 
1438 	hibernate_zlib_start = (vaddr_t)(pva + (8 * PAGE_SIZE));
1439 	hibernate_zlib_size = 80 * PAGE_SIZE;
1440 
1441 	bzero((caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1442 	bzero((caddr_t)hibernate_state, PAGE_SIZE);
1443 
1444 	/* Set up stream structure */
1445 	hibernate_state->hib_stream.zalloc = (alloc_func)hibernate_zlib_alloc;
1446 	hibernate_state->hib_stream.zfree = (free_func)hibernate_zlib_free;
1447 
1448 	/* Initialize the hiballoc arena for zlib allocs/frees */
1449 	hiballoc_init(&hibernate_state->hiballoc_arena,
1450 	    (caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1451 
1452 	if (deflate) {
1453 		return deflateInit(&hibernate_state->hib_stream,
1454 		    Z_BEST_SPEED);
1455 	} else
1456 		return inflateInit(&hibernate_state->hib_stream);
1457 }
1458 
1459 /*
1460  * Reads the hibernated memory image from disk, whose location and
1461  * size are recorded in hiber_info. Begin by reading the persisted
1462  * chunk table, which records the original chunk placement location
1463  * and compressed size for each. Next, allocate a pig region of
1464  * sufficient size to hold the compressed image. Next, read the
1465  * chunks into the pig area (calling hibernate_read_chunks to do this),
1466  * and finally, if all of the above succeeds, clear the hibernate signature.
1467  * The function will then return to hibernate_resume, which will proceed
1468  * to unpack the pig image to the correct place in memory.
1469  */
1470 int
1471 hibernate_read_image(union hibernate_info *hiber_info)
1472 {
1473 	size_t compressed_size, disk_size, chunktable_size, pig_sz;
1474 	paddr_t image_start, image_end, pig_start, pig_end;
1475 	struct hibernate_disk_chunk *chunks;
1476 	daddr_t blkctr;
1477 	vaddr_t chunktable = (vaddr_t)NULL;
1478 	paddr_t piglet_chunktable = hiber_info->piglet_pa +
1479 	    HIBERNATE_CHUNK_SIZE;
1480 	int i;
1481 
1482 	/* Calculate total chunk table size in disk blocks */
1483 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / hiber_info->secsize;
1484 
1485 	blkctr = hiber_info->sig_offset - chunktable_size -
1486 			hiber_info->swap_offset;
1487 
1488 	chunktable = (vaddr_t)km_alloc(HIBERNATE_CHUNK_TABLE_SIZE, &kv_any,
1489 	    &kp_none, &kd_nowait);
1490 
1491 	if (!chunktable)
1492 		return (1);
1493 
1494 	/* Read the chunktable from disk into the piglet chunktable */
1495 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE;
1496 	    i += PAGE_SIZE, blkctr += PAGE_SIZE/hiber_info->secsize) {
1497 		pmap_kenter_pa(chunktable + i, piglet_chunktable + i, VM_PROT_ALL);
1498 		hibernate_read_block(hiber_info, blkctr, PAGE_SIZE,
1499 		    chunktable + i);
1500 	}
1501 
1502 	blkctr = hiber_info->image_offset;
1503 	compressed_size = 0;
1504 	pmap_kenter_pa(chunktable, piglet_chunktable, VM_PROT_ALL);
1505 	chunks = (struct hibernate_disk_chunk *)chunktable;
1506 
1507 	for (i = 0; i < hiber_info->chunk_ctr; i++)
1508 		compressed_size += chunks[i].compressed_size;
1509 
1510 	disk_size = compressed_size;
1511 
1512 	/* Allocate the pig area */
1513 	pig_sz = compressed_size + HIBERNATE_CHUNK_SIZE;
1514 	if (uvm_pmr_alloc_pig(&pig_start, pig_sz) == ENOMEM)
1515 		return (1);
1516 
1517 	pig_end = pig_start + pig_sz;
1518 
1519 	/* Calculate image extents. Pig image must end on a chunk boundary. */
1520 	image_end = pig_end & ~(HIBERNATE_CHUNK_SIZE - 1);
1521 	image_start = pig_start;
1522 
1523 	image_start = image_end - disk_size;
1524 
1525 	hibernate_read_chunks(hiber_info, image_start, image_end, disk_size,
1526 	    chunks);
1527 
1528 	/* Prepare the resume time pmap/page table */
1529 	hibernate_populate_resume_pt(hiber_info, image_start, image_end);
1530 
1531 	/* Read complete, clear the signature and return */
1532 	return hibernate_clear_signature();
1533 }
1534 
1535 /*
1536  * Read the hibernated memory chunks from disk (chunk information at this
1537  * point is stored in the piglet) into the pig area specified by
1538  * [pig_start .. pig_end]. Order the chunks so that the final chunk is the
1539  * only chunk with overlap possibilities.
1540  *
1541  * This function uses the piglet area during this process as follows:
1542  *
1543  * offset from piglet base	use
1544  * -----------------------	--------------------
1545  * 0				i/o allocation area
1546  * PAGE_SIZE			i/o write area
1547  * 2*PAGE_SIZE			temp/scratch page
1548  * 3*PAGE_SIZE			temp/scratch page
1549  * 4*PAGE_SIZE to 6*PAGE_SIZE	chunk ordering area
1550  * 7*PAGE_SIZE			hiballoc arena
1551  * 8*PAGE_SIZE to 88*PAGE_SIZE	zlib deflate area
1552  * ...
1553  * HIBERNATE_CHUNK_SIZE		chunk table temporary area
1554  */
1555 int
1556 hibernate_read_chunks(union hibernate_info *hib_info, paddr_t pig_start,
1557     paddr_t pig_end, size_t image_compr_size,
1558     struct hibernate_disk_chunk *chunks)
1559 {
1560 	paddr_t img_index, img_cur, r1s, r1e, r2s, r2e;
1561 	paddr_t copy_start, copy_end, piglet_cur;
1562 	paddr_t piglet_base = hib_info->piglet_pa;
1563 	paddr_t piglet_end = piglet_base + HIBERNATE_CHUNK_SIZE;
1564 	daddr_t blkctr;
1565 	size_t processed, compressed_size, read_size;
1566 	int i, j, overlap, found, nchunks;
1567 	int nochunks = 0, nfchunks = 0, npchunks = 0;
1568 	int *ochunks, *pchunks, *fchunks;
1569 	vaddr_t tempva = (vaddr_t)NULL, hibernate_fchunk_area = (vaddr_t)NULL;
1570 
1571 	global_pig_start = pig_start;
1572 
1573 	/* XXX - dont need this on all archs */
1574 	pmap_activate(curproc);
1575 
1576 	/*
1577 	 * These mappings go into the resuming kernel's page table, and are
1578 	 * used only during image read. They dissappear from existence
1579 	 * when the suspended kernel is unpacked on top of us.
1580 	 */
1581 	tempva = (vaddr_t)km_alloc(2*PAGE_SIZE, &kv_any, &kp_none, &kd_nowait);
1582 	if (!tempva)
1583 		return (1);
1584 	hibernate_fchunk_area = (vaddr_t)km_alloc(3*PAGE_SIZE, &kv_any,
1585 	    &kp_none, &kd_nowait);
1586 	if (!hibernate_fchunk_area)
1587 		return (1);
1588 
1589 	/* Temporary output chunk ordering */
1590 	ochunks = (int *)hibernate_fchunk_area;
1591 
1592 	/* Piglet chunk ordering */
1593 	pchunks = (int *)(hibernate_fchunk_area + PAGE_SIZE);
1594 
1595 	/* Final chunk ordering */
1596 	fchunks = (int *)(hibernate_fchunk_area + (2*PAGE_SIZE));
1597 
1598 	/* Map the chunk ordering region */
1599 	pmap_kenter_pa(hibernate_fchunk_area,
1600 	    piglet_base + (4*PAGE_SIZE), VM_PROT_ALL);
1601 	pmap_kenter_pa((vaddr_t)pchunks, piglet_base + (5*PAGE_SIZE),
1602 	    VM_PROT_ALL);
1603 	pmap_kenter_pa((vaddr_t)fchunks, piglet_base + (6*PAGE_SIZE),
1604 	    VM_PROT_ALL);
1605 
1606 	nchunks = hib_info->chunk_ctr;
1607 
1608 	/* Initially start all chunks as unplaced */
1609 	for (i = 0; i < nchunks; i++)
1610 		chunks[i].flags = 0;
1611 
1612 	/*
1613 	 * Search the list for chunks that are outside the pig area. These
1614 	 * can be placed first in the final output list.
1615 	 */
1616 	for (i = 0; i < nchunks; i++) {
1617 		if (chunks[i].end <= pig_start || chunks[i].base >= pig_end) {
1618 			ochunks[nochunks] = (u_int8_t)i;
1619 			fchunks[nfchunks] = (u_int8_t)i;
1620 			nochunks++;
1621 			nfchunks++;
1622 			chunks[i].flags |= HIBERNATE_CHUNK_USED;
1623 		}
1624 	}
1625 
1626 	/*
1627 	 * Walk the ordering, place the chunks in ascending memory order.
1628 	 * Conflicts might arise, these are handled next.
1629 	 */
1630 	do {
1631 		img_index = -1;
1632 		found = 0;
1633 		j = -1;
1634 		for (i = 0; i < nchunks; i++)
1635 			if (chunks[i].base < img_index &&
1636 			    chunks[i].flags == 0 ) {
1637 				j = i;
1638 				img_index = chunks[i].base;
1639 			}
1640 
1641 		if (j != -1) {
1642 			found = 1;
1643 			ochunks[nochunks] = (short)j;
1644 			nochunks++;
1645 			chunks[j].flags |= HIBERNATE_CHUNK_PLACED;
1646 		}
1647 	} while (found);
1648 
1649 	img_index = pig_start;
1650 
1651 	/*
1652 	 * Identify chunk output conflicts (chunks whose pig load area
1653 	 * corresponds to their original memory placement location)
1654 	 */
1655 	for (i = 0; i < nochunks ; i++) {
1656 		overlap = 0;
1657 		r1s = img_index;
1658 		r1e = img_index + chunks[ochunks[i]].compressed_size;
1659 		r2s = chunks[ochunks[i]].base;
1660 		r2e = chunks[ochunks[i]].end;
1661 
1662 		overlap = hibernate_check_overlap(r1s, r1e, r2s, r2e);
1663 		if (overlap)
1664 			chunks[ochunks[i]].flags |= HIBERNATE_CHUNK_CONFLICT;
1665 		img_index += chunks[ochunks[i]].compressed_size;
1666 	}
1667 
1668 	/*
1669 	 * Prepare the final output chunk list. Calculate an output
1670 	 * inflate strategy for overlapping chunks if needed.
1671 	 */
1672 	img_index = pig_start;
1673 	for (i = 0; i < nochunks ; i++) {
1674 		/*
1675 		 * If a conflict is detected, consume enough compressed
1676 		 * output chunks to fill the piglet
1677 		 */
1678 		if (chunks[ochunks[i]].flags & HIBERNATE_CHUNK_CONFLICT) {
1679 			copy_start = piglet_base;
1680 			copy_end = piglet_end;
1681 			piglet_cur = piglet_base;
1682 			npchunks = 0;
1683 			j = i;
1684 			while (copy_start < copy_end && j < nochunks) {
1685 				piglet_cur += chunks[ochunks[j]].compressed_size;
1686 				pchunks[npchunks] = ochunks[j];
1687 				npchunks++;
1688 				copy_start += chunks[ochunks[j]].compressed_size;
1689 				img_index += chunks[ochunks[j]].compressed_size;
1690 				i++;
1691 				j++;
1692 			}
1693 
1694 			piglet_cur = piglet_base;
1695 			for (j = 0; j < npchunks; j++) {
1696 				piglet_cur += chunks[pchunks[j]].compressed_size;
1697 				fchunks[nfchunks] = pchunks[j];
1698 				chunks[pchunks[j]].flags |= HIBERNATE_CHUNK_USED;
1699 				nfchunks++;
1700 			}
1701 		} else {
1702 			/*
1703 			 * No conflict, chunk can be added without copying
1704 			 */
1705 			if ((chunks[ochunks[i]].flags &
1706 			    HIBERNATE_CHUNK_USED) == 0) {
1707 				fchunks[nfchunks] = ochunks[i];
1708 				chunks[ochunks[i]].flags |= HIBERNATE_CHUNK_USED;
1709 				nfchunks++;
1710 			}
1711 			img_index += chunks[ochunks[i]].compressed_size;
1712 		}
1713 	}
1714 
1715 	img_index = pig_start;
1716 	for (i = 0; i < nfchunks; i++) {
1717 		piglet_cur = piglet_base;
1718 		img_index += chunks[fchunks[i]].compressed_size;
1719 	}
1720 
1721 	img_cur = pig_start;
1722 
1723 	for (i = 0; i < nfchunks; i++) {
1724 		blkctr = chunks[fchunks[i]].offset - hib_info->swap_offset;
1725 		processed = 0;
1726 		compressed_size = chunks[fchunks[i]].compressed_size;
1727 
1728 		while (processed < compressed_size) {
1729 			pmap_kenter_pa(tempva, img_cur, VM_PROT_ALL);
1730 			pmap_kenter_pa(tempva + PAGE_SIZE, img_cur+PAGE_SIZE,
1731 			    VM_PROT_ALL);
1732 
1733 			/* XXX - not needed on all archs */
1734 			pmap_activate(curproc);
1735 			if (compressed_size - processed >= PAGE_SIZE)
1736 				read_size = PAGE_SIZE;
1737 			else
1738 				read_size = compressed_size - processed;
1739 
1740 			hibernate_read_block(hib_info, blkctr, read_size,
1741 			    tempva + (img_cur & PAGE_MASK));
1742 
1743 			blkctr += (read_size / hib_info->secsize);
1744 
1745 			hibernate_flush();
1746 			pmap_kremove(tempva, PAGE_SIZE);
1747 			pmap_kremove(tempva + PAGE_SIZE, PAGE_SIZE);
1748 			processed += read_size;
1749 			img_cur += read_size;
1750 		}
1751 	}
1752 
1753 	return (0);
1754 }
1755 
1756 /*
1757  * Hibernating a machine comprises the following operations:
1758  *  1. Calculating this machine's hibernate_info information
1759  *  2. Allocating a piglet and saving the piglet's physaddr
1760  *  3. Calculating the memory chunks
1761  *  4. Writing the compressed chunks to disk
1762  *  5. Writing the chunk table
1763  *  6. Writing the signature block (hibernate_info)
1764  *
1765  * On most architectures, the function calling hibernate_suspend would
1766  * then power off the machine using some MD-specific implementation.
1767  */
1768 int
1769 hibernate_suspend(void)
1770 {
1771 	union hibernate_info hib_info;
1772 
1773 	/*
1774 	 * Calculate memory ranges, swap offsets, etc.
1775 	 * This also allocates a piglet whose physaddr is stored in
1776 	 * hib_info->piglet_pa and vaddr stored in hib_info->piglet_va
1777 	 */
1778 	if (get_hibernate_info(&hib_info, 1))
1779 		return (1);
1780 
1781 	global_piglet_va = hib_info.piglet_va;
1782 
1783 	if (hibernate_write_chunks(&hib_info))
1784 		return (1);
1785 
1786 	if (hibernate_write_chunktable(&hib_info))
1787 		return (1);
1788 
1789 	if (hibernate_write_signature(&hib_info))
1790 		return (1);
1791 
1792 	delay(500000);
1793 	return (0);
1794 }
1795 
1796 /*
1797  * Free items allocated during hibernate
1798  */
1799 void
1800 hibernate_free(void)
1801 {
1802 	uvm_pmr_free_piglet(global_piglet_va, 3*HIBERNATE_CHUNK_SIZE);
1803 
1804 	pmap_kremove(hibernate_copy_page, PAGE_SIZE);
1805 	pmap_kremove(hibernate_temp_page, PAGE_SIZE);
1806 	pmap_update(pmap_kernel());
1807 
1808 	km_free((void *)hibernate_copy_page, PAGE_SIZE, &kv_any, &kp_none);
1809 	km_free((void *)hibernate_temp_page, PAGE_SIZE, &kv_any, &kp_none);
1810 }
1811