xref: /openbsd-src/sys/kern/subr_hibernate.c (revision e33b04c70ad42be79427c0cdbab1a17a49cb450e)
1 /*	$OpenBSD: subr_hibernate.c,v 1.18 2011/09/22 22:12:45 deraadt Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  * Copyright (c) 2011 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/hibernate.h>
21 #include <sys/malloc.h>
22 #include <sys/param.h>
23 #include <sys/tree.h>
24 #include <sys/types.h>
25 #include <sys/systm.h>
26 #include <sys/disklabel.h>
27 #include <sys/disk.h>
28 #include <sys/conf.h>
29 #include <sys/buf.h>
30 #include <sys/fcntl.h>
31 #include <sys/stat.h>
32 #include <uvm/uvm.h>
33 #include <machine/hibernate.h>
34 
35 struct hibernate_zlib_state *hibernate_state;
36 
37 /* Temporary vaddr ranges used during hibernate */
38 vaddr_t hibernate_temp_page;
39 vaddr_t hibernate_copy_page;
40 vaddr_t hibernate_stack_page;
41 vaddr_t hibernate_fchunk_area;
42 vaddr_t	hibernate_chunktable_area;
43 vaddr_t hibernate_inflate_page;
44 
45 /* Hibernate info as read from disk during resume */
46 union hibernate_info disk_hiber_info;
47 
48 /*
49  * Hib alloc enforced alignment.
50  */
51 #define HIB_ALIGN		8 /* bytes alignment */
52 
53 /*
54  * sizeof builtin operation, but with alignment constraint.
55  */
56 #define HIB_SIZEOF(_type)	roundup(sizeof(_type), HIB_ALIGN)
57 
58 struct hiballoc_entry {
59 	size_t			hibe_use;
60 	size_t			hibe_space;
61 	RB_ENTRY(hiballoc_entry) hibe_entry;
62 };
63 
64 /*
65  * Compare hiballoc entries based on the address they manage.
66  *
67  * Since the address is fixed, relative to struct hiballoc_entry,
68  * we just compare the hiballoc_entry pointers.
69  */
70 static __inline int
71 hibe_cmp(struct hiballoc_entry *l, struct hiballoc_entry *r)
72 {
73 	return l < r ? -1 : (l > r);
74 }
75 
76 RB_PROTOTYPE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
77 
78 /*
79  * Given a hiballoc entry, return the address it manages.
80  */
81 static __inline void *
82 hib_entry_to_addr(struct hiballoc_entry *entry)
83 {
84 	caddr_t addr;
85 
86 	addr = (caddr_t)entry;
87 	addr += HIB_SIZEOF(struct hiballoc_entry);
88 	return addr;
89 }
90 
91 /*
92  * Given an address, find the hiballoc that corresponds.
93  */
94 static __inline struct hiballoc_entry*
95 hib_addr_to_entry(void *addr_param)
96 {
97 	caddr_t addr;
98 
99 	addr = (caddr_t)addr_param;
100 	addr -= HIB_SIZEOF(struct hiballoc_entry);
101 	return (struct hiballoc_entry*)addr;
102 }
103 
104 RB_GENERATE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
105 
106 /*
107  * Allocate memory from the arena.
108  *
109  * Returns NULL if no memory is available.
110  */
111 void *
112 hib_alloc(struct hiballoc_arena *arena, size_t alloc_sz)
113 {
114 	struct hiballoc_entry *entry, *new_entry;
115 	size_t find_sz;
116 
117 	/*
118 	 * Enforce alignment of HIB_ALIGN bytes.
119 	 *
120 	 * Note that, because the entry is put in front of the allocation,
121 	 * 0-byte allocations are guaranteed a unique address.
122 	 */
123 	alloc_sz = roundup(alloc_sz, HIB_ALIGN);
124 
125 	/*
126 	 * Find an entry with hibe_space >= find_sz.
127 	 *
128 	 * If the root node is not large enough, we switch to tree traversal.
129 	 * Because all entries are made at the bottom of the free space,
130 	 * traversal from the end has a slightly better chance of yielding
131 	 * a sufficiently large space.
132 	 */
133 	find_sz = alloc_sz + HIB_SIZEOF(struct hiballoc_entry);
134 	entry = RB_ROOT(&arena->hib_addrs);
135 	if (entry != NULL && entry->hibe_space < find_sz) {
136 		RB_FOREACH_REVERSE(entry, hiballoc_addr, &arena->hib_addrs) {
137 			if (entry->hibe_space >= find_sz)
138 				break;
139 		}
140 	}
141 
142 	/*
143 	 * Insufficient or too fragmented memory.
144 	 */
145 	if (entry == NULL)
146 		return NULL;
147 
148 	/*
149 	 * Create new entry in allocated space.
150 	 */
151 	new_entry = (struct hiballoc_entry*)(
152 	    (caddr_t)hib_entry_to_addr(entry) + entry->hibe_use);
153 	new_entry->hibe_space = entry->hibe_space - find_sz;
154 	new_entry->hibe_use = alloc_sz;
155 
156 	/*
157 	 * Insert entry.
158 	 */
159 	if (RB_INSERT(hiballoc_addr, &arena->hib_addrs, new_entry) != NULL)
160 		panic("hib_alloc: insert failure");
161 	entry->hibe_space = 0;
162 
163 	/* Return address managed by entry. */
164 	return hib_entry_to_addr(new_entry);
165 }
166 
167 /*
168  * Free a pointer previously allocated from this arena.
169  *
170  * If addr is NULL, this will be silently accepted.
171  */
172 void
173 hib_free(struct hiballoc_arena *arena, void *addr)
174 {
175 	struct hiballoc_entry *entry, *prev;
176 
177 	if (addr == NULL)
178 		return;
179 
180 	/*
181 	 * Derive entry from addr and check it is really in this arena.
182 	 */
183 	entry = hib_addr_to_entry(addr);
184 	if (RB_FIND(hiballoc_addr, &arena->hib_addrs, entry) != entry)
185 		panic("hib_free: freed item %p not in hib arena", addr);
186 
187 	/*
188 	 * Give the space in entry to its predecessor.
189 	 *
190 	 * If entry has no predecessor, change its used space into free space
191 	 * instead.
192 	 */
193 	prev = RB_PREV(hiballoc_addr, &arena->hib_addrs, entry);
194 	if (prev != NULL &&
195 	    (void *)((caddr_t)prev + HIB_SIZEOF(struct hiballoc_entry) +
196 	    prev->hibe_use + prev->hibe_space) == entry) {
197 		/* Merge entry. */
198 		RB_REMOVE(hiballoc_addr, &arena->hib_addrs, entry);
199 		prev->hibe_space += HIB_SIZEOF(struct hiballoc_entry) +
200 		    entry->hibe_use + entry->hibe_space;
201 	} else {
202 		/* Flip used memory to free space. */
203 		entry->hibe_space += entry->hibe_use;
204 		entry->hibe_use = 0;
205 	}
206 }
207 
208 /*
209  * Initialize hiballoc.
210  *
211  * The allocator will manage memmory at ptr, which is len bytes.
212  */
213 int
214 hiballoc_init(struct hiballoc_arena *arena, void *p_ptr, size_t p_len)
215 {
216 	struct hiballoc_entry *entry;
217 	caddr_t ptr;
218 	size_t len;
219 
220 	RB_INIT(&arena->hib_addrs);
221 
222 	/*
223 	 * Hib allocator enforces HIB_ALIGN alignment.
224 	 * Fixup ptr and len.
225 	 */
226 	ptr = (caddr_t)roundup((vaddr_t)p_ptr, HIB_ALIGN);
227 	len = p_len - ((size_t)ptr - (size_t)p_ptr);
228 	len &= ~((size_t)HIB_ALIGN - 1);
229 
230 	/*
231 	 * Insufficient memory to be able to allocate and also do bookkeeping.
232 	 */
233 	if (len <= HIB_SIZEOF(struct hiballoc_entry))
234 		return ENOMEM;
235 
236 	/*
237 	 * Create entry describing space.
238 	 */
239 	entry = (struct hiballoc_entry*)ptr;
240 	entry->hibe_use = 0;
241 	entry->hibe_space = len - HIB_SIZEOF(struct hiballoc_entry);
242 	RB_INSERT(hiballoc_addr, &arena->hib_addrs, entry);
243 
244 	return 0;
245 }
246 
247 /*
248  * Zero all free memory.
249  */
250 void
251 uvm_pmr_zero_everything(void)
252 {
253 	struct uvm_pmemrange	*pmr;
254 	struct vm_page		*pg;
255 	int			 i;
256 
257 	uvm_lock_fpageq();
258 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
259 		/* Zero single pages. */
260 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_DIRTY]))
261 		    != NULL) {
262 			uvm_pmr_remove(pmr, pg);
263 			uvm_pagezero(pg);
264 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
265 			uvmexp.zeropages++;
266 			uvm_pmr_insert(pmr, pg, 0);
267 		}
268 
269 		/* Zero multi page ranges. */
270 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_DIRTY]))
271 		    != NULL) {
272 			pg--; /* Size tree always has second page. */
273 			uvm_pmr_remove(pmr, pg);
274 			for (i = 0; i < pg->fpgsz; i++) {
275 				uvm_pagezero(&pg[i]);
276 				atomic_setbits_int(&pg[i].pg_flags, PG_ZERO);
277 				uvmexp.zeropages++;
278 			}
279 			uvm_pmr_insert(pmr, pg, 0);
280 		}
281 	}
282 	uvm_unlock_fpageq();
283 }
284 
285 /*
286  * Mark all memory as dirty.
287  *
288  * Used to inform the system that the clean memory isn't clean for some
289  * reason, for example because we just came back from hibernate.
290  */
291 void
292 uvm_pmr_dirty_everything(void)
293 {
294 	struct uvm_pmemrange	*pmr;
295 	struct vm_page		*pg;
296 	int			 i;
297 
298 	uvm_lock_fpageq();
299 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
300 		/* Dirty single pages. */
301 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_ZERO]))
302 		    != NULL) {
303 			uvm_pmr_remove(pmr, pg);
304 			atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
305 			uvm_pmr_insert(pmr, pg, 0);
306 		}
307 
308 		/* Dirty multi page ranges. */
309 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_ZERO]))
310 		    != NULL) {
311 			pg--; /* Size tree always has second page. */
312 			uvm_pmr_remove(pmr, pg);
313 			for (i = 0; i < pg->fpgsz; i++)
314 				atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
315 			uvm_pmr_insert(pmr, pg, 0);
316 		}
317 	}
318 
319 	uvmexp.zeropages = 0;
320 	uvm_unlock_fpageq();
321 }
322 
323 /*
324  * Allocate the highest address that can hold sz.
325  *
326  * sz in bytes.
327  */
328 int
329 uvm_pmr_alloc_pig(paddr_t *addr, psize_t sz)
330 {
331 	struct uvm_pmemrange	*pmr;
332 	struct vm_page		*pig_pg, *pg;
333 
334 	/*
335 	 * Convert sz to pages, since that is what pmemrange uses internally.
336 	 */
337 	sz = atop(round_page(sz));
338 
339 	uvm_lock_fpageq();
340 
341 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
342 		RB_FOREACH_REVERSE(pig_pg, uvm_pmr_addr, &pmr->addr) {
343 			if (pig_pg->fpgsz >= sz) {
344 				goto found;
345 			}
346 		}
347 	}
348 
349 	/*
350 	 * Allocation failure.
351 	 */
352 	uvm_unlock_pageq();
353 	return ENOMEM;
354 
355 found:
356 	/* Remove page from freelist. */
357 	uvm_pmr_remove_size(pmr, pig_pg);
358 	pig_pg->fpgsz -= sz;
359 	pg = pig_pg + pig_pg->fpgsz;
360 	if (pig_pg->fpgsz == 0)
361 		uvm_pmr_remove_addr(pmr, pig_pg);
362 	else
363 		uvm_pmr_insert_size(pmr, pig_pg);
364 
365 	uvmexp.free -= sz;
366 	*addr = VM_PAGE_TO_PHYS(pg);
367 
368 	/*
369 	 * Update pg flags.
370 	 *
371 	 * Note that we trash the sz argument now.
372 	 */
373 	while (sz > 0) {
374 		KASSERT(pg->pg_flags & PQ_FREE);
375 
376 		atomic_clearbits_int(&pg->pg_flags,
377 		    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3);
378 
379 		if (pg->pg_flags & PG_ZERO)
380 			uvmexp.zeropages -= sz;
381 		atomic_clearbits_int(&pg->pg_flags,
382 		    PG_ZERO|PQ_FREE);
383 
384 		pg->uobject = NULL;
385 		pg->uanon = NULL;
386 		pg->pg_version++;
387 
388 		/*
389 		 * Next.
390 		 */
391 		pg++;
392 		sz--;
393 	}
394 
395 	/* Return. */
396 	uvm_unlock_fpageq();
397 	return 0;
398 }
399 
400 /*
401  * Allocate a piglet area.
402  *
403  * This is as low as possible.
404  * Piglets are aligned.
405  *
406  * sz and align in bytes.
407  *
408  * The call will sleep for the pagedaemon to attempt to free memory.
409  * The pagedaemon may decide its not possible to free enough memory, causing
410  * the allocation to fail.
411  */
412 int
413 uvm_pmr_alloc_piglet(vaddr_t *va, paddr_t *pa, vsize_t sz, paddr_t align)
414 {
415 	paddr_t			 pg_addr, piglet_addr;
416 	struct uvm_pmemrange	*pmr;
417 	struct vm_page		*pig_pg, *pg;
418 	struct pglist		 pageq;
419 	int			 pdaemon_woken;
420 	vaddr_t			 piglet_va;
421 
422 	KASSERT((align & (align - 1)) == 0);
423 	pdaemon_woken = 0; /* Didn't wake the pagedaemon. */
424 
425 	/*
426 	 * Fixup arguments: align must be at least PAGE_SIZE,
427 	 * sz will be converted to pagecount, since that is what
428 	 * pmemrange uses internally.
429 	 */
430 	if (align < PAGE_SIZE)
431 		align = PAGE_SIZE;
432 	sz = round_page(sz);
433 
434 	uvm_lock_fpageq();
435 
436 	TAILQ_FOREACH_REVERSE(pmr, &uvm.pmr_control.use, uvm_pmemrange_use,
437 	    pmr_use) {
438 retry:
439 		/*
440 		 * Search for a range with enough space.
441 		 * Use the address tree, to ensure the range is as low as
442 		 * possible.
443 		 */
444 		RB_FOREACH(pig_pg, uvm_pmr_addr, &pmr->addr) {
445 			pg_addr = VM_PAGE_TO_PHYS(pig_pg);
446 			piglet_addr = (pg_addr + (align - 1)) & ~(align - 1);
447 
448 			if (atop(pg_addr) + pig_pg->fpgsz >=
449 			    atop(piglet_addr) + atop(sz))
450 				goto found;
451 		}
452 	}
453 
454 	/*
455 	 * Try to coerse the pagedaemon into freeing memory
456 	 * for the piglet.
457 	 *
458 	 * pdaemon_woken is set to prevent the code from
459 	 * falling into an endless loop.
460 	 */
461 	if (!pdaemon_woken) {
462 		pdaemon_woken = 1;
463 		if (uvm_wait_pla(ptoa(pmr->low), ptoa(pmr->high) - 1,
464 		    sz, UVM_PLA_FAILOK) == 0)
465 			goto retry;
466 	}
467 
468 	/* Return failure. */
469 	uvm_unlock_fpageq();
470 	return ENOMEM;
471 
472 found:
473 	/*
474 	 * Extract piglet from pigpen.
475 	 */
476 	TAILQ_INIT(&pageq);
477 	uvm_pmr_extract_range(pmr, pig_pg,
478 	    atop(piglet_addr), atop(piglet_addr) + atop(sz), &pageq);
479 
480 	*pa = piglet_addr;
481 	uvmexp.free -= atop(sz);
482 
483 	/*
484 	 * Update pg flags.
485 	 *
486 	 * Note that we trash the sz argument now.
487 	 */
488 	TAILQ_FOREACH(pg, &pageq, pageq) {
489 		KASSERT(pg->pg_flags & PQ_FREE);
490 
491 		atomic_clearbits_int(&pg->pg_flags,
492 		    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3);
493 
494 		if (pg->pg_flags & PG_ZERO)
495 			uvmexp.zeropages--;
496 		atomic_clearbits_int(&pg->pg_flags,
497 		    PG_ZERO|PQ_FREE);
498 
499 		pg->uobject = NULL;
500 		pg->uanon = NULL;
501 		pg->pg_version++;
502 	}
503 
504 	uvm_unlock_fpageq();
505 
506 	/*
507 	 * Now allocate a va.
508 	 * Use direct mappings for the pages.
509 	 */
510 
511 	piglet_va = *va = (vaddr_t)km_alloc(sz, &kv_any, &kp_none, &kd_waitok);
512 	if (!piglet_va) {
513 		uvm_pglistfree(&pageq);
514 		return ENOMEM;
515 	}
516 
517 	/*
518 	 * Map piglet to va.
519 	 */
520 	TAILQ_FOREACH(pg, &pageq, pageq) {
521 		pmap_kenter_pa(piglet_va, VM_PAGE_TO_PHYS(pg), UVM_PROT_RW);
522 		piglet_va += PAGE_SIZE;
523 	}
524 	pmap_update(pmap_kernel());
525 
526 	return 0;
527 }
528 
529 /*
530  * Free a piglet area.
531  */
532 void
533 uvm_pmr_free_piglet(vaddr_t va, vsize_t sz)
534 {
535 	paddr_t			 pa;
536 	struct vm_page		*pg;
537 
538 	/*
539 	 * Fix parameters.
540 	 */
541 	sz = round_page(sz);
542 
543 	/*
544 	 * Find the first page in piglet.
545 	 * Since piglets are contiguous, the first pg is all we need.
546 	 */
547 	if (!pmap_extract(pmap_kernel(), va, &pa))
548 		panic("uvm_pmr_free_piglet: piglet 0x%lx has no pages", va);
549 	pg = PHYS_TO_VM_PAGE(pa);
550 	if (pg == NULL)
551 		panic("uvm_pmr_free_piglet: unmanaged page 0x%lx", pa);
552 
553 	/*
554 	 * Unmap.
555 	 */
556 	pmap_kremove(va, sz);
557 	pmap_update(pmap_kernel());
558 
559 	/*
560 	 * Free the physical and virtual memory.
561 	 */
562 	uvm_pmr_freepages(pg, atop(sz));
563 	km_free((void *)va, sz, &kv_any, &kp_none);
564 }
565 
566 /*
567  * Physmem RLE compression support.
568  *
569  * Given a physical page address, it will return the number of pages
570  * starting at the address, that are free.
571  * Returns 0 if the page at addr is not free.
572  */
573 psize_t
574 uvm_page_rle(paddr_t addr)
575 {
576 	struct vm_page		*pg, *pg_end;
577 	struct vm_physseg	*vmp;
578 	int			 pseg_idx, off_idx;
579 
580 	pseg_idx = vm_physseg_find(atop(addr), &off_idx);
581 	if (pseg_idx == -1)
582 		return 0;
583 
584 	vmp = &vm_physmem[pseg_idx];
585 	pg = &vmp->pgs[off_idx];
586 	if (!(pg->pg_flags & PQ_FREE))
587 		return 0;
588 
589 	/*
590 	 * Search for the first non-free page after pg.
591 	 * Note that the page may not be the first page in a free pmemrange,
592 	 * therefore pg->fpgsz cannot be used.
593 	 */
594 	for (pg_end = pg; pg_end <= vmp->lastpg &&
595 	    (pg_end->pg_flags & PQ_FREE) == PQ_FREE; pg_end++);
596 	return pg_end - pg;
597 }
598 
599 /*
600  * Fills out the hibernate_info union pointed to by hiber_info
601  * with information about this machine (swap signature block
602  * offsets, number of memory ranges, kernel in use, etc)
603  */
604 int
605 get_hibernate_info(union hibernate_info *hiber_info, int suspend)
606 {
607 	int chunktable_size;
608 	struct disklabel dl;
609 	char err_string[128], *dl_ret;
610 
611 	/* Determine I/O function to use */
612 	hiber_info->io_func = get_hibernate_io_function();
613 	if (hiber_info->io_func == NULL)
614 		return (1);
615 
616 	/* Calculate hibernate device */
617 	hiber_info->device = swdevt[0].sw_dev;
618 
619 	/* Read disklabel (used to calculate signature and image offsets) */
620 	dl_ret = disk_readlabel(&dl, hiber_info->device, err_string, 128);
621 
622 	if (dl_ret) {
623 		printf("Hibernate error reading disklabel: %s\n", dl_ret);
624 		return (1);
625 	}
626 
627 	hiber_info->secsize = dl.d_secsize;
628 
629 	/* Make sure the signature can fit in one block */
630 	KASSERT(sizeof(union hibernate_info)/hiber_info->secsize == 1);
631 
632 	/* Calculate swap offset from start of disk */
633 	hiber_info->swap_offset = dl.d_partitions[1].p_offset;
634 
635 	/* Calculate signature block location */
636 	hiber_info->sig_offset = dl.d_partitions[1].p_offset +
637 	    dl.d_partitions[1].p_size -
638 	    sizeof(union hibernate_info)/hiber_info->secsize;
639 
640 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / hiber_info->secsize;
641 
642 	/* Stash kernel version information */
643 	bzero(&hiber_info->kernel_version, 128);
644 	bcopy(version, &hiber_info->kernel_version,
645 	    min(strlen(version), sizeof(hiber_info->kernel_version)-1));
646 
647 	if (suspend) {
648 		/* Allocate piglet region */
649 		if (uvm_pmr_alloc_piglet(&hiber_info->piglet_va,
650 		    &hiber_info->piglet_pa, HIBERNATE_CHUNK_SIZE*3,
651 		    HIBERNATE_CHUNK_SIZE)) {
652 			printf("Hibernate failed to allocate the piglet\n");
653 			return (1);
654 		}
655 	}
656 
657 	if (get_hibernate_info_md(hiber_info))
658 		return (1);
659 
660 	/* Calculate memory image location */
661 	hiber_info->image_offset = dl.d_partitions[1].p_offset +
662 	    dl.d_partitions[1].p_size -
663 	    (hiber_info->image_size / hiber_info->secsize) -
664 	    sizeof(union hibernate_info)/hiber_info->secsize -
665 	    chunktable_size;
666 
667 	return (0);
668 }
669 
670 /*
671  * Allocate nitems*size bytes from the hiballoc area presently in use
672  */
673 void
674 *hibernate_zlib_alloc(void *unused, int nitems, int size)
675 {
676 	return hib_alloc(&hibernate_state->hiballoc_arena, nitems*size);
677 }
678 
679 /*
680  * Free the memory pointed to by addr in the hiballoc area presently in
681  * use
682  */
683 void
684 hibernate_zlib_free(void *unused, void *addr)
685 {
686 	hib_free(&hibernate_state->hiballoc_arena, addr);
687 }
688 
689 /*
690  * Inflate size bytes from src into dest, skipping any pages in
691  * [src..dest] that are special (see hibernate_inflate_skip)
692  *
693  * For each page of output data, we map HIBERNATE_TEMP_PAGE
694  * to the current output page, and tell inflate() to inflate
695  * its data there, resulting in the inflated data being placed
696  * at the proper paddr.
697  *
698  * This function executes while using the resume-time stack
699  * and pmap, and therefore cannot use ddb/printf/etc. Doing so
700  * will likely hang or reset the machine.
701  */
702 void
703 hibernate_inflate(union hibernate_info *hiber_info, paddr_t dest,
704     paddr_t src, size_t size)
705 {
706 	int i;
707 
708 	hibernate_state->hib_stream.avail_in = size;
709 	hibernate_state->hib_stream.next_in = (char *)src;
710 
711 	hibernate_inflate_page = hiber_info->piglet_va + 2 * PAGE_SIZE;
712 
713 	do {
714 		/* Flush cache and TLB */
715 		hibernate_flush();
716 
717 		/*
718 		 * Is this a special page? If yes, redirect the
719 		 * inflate output to a scratch page (eg, discard it)
720 		 */
721 		if (hibernate_inflate_skip(hiber_info, dest))
722 			hibernate_enter_resume_mapping(
723 			    hibernate_inflate_page,
724 			    hiber_info->piglet_pa + 2 * PAGE_SIZE, 0);
725 		else
726 			hibernate_enter_resume_mapping(
727 			    hibernate_inflate_page, dest, 0);
728 
729 		/* Set up the stream for inflate */
730 		hibernate_state->hib_stream.avail_out = PAGE_SIZE;
731 		hibernate_state->hib_stream.next_out =
732 		    (char *)hiber_info->piglet_va + 2 * PAGE_SIZE;
733 
734 		/* Process next block of data */
735 		i = inflate(&hibernate_state->hib_stream, Z_PARTIAL_FLUSH);
736 		if (i != Z_OK && i != Z_STREAM_END) {
737 			/*
738 			 * XXX - this will likely reboot/hang most machines,
739 			 *       but there's not much else we can do here.
740 			 */
741 			panic("inflate error");
742 		}
743 
744 		dest += PAGE_SIZE - hibernate_state->hib_stream.avail_out;
745 	} while (i != Z_STREAM_END);
746 }
747 
748 /*
749  * deflate from src into the I/O page, up to 'remaining' bytes
750  *
751  * Returns number of input bytes consumed, and may reset
752  * the 'remaining' parameter if not all the output space was consumed
753  * (this information is needed to know how much to write to disk
754  */
755 size_t
756 hibernate_deflate(union hibernate_info *hiber_info, paddr_t src,
757     size_t *remaining)
758 {
759 	vaddr_t hibernate_io_page = hiber_info->piglet_va + PAGE_SIZE;
760 
761 	/* Set up the stream for deflate */
762 	hibernate_state->hib_stream.avail_in = PAGE_SIZE - (src & PAGE_MASK);
763 	hibernate_state->hib_stream.avail_out = *remaining;
764 	hibernate_state->hib_stream.next_in = (caddr_t)src;
765 	hibernate_state->hib_stream.next_out = (caddr_t)hibernate_io_page +
766 	    (PAGE_SIZE - *remaining);
767 
768 	/* Process next block of data */
769 	if (deflate(&hibernate_state->hib_stream, Z_PARTIAL_FLUSH) != Z_OK)
770 		panic("hibernate zlib deflate error\n");
771 
772 	/* Update pointers and return number of bytes consumed */
773 	*remaining = hibernate_state->hib_stream.avail_out;
774 	return (PAGE_SIZE - (src & PAGE_MASK)) -
775 		hibernate_state->hib_stream.avail_in;
776 }
777 
778 /*
779  * Write the hibernation information specified in hiber_info
780  * to the location in swap previously calculated (last block of
781  * swap), called the "signature block".
782  *
783  * Write the memory chunk table to the area in swap immediately
784  * preceding the signature block.
785  */
786 int
787 hibernate_write_signature(union hibernate_info *hiber_info)
788 {
789 	u_int8_t *io_page;
790 	int result = 0;
791 
792 	io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
793 	if (!io_page)
794 		return (1);
795 
796 	/* Write hibernate info to disk */
797 	if (hiber_info->io_func(hiber_info->device, hiber_info->sig_offset,
798 	    (vaddr_t)hiber_info, hiber_info->secsize, 1, io_page))
799 		result = 1;
800 
801 	free(io_page, M_DEVBUF);
802 	return (result);
803 }
804 
805 /*
806  * Write the memory chunk table to the area in swap immediately
807  * preceding the signature block. The chunk table is stored
808  * in the piglet when this function is called.
809  */
810 int
811 hibernate_write_chunktable(union hibernate_info *hiber_info)
812 {
813 	struct hibernate_disk_chunk *chunks;
814 	vaddr_t hibernate_chunk_table_start;
815 	size_t hibernate_chunk_table_size;
816 	u_int8_t *io_page;
817 	daddr_t chunkbase;
818 	int i;
819 
820 	io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
821 	if (!io_page)
822 		return (1);
823 
824 	hibernate_chunk_table_size = HIBERNATE_CHUNK_TABLE_SIZE;
825 
826 	chunkbase = hiber_info->sig_offset -
827 	    (hibernate_chunk_table_size / hiber_info->secsize);
828 
829 	hibernate_chunk_table_start = hiber_info->piglet_va +
830 	    HIBERNATE_CHUNK_SIZE;
831 
832 	chunks = (struct hibernate_disk_chunk *)(hiber_info->piglet_va +
833 	    HIBERNATE_CHUNK_SIZE);
834 
835 	/* Write chunk table */
836 	for (i = 0; i < hibernate_chunk_table_size; i += MAXPHYS) {
837 		if (hiber_info->io_func(hiber_info->device,
838 		    chunkbase + (i/hiber_info->secsize),
839 		    (vaddr_t)(hibernate_chunk_table_start + i),
840 		    MAXPHYS, 1, io_page)) {
841 			free(io_page, M_DEVBUF);
842 			return (1);
843 		}
844 	}
845 
846 	free(io_page, M_DEVBUF);
847 	return (0);
848 }
849 
850 /*
851  * Write an empty hiber_info to the swap signature block, which is
852  * guaranteed to not match any valid hiber_info.
853  */
854 int
855 hibernate_clear_signature(void)
856 {
857 	union hibernate_info blank_hiber_info;
858 	union hibernate_info hiber_info;
859 	u_int8_t *io_page;
860 
861 	/* Zero out a blank hiber_info */
862 	bzero(&blank_hiber_info, sizeof(hiber_info));
863 
864 	if (get_hibernate_info(&hiber_info, 0))
865 		return (1);
866 
867 	io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
868 	if (!io_page)
869 		return (1);
870 
871 	/* Write (zeroed) hibernate info to disk */
872 	/* XXX - use regular kernel write routine for this */
873 	if (hiber_info.io_func(hiber_info.device, hiber_info.sig_offset,
874 	    (vaddr_t)&blank_hiber_info, hiber_info.secsize, 1, io_page))
875 		panic("error hibernate write 6\n");
876 
877 	free(io_page, M_DEVBUF);
878 
879 	return (0);
880 }
881 
882 /*
883  * Check chunk range overlap when calculating whether or not to copy a
884  * compressed chunk to the piglet area before decompressing.
885  *
886  * returns zero if the ranges do not overlap, non-zero otherwise.
887  */
888 int
889 hibernate_check_overlap(paddr_t r1s, paddr_t r1e, paddr_t r2s, paddr_t r2e)
890 {
891 	/* case A : end of r1 overlaps start of r2 */
892 	if (r1s < r2s && r1e > r2s)
893 		return (1);
894 
895 	/* case B : r1 entirely inside r2 */
896 	if (r1s >= r2s && r1e <= r2e)
897 		return (1);
898 
899 	/* case C : r2 entirely inside r1 */
900 	if (r2s >= r1s && r2e <= r1e)
901 		return (1);
902 
903 	/* case D : end of r2 overlaps start of r1 */
904 	if (r2s < r1s && r2e > r1s)
905 		return (1);
906 
907 	return (0);
908 }
909 
910 /*
911  * Compare two hibernate_infos to determine if they are the same (eg,
912  * we should be performing a hibernate resume on this machine.
913  * Not all fields are checked - just enough to verify that the machine
914  * has the same memory configuration and kernel as the one that
915  * wrote the signature previously.
916  */
917 int
918 hibernate_compare_signature(union hibernate_info *mine,
919     union hibernate_info *disk)
920 {
921 	u_int i;
922 
923 	if (mine->nranges != disk->nranges)
924 		return (1);
925 
926 	if (strcmp(mine->kernel_version, disk->kernel_version) != 0)
927 		return (1);
928 
929 	for (i = 0; i < mine->nranges; i++) {
930 		if ((mine->ranges[i].base != disk->ranges[i].base) ||
931 		    (mine->ranges[i].end != disk->ranges[i].end) )
932 			return (1);
933 	}
934 
935 	return (0);
936 }
937 
938 /*
939  * Reads read_size bytes from the hibernate device specified in
940  * hib_info at offset blkctr. Output is placed into the vaddr specified
941  * at dest.
942  *
943  * Separate offsets and pages are used to handle misaligned reads (reads
944  * that span a page boundary).
945  *
946  * blkctr specifies a relative offset (relative to the start of swap),
947  * not an absolute disk offset
948  *
949  */
950 int
951 hibernate_read_block(union hibernate_info *hib_info, daddr_t blkctr,
952     size_t read_size, vaddr_t dest)
953 {
954 	struct buf *bp;
955 	struct bdevsw *bdsw;
956 	int error;
957 
958 	bp = geteblk(read_size);
959 	bdsw = &bdevsw[major(hib_info->device)];
960 
961 	error = (*bdsw->d_open)(hib_info->device, FREAD, S_IFCHR, curproc);
962 	if (error) {
963 		printf("hibernate_read_block open failed\n");
964 		return (1);
965 	}
966 
967 	bp->b_bcount = read_size;
968 	bp->b_blkno = blkctr;
969 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
970 	SET(bp->b_flags, B_BUSY | B_READ | B_RAW);
971 	bp->b_dev = hib_info->device;
972 	bp->b_cylinder = 0;
973 	(*bdsw->d_strategy)(bp);
974 
975 	error = biowait(bp);
976 	if (error) {
977 		printf("hibernate_read_block biowait failed %d\n", error);
978 		error = (*bdsw->d_close)(hib_info->device, 0, S_IFCHR,
979 		    curproc);
980 		if (error)
981 			printf("hibernate_read_block error close failed\n");
982 		return (1);
983 	}
984 
985 	error = (*bdsw->d_close)(hib_info->device, FREAD, S_IFCHR, curproc);
986 	if (error) {
987 		printf("hibernate_read_block close failed\n");
988 		return (1);
989 	}
990 
991 	bcopy(bp->b_data, (caddr_t)dest, read_size);
992 
993 	bp->b_flags |= B_INVAL;
994 	brelse(bp);
995 
996 	return (0);
997 }
998 
999 /*
1000  * Reads the signature block from swap, checks against the current machine's
1001  * information. If the information matches, perform a resume by reading the
1002  * saved image into the pig area, and unpacking.
1003  */
1004 void
1005 hibernate_resume(void)
1006 {
1007 	union hibernate_info hiber_info;
1008 	u_int8_t *io_page;
1009 	int s;
1010 
1011 	/* Scrub temporary vaddr ranges used during resume */
1012 	hibernate_temp_page = (vaddr_t)NULL;
1013 	hibernate_fchunk_area = (vaddr_t)NULL;
1014 	hibernate_chunktable_area = (vaddr_t)NULL;
1015 	hibernate_stack_page = (vaddr_t)NULL;
1016 
1017 	/* Get current running machine's hibernate info */
1018 	bzero(&hiber_info, sizeof(hiber_info));
1019 	if (get_hibernate_info(&hiber_info, 0))
1020 		return;
1021 
1022 	io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
1023 	if (!io_page)
1024 		return;
1025 
1026 	/* Read hibernate info from disk */
1027 	s = splbio();
1028 
1029 	/* XXX use regular kernel read routine here */
1030 	if (hiber_info.io_func(hiber_info.device, hiber_info.sig_offset,
1031 	    (vaddr_t)&disk_hiber_info, hiber_info.secsize, 0, io_page))
1032 		panic("error in hibernate read\n");
1033 
1034 	free(io_page, M_DEVBUF);
1035 
1036 	/*
1037 	 * If on-disk and in-memory hibernate signatures match,
1038 	 * this means we should do a resume from hibernate.
1039 	 */
1040 	if (hibernate_compare_signature(&hiber_info, &disk_hiber_info))
1041 		return;
1042 
1043 	/*
1044 	 * Allocate several regions of vaddrs for use during read.
1045 	 * These mappings go into the resuming kernel's page table, and are
1046 	 * used only during image read.
1047 	 */
1048 	hibernate_temp_page = (vaddr_t)km_alloc(2*PAGE_SIZE, &kv_any,
1049 	    &kp_none, &kd_nowait);
1050 	if (!hibernate_temp_page)
1051 		goto fail;
1052 
1053 	hibernate_fchunk_area = (vaddr_t)km_alloc(3*PAGE_SIZE, &kv_any,
1054 	    &kp_none, &kd_nowait);
1055 	if (!hibernate_fchunk_area)
1056 		goto fail;
1057 
1058 	/* Allocate a temporary chunktable area */
1059 	hibernate_chunktable_area = (vaddr_t)malloc(HIBERNATE_CHUNK_TABLE_SIZE,
1060 					   M_DEVBUF, M_NOWAIT);
1061 	if (!hibernate_chunktable_area)
1062 		goto fail;
1063 
1064 	/* Allocate one temporary page of VAs for the resume time stack */
1065 	hibernate_stack_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1066 	    &kp_none, &kd_nowait);
1067 	if (!hibernate_stack_page)
1068 		goto fail;
1069 
1070 	/* Read the image from disk into the image (pig) area */
1071 	if (hibernate_read_image(&disk_hiber_info))
1072 		goto fail;
1073 
1074 	/* Point of no return ... */
1075 
1076 	disable_intr();
1077 	cold = 1;
1078 
1079 	/* Switch stacks */
1080 	hibernate_switch_stack_machdep();
1081 
1082 	/*
1083 	 * Image is now in high memory (pig area), copy to correct location
1084 	 * in memory. We'll eventually end up copying on top of ourself, but
1085 	 * we are assured the kernel code here is the same between the
1086 	 * hibernated and resuming kernel, and we are running on our own
1087 	 * stack, so the overwrite is ok.
1088 	 */
1089 	hibernate_unpack_image(&disk_hiber_info);
1090 
1091 	/*
1092 	 * Resume the loaded kernel by jumping to the MD resume vector.
1093 	 * We won't be returning from this call.
1094 	 */
1095 	hibernate_resume_machdep();
1096 
1097 fail:
1098 	printf("Unable to resume hibernated image\n");
1099 
1100 	if (hibernate_temp_page)
1101 		km_free((void *)hibernate_temp_page, 2*PAGE_SIZE, &kv_any,
1102 		    &kp_none);
1103 
1104 	if (hibernate_fchunk_area)
1105 		km_free((void *)hibernate_fchunk_area, 3*PAGE_SIZE, &kv_any,
1106 		    &kp_none);
1107 
1108 	if (io_page)
1109 		free((void *)io_page, M_DEVBUF);
1110 
1111 	if (hibernate_chunktable_area)
1112 		free((void *)hibernate_chunktable_area, M_DEVBUF);
1113 }
1114 
1115 /*
1116  * Unpack image from pig area to original location by looping through the
1117  * list of output chunks in the order they should be restored (fchunks).
1118  * This ordering is used to avoid having inflate overwrite a chunk in the
1119  * middle of processing that chunk. This will, of course, happen during the
1120  * final output chunk, where we copy the chunk to the piglet area first,
1121  * before inflating.
1122  */
1123 void
1124 hibernate_unpack_image(union hibernate_info *hiber_info)
1125 {
1126 	struct hibernate_disk_chunk *chunks;
1127 	paddr_t image_cur;
1128 	vaddr_t tempva;
1129 	int *fchunks, i;
1130 	char *pva;
1131 
1132 	pva = (char *)hiber_info->piglet_va;
1133 
1134 	fchunks = (int *)(pva + (4 * PAGE_SIZE));
1135 
1136 	/* Copy temporary chunktable to piglet */
1137 	tempva = (vaddr_t)km_alloc(HIBERNATE_CHUNK_TABLE_SIZE, &kv_any,
1138 	    &kp_none, &kd_nowait);
1139 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE; i += PAGE_SIZE)
1140 		pmap_kenter_pa(tempva + i, hiber_info->piglet_pa +
1141 		    HIBERNATE_CHUNK_SIZE + i, VM_PROT_ALL);
1142 
1143 	bcopy((caddr_t)hibernate_chunktable_area, (caddr_t)tempva,
1144 	    HIBERNATE_CHUNK_TABLE_SIZE);
1145 
1146 	chunks = (struct hibernate_disk_chunk *)(pva +  HIBERNATE_CHUNK_SIZE);
1147 
1148 	hibernate_activate_resume_pt_machdep();
1149 
1150 	for (i = 0; i < hiber_info->chunk_ctr; i++) {
1151 		/* Reset zlib for inflate */
1152 		if (hibernate_zlib_reset(hiber_info, 0) != Z_OK)
1153 			panic("hibernate failed to reset zlib for inflate\n");
1154 
1155 		/*
1156 		 * If there is a conflict, copy the chunk to the piglet area
1157 		 * before unpacking it to its original location.
1158 		 */
1159 		if ((chunks[fchunks[i]].flags & HIBERNATE_CHUNK_CONFLICT) == 0)
1160 			hibernate_inflate(hiber_info, chunks[fchunks[i]].base,
1161 			    image_cur, chunks[fchunks[i]].compressed_size);
1162 		else {
1163 			bcopy((caddr_t)image_cur,
1164 			    (caddr_t)hiber_info->piglet_va +
1165 			    HIBERNATE_CHUNK_SIZE * 2,
1166 			    chunks[fchunks[i]].compressed_size);
1167 			hibernate_inflate(hiber_info, chunks[fchunks[i]].base,
1168 			    hiber_info->piglet_va + HIBERNATE_CHUNK_SIZE * 2,
1169 			    chunks[fchunks[i]].compressed_size);
1170 		}
1171 		image_cur += chunks[fchunks[i]].compressed_size;
1172 	}
1173 }
1174 
1175 /*
1176  * Write a compressed version of this machine's memory to disk, at the
1177  * precalculated swap offset:
1178  *
1179  * end of swap - signature block size - chunk table size - memory size
1180  *
1181  * The function begins by looping through each phys mem range, cutting each
1182  * one into 4MB chunks. These chunks are then compressed individually
1183  * and written out to disk, in phys mem order. Some chunks might compress
1184  * more than others, and for this reason, each chunk's size is recorded
1185  * in the chunk table, which is written to disk after the image has
1186  * properly been compressed and written (in hibernate_write_chunktable).
1187  *
1188  * When this function is called, the machine is nearly suspended - most
1189  * devices are quiesced/suspended, interrupts are off, and cold has
1190  * been set. This means that there can be no side effects once the
1191  * write has started, and the write function itself can also have no
1192  * side effects.
1193  *
1194  * This function uses the piglet area during this process as follows:
1195  *
1196  * offset from piglet base	use
1197  * -----------------------	--------------------
1198  * 0				i/o allocation area
1199  * PAGE_SIZE			i/o write area
1200  * 2*PAGE_SIZE			temp/scratch page
1201  * 3*PAGE_SIZE			temp/scratch page
1202  * 4*PAGE_SIZE			hiballoc arena
1203  * 5*PAGE_SIZE to 85*PAGE_SIZE	zlib deflate area
1204  * ...
1205  * HIBERNATE_CHUNK_SIZE		chunk table temporary area
1206  *
1207  * Some transient piglet content is saved as part of deflate,
1208  * but it is irrelevant during resume as it will be repurposed
1209  * at that time for other things.
1210  */
1211 int
1212 hibernate_write_chunks(union hibernate_info *hiber_info)
1213 {
1214 	paddr_t range_base, range_end, inaddr, temp_inaddr;
1215 	size_t nblocks, out_remaining, used, offset = 0;
1216 	struct hibernate_disk_chunk *chunks;
1217 	vaddr_t hibernate_alloc_page = hiber_info->piglet_va;
1218 	vaddr_t hibernate_io_page = hiber_info->piglet_va + PAGE_SIZE;
1219 	daddr_t blkctr = hiber_info->image_offset;
1220 	int i;
1221 
1222 	hiber_info->chunk_ctr = 0;
1223 
1224 	/*
1225 	 * Allocate VA for the temp and copy page.
1226 	 */
1227 
1228 	hibernate_temp_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1229 	    &kp_none, &kd_nowait);
1230 	if (!hibernate_temp_page)
1231 		return (1);
1232 
1233 	hibernate_copy_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1234 	    &kp_none, &kd_nowait);
1235 	if (!hibernate_copy_page)
1236 		return (1);
1237 
1238 	pmap_kenter_pa(hibernate_copy_page,
1239 	    (hiber_info->piglet_pa + 3*PAGE_SIZE), VM_PROT_ALL);
1240 
1241 	/* XXX - needed on i386. check other archs */
1242 	pmap_activate(curproc);
1243 
1244 	chunks = (struct hibernate_disk_chunk *)(hiber_info->piglet_va +
1245 	    HIBERNATE_CHUNK_SIZE);
1246 
1247 	/* Calculate the chunk regions */
1248 	for (i = 0; i < hiber_info->nranges; i++) {
1249 		range_base = hiber_info->ranges[i].base;
1250 		range_end = hiber_info->ranges[i].end;
1251 
1252 		inaddr = range_base;
1253 
1254 		while (inaddr < range_end) {
1255 			chunks[hiber_info->chunk_ctr].base = inaddr;
1256 			if (inaddr + HIBERNATE_CHUNK_SIZE < range_end)
1257 				chunks[hiber_info->chunk_ctr].end = inaddr +
1258 				    HIBERNATE_CHUNK_SIZE;
1259 			else
1260 				chunks[hiber_info->chunk_ctr].end = range_end;
1261 
1262 			inaddr += HIBERNATE_CHUNK_SIZE;
1263 			hiber_info->chunk_ctr ++;
1264 		}
1265 	}
1266 
1267 	/* Compress and write the chunks in the chunktable */
1268 	for (i = 0; i < hiber_info->chunk_ctr; i++) {
1269 		range_base = chunks[i].base;
1270 		range_end = chunks[i].end;
1271 
1272 		chunks[i].offset = blkctr;
1273 
1274 		/* Reset zlib for deflate */
1275 		if (hibernate_zlib_reset(hiber_info, 1) != Z_OK)
1276 			return (1);
1277 
1278 		inaddr = range_base;
1279 
1280 		/*
1281 		 * For each range, loop through its phys mem region
1282 		 * and write out the chunks (the last chunk might be
1283 		 * smaller than the chunk size).
1284 		 */
1285 		while (inaddr < range_end) {
1286 			out_remaining = PAGE_SIZE;
1287 			while (out_remaining > 0 && inaddr < range_end) {
1288 				pmap_kenter_pa(hibernate_temp_page,
1289 				    inaddr & PMAP_PA_MASK, VM_PROT_ALL);
1290 				pmap_activate(curproc);
1291 
1292 				bcopy((caddr_t)hibernate_temp_page,
1293 				    (caddr_t)hibernate_copy_page, PAGE_SIZE);
1294 
1295 				/* Adjust for non page-sized regions */
1296 				temp_inaddr = (inaddr & PAGE_MASK) +
1297 				    hibernate_copy_page;
1298 
1299 				/* Deflate from temp_inaddr to IO page */
1300 				inaddr += hibernate_deflate(hiber_info,
1301 				    temp_inaddr, &out_remaining);
1302 			}
1303 
1304 			if (out_remaining == 0) {
1305 				/* Filled up the page */
1306 				nblocks = PAGE_SIZE / hiber_info->secsize;
1307 
1308 				if (hiber_info->io_func(hiber_info->device,
1309 				    blkctr, (vaddr_t)hibernate_io_page,
1310 				    PAGE_SIZE, 1, (void *)hibernate_alloc_page))
1311 					return (1);
1312 
1313 				blkctr += nblocks;
1314 			}
1315 		}
1316 
1317 		if (inaddr != range_end)
1318 			return (1);
1319 
1320 		/*
1321 		 * End of range. Round up to next secsize bytes
1322 		 * after finishing compress
1323 		 */
1324 		if (out_remaining == 0)
1325 			out_remaining = PAGE_SIZE;
1326 
1327 		/* Finish compress */
1328 		hibernate_state->hib_stream.avail_in = 0;
1329 		hibernate_state->hib_stream.avail_out = out_remaining;
1330 		hibernate_state->hib_stream.next_in = (caddr_t)inaddr;
1331 		hibernate_state->hib_stream.next_out =
1332 		    (caddr_t)hibernate_io_page + (PAGE_SIZE - out_remaining);
1333 
1334 		if (deflate(&hibernate_state->hib_stream, Z_FINISH) !=
1335 		    Z_STREAM_END)
1336 			return (1);
1337 
1338 		out_remaining = hibernate_state->hib_stream.avail_out;
1339 
1340 		used = PAGE_SIZE - out_remaining;
1341 		nblocks = used / hiber_info->secsize;
1342 
1343 		/* Round up to next block if needed */
1344 		if (used % hiber_info->secsize != 0)
1345 			nblocks ++;
1346 
1347 		/* Write final block(s) for this chunk */
1348 		if (hiber_info->io_func(hiber_info->device, blkctr,
1349 		    (vaddr_t)hibernate_io_page, nblocks*hiber_info->secsize,
1350 		    1, (void *)hibernate_alloc_page))
1351 			return (1);
1352 
1353 		blkctr += nblocks;
1354 
1355 		offset = blkctr;
1356 		chunks[i].compressed_size = (offset - chunks[i].offset) *
1357 		    hiber_info->secsize;
1358 	}
1359 
1360 	return (0);
1361 }
1362 
1363 /*
1364  * Reset the zlib stream state and allocate a new hiballoc area for either
1365  * inflate or deflate. This function is called once for each hibernate chunk.
1366  * Calling hiballoc_init multiple times is acceptable since the memory it is
1367  * provided is unmanaged memory (stolen). We use the memory provided to us
1368  * by the piglet allocated via the supplied hiber_info.
1369  */
1370 int
1371 hibernate_zlib_reset(union hibernate_info *hiber_info, int deflate)
1372 {
1373 	vaddr_t hibernate_zlib_start;
1374 	size_t hibernate_zlib_size;
1375 
1376 	hibernate_state = (struct hibernate_zlib_state *)hiber_info->piglet_va +
1377 	    (4 * PAGE_SIZE);
1378 
1379 	hibernate_zlib_start = hiber_info->piglet_va + (5 * PAGE_SIZE);
1380 	hibernate_zlib_size = 80 * PAGE_SIZE;
1381 
1382 	bzero((caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1383 	bzero((caddr_t)hibernate_state, PAGE_SIZE);
1384 
1385 	/* Set up stream structure */
1386 	hibernate_state->hib_stream.zalloc = (alloc_func)hibernate_zlib_alloc;
1387 	hibernate_state->hib_stream.zfree = (free_func)hibernate_zlib_free;
1388 
1389 	/* Initialize the hiballoc arena for zlib allocs/frees */
1390 	hiballoc_init(&hibernate_state->hiballoc_arena,
1391 	    (caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1392 
1393 	if (deflate) {
1394 		return deflateInit(&hibernate_state->hib_stream,
1395 		    Z_DEFAULT_COMPRESSION);
1396 	} else
1397 		return inflateInit(&hibernate_state->hib_stream);
1398 }
1399 
1400 /*
1401  * Reads the hibernated memory image from disk, whose location and
1402  * size are recorded in hiber_info. Begin by reading the persisted
1403  * chunk table, which records the original chunk placement location
1404  * and compressed size for each. Next, allocate a pig region of
1405  * sufficient size to hold the compressed image. Next, read the
1406  * chunks into the pig area (calling hibernate_read_chunks to do this),
1407  * and finally, if all of the above succeeds, clear the hibernate signature.
1408  * The function will then return to hibernate_resume, which will proceed
1409  * to unpack the pig image to the correct place in memory.
1410  */
1411 int
1412 hibernate_read_image(union hibernate_info *hiber_info)
1413 {
1414 	size_t compressed_size, disk_size, chunktable_size, pig_sz;
1415 	paddr_t image_start, image_end, pig_start, pig_end;
1416 	struct hibernate_disk_chunk *chunks;
1417 	daddr_t blkctr;
1418 	int i;
1419 
1420 	/* Calculate total chunk table size in disk blocks */
1421 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / hiber_info->secsize;
1422 
1423 	blkctr = hiber_info->sig_offset - chunktable_size -
1424 			hiber_info->swap_offset;
1425 
1426 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE;
1427 	    i += MAXPHYS, blkctr += MAXPHYS/hiber_info->secsize)
1428 		hibernate_read_block(hiber_info, blkctr, MAXPHYS,
1429 		    hibernate_chunktable_area + i);
1430 
1431 	blkctr = hiber_info->image_offset;
1432 	compressed_size = 0;
1433 	chunks = (struct hibernate_disk_chunk *)hibernate_chunktable_area;
1434 
1435 	for (i = 0; i < hiber_info->chunk_ctr; i++)
1436 		compressed_size += chunks[i].compressed_size;
1437 
1438 	disk_size = compressed_size;
1439 
1440 	/* Allocate the pig area */
1441 	pig_sz = compressed_size + HIBERNATE_CHUNK_SIZE;
1442 	if (uvm_pmr_alloc_pig(&pig_start, pig_sz) == ENOMEM)
1443 		return (1);
1444 
1445 	pig_end = pig_start + pig_sz;
1446 
1447 	/* Calculate image extents. Pig image must end on a chunk boundary. */
1448 	image_end = pig_end & ~(HIBERNATE_CHUNK_SIZE - 1);
1449 	image_start = pig_start;
1450 
1451 	image_start = image_end - disk_size;
1452 
1453 	hibernate_read_chunks(hiber_info, image_start, image_end, disk_size);
1454 
1455 	/* Prepare the resume time pmap/page table */
1456 	hibernate_populate_resume_pt(hiber_info, image_start, image_end);
1457 
1458 	/* Read complete, clear the signature and return */
1459 	return hibernate_clear_signature();
1460 }
1461 
1462 /*
1463  * Read the hibernated memory chunks from disk (chunk information at this
1464  * point is stored in the piglet) into the pig area specified by
1465  * [pig_start .. pig_end]. Order the chunks so that the final chunk is the
1466  * only chunk with overlap possibilities.
1467  *
1468  * This function uses the piglet area during this process as follows:
1469  *
1470  * offset from piglet base	use
1471  * -----------------------	--------------------
1472  * 0				i/o allocation area
1473  * PAGE_SIZE			i/o write area
1474  * 2*PAGE_SIZE			temp/scratch page
1475  * 3*PAGE_SIZE			temp/scratch page
1476  * 4*PAGE_SIZE to 6*PAGE_SIZE	chunk ordering area
1477  * 7*PAGE_SIZE			hiballoc arena
1478  * 8*PAGE_SIZE to 88*PAGE_SIZE	zlib deflate area
1479  * ...
1480  * HIBERNATE_CHUNK_SIZE		chunk table temporary area
1481  */
1482 int
1483 hibernate_read_chunks(union hibernate_info *hib_info, paddr_t pig_start,
1484     paddr_t pig_end, size_t image_compr_size)
1485 {
1486 	paddr_t img_index, img_cur, r1s, r1e, r2s, r2e;
1487 	paddr_t copy_start, copy_end, piglet_cur;
1488 	paddr_t piglet_base = hib_info->piglet_pa;
1489 	paddr_t piglet_end = piglet_base + HIBERNATE_CHUNK_SIZE;
1490 	daddr_t blkctr;
1491 	size_t processed, compressed_size, read_size;
1492 	int i, j, overlap, found, nchunks, nochunks = 0, nfchunks = 0, npchunks = 0;
1493 	struct hibernate_disk_chunk *chunks;
1494 	u_int8_t *ochunks, *pchunks, *fchunks;
1495 
1496 	/* Map the chunk ordering region */
1497 	pmap_kenter_pa(hibernate_fchunk_area,
1498 	    piglet_base + (4*PAGE_SIZE), VM_PROT_ALL);
1499 	pmap_kenter_pa(hibernate_fchunk_area + PAGE_SIZE,
1500 	    piglet_base + (5*PAGE_SIZE), VM_PROT_ALL);
1501 	pmap_kenter_pa(hibernate_fchunk_area + 2*PAGE_SIZE,
1502 	    piglet_base + (6*PAGE_SIZE), VM_PROT_ALL);
1503 
1504 	/* Temporary output chunk ordering */
1505 	ochunks = (u_int8_t *)hibernate_fchunk_area;
1506 
1507 	/* Piglet chunk ordering */
1508 	pchunks = (u_int8_t *)hibernate_fchunk_area + PAGE_SIZE;
1509 
1510 	/* Final chunk ordering */
1511 	fchunks = (u_int8_t *)hibernate_fchunk_area + 2*PAGE_SIZE;
1512 
1513 	nchunks = hib_info->chunk_ctr;
1514 	chunks = (struct hibernate_disk_chunk *)hibernate_chunktable_area;
1515 
1516 	/* Initially start all chunks as unplaced */
1517 	for (i = 0; i < nchunks; i++)
1518 		chunks[i].flags = 0;
1519 
1520 	/*
1521 	 * Search the list for chunks that are outside the pig area. These
1522 	 * can be placed first in the final output list.
1523 	 */
1524 	for (i = 0; i < nchunks; i++) {
1525 		if (chunks[i].end <= pig_start || chunks[i].base >= pig_end) {
1526 			ochunks[nochunks] = (u_int8_t)i;
1527 			fchunks[nfchunks] = (u_int8_t)i;
1528 			nochunks++;
1529 			nfchunks++;
1530 			chunks[i].flags |= HIBERNATE_CHUNK_USED;
1531 		}
1532 	}
1533 
1534 	/*
1535 	 * Walk the ordering, place the chunks in ascending memory order.
1536 	 * Conflicts might arise, these are handled next.
1537 	 */
1538 	do {
1539 		img_index = -1;
1540 		found = 0;
1541 		j = -1;
1542 		for (i = 0; i < nchunks; i++)
1543 			if (chunks[i].base < img_index &&
1544 			    chunks[i].flags == 0 ) {
1545 				j = i;
1546 				img_index = chunks[i].base;
1547 			}
1548 
1549 		if (j != -1) {
1550 			found = 1;
1551 			ochunks[nochunks] = (short)j;
1552 			nochunks++;
1553 			chunks[j].flags |= HIBERNATE_CHUNK_PLACED;
1554 		}
1555 	} while (found);
1556 
1557 	img_index = pig_start;
1558 
1559 	/*
1560 	 * Identify chunk output conflicts (chunks whose pig load area
1561 	 * corresponds to their original memory placement location)
1562 	 */
1563 	for (i = 0; i < nochunks ; i++) {
1564 		overlap = 0;
1565 		r1s = img_index;
1566 		r1e = img_index + chunks[ochunks[i]].compressed_size;
1567 		r2s = chunks[ochunks[i]].base;
1568 		r2e = chunks[ochunks[i]].end;
1569 
1570 		overlap = hibernate_check_overlap(r1s, r1e, r2s, r2e);
1571 		if (overlap)
1572 			chunks[ochunks[i]].flags |= HIBERNATE_CHUNK_CONFLICT;
1573 		img_index += chunks[ochunks[i]].compressed_size;
1574 	}
1575 
1576 	/*
1577 	 * Prepare the final output chunk list. Calculate an output
1578 	 * inflate strategy for overlapping chunks if needed.
1579 	 */
1580 	img_index = pig_start;
1581 	for (i = 0; i < nochunks ; i++) {
1582 		/*
1583 		 * If a conflict is detected, consume enough compressed
1584 		 * output chunks to fill the piglet
1585 		 */
1586 		if (chunks[ochunks[i]].flags & HIBERNATE_CHUNK_CONFLICT) {
1587 			copy_start = piglet_base;
1588 			copy_end = piglet_end;
1589 			piglet_cur = piglet_base;
1590 			npchunks = 0;
1591 			j = i;
1592 			while (copy_start < copy_end && j < nochunks) {
1593 				piglet_cur += chunks[ochunks[j]].compressed_size;
1594 				pchunks[npchunks] = ochunks[j];
1595 				npchunks++;
1596 				copy_start += chunks[ochunks[j]].compressed_size;
1597 				img_index += chunks[ochunks[j]].compressed_size;
1598 				i++;
1599 				j++;
1600 			}
1601 
1602 			piglet_cur = piglet_base;
1603 			for (j = 0; j < npchunks; j++) {
1604 				piglet_cur += chunks[pchunks[j]].compressed_size;
1605 				fchunks[nfchunks] = pchunks[j];
1606 				chunks[pchunks[j]].flags |= HIBERNATE_CHUNK_USED;
1607 				nfchunks++;
1608 			}
1609 		} else {
1610 			/*
1611 			 * No conflict, chunk can be added without copying
1612 			 */
1613 			if ((chunks[ochunks[i]].flags &
1614 			    HIBERNATE_CHUNK_USED) == 0) {
1615 				fchunks[nfchunks] = ochunks[i];
1616 				chunks[ochunks[i]].flags |= HIBERNATE_CHUNK_USED;
1617 				nfchunks++;
1618 			}
1619 			img_index += chunks[ochunks[i]].compressed_size;
1620 		}
1621 	}
1622 
1623 	img_index = pig_start;
1624 	for (i = 0; i < nfchunks; i++) {
1625 		piglet_cur = piglet_base;
1626 		img_index += chunks[fchunks[i]].compressed_size;
1627 	}
1628 
1629 	img_cur = pig_start;
1630 
1631 	for (i = 0; i < nfchunks; i++) {
1632 		blkctr = chunks[fchunks[i]].offset - hib_info->swap_offset;
1633 		processed = 0;
1634 		compressed_size = chunks[fchunks[i]].compressed_size;
1635 
1636 		while (processed < compressed_size) {
1637 			pmap_kenter_pa(hibernate_temp_page, img_cur,
1638 			    VM_PROT_ALL);
1639 			pmap_kenter_pa(hibernate_temp_page + PAGE_SIZE,
1640 			    img_cur+PAGE_SIZE, VM_PROT_ALL);
1641 
1642 			/* XXX - needed on i386. check other archs */
1643 			pmap_activate(curproc);
1644 			if (compressed_size - processed >= PAGE_SIZE)
1645 				read_size = PAGE_SIZE;
1646 			else
1647 				read_size = compressed_size - processed;
1648 
1649 			hibernate_read_block(hib_info, blkctr, read_size,
1650 			    hibernate_temp_page + (img_cur & PAGE_MASK));
1651 
1652 			blkctr += (read_size / hib_info->secsize);
1653 
1654 			hibernate_flush();
1655 			pmap_kremove(hibernate_temp_page, PAGE_SIZE);
1656 			pmap_kremove(hibernate_temp_page + PAGE_SIZE,
1657 			    PAGE_SIZE);
1658 			processed += read_size;
1659 			img_cur += read_size;
1660 		}
1661 	}
1662 
1663 	return (0);
1664 }
1665 
1666 /*
1667  * Hibernating a machine comprises the following operations:
1668  *  1. Calculating this machine's hibernate_info information
1669  *  2. Allocating a piglet and saving the piglet's physaddr
1670  *  3. Calculating the memory chunks
1671  *  4. Writing the compressed chunks to disk
1672  *  5. Writing the chunk table
1673  *  6. Writing the signature block (hibernate_info)
1674  *
1675  * On most architectures, the function calling hibernate_suspend would
1676  * then power off the machine using some MD-specific implementation.
1677  */
1678 int
1679 hibernate_suspend(void)
1680 {
1681 	union hibernate_info hib_info;
1682 
1683 	/*
1684 	 * Calculate memory ranges, swap offsets, etc.
1685 	 * This also allocates a piglet whose physaddr is stored in
1686 	 * hib_info->piglet_pa and vaddr stored in hib_info->piglet_va
1687 	 */
1688 	if (get_hibernate_info(&hib_info, 1))
1689 		return (1);
1690 
1691 	/* XXX - Won't need to zero everything with RLE */
1692 	uvm_pmr_zero_everything();
1693 
1694 	if (hibernate_write_chunks(&hib_info))
1695 		return (1);
1696 
1697 	if (hibernate_write_chunktable(&hib_info))
1698 		return (1);
1699 
1700 	return hibernate_write_signature(&hib_info);
1701 }
1702