xref: /openbsd-src/sys/kern/subr_hibernate.c (revision 95a70527f61f26bc006ca27e2cf65ee224e3f9ed)
1 /*	$OpenBSD: subr_hibernate.c,v 1.19 2011/11/13 18:38:10 mlarkin Exp $	*/
2 
3 /*
4  * Copyright (c) 2011 Ariane van der Steldt <ariane@stack.nl>
5  * Copyright (c) 2011 Mike Larkin <mlarkin@openbsd.org>
6  *
7  * Permission to use, copy, modify, and distribute this software for any
8  * purpose with or without fee is hereby granted, provided that the above
9  * copyright notice and this permission notice appear in all copies.
10  *
11  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
12  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
14  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18  */
19 
20 #include <sys/hibernate.h>
21 #include <sys/malloc.h>
22 #include <sys/param.h>
23 #include <sys/tree.h>
24 #include <sys/types.h>
25 #include <sys/systm.h>
26 #include <sys/disklabel.h>
27 #include <sys/disk.h>
28 #include <sys/conf.h>
29 #include <sys/buf.h>
30 #include <sys/fcntl.h>
31 #include <sys/stat.h>
32 #include <uvm/uvm.h>
33 #include <machine/hibernate.h>
34 
35 struct hibernate_zlib_state *hibernate_state;
36 
37 /* Temporary vaddr ranges used during hibernate */
38 vaddr_t hibernate_temp_page;
39 vaddr_t hibernate_copy_page;
40 vaddr_t hibernate_stack_page;
41 vaddr_t hibernate_fchunk_area;
42 vaddr_t	hibernate_chunktable_area;
43 
44 /* Hibernate info as read from disk during resume */
45 union hibernate_info disk_hiber_info;
46 paddr_t global_pig_start;
47 
48 /*
49  * Hib alloc enforced alignment.
50  */
51 #define HIB_ALIGN		8 /* bytes alignment */
52 
53 /*
54  * sizeof builtin operation, but with alignment constraint.
55  */
56 #define HIB_SIZEOF(_type)	roundup(sizeof(_type), HIB_ALIGN)
57 
58 struct hiballoc_entry {
59 	size_t			hibe_use;
60 	size_t			hibe_space;
61 	RB_ENTRY(hiballoc_entry) hibe_entry;
62 };
63 
64 /*
65  * Compare hiballoc entries based on the address they manage.
66  *
67  * Since the address is fixed, relative to struct hiballoc_entry,
68  * we just compare the hiballoc_entry pointers.
69  */
70 static __inline int
71 hibe_cmp(struct hiballoc_entry *l, struct hiballoc_entry *r)
72 {
73 	return l < r ? -1 : (l > r);
74 }
75 
76 RB_PROTOTYPE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
77 
78 /*
79  * Given a hiballoc entry, return the address it manages.
80  */
81 static __inline void *
82 hib_entry_to_addr(struct hiballoc_entry *entry)
83 {
84 	caddr_t addr;
85 
86 	addr = (caddr_t)entry;
87 	addr += HIB_SIZEOF(struct hiballoc_entry);
88 	return addr;
89 }
90 
91 /*
92  * Given an address, find the hiballoc that corresponds.
93  */
94 static __inline struct hiballoc_entry*
95 hib_addr_to_entry(void *addr_param)
96 {
97 	caddr_t addr;
98 
99 	addr = (caddr_t)addr_param;
100 	addr -= HIB_SIZEOF(struct hiballoc_entry);
101 	return (struct hiballoc_entry*)addr;
102 }
103 
104 RB_GENERATE(hiballoc_addr, hiballoc_entry, hibe_entry, hibe_cmp)
105 
106 /*
107  * Allocate memory from the arena.
108  *
109  * Returns NULL if no memory is available.
110  */
111 void *
112 hib_alloc(struct hiballoc_arena *arena, size_t alloc_sz)
113 {
114 	struct hiballoc_entry *entry, *new_entry;
115 	size_t find_sz;
116 
117 	/*
118 	 * Enforce alignment of HIB_ALIGN bytes.
119 	 *
120 	 * Note that, because the entry is put in front of the allocation,
121 	 * 0-byte allocations are guaranteed a unique address.
122 	 */
123 	alloc_sz = roundup(alloc_sz, HIB_ALIGN);
124 
125 	/*
126 	 * Find an entry with hibe_space >= find_sz.
127 	 *
128 	 * If the root node is not large enough, we switch to tree traversal.
129 	 * Because all entries are made at the bottom of the free space,
130 	 * traversal from the end has a slightly better chance of yielding
131 	 * a sufficiently large space.
132 	 */
133 	find_sz = alloc_sz + HIB_SIZEOF(struct hiballoc_entry);
134 	entry = RB_ROOT(&arena->hib_addrs);
135 	if (entry != NULL && entry->hibe_space < find_sz) {
136 		RB_FOREACH_REVERSE(entry, hiballoc_addr, &arena->hib_addrs) {
137 			if (entry->hibe_space >= find_sz)
138 				break;
139 		}
140 	}
141 
142 	/*
143 	 * Insufficient or too fragmented memory.
144 	 */
145 	if (entry == NULL)
146 		return NULL;
147 
148 	/*
149 	 * Create new entry in allocated space.
150 	 */
151 	new_entry = (struct hiballoc_entry*)(
152 	    (caddr_t)hib_entry_to_addr(entry) + entry->hibe_use);
153 	new_entry->hibe_space = entry->hibe_space - find_sz;
154 	new_entry->hibe_use = alloc_sz;
155 
156 	/*
157 	 * Insert entry.
158 	 */
159 	if (RB_INSERT(hiballoc_addr, &arena->hib_addrs, new_entry) != NULL)
160 		panic("hib_alloc: insert failure");
161 	entry->hibe_space = 0;
162 
163 	/* Return address managed by entry. */
164 	return hib_entry_to_addr(new_entry);
165 }
166 
167 /*
168  * Free a pointer previously allocated from this arena.
169  *
170  * If addr is NULL, this will be silently accepted.
171  */
172 void
173 hib_free(struct hiballoc_arena *arena, void *addr)
174 {
175 	struct hiballoc_entry *entry, *prev;
176 
177 	if (addr == NULL)
178 		return;
179 
180 	/*
181 	 * Derive entry from addr and check it is really in this arena.
182 	 */
183 	entry = hib_addr_to_entry(addr);
184 	if (RB_FIND(hiballoc_addr, &arena->hib_addrs, entry) != entry)
185 		panic("hib_free: freed item %p not in hib arena", addr);
186 
187 	/*
188 	 * Give the space in entry to its predecessor.
189 	 *
190 	 * If entry has no predecessor, change its used space into free space
191 	 * instead.
192 	 */
193 	prev = RB_PREV(hiballoc_addr, &arena->hib_addrs, entry);
194 	if (prev != NULL &&
195 	    (void *)((caddr_t)prev + HIB_SIZEOF(struct hiballoc_entry) +
196 	    prev->hibe_use + prev->hibe_space) == entry) {
197 		/* Merge entry. */
198 		RB_REMOVE(hiballoc_addr, &arena->hib_addrs, entry);
199 		prev->hibe_space += HIB_SIZEOF(struct hiballoc_entry) +
200 		    entry->hibe_use + entry->hibe_space;
201 	} else {
202 		/* Flip used memory to free space. */
203 		entry->hibe_space += entry->hibe_use;
204 		entry->hibe_use = 0;
205 	}
206 }
207 
208 /*
209  * Initialize hiballoc.
210  *
211  * The allocator will manage memmory at ptr, which is len bytes.
212  */
213 int
214 hiballoc_init(struct hiballoc_arena *arena, void *p_ptr, size_t p_len)
215 {
216 	struct hiballoc_entry *entry;
217 	caddr_t ptr;
218 	size_t len;
219 
220 	RB_INIT(&arena->hib_addrs);
221 
222 	/*
223 	 * Hib allocator enforces HIB_ALIGN alignment.
224 	 * Fixup ptr and len.
225 	 */
226 	ptr = (caddr_t)roundup((vaddr_t)p_ptr, HIB_ALIGN);
227 	len = p_len - ((size_t)ptr - (size_t)p_ptr);
228 	len &= ~((size_t)HIB_ALIGN - 1);
229 
230 	/*
231 	 * Insufficient memory to be able to allocate and also do bookkeeping.
232 	 */
233 	if (len <= HIB_SIZEOF(struct hiballoc_entry))
234 		return ENOMEM;
235 
236 	/*
237 	 * Create entry describing space.
238 	 */
239 	entry = (struct hiballoc_entry*)ptr;
240 	entry->hibe_use = 0;
241 	entry->hibe_space = len - HIB_SIZEOF(struct hiballoc_entry);
242 	RB_INSERT(hiballoc_addr, &arena->hib_addrs, entry);
243 
244 	return 0;
245 }
246 
247 /*
248  * Zero all free memory.
249  */
250 void
251 uvm_pmr_zero_everything(void)
252 {
253 	struct uvm_pmemrange	*pmr;
254 	struct vm_page		*pg;
255 	int			 i;
256 
257 	uvm_lock_fpageq();
258 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
259 		/* Zero single pages. */
260 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_DIRTY]))
261 		    != NULL) {
262 			uvm_pmr_remove(pmr, pg);
263 			uvm_pagezero(pg);
264 			atomic_setbits_int(&pg->pg_flags, PG_ZERO);
265 			uvmexp.zeropages++;
266 			uvm_pmr_insert(pmr, pg, 0);
267 		}
268 
269 		/* Zero multi page ranges. */
270 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_DIRTY]))
271 		    != NULL) {
272 			pg--; /* Size tree always has second page. */
273 			uvm_pmr_remove(pmr, pg);
274 			for (i = 0; i < pg->fpgsz; i++) {
275 				uvm_pagezero(&pg[i]);
276 				atomic_setbits_int(&pg[i].pg_flags, PG_ZERO);
277 				uvmexp.zeropages++;
278 			}
279 			uvm_pmr_insert(pmr, pg, 0);
280 		}
281 	}
282 	uvm_unlock_fpageq();
283 }
284 
285 /*
286  * Mark all memory as dirty.
287  *
288  * Used to inform the system that the clean memory isn't clean for some
289  * reason, for example because we just came back from hibernate.
290  */
291 void
292 uvm_pmr_dirty_everything(void)
293 {
294 	struct uvm_pmemrange	*pmr;
295 	struct vm_page		*pg;
296 	int			 i;
297 
298 	uvm_lock_fpageq();
299 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
300 		/* Dirty single pages. */
301 		while ((pg = TAILQ_FIRST(&pmr->single[UVM_PMR_MEMTYPE_ZERO]))
302 		    != NULL) {
303 			uvm_pmr_remove(pmr, pg);
304 			atomic_clearbits_int(&pg->pg_flags, PG_ZERO);
305 			uvm_pmr_insert(pmr, pg, 0);
306 		}
307 
308 		/* Dirty multi page ranges. */
309 		while ((pg = RB_ROOT(&pmr->size[UVM_PMR_MEMTYPE_ZERO]))
310 		    != NULL) {
311 			pg--; /* Size tree always has second page. */
312 			uvm_pmr_remove(pmr, pg);
313 			for (i = 0; i < pg->fpgsz; i++)
314 				atomic_clearbits_int(&pg[i].pg_flags, PG_ZERO);
315 			uvm_pmr_insert(pmr, pg, 0);
316 		}
317 	}
318 
319 	uvmexp.zeropages = 0;
320 	uvm_unlock_fpageq();
321 }
322 
323 /*
324  * Allocate the highest address that can hold sz.
325  *
326  * sz in bytes.
327  */
328 int
329 uvm_pmr_alloc_pig(paddr_t *addr, psize_t sz)
330 {
331 	struct uvm_pmemrange	*pmr;
332 	struct vm_page		*pig_pg, *pg;
333 
334 	/*
335 	 * Convert sz to pages, since that is what pmemrange uses internally.
336 	 */
337 	sz = atop(round_page(sz));
338 
339 	uvm_lock_fpageq();
340 
341 	TAILQ_FOREACH(pmr, &uvm.pmr_control.use, pmr_use) {
342 		RB_FOREACH_REVERSE(pig_pg, uvm_pmr_addr, &pmr->addr) {
343 			if (pig_pg->fpgsz >= sz) {
344 				goto found;
345 			}
346 		}
347 	}
348 
349 	/*
350 	 * Allocation failure.
351 	 */
352 	uvm_unlock_pageq();
353 	return ENOMEM;
354 
355 found:
356 	/* Remove page from freelist. */
357 	uvm_pmr_remove_size(pmr, pig_pg);
358 	pig_pg->fpgsz -= sz;
359 	pg = pig_pg + pig_pg->fpgsz;
360 	if (pig_pg->fpgsz == 0)
361 		uvm_pmr_remove_addr(pmr, pig_pg);
362 	else
363 		uvm_pmr_insert_size(pmr, pig_pg);
364 
365 	uvmexp.free -= sz;
366 	*addr = VM_PAGE_TO_PHYS(pg);
367 
368 	/*
369 	 * Update pg flags.
370 	 *
371 	 * Note that we trash the sz argument now.
372 	 */
373 	while (sz > 0) {
374 		KASSERT(pg->pg_flags & PQ_FREE);
375 
376 		atomic_clearbits_int(&pg->pg_flags,
377 		    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3);
378 
379 		if (pg->pg_flags & PG_ZERO)
380 			uvmexp.zeropages -= sz;
381 		atomic_clearbits_int(&pg->pg_flags,
382 		    PG_ZERO|PQ_FREE);
383 
384 		pg->uobject = NULL;
385 		pg->uanon = NULL;
386 		pg->pg_version++;
387 
388 		/*
389 		 * Next.
390 		 */
391 		pg++;
392 		sz--;
393 	}
394 
395 	/* Return. */
396 	uvm_unlock_fpageq();
397 	return 0;
398 }
399 
400 /*
401  * Allocate a piglet area.
402  *
403  * This is as low as possible.
404  * Piglets are aligned.
405  *
406  * sz and align in bytes.
407  *
408  * The call will sleep for the pagedaemon to attempt to free memory.
409  * The pagedaemon may decide its not possible to free enough memory, causing
410  * the allocation to fail.
411  */
412 int
413 uvm_pmr_alloc_piglet(vaddr_t *va, paddr_t *pa, vsize_t sz, paddr_t align)
414 {
415 	paddr_t			 pg_addr, piglet_addr;
416 	struct uvm_pmemrange	*pmr;
417 	struct vm_page		*pig_pg, *pg;
418 	struct pglist		 pageq;
419 	int			 pdaemon_woken;
420 	vaddr_t			 piglet_va;
421 
422 	KASSERT((align & (align - 1)) == 0);
423 	pdaemon_woken = 0; /* Didn't wake the pagedaemon. */
424 
425 	/*
426 	 * Fixup arguments: align must be at least PAGE_SIZE,
427 	 * sz will be converted to pagecount, since that is what
428 	 * pmemrange uses internally.
429 	 */
430 	if (align < PAGE_SIZE)
431 		align = PAGE_SIZE;
432 	sz = round_page(sz);
433 
434 	uvm_lock_fpageq();
435 
436 	TAILQ_FOREACH_REVERSE(pmr, &uvm.pmr_control.use, uvm_pmemrange_use,
437 	    pmr_use) {
438 retry:
439 		/*
440 		 * Search for a range with enough space.
441 		 * Use the address tree, to ensure the range is as low as
442 		 * possible.
443 		 */
444 		RB_FOREACH(pig_pg, uvm_pmr_addr, &pmr->addr) {
445 			pg_addr = VM_PAGE_TO_PHYS(pig_pg);
446 			piglet_addr = (pg_addr + (align - 1)) & ~(align - 1);
447 
448 			if (atop(pg_addr) + pig_pg->fpgsz >=
449 			    atop(piglet_addr) + atop(sz))
450 				goto found;
451 		}
452 	}
453 
454 	/*
455 	 * Try to coerse the pagedaemon into freeing memory
456 	 * for the piglet.
457 	 *
458 	 * pdaemon_woken is set to prevent the code from
459 	 * falling into an endless loop.
460 	 */
461 	if (!pdaemon_woken) {
462 		pdaemon_woken = 1;
463 		if (uvm_wait_pla(ptoa(pmr->low), ptoa(pmr->high) - 1,
464 		    sz, UVM_PLA_FAILOK) == 0)
465 			goto retry;
466 	}
467 
468 	/* Return failure. */
469 	uvm_unlock_fpageq();
470 	return ENOMEM;
471 
472 found:
473 	/*
474 	 * Extract piglet from pigpen.
475 	 */
476 	TAILQ_INIT(&pageq);
477 	uvm_pmr_extract_range(pmr, pig_pg,
478 	    atop(piglet_addr), atop(piglet_addr) + atop(sz), &pageq);
479 
480 	*pa = piglet_addr;
481 	uvmexp.free -= atop(sz);
482 
483 	/*
484 	 * Update pg flags.
485 	 *
486 	 * Note that we trash the sz argument now.
487 	 */
488 	TAILQ_FOREACH(pg, &pageq, pageq) {
489 		KASSERT(pg->pg_flags & PQ_FREE);
490 
491 		atomic_clearbits_int(&pg->pg_flags,
492 		    PG_PMAP0|PG_PMAP1|PG_PMAP2|PG_PMAP3);
493 
494 		if (pg->pg_flags & PG_ZERO)
495 			uvmexp.zeropages--;
496 		atomic_clearbits_int(&pg->pg_flags,
497 		    PG_ZERO|PQ_FREE);
498 
499 		pg->uobject = NULL;
500 		pg->uanon = NULL;
501 		pg->pg_version++;
502 	}
503 
504 	uvm_unlock_fpageq();
505 
506 	/*
507 	 * Now allocate a va.
508 	 * Use direct mappings for the pages.
509 	 */
510 
511 	piglet_va = *va = (vaddr_t)km_alloc(sz, &kv_any, &kp_none, &kd_waitok);
512 	if (!piglet_va) {
513 		uvm_pglistfree(&pageq);
514 		return ENOMEM;
515 	}
516 
517 	/*
518 	 * Map piglet to va.
519 	 */
520 	TAILQ_FOREACH(pg, &pageq, pageq) {
521 		pmap_kenter_pa(piglet_va, VM_PAGE_TO_PHYS(pg), UVM_PROT_RW);
522 		piglet_va += PAGE_SIZE;
523 	}
524 	pmap_update(pmap_kernel());
525 
526 	return 0;
527 }
528 
529 /*
530  * Free a piglet area.
531  */
532 void
533 uvm_pmr_free_piglet(vaddr_t va, vsize_t sz)
534 {
535 	paddr_t			 pa;
536 	struct vm_page		*pg;
537 
538 	/*
539 	 * Fix parameters.
540 	 */
541 	sz = round_page(sz);
542 
543 	/*
544 	 * Find the first page in piglet.
545 	 * Since piglets are contiguous, the first pg is all we need.
546 	 */
547 	if (!pmap_extract(pmap_kernel(), va, &pa))
548 		panic("uvm_pmr_free_piglet: piglet 0x%lx has no pages", va);
549 	pg = PHYS_TO_VM_PAGE(pa);
550 	if (pg == NULL)
551 		panic("uvm_pmr_free_piglet: unmanaged page 0x%lx", pa);
552 
553 	/*
554 	 * Unmap.
555 	 */
556 	pmap_kremove(va, sz);
557 	pmap_update(pmap_kernel());
558 
559 	/*
560 	 * Free the physical and virtual memory.
561 	 */
562 	uvm_pmr_freepages(pg, atop(sz));
563 	km_free((void *)va, sz, &kv_any, &kp_none);
564 }
565 
566 /*
567  * Physmem RLE compression support.
568  *
569  * Given a physical page address, it will return the number of pages
570  * starting at the address, that are free.
571  * Returns 0 if the page at addr is not free.
572  */
573 psize_t
574 uvm_page_rle(paddr_t addr)
575 {
576 	struct vm_page		*pg, *pg_end;
577 	struct vm_physseg	*vmp;
578 	int			 pseg_idx, off_idx;
579 
580 	pseg_idx = vm_physseg_find(atop(addr), &off_idx);
581 	if (pseg_idx == -1)
582 		return 0;
583 
584 	vmp = &vm_physmem[pseg_idx];
585 	pg = &vmp->pgs[off_idx];
586 	if (!(pg->pg_flags & PQ_FREE))
587 		return 0;
588 
589 	/*
590 	 * Search for the first non-free page after pg.
591 	 * Note that the page may not be the first page in a free pmemrange,
592 	 * therefore pg->fpgsz cannot be used.
593 	 */
594 	for (pg_end = pg; pg_end <= vmp->lastpg &&
595 	    (pg_end->pg_flags & PQ_FREE) == PQ_FREE; pg_end++);
596 	return pg_end - pg;
597 }
598 
599 /*
600  * Fills out the hibernate_info union pointed to by hiber_info
601  * with information about this machine (swap signature block
602  * offsets, number of memory ranges, kernel in use, etc)
603  */
604 int
605 get_hibernate_info(union hibernate_info *hiber_info, int suspend)
606 {
607 	int chunktable_size;
608 	struct disklabel dl;
609 	char err_string[128], *dl_ret;
610 
611 	/* Determine I/O function to use */
612 	hiber_info->io_func = get_hibernate_io_function();
613 	if (hiber_info->io_func == NULL)
614 		return (1);
615 
616 	/* Calculate hibernate device */
617 	hiber_info->device = swdevt[0].sw_dev;
618 
619 	/* Read disklabel (used to calculate signature and image offsets) */
620 	dl_ret = disk_readlabel(&dl, hiber_info->device, err_string, 128);
621 
622 	if (dl_ret) {
623 		printf("Hibernate error reading disklabel: %s\n", dl_ret);
624 		return (1);
625 	}
626 
627 	hiber_info->secsize = dl.d_secsize;
628 
629 	/* Make sure the signature can fit in one block */
630 	KASSERT(sizeof(union hibernate_info)/hiber_info->secsize == 1);
631 
632 	/* Calculate swap offset from start of disk */
633 	hiber_info->swap_offset = dl.d_partitions[1].p_offset;
634 
635 	/* Calculate signature block location */
636 	hiber_info->sig_offset = dl.d_partitions[1].p_offset +
637 	    dl.d_partitions[1].p_size -
638 	    sizeof(union hibernate_info)/hiber_info->secsize;
639 
640 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / hiber_info->secsize;
641 
642 	/* Stash kernel version information */
643 	bzero(&hiber_info->kernel_version, 128);
644 	bcopy(version, &hiber_info->kernel_version,
645 	    min(strlen(version), sizeof(hiber_info->kernel_version)-1));
646 
647 	if (suspend) {
648 		/* Allocate piglet region */
649 		if (uvm_pmr_alloc_piglet(&hiber_info->piglet_va,
650 		    &hiber_info->piglet_pa, HIBERNATE_CHUNK_SIZE*3,
651 		    HIBERNATE_CHUNK_SIZE)) {
652 			printf("Hibernate failed to allocate the piglet\n");
653 			return (1);
654 		}
655 	}
656 
657 	if (get_hibernate_info_md(hiber_info))
658 		return (1);
659 
660 	/* Calculate memory image location */
661 	hiber_info->image_offset = dl.d_partitions[1].p_offset +
662 	    dl.d_partitions[1].p_size -
663 	    (hiber_info->image_size / hiber_info->secsize) -
664 	    sizeof(union hibernate_info)/hiber_info->secsize -
665 	    chunktable_size;
666 
667 	return (0);
668 }
669 
670 /*
671  * Allocate nitems*size bytes from the hiballoc area presently in use
672  */
673 void
674 *hibernate_zlib_alloc(void *unused, int nitems, int size)
675 {
676 	return hib_alloc(&hibernate_state->hiballoc_arena, nitems*size);
677 }
678 
679 /*
680  * Free the memory pointed to by addr in the hiballoc area presently in
681  * use
682  */
683 void
684 hibernate_zlib_free(void *unused, void *addr)
685 {
686 	hib_free(&hibernate_state->hiballoc_arena, addr);
687 }
688 
689 /*
690  * Inflate size bytes from src into dest, skipping any pages in
691  * [src..dest] that are special (see hibernate_inflate_skip)
692  *
693  * For each page of output data, we map HIBERNATE_TEMP_PAGE
694  * to the current output page, and tell inflate() to inflate
695  * its data there, resulting in the inflated data being placed
696  * at the proper paddr.
697  *
698  * This function executes while using the resume-time stack
699  * and pmap, and therefore cannot use ddb/printf/etc. Doing so
700  * will likely hang or reset the machine.
701  */
702 void
703 hibernate_inflate(union hibernate_info *hiber_info, paddr_t dest,
704     paddr_t src, size_t size)
705 {
706 	int i;
707 
708 	hibernate_state->hib_stream.avail_in = size;
709 	hibernate_state->hib_stream.next_in = (char *)src;
710 
711 	do {
712 		/* Flush cache and TLB */
713 		hibernate_flush();
714 
715 		/*
716 		 * Is this a special page? If yes, redirect the
717 		 * inflate output to a scratch page (eg, discard it)
718 		 */
719 		if (hibernate_inflate_skip(hiber_info, dest))
720 			hibernate_enter_resume_mapping(
721 			    HIBERNATE_INFLATE_PAGE,
722 			    HIBERNATE_INFLATE_PAGE, 0);
723 		else
724 			hibernate_enter_resume_mapping(
725 			    HIBERNATE_INFLATE_PAGE, dest, 0);
726 
727 		/* Set up the stream for inflate */
728 		hibernate_state->hib_stream.avail_out = PAGE_SIZE;
729 		hibernate_state->hib_stream.next_out =
730 		    (char *)HIBERNATE_INFLATE_PAGE;
731 
732 		/* Process next block of data */
733 		i = inflate(&hibernate_state->hib_stream, Z_PARTIAL_FLUSH);
734 		if (i != Z_OK && i != Z_STREAM_END) {
735 			/*
736 			 * XXX - this will likely reboot/hang most machines,
737 			 *       but there's not much else we can do here.
738 			 */
739 			panic("inflate error");
740 		}
741 
742 		dest += PAGE_SIZE - hibernate_state->hib_stream.avail_out;
743 	} while (i != Z_STREAM_END);
744 }
745 
746 /*
747  * deflate from src into the I/O page, up to 'remaining' bytes
748  *
749  * Returns number of input bytes consumed, and may reset
750  * the 'remaining' parameter if not all the output space was consumed
751  * (this information is needed to know how much to write to disk
752  */
753 size_t
754 hibernate_deflate(union hibernate_info *hiber_info, paddr_t src,
755     size_t *remaining)
756 {
757 	vaddr_t hibernate_io_page = hiber_info->piglet_va + PAGE_SIZE;
758 
759 	/* Set up the stream for deflate */
760 	hibernate_state->hib_stream.avail_in = PAGE_SIZE - (src & PAGE_MASK);
761 	hibernate_state->hib_stream.avail_out = *remaining;
762 	hibernate_state->hib_stream.next_in = (caddr_t)src;
763 	hibernate_state->hib_stream.next_out = (caddr_t)hibernate_io_page +
764 	    (PAGE_SIZE - *remaining);
765 
766 	/* Process next block of data */
767 	if (deflate(&hibernate_state->hib_stream, Z_PARTIAL_FLUSH) != Z_OK)
768 		panic("hibernate zlib deflate error\n");
769 
770 	/* Update pointers and return number of bytes consumed */
771 	*remaining = hibernate_state->hib_stream.avail_out;
772 	return (PAGE_SIZE - (src & PAGE_MASK)) -
773 		hibernate_state->hib_stream.avail_in;
774 }
775 
776 /*
777  * Write the hibernation information specified in hiber_info
778  * to the location in swap previously calculated (last block of
779  * swap), called the "signature block".
780  *
781  * Write the memory chunk table to the area in swap immediately
782  * preceding the signature block.
783  */
784 int
785 hibernate_write_signature(union hibernate_info *hiber_info)
786 {
787 	u_int8_t *io_page;
788 	int result = 0;
789 
790 	io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
791 	if (!io_page)
792 		return (1);
793 
794 	/* Write hibernate info to disk */
795 	if (hiber_info->io_func(hiber_info->device, hiber_info->sig_offset,
796 	    (vaddr_t)hiber_info, hiber_info->secsize, 1, io_page))
797 		result = 1;
798 
799 	free(io_page, M_DEVBUF);
800 	return (result);
801 }
802 
803 /*
804  * Write the memory chunk table to the area in swap immediately
805  * preceding the signature block. The chunk table is stored
806  * in the piglet when this function is called.
807  */
808 int
809 hibernate_write_chunktable(union hibernate_info *hiber_info)
810 {
811 	struct hibernate_disk_chunk *chunks;
812 	vaddr_t hibernate_chunk_table_start;
813 	size_t hibernate_chunk_table_size;
814 	u_int8_t *io_page;
815 	daddr_t chunkbase;
816 	int i;
817 
818 	io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
819 	if (!io_page)
820 		return (1);
821 
822 	hibernate_chunk_table_size = HIBERNATE_CHUNK_TABLE_SIZE;
823 
824 	chunkbase = hiber_info->sig_offset -
825 	    (hibernate_chunk_table_size / hiber_info->secsize);
826 
827 	hibernate_chunk_table_start = hiber_info->piglet_va +
828 	    HIBERNATE_CHUNK_SIZE;
829 
830 	chunks = (struct hibernate_disk_chunk *)(hiber_info->piglet_va +
831 	    HIBERNATE_CHUNK_SIZE);
832 
833 	/* Write chunk table */
834 	for (i = 0; i < hibernate_chunk_table_size; i += MAXPHYS) {
835 		if (hiber_info->io_func(hiber_info->device,
836 		    chunkbase + (i/hiber_info->secsize),
837 		    (vaddr_t)(hibernate_chunk_table_start + i),
838 		    MAXPHYS, 1, io_page)) {
839 			free(io_page, M_DEVBUF);
840 			return (1);
841 		}
842 	}
843 
844 	free(io_page, M_DEVBUF);
845 	return (0);
846 }
847 
848 /*
849  * Write an empty hiber_info to the swap signature block, which is
850  * guaranteed to not match any valid hiber_info.
851  */
852 int
853 hibernate_clear_signature(void)
854 {
855 	union hibernate_info blank_hiber_info;
856 	union hibernate_info hiber_info;
857 	u_int8_t *io_page;
858 
859 	/* Zero out a blank hiber_info */
860 	bzero(&blank_hiber_info, sizeof(hiber_info));
861 
862 	if (get_hibernate_info(&hiber_info, 0))
863 		return (1);
864 
865 	io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
866 	if (!io_page)
867 		return (1);
868 
869 	/* Write (zeroed) hibernate info to disk */
870 	/* XXX - use regular kernel write routine for this */
871 	if (hiber_info.io_func(hiber_info.device, hiber_info.sig_offset,
872 	    (vaddr_t)&blank_hiber_info, hiber_info.secsize, 1, io_page))
873 		panic("error hibernate write 6\n");
874 
875 	free(io_page, M_DEVBUF);
876 
877 	return (0);
878 }
879 
880 /*
881  * Check chunk range overlap when calculating whether or not to copy a
882  * compressed chunk to the piglet area before decompressing.
883  *
884  * returns zero if the ranges do not overlap, non-zero otherwise.
885  */
886 int
887 hibernate_check_overlap(paddr_t r1s, paddr_t r1e, paddr_t r2s, paddr_t r2e)
888 {
889 	/* case A : end of r1 overlaps start of r2 */
890 	if (r1s < r2s && r1e > r2s)
891 		return (1);
892 
893 	/* case B : r1 entirely inside r2 */
894 	if (r1s >= r2s && r1e <= r2e)
895 		return (1);
896 
897 	/* case C : r2 entirely inside r1 */
898 	if (r2s >= r1s && r2e <= r1e)
899 		return (1);
900 
901 	/* case D : end of r2 overlaps start of r1 */
902 	if (r2s < r1s && r2e > r1s)
903 		return (1);
904 
905 	return (0);
906 }
907 
908 /*
909  * Compare two hibernate_infos to determine if they are the same (eg,
910  * we should be performing a hibernate resume on this machine.
911  * Not all fields are checked - just enough to verify that the machine
912  * has the same memory configuration and kernel as the one that
913  * wrote the signature previously.
914  */
915 int
916 hibernate_compare_signature(union hibernate_info *mine,
917     union hibernate_info *disk)
918 {
919 	u_int i;
920 
921 	if (mine->nranges != disk->nranges)
922 		return (1);
923 
924 	if (strcmp(mine->kernel_version, disk->kernel_version) != 0)
925 		return (1);
926 
927 	for (i = 0; i < mine->nranges; i++) {
928 		if ((mine->ranges[i].base != disk->ranges[i].base) ||
929 		    (mine->ranges[i].end != disk->ranges[i].end) )
930 			return (1);
931 	}
932 
933 	return (0);
934 }
935 
936 /*
937  * Reads read_size bytes from the hibernate device specified in
938  * hib_info at offset blkctr. Output is placed into the vaddr specified
939  * at dest.
940  *
941  * Separate offsets and pages are used to handle misaligned reads (reads
942  * that span a page boundary).
943  *
944  * blkctr specifies a relative offset (relative to the start of swap),
945  * not an absolute disk offset
946  *
947  */
948 int
949 hibernate_read_block(union hibernate_info *hib_info, daddr_t blkctr,
950     size_t read_size, vaddr_t dest)
951 {
952 	struct buf *bp;
953 	struct bdevsw *bdsw;
954 	int error;
955 
956 	bp = geteblk(read_size);
957 	bdsw = &bdevsw[major(hib_info->device)];
958 
959 	error = (*bdsw->d_open)(hib_info->device, FREAD, S_IFCHR, curproc);
960 	if (error) {
961 		printf("hibernate_read_block open failed\n");
962 		return (1);
963 	}
964 
965 	bp->b_bcount = read_size;
966 	bp->b_blkno = blkctr;
967 	CLR(bp->b_flags, B_READ | B_WRITE | B_DONE);
968 	SET(bp->b_flags, B_BUSY | B_READ | B_RAW);
969 	bp->b_dev = hib_info->device;
970 	bp->b_cylinder = 0;
971 	(*bdsw->d_strategy)(bp);
972 
973 	error = biowait(bp);
974 	if (error) {
975 		printf("hibernate_read_block biowait failed %d\n", error);
976 		error = (*bdsw->d_close)(hib_info->device, 0, S_IFCHR,
977 		    curproc);
978 		if (error)
979 			printf("hibernate_read_block error close failed\n");
980 		return (1);
981 	}
982 
983 	error = (*bdsw->d_close)(hib_info->device, FREAD, S_IFCHR, curproc);
984 	if (error) {
985 		printf("hibernate_read_block close failed\n");
986 		return (1);
987 	}
988 
989 	bcopy(bp->b_data, (caddr_t)dest, read_size);
990 
991 	bp->b_flags |= B_INVAL;
992 	brelse(bp);
993 
994 	return (0);
995 }
996 
997 /*
998  * Reads the signature block from swap, checks against the current machine's
999  * information. If the information matches, perform a resume by reading the
1000  * saved image into the pig area, and unpacking.
1001  */
1002 void
1003 hibernate_resume(void)
1004 {
1005 	union hibernate_info hiber_info;
1006 	u_int8_t *io_page;
1007 	int s;
1008 
1009 	/* Scrub temporary vaddr ranges used during resume */
1010 	hibernate_temp_page = (vaddr_t)NULL;
1011 	hibernate_fchunk_area = (vaddr_t)NULL;
1012 	hibernate_chunktable_area = (vaddr_t)NULL;
1013 	hibernate_stack_page = (vaddr_t)NULL;
1014 
1015 	/* Get current running machine's hibernate info */
1016 	bzero(&hiber_info, sizeof(hiber_info));
1017 	if (get_hibernate_info(&hiber_info, 0))
1018 		return;
1019 
1020 	io_page = malloc(PAGE_SIZE, M_DEVBUF, M_NOWAIT);
1021 	if (!io_page)
1022 		return;
1023 
1024 	/* Read hibernate info from disk */
1025 	s = splbio();
1026 
1027 	/* XXX use regular kernel read routine here */
1028 	if (hiber_info.io_func(hiber_info.device, hiber_info.sig_offset,
1029 	    (vaddr_t)&disk_hiber_info, hiber_info.secsize, 0, io_page))
1030 		panic("error in hibernate read\n");
1031 
1032 	free(io_page, M_DEVBUF);
1033 
1034 	/*
1035 	 * If on-disk and in-memory hibernate signatures match,
1036 	 * this means we should do a resume from hibernate.
1037 	 */
1038 	if (hibernate_compare_signature(&hiber_info, &disk_hiber_info))
1039 		return;
1040 
1041 	/*
1042 	 * Allocate several regions of vaddrs for use during read.
1043 	 * These mappings go into the resuming kernel's page table, and are
1044 	 * used only during image read.
1045 	 */
1046 	hibernate_temp_page = (vaddr_t)km_alloc(2*PAGE_SIZE, &kv_any,
1047 	    &kp_none, &kd_nowait);
1048 	if (!hibernate_temp_page)
1049 		goto fail;
1050 
1051 	hibernate_fchunk_area = (vaddr_t)km_alloc(3*PAGE_SIZE, &kv_any,
1052 	    &kp_none, &kd_nowait);
1053 	if (!hibernate_fchunk_area)
1054 		goto fail;
1055 
1056 	/* Allocate a temporary chunktable area */
1057 	hibernate_chunktable_area = (vaddr_t)malloc(HIBERNATE_CHUNK_TABLE_SIZE,
1058 					   M_DEVBUF, M_NOWAIT);
1059 	if (!hibernate_chunktable_area)
1060 		goto fail;
1061 
1062 	/* Allocate one temporary page of VAs for the resume time stack */
1063 	hibernate_stack_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1064 	    &kp_none, &kd_nowait);
1065 	if (!hibernate_stack_page)
1066 		goto fail;
1067 
1068 	/* Read the image from disk into the image (pig) area */
1069 	if (hibernate_read_image(&disk_hiber_info))
1070 		goto fail;
1071 
1072 	/* Point of no return ... */
1073 
1074 	disable_intr();
1075 	cold = 1;
1076 
1077 	/* Switch stacks */
1078 	hibernate_switch_stack_machdep();
1079 
1080 	/*
1081 	 * Image is now in high memory (pig area), copy to correct location
1082 	 * in memory. We'll eventually end up copying on top of ourself, but
1083 	 * we are assured the kernel code here is the same between the
1084 	 * hibernated and resuming kernel, and we are running on our own
1085 	 * stack, so the overwrite is ok.
1086 	 */
1087 	hibernate_unpack_image(&disk_hiber_info);
1088 
1089 	/*
1090 	 * Resume the loaded kernel by jumping to the MD resume vector.
1091 	 * We won't be returning from this call.
1092 	 */
1093 	hibernate_resume_machdep();
1094 
1095 fail:
1096 	printf("Unable to resume hibernated image\n");
1097 
1098 	if (hibernate_temp_page)
1099 		km_free((void *)hibernate_temp_page, 2*PAGE_SIZE, &kv_any,
1100 		    &kp_none);
1101 
1102 	if (hibernate_fchunk_area)
1103 		km_free((void *)hibernate_fchunk_area, 3*PAGE_SIZE, &kv_any,
1104 		    &kp_none);
1105 
1106 	if (io_page)
1107 		free((void *)io_page, M_DEVBUF);
1108 
1109 	if (hibernate_chunktable_area)
1110 		free((void *)hibernate_chunktable_area, M_DEVBUF);
1111 }
1112 
1113 /*
1114  * Unpack image from pig area to original location by looping through the
1115  * list of output chunks in the order they should be restored (fchunks).
1116  * This ordering is used to avoid having inflate overwrite a chunk in the
1117  * middle of processing that chunk. This will, of course, happen during the
1118  * final output chunk, where we copy the chunk to the piglet area first,
1119  * before inflating.
1120  */
1121 void
1122 hibernate_unpack_image(union hibernate_info *hiber_info)
1123 {
1124 	struct hibernate_disk_chunk *chunks;
1125 	union hibernate_info local_hiber_info;
1126 	paddr_t image_cur = global_pig_start;
1127 	vaddr_t tempva;
1128 	int *fchunks, i;
1129 	char *pva = (char *)hiber_info->piglet_va;
1130 
1131 	/* Mask off based on arch-specific piglet page size */
1132 	pva = (char *)((paddr_t)pva & (PIGLET_PAGE_MASK));
1133 	fchunks = (int *)(pva + (6 * PAGE_SIZE));
1134 
1135 	/* Copy temporary chunktable to piglet */
1136 	tempva = (vaddr_t)km_alloc(HIBERNATE_CHUNK_TABLE_SIZE, &kv_any,
1137 	    &kp_none, &kd_nowait);
1138 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE; i += PAGE_SIZE)
1139 		pmap_kenter_pa(tempva + i, hiber_info->piglet_pa +
1140 		    HIBERNATE_CHUNK_SIZE + i, VM_PROT_ALL);
1141 
1142 	bcopy((caddr_t)hibernate_chunktable_area, (caddr_t)tempva,
1143 	    HIBERNATE_CHUNK_TABLE_SIZE);
1144 
1145 	chunks = (struct hibernate_disk_chunk *)(pva +  HIBERNATE_CHUNK_SIZE);
1146 
1147 	/* Can't use hiber_info that's passed in after here */
1148 	bcopy(hiber_info, &local_hiber_info, sizeof(union hibernate_info));
1149 
1150 	hibernate_activate_resume_pt_machdep();
1151 
1152 	for (i = 0; i < local_hiber_info.chunk_ctr; i++) {
1153 		/* Reset zlib for inflate */
1154 		if (hibernate_zlib_reset(&local_hiber_info, 0) != Z_OK)
1155 			panic("hibernate failed to reset zlib for inflate\n");
1156 
1157 		/*
1158 		 * If there is a conflict, copy the chunk to the piglet area
1159 		 * before unpacking it to its original location.
1160 		 */
1161 		if ((chunks[fchunks[i]].flags & HIBERNATE_CHUNK_CONFLICT) == 0)
1162 			hibernate_inflate(&local_hiber_info,
1163 			    chunks[fchunks[i]].base, image_cur,
1164 			    chunks[fchunks[i]].compressed_size);
1165 		else {
1166 			bcopy((caddr_t)image_cur,
1167 			    pva + (HIBERNATE_CHUNK_SIZE * 2),
1168 			    chunks[fchunks[i]].compressed_size);
1169 			hibernate_inflate(&local_hiber_info,
1170 			    chunks[fchunks[i]].base,
1171 			    (vaddr_t)(pva + (HIBERNATE_CHUNK_SIZE * 2)),
1172 			    chunks[fchunks[i]].compressed_size);
1173 		}
1174 		image_cur += chunks[fchunks[i]].compressed_size;
1175 	}
1176 }
1177 
1178 /*
1179  * Write a compressed version of this machine's memory to disk, at the
1180  * precalculated swap offset:
1181  *
1182  * end of swap - signature block size - chunk table size - memory size
1183  *
1184  * The function begins by looping through each phys mem range, cutting each
1185  * one into 4MB chunks. These chunks are then compressed individually
1186  * and written out to disk, in phys mem order. Some chunks might compress
1187  * more than others, and for this reason, each chunk's size is recorded
1188  * in the chunk table, which is written to disk after the image has
1189  * properly been compressed and written (in hibernate_write_chunktable).
1190  *
1191  * When this function is called, the machine is nearly suspended - most
1192  * devices are quiesced/suspended, interrupts are off, and cold has
1193  * been set. This means that there can be no side effects once the
1194  * write has started, and the write function itself can also have no
1195  * side effects.
1196  *
1197  * This function uses the piglet area during this process as follows:
1198  *
1199  * offset from piglet base	use
1200  * -----------------------	--------------------
1201  * 0				i/o allocation area
1202  * PAGE_SIZE			i/o write area
1203  * 2*PAGE_SIZE			temp/scratch page
1204  * 3*PAGE_SIZE			temp/scratch page
1205  * 4*PAGE_SIZE			hiballoc arena
1206  * 5*PAGE_SIZE to 85*PAGE_SIZE	zlib deflate area
1207  * ...
1208  * HIBERNATE_CHUNK_SIZE		chunk table temporary area
1209  *
1210  * Some transient piglet content is saved as part of deflate,
1211  * but it is irrelevant during resume as it will be repurposed
1212  * at that time for other things.
1213  */
1214 int
1215 hibernate_write_chunks(union hibernate_info *hiber_info)
1216 {
1217 	paddr_t range_base, range_end, inaddr, temp_inaddr;
1218 	size_t nblocks, out_remaining, used, offset = 0;
1219 	struct hibernate_disk_chunk *chunks;
1220 	vaddr_t hibernate_alloc_page = hiber_info->piglet_va;
1221 	vaddr_t hibernate_io_page = hiber_info->piglet_va + PAGE_SIZE;
1222 	daddr_t blkctr = hiber_info->image_offset;
1223 	int i;
1224 
1225 	hiber_info->chunk_ctr = 0;
1226 
1227 	/*
1228 	 * Allocate VA for the temp and copy page.
1229 	 */
1230 
1231 	hibernate_temp_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1232 	    &kp_none, &kd_nowait);
1233 	if (!hibernate_temp_page)
1234 		return (1);
1235 
1236 	hibernate_copy_page = (vaddr_t)km_alloc(PAGE_SIZE, &kv_any,
1237 	    &kp_none, &kd_nowait);
1238 	if (!hibernate_copy_page)
1239 		return (1);
1240 
1241 	pmap_kenter_pa(hibernate_copy_page,
1242 	    (hiber_info->piglet_pa + 3*PAGE_SIZE), VM_PROT_ALL);
1243 
1244 	/* XXX - not needed on all archs */
1245 	pmap_activate(curproc);
1246 
1247 	chunks = (struct hibernate_disk_chunk *)(hiber_info->piglet_va +
1248 	    HIBERNATE_CHUNK_SIZE);
1249 
1250 	/* Calculate the chunk regions */
1251 	for (i = 0; i < hiber_info->nranges; i++) {
1252 		range_base = hiber_info->ranges[i].base;
1253 		range_end = hiber_info->ranges[i].end;
1254 
1255 		inaddr = range_base;
1256 
1257 		while (inaddr < range_end) {
1258 			chunks[hiber_info->chunk_ctr].base = inaddr;
1259 			if (inaddr + HIBERNATE_CHUNK_SIZE < range_end)
1260 				chunks[hiber_info->chunk_ctr].end = inaddr +
1261 				    HIBERNATE_CHUNK_SIZE;
1262 			else
1263 				chunks[hiber_info->chunk_ctr].end = range_end;
1264 
1265 			inaddr += HIBERNATE_CHUNK_SIZE;
1266 			hiber_info->chunk_ctr ++;
1267 		}
1268 	}
1269 
1270 	/* Compress and write the chunks in the chunktable */
1271 	for (i = 0; i < hiber_info->chunk_ctr; i++) {
1272 		range_base = chunks[i].base;
1273 		range_end = chunks[i].end;
1274 
1275 		chunks[i].offset = blkctr;
1276 
1277 		/* Reset zlib for deflate */
1278 		if (hibernate_zlib_reset(hiber_info, 1) != Z_OK)
1279 			return (1);
1280 
1281 		inaddr = range_base;
1282 
1283 		/*
1284 		 * For each range, loop through its phys mem region
1285 		 * and write out the chunks (the last chunk might be
1286 		 * smaller than the chunk size).
1287 		 */
1288 		while (inaddr < range_end) {
1289 			out_remaining = PAGE_SIZE;
1290 			while (out_remaining > 0 && inaddr < range_end) {
1291 				pmap_kenter_pa(hibernate_temp_page,
1292 				    inaddr & PMAP_PA_MASK, VM_PROT_ALL);
1293 
1294 				/* XXX - not needed on all archs */
1295 				pmap_activate(curproc);
1296 
1297 				bcopy((caddr_t)hibernate_temp_page,
1298 				    (caddr_t)hibernate_copy_page, PAGE_SIZE);
1299 
1300 				/*
1301 				 * Adjust for regions that are not evenly
1302 				 * divisible by PAGE_SIZE
1303 				 */
1304 				temp_inaddr = (inaddr & PAGE_MASK) +
1305 				    hibernate_copy_page;
1306 
1307 				/* Deflate from temp_inaddr to IO page */
1308 				inaddr += hibernate_deflate(hiber_info,
1309 				    temp_inaddr, &out_remaining);
1310 			}
1311 
1312 			if (out_remaining == 0) {
1313 				/* Filled up the page */
1314 				nblocks = PAGE_SIZE / hiber_info->secsize;
1315 
1316 				if (hiber_info->io_func(hiber_info->device,
1317 				    blkctr, (vaddr_t)hibernate_io_page,
1318 				    PAGE_SIZE, 1, (void *)hibernate_alloc_page))
1319 					return (1);
1320 
1321 				blkctr += nblocks;
1322 			}
1323 		}
1324 
1325 		if (inaddr != range_end)
1326 			return (1);
1327 
1328 		/*
1329 		 * End of range. Round up to next secsize bytes
1330 		 * after finishing compress
1331 		 */
1332 		if (out_remaining == 0)
1333 			out_remaining = PAGE_SIZE;
1334 
1335 		/* Finish compress */
1336 		hibernate_state->hib_stream.avail_in = 0;
1337 		hibernate_state->hib_stream.avail_out = out_remaining;
1338 		hibernate_state->hib_stream.next_in = (caddr_t)inaddr;
1339 		hibernate_state->hib_stream.next_out =
1340 		    (caddr_t)hibernate_io_page + (PAGE_SIZE - out_remaining);
1341 
1342 		if (deflate(&hibernate_state->hib_stream, Z_FINISH) !=
1343 		    Z_STREAM_END)
1344 			return (1);
1345 
1346 		out_remaining = hibernate_state->hib_stream.avail_out;
1347 
1348 		used = PAGE_SIZE - out_remaining;
1349 		nblocks = used / hiber_info->secsize;
1350 
1351 		/* Round up to next block if needed */
1352 		if (used % hiber_info->secsize != 0)
1353 			nblocks ++;
1354 
1355 		/* Write final block(s) for this chunk */
1356 		if (hiber_info->io_func(hiber_info->device, blkctr,
1357 		    (vaddr_t)hibernate_io_page, nblocks*hiber_info->secsize,
1358 		    1, (void *)hibernate_alloc_page))
1359 			return (1);
1360 
1361 		blkctr += nblocks;
1362 
1363 		offset = blkctr;
1364 		chunks[i].compressed_size = (offset - chunks[i].offset) *
1365 		    hiber_info->secsize;
1366 	}
1367 
1368 	return (0);
1369 }
1370 
1371 /*
1372  * Reset the zlib stream state and allocate a new hiballoc area for either
1373  * inflate or deflate. This function is called once for each hibernate chunk.
1374  * Calling hiballoc_init multiple times is acceptable since the memory it is
1375  * provided is unmanaged memory (stolen). We use the memory provided to us
1376  * by the piglet allocated via the supplied hiber_info.
1377  */
1378 int
1379 hibernate_zlib_reset(union hibernate_info *hiber_info, int deflate)
1380 {
1381 	vaddr_t hibernate_zlib_start;
1382 	size_t hibernate_zlib_size;
1383 	char *pva = (char *)hiber_info->piglet_va;
1384 
1385 	hibernate_state = (struct hibernate_zlib_state *)
1386 	    (pva + (7 * PAGE_SIZE));
1387 
1388 	hibernate_zlib_start = (vaddr_t)(pva + (8 * PAGE_SIZE));
1389 	hibernate_zlib_size = 80 * PAGE_SIZE;
1390 
1391 	bzero((caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1392 	bzero((caddr_t)hibernate_state, PAGE_SIZE);
1393 
1394 	/* Set up stream structure */
1395 	hibernate_state->hib_stream.zalloc = (alloc_func)hibernate_zlib_alloc;
1396 	hibernate_state->hib_stream.zfree = (free_func)hibernate_zlib_free;
1397 
1398 	/* Initialize the hiballoc arena for zlib allocs/frees */
1399 	hiballoc_init(&hibernate_state->hiballoc_arena,
1400 	    (caddr_t)hibernate_zlib_start, hibernate_zlib_size);
1401 
1402 	if (deflate) {
1403 		return deflateInit(&hibernate_state->hib_stream,
1404 		    Z_DEFAULT_COMPRESSION);
1405 	} else
1406 		return inflateInit(&hibernate_state->hib_stream);
1407 }
1408 
1409 /*
1410  * Reads the hibernated memory image from disk, whose location and
1411  * size are recorded in hiber_info. Begin by reading the persisted
1412  * chunk table, which records the original chunk placement location
1413  * and compressed size for each. Next, allocate a pig region of
1414  * sufficient size to hold the compressed image. Next, read the
1415  * chunks into the pig area (calling hibernate_read_chunks to do this),
1416  * and finally, if all of the above succeeds, clear the hibernate signature.
1417  * The function will then return to hibernate_resume, which will proceed
1418  * to unpack the pig image to the correct place in memory.
1419  */
1420 int
1421 hibernate_read_image(union hibernate_info *hiber_info)
1422 {
1423 	size_t compressed_size, disk_size, chunktable_size, pig_sz;
1424 	paddr_t image_start, image_end, pig_start, pig_end;
1425 	struct hibernate_disk_chunk *chunks;
1426 	daddr_t blkctr;
1427 	int i;
1428 
1429 	/* Calculate total chunk table size in disk blocks */
1430 	chunktable_size = HIBERNATE_CHUNK_TABLE_SIZE / hiber_info->secsize;
1431 
1432 	blkctr = hiber_info->sig_offset - chunktable_size -
1433 			hiber_info->swap_offset;
1434 
1435 	for (i = 0; i < HIBERNATE_CHUNK_TABLE_SIZE;
1436 	    i += MAXPHYS, blkctr += MAXPHYS/hiber_info->secsize)
1437 		hibernate_read_block(hiber_info, blkctr, MAXPHYS,
1438 		    hibernate_chunktable_area + i);
1439 
1440 	blkctr = hiber_info->image_offset;
1441 	compressed_size = 0;
1442 	chunks = (struct hibernate_disk_chunk *)hibernate_chunktable_area;
1443 
1444 	for (i = 0; i < hiber_info->chunk_ctr; i++)
1445 		compressed_size += chunks[i].compressed_size;
1446 
1447 	disk_size = compressed_size;
1448 
1449 	/* Allocate the pig area */
1450 	pig_sz = compressed_size + HIBERNATE_CHUNK_SIZE;
1451 	if (uvm_pmr_alloc_pig(&pig_start, pig_sz) == ENOMEM)
1452 		return (1);
1453 
1454 	pig_end = pig_start + pig_sz;
1455 
1456 	/* Calculate image extents. Pig image must end on a chunk boundary. */
1457 	image_end = pig_end & ~(HIBERNATE_CHUNK_SIZE - 1);
1458 	image_start = pig_start;
1459 
1460 	image_start = image_end - disk_size;
1461 
1462 	hibernate_read_chunks(hiber_info, image_start, image_end, disk_size);
1463 
1464 	/* Prepare the resume time pmap/page table */
1465 	hibernate_populate_resume_pt(hiber_info, image_start, image_end);
1466 
1467 	/* Read complete, clear the signature and return */
1468 	return hibernate_clear_signature();
1469 }
1470 
1471 /*
1472  * Read the hibernated memory chunks from disk (chunk information at this
1473  * point is stored in the piglet) into the pig area specified by
1474  * [pig_start .. pig_end]. Order the chunks so that the final chunk is the
1475  * only chunk with overlap possibilities.
1476  *
1477  * This function uses the piglet area during this process as follows:
1478  *
1479  * offset from piglet base	use
1480  * -----------------------	--------------------
1481  * 0				i/o allocation area
1482  * PAGE_SIZE			i/o write area
1483  * 2*PAGE_SIZE			temp/scratch page
1484  * 3*PAGE_SIZE			temp/scratch page
1485  * 4*PAGE_SIZE to 6*PAGE_SIZE	chunk ordering area
1486  * 7*PAGE_SIZE			hiballoc arena
1487  * 8*PAGE_SIZE to 88*PAGE_SIZE	zlib deflate area
1488  * ...
1489  * HIBERNATE_CHUNK_SIZE		chunk table temporary area
1490  */
1491 int
1492 hibernate_read_chunks(union hibernate_info *hib_info, paddr_t pig_start,
1493     paddr_t pig_end, size_t image_compr_size)
1494 {
1495 	paddr_t img_index, img_cur, r1s, r1e, r2s, r2e;
1496 	paddr_t copy_start, copy_end, piglet_cur;
1497 	paddr_t piglet_base = hib_info->piglet_pa;
1498 	paddr_t piglet_end = piglet_base + HIBERNATE_CHUNK_SIZE;
1499 	daddr_t blkctr;
1500 	size_t processed, compressed_size, read_size;
1501 	int i, j, overlap, found, nchunks;
1502 	int nochunks = 0, nfchunks = 0, npchunks = 0;
1503 	struct hibernate_disk_chunk *chunks;
1504 	int *ochunks, *pchunks, *fchunks;
1505 
1506 	global_pig_start = pig_start;
1507 
1508 	/* XXX - dont need this on all archs */
1509 	pmap_activate(curproc);
1510 
1511 	/* Temporary output chunk ordering */
1512 	ochunks = (int *)hibernate_fchunk_area;
1513 
1514 	/* Piglet chunk ordering */
1515 	pchunks = (int *)(hibernate_fchunk_area + PAGE_SIZE);
1516 
1517 	/* Final chunk ordering */
1518 	fchunks = (int *)(hibernate_fchunk_area + (2*PAGE_SIZE));
1519 
1520 	/* Map the chunk ordering region */
1521 	pmap_kenter_pa(hibernate_fchunk_area,
1522 	    piglet_base + (4*PAGE_SIZE), VM_PROT_ALL);
1523 	pmap_kenter_pa((vaddr_t)pchunks, piglet_base + (5*PAGE_SIZE),
1524 	    VM_PROT_ALL);
1525 	pmap_kenter_pa((vaddr_t)fchunks, piglet_base + (6*PAGE_SIZE),
1526 	    VM_PROT_ALL);
1527 
1528 	nchunks = hib_info->chunk_ctr;
1529 	chunks = (struct hibernate_disk_chunk *)hibernate_chunktable_area;
1530 
1531 	/* Initially start all chunks as unplaced */
1532 	for (i = 0; i < nchunks; i++)
1533 		chunks[i].flags = 0;
1534 
1535 	/*
1536 	 * Search the list for chunks that are outside the pig area. These
1537 	 * can be placed first in the final output list.
1538 	 */
1539 	for (i = 0; i < nchunks; i++) {
1540 		if (chunks[i].end <= pig_start || chunks[i].base >= pig_end) {
1541 			ochunks[nochunks] = (u_int8_t)i;
1542 			fchunks[nfchunks] = (u_int8_t)i;
1543 			nochunks++;
1544 			nfchunks++;
1545 			chunks[i].flags |= HIBERNATE_CHUNK_USED;
1546 		}
1547 	}
1548 
1549 	/*
1550 	 * Walk the ordering, place the chunks in ascending memory order.
1551 	 * Conflicts might arise, these are handled next.
1552 	 */
1553 	do {
1554 		img_index = -1;
1555 		found = 0;
1556 		j = -1;
1557 		for (i = 0; i < nchunks; i++)
1558 			if (chunks[i].base < img_index &&
1559 			    chunks[i].flags == 0 ) {
1560 				j = i;
1561 				img_index = chunks[i].base;
1562 			}
1563 
1564 		if (j != -1) {
1565 			found = 1;
1566 			ochunks[nochunks] = (short)j;
1567 			nochunks++;
1568 			chunks[j].flags |= HIBERNATE_CHUNK_PLACED;
1569 		}
1570 	} while (found);
1571 
1572 	img_index = pig_start;
1573 
1574 	/*
1575 	 * Identify chunk output conflicts (chunks whose pig load area
1576 	 * corresponds to their original memory placement location)
1577 	 */
1578 	for (i = 0; i < nochunks ; i++) {
1579 		overlap = 0;
1580 		r1s = img_index;
1581 		r1e = img_index + chunks[ochunks[i]].compressed_size;
1582 		r2s = chunks[ochunks[i]].base;
1583 		r2e = chunks[ochunks[i]].end;
1584 
1585 		overlap = hibernate_check_overlap(r1s, r1e, r2s, r2e);
1586 		if (overlap)
1587 			chunks[ochunks[i]].flags |= HIBERNATE_CHUNK_CONFLICT;
1588 		img_index += chunks[ochunks[i]].compressed_size;
1589 	}
1590 
1591 	/*
1592 	 * Prepare the final output chunk list. Calculate an output
1593 	 * inflate strategy for overlapping chunks if needed.
1594 	 */
1595 	img_index = pig_start;
1596 	for (i = 0; i < nochunks ; i++) {
1597 		/*
1598 		 * If a conflict is detected, consume enough compressed
1599 		 * output chunks to fill the piglet
1600 		 */
1601 		if (chunks[ochunks[i]].flags & HIBERNATE_CHUNK_CONFLICT) {
1602 			copy_start = piglet_base;
1603 			copy_end = piglet_end;
1604 			piglet_cur = piglet_base;
1605 			npchunks = 0;
1606 			j = i;
1607 			while (copy_start < copy_end && j < nochunks) {
1608 				piglet_cur += chunks[ochunks[j]].compressed_size;
1609 				pchunks[npchunks] = ochunks[j];
1610 				npchunks++;
1611 				copy_start += chunks[ochunks[j]].compressed_size;
1612 				img_index += chunks[ochunks[j]].compressed_size;
1613 				i++;
1614 				j++;
1615 			}
1616 
1617 			piglet_cur = piglet_base;
1618 			for (j = 0; j < npchunks; j++) {
1619 				piglet_cur += chunks[pchunks[j]].compressed_size;
1620 				fchunks[nfchunks] = pchunks[j];
1621 				chunks[pchunks[j]].flags |= HIBERNATE_CHUNK_USED;
1622 				nfchunks++;
1623 			}
1624 		} else {
1625 			/*
1626 			 * No conflict, chunk can be added without copying
1627 			 */
1628 			if ((chunks[ochunks[i]].flags &
1629 			    HIBERNATE_CHUNK_USED) == 0) {
1630 				fchunks[nfchunks] = ochunks[i];
1631 				chunks[ochunks[i]].flags |= HIBERNATE_CHUNK_USED;
1632 				nfchunks++;
1633 			}
1634 			img_index += chunks[ochunks[i]].compressed_size;
1635 		}
1636 	}
1637 
1638 	img_index = pig_start;
1639 	for (i = 0; i < nfchunks; i++) {
1640 		piglet_cur = piglet_base;
1641 		img_index += chunks[fchunks[i]].compressed_size;
1642 	}
1643 
1644 	img_cur = pig_start;
1645 
1646 	for (i = 0; i < nfchunks; i++) {
1647 		blkctr = chunks[fchunks[i]].offset - hib_info->swap_offset;
1648 		processed = 0;
1649 		compressed_size = chunks[fchunks[i]].compressed_size;
1650 
1651 		while (processed < compressed_size) {
1652 			pmap_kenter_pa(hibernate_temp_page, img_cur,
1653 			    VM_PROT_ALL);
1654 			pmap_kenter_pa(hibernate_temp_page + PAGE_SIZE,
1655 			    img_cur+PAGE_SIZE, VM_PROT_ALL);
1656 
1657 			/* XXX - not needed on all archs */
1658 			pmap_activate(curproc);
1659 			if (compressed_size - processed >= PAGE_SIZE)
1660 				read_size = PAGE_SIZE;
1661 			else
1662 				read_size = compressed_size - processed;
1663 
1664 			hibernate_read_block(hib_info, blkctr, read_size,
1665 			    hibernate_temp_page + (img_cur & PAGE_MASK));
1666 
1667 			blkctr += (read_size / hib_info->secsize);
1668 
1669 			hibernate_flush();
1670 			pmap_kremove(hibernate_temp_page, PAGE_SIZE);
1671 			pmap_kremove(hibernate_temp_page + PAGE_SIZE,
1672 			    PAGE_SIZE);
1673 			processed += read_size;
1674 			img_cur += read_size;
1675 		}
1676 	}
1677 
1678 	return (0);
1679 }
1680 
1681 /*
1682  * Hibernating a machine comprises the following operations:
1683  *  1. Calculating this machine's hibernate_info information
1684  *  2. Allocating a piglet and saving the piglet's physaddr
1685  *  3. Calculating the memory chunks
1686  *  4. Writing the compressed chunks to disk
1687  *  5. Writing the chunk table
1688  *  6. Writing the signature block (hibernate_info)
1689  *
1690  * On most architectures, the function calling hibernate_suspend would
1691  * then power off the machine using some MD-specific implementation.
1692  */
1693 int
1694 hibernate_suspend(void)
1695 {
1696 	union hibernate_info hib_info;
1697 
1698 	/*
1699 	 * Calculate memory ranges, swap offsets, etc.
1700 	 * This also allocates a piglet whose physaddr is stored in
1701 	 * hib_info->piglet_pa and vaddr stored in hib_info->piglet_va
1702 	 */
1703 	if (get_hibernate_info(&hib_info, 1))
1704 		return (1);
1705 
1706 	/* XXX - Won't need to zero everything with RLE */
1707 	uvm_pmr_zero_everything();
1708 
1709 	if (hibernate_write_chunks(&hib_info))
1710 		return (1);
1711 
1712 	if (hibernate_write_chunktable(&hib_info))
1713 		return (1);
1714 
1715 	if (hibernate_write_signature(&hib_info))
1716 		return (1);
1717 
1718 	delay(100000);
1719 	return (0);
1720 }
1721