xref: /openbsd-src/sys/arch/amd64/amd64/hibernate_machdep.c (revision 1edb6294a762f6384eba7bcc9c4b5ac1e44b63cc)
1 /*	$OpenBSD: hibernate_machdep.c,v 1.52 2024/06/19 13:27:26 jsg Exp $	*/
2 
3 /*
4  * Copyright (c) 2012 Mike Larkin <mlarkin@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 
19 #include <sys/param.h>
20 #include <sys/systm.h>
21 #include <sys/buf.h>
22 #include <sys/conf.h>
23 #include <sys/device.h>
24 #include <sys/disk.h>
25 #include <sys/disklabel.h>
26 #include <sys/hibernate.h>
27 #include <sys/timeout.h>
28 #include <sys/malloc.h>
29 #include <sys/kcore.h>
30 #include <sys/atomic.h>
31 
32 #include <uvm/uvm_extern.h>
33 #include <uvm/uvm_pmemrange.h>
34 
35 #include <machine/biosvar.h>
36 #include <machine/cpu.h>
37 #include <machine/hibernate.h>
38 #include <machine/pte.h>
39 #include <machine/pmap.h>
40 
41 #ifdef MULTIPROCESSOR
42 #include <machine/mpbiosvar.h>
43 #endif /* MULTIPROCESSOR */
44 
45 #include <dev/acpi/acpivar.h>
46 
47 #include "acpi.h"
48 #include "wd.h"
49 #include "ahci.h"
50 #include "softraid.h"
51 #include "sd.h"
52 #include "nvme.h"
53 #include "sdmmc.h"
54 #include "ufshci.h"
55 
56 /* Hibernate support */
57 void    hibernate_enter_resume_4k_pte(vaddr_t, paddr_t);
58 void    hibernate_enter_resume_2m_pde(vaddr_t, paddr_t);
59 
60 extern	caddr_t start, end;
61 extern	int mem_cluster_cnt;
62 extern  phys_ram_seg_t mem_clusters[];
63 extern	bios_memmap_t *bios_memmap;
64 
65 /*
66  * amd64 MD Hibernate functions
67  *
68  * see amd64 hibernate.h for lowmem layout used during hibernate
69  */
70 
71 /*
72  * Returns the hibernate write I/O function to use on this machine
73  */
74 hibio_fn
get_hibernate_io_function(dev_t dev)75 get_hibernate_io_function(dev_t dev)
76 {
77 	char *blkname = findblkname(major(dev));
78 
79 	if (blkname == NULL)
80 		return NULL;
81 
82 #if NWD > 0
83 	if (strcmp(blkname, "wd") == 0) {
84 		extern int wd_hibernate_io(dev_t dev, daddr_t blkno,
85 		    vaddr_t addr, size_t size, int op, void *page);
86 		return wd_hibernate_io;
87 	}
88 #endif
89 #if NSD > 0
90 	if (strcmp(blkname, "sd") == 0) {
91 		extern struct cfdriver sd_cd;
92 		extern int ahci_hibernate_io(dev_t dev, daddr_t blkno,
93 		    vaddr_t addr, size_t size, int op, void *page);
94 		extern int nvme_hibernate_io(dev_t dev, daddr_t blkno,
95 		    vaddr_t addr, size_t size, int op, void *page);
96 		extern int sr_hibernate_io(dev_t dev, daddr_t blkno,
97 		    vaddr_t addr, size_t size, int op, void *page);
98 		extern int sdmmc_scsi_hibernate_io(dev_t dev, daddr_t blkno,
99 		    vaddr_t addr, size_t size, int op, void *page);
100 		extern int ufshci_hibernate_io(dev_t dev, daddr_t blkno,
101 		    vaddr_t addr, size_t size, int op, void *page);
102 		struct device *dv = disk_lookup(&sd_cd, DISKUNIT(dev));
103 		struct {
104 			const char *driver;
105 			hibio_fn io_func;
106 		} sd_io_funcs[] = {
107 #if NAHCI > 0
108 			{ "ahci", ahci_hibernate_io },
109 #endif
110 #if NNVME > 0
111 			{ "nvme", nvme_hibernate_io },
112 #endif
113 #if NSOFTRAID > 0
114 			{ "softraid", sr_hibernate_io },
115 #endif
116 #if NSDMMC > 0
117 			{ "sdmmc", sdmmc_scsi_hibernate_io },
118 #endif
119 #if NUFSHCI > 0
120 			{ "ufshci", ufshci_hibernate_io },
121 #endif
122 		};
123 
124 		if (dv && dv->dv_parent && dv->dv_parent->dv_parent) {
125 			const char *driver = dv->dv_parent->dv_parent->dv_cfdata->
126 			    cf_driver->cd_name;
127 			int i;
128 
129 			for (i = 0; i < nitems(sd_io_funcs); i++) {
130 				if (strcmp(driver, sd_io_funcs[i].driver) == 0)
131 					return sd_io_funcs[i].io_func;
132 			}
133 		}
134 	}
135 #endif /* NSD > 0 */
136 	return NULL;
137 }
138 
139 /*
140  * Gather MD-specific data and store into hiber_info
141  */
142 int
get_hibernate_info_md(union hibernate_info * hiber_info)143 get_hibernate_info_md(union hibernate_info *hiber_info)
144 {
145 	int i;
146 	bios_memmap_t *bmp;
147 
148 	/* Calculate memory ranges */
149 	hiber_info->nranges = mem_cluster_cnt;
150 	hiber_info->image_size = 0;
151 
152 	for (i = 0; i < mem_cluster_cnt; i++) {
153 		hiber_info->ranges[i].base = mem_clusters[i].start;
154 		hiber_info->ranges[i].end = mem_clusters[i].size + mem_clusters[i].start;
155 		hiber_info->image_size += hiber_info->ranges[i].end -
156 		    hiber_info->ranges[i].base;
157 	}
158 
159 #if NACPI > 0
160 	/* Record ACPI trampoline code page */
161 	if (hiber_info->nranges >= nitems(hiber_info->ranges))
162 		return (1);
163 	hiber_info->ranges[hiber_info->nranges].base = ACPI_TRAMPOLINE;
164 	hiber_info->ranges[hiber_info->nranges].end =
165 	    hiber_info->ranges[hiber_info->nranges].base + PAGE_SIZE;
166 	hiber_info->image_size += PAGE_SIZE;
167 	hiber_info->nranges++;
168 
169 	/* Record ACPI trampoline data page */
170 	if (hiber_info->nranges >= nitems(hiber_info->ranges))
171 		return (1);
172 	hiber_info->ranges[hiber_info->nranges].base = ACPI_TRAMP_DATA;
173 	hiber_info->ranges[hiber_info->nranges].end =
174 	    hiber_info->ranges[hiber_info->nranges].base + PAGE_SIZE;
175 	hiber_info->image_size += PAGE_SIZE;
176 	hiber_info->nranges++;
177 #endif
178 #ifdef MULTIPROCESSOR
179 	/* Record MP trampoline code page */
180 	if (hiber_info->nranges >= nitems(hiber_info->ranges))
181 		return (1);
182 	hiber_info->ranges[hiber_info->nranges].base = MP_TRAMPOLINE;
183 	hiber_info->ranges[hiber_info->nranges].end =
184 	    hiber_info->ranges[hiber_info->nranges].base + PAGE_SIZE;
185 	hiber_info->image_size += PAGE_SIZE;
186 	hiber_info->nranges++;
187 
188 	/* Record MP trampoline data page */
189 	if (hiber_info->nranges >= nitems(hiber_info->ranges))
190 		return (1);
191 	hiber_info->ranges[hiber_info->nranges].base =
192 		MP_TRAMP_DATA;
193 	hiber_info->ranges[hiber_info->nranges].end =
194 	    hiber_info->ranges[hiber_info->nranges].base + PAGE_SIZE;
195 	hiber_info->image_size += PAGE_SIZE;
196 	hiber_info->nranges++;
197 #endif
198 
199 	for (bmp = bios_memmap; bmp->type != BIOS_MAP_END; bmp++) {
200 		/* Skip non-NVS ranges (already processed) */
201 		if (bmp->type != BIOS_MAP_NVS)
202 			continue;
203 		if (hiber_info->nranges >= nitems(hiber_info->ranges))
204 			return (1);
205 
206 		i = hiber_info->nranges;
207 		hiber_info->ranges[i].base = round_page(bmp->addr);
208 		hiber_info->ranges[i].end = trunc_page(bmp->addr + bmp->size);
209 		hiber_info->image_size += hiber_info->ranges[i].end -
210 			hiber_info->ranges[i].base;
211 		hiber_info->nranges++;
212 	}
213 
214 	hibernate_sort_ranges(hiber_info);
215 
216 	return (0);
217 }
218 
219 /*
220  * Enter a mapping for va->pa in the resume pagetable, using
221  * the specified size.
222  *
223  * size : 0 if a 4KB mapping is desired
224  *        1 if a 2MB mapping is desired
225  */
226 void
hibernate_enter_resume_mapping(vaddr_t va,paddr_t pa,int size)227 hibernate_enter_resume_mapping(vaddr_t va, paddr_t pa, int size)
228 {
229 	if (size)
230 		return hibernate_enter_resume_2m_pde(va, pa);
231 	else
232 		return hibernate_enter_resume_4k_pte(va, pa);
233 }
234 
235 /*
236  * Enter a 2MB PDE mapping for the supplied VA/PA into the resume-time pmap
237  */
238 void
hibernate_enter_resume_2m_pde(vaddr_t va,paddr_t pa)239 hibernate_enter_resume_2m_pde(vaddr_t va, paddr_t pa)
240 {
241 	pt_entry_t *pde, npde;
242 
243 	if (va < NBPD_L4) {
244 		if (va < NBPD_L3) {
245 			/* First 512GB and 1GB are already mapped */
246 			pde = (pt_entry_t *)(HIBERNATE_PD_LOW +
247 				(pl2_pi(va) * sizeof(pt_entry_t)));
248 			npde = (pa & PG_LGFRAME) |
249 				PG_RW | PG_V | PG_M | PG_PS | PG_U;
250 			*pde = npde;
251 		} else {
252 			/* Map the 1GB containing region */
253 			pde = (pt_entry_t *)(HIBERNATE_PDPT_LOW +
254 				(pl3_pi(va) * sizeof(pt_entry_t)));
255 			npde = (HIBERNATE_PD_LOW2) | PG_RW | PG_V;
256 			*pde = npde;
257 
258 			/* Map 2MB page */
259 			pde = (pt_entry_t *)(HIBERNATE_PD_LOW2 +
260 				(pl2_pi(va) * sizeof(pt_entry_t)));
261 			npde = (pa & PG_LGFRAME) |
262 				PG_RW | PG_V | PG_M | PG_PS | PG_U;
263 			*pde = npde;
264 		}
265 	} else {
266 		/* First map the 512GB containing region */
267 		pde = (pt_entry_t *)(HIBERNATE_PML4T +
268 			(pl4_pi(va) * sizeof(pt_entry_t)));
269 		npde = (HIBERNATE_PDPT_HI) | PG_RW | PG_V;
270 		*pde = npde;
271 
272 		/* Map the 1GB containing region */
273 		pde = (pt_entry_t *)(HIBERNATE_PDPT_HI +
274 			(pl3_pi(va) * sizeof(pt_entry_t)));
275 		npde = (HIBERNATE_PD_HI) | PG_RW | PG_V;
276 		*pde = npde;
277 
278 		/* Map the 2MB page */
279 		pde = (pt_entry_t *)(HIBERNATE_PD_HI +
280 			(pl2_pi(va) * sizeof(pt_entry_t)));
281 		npde = (pa & PG_LGFRAME) | PG_RW | PG_V | PG_PS;
282 		*pde = npde;
283 	}
284 }
285 
286 /*
287  * Enter a 4KB PTE mapping for the supplied VA/PA into the resume-time pmap.
288  */
289 void
hibernate_enter_resume_4k_pte(vaddr_t va,paddr_t pa)290 hibernate_enter_resume_4k_pte(vaddr_t va, paddr_t pa)
291 {
292 	pt_entry_t *pde, npde;
293 
294 	/* Mappings entered here must be in the first 2MB VA */
295 	KASSERT(va < NBPD_L2);
296 
297 	/* Map the page */
298 	pde = (pt_entry_t *)(HIBERNATE_PT_LOW +
299 		(pl1_pi(va) * sizeof(pt_entry_t)));
300 	npde = (pa & PMAP_PA_MASK) | PG_RW | PG_V | PG_M | PG_U;
301 	*pde = npde;
302 }
303 
304 /*
305  * Create the resume-time page table. This table maps the image(pig) area,
306  * the kernel text area, and various utility pages for use during resume,
307  * since we cannot overwrite the resuming kernel's page table during inflate
308  * and expect things to work properly.
309  */
310 void
hibernate_populate_resume_pt(union hibernate_info * hib_info,paddr_t image_start,paddr_t image_end)311 hibernate_populate_resume_pt(union hibernate_info *hib_info,
312     paddr_t image_start, paddr_t image_end)
313 {
314 	int phys_page_number, i;
315 	paddr_t pa;
316 	vaddr_t kern_start_2m_va, kern_end_2m_va, page;
317 	vaddr_t piglet_start_va, piglet_end_va;
318 	pt_entry_t *pde, npde;
319 
320 	/* Identity map MMU pages */
321 	pmap_kenter_pa(HIBERNATE_PML4T, HIBERNATE_PML4T, PROT_MASK);
322 	pmap_kenter_pa(HIBERNATE_PDPT_LOW, HIBERNATE_PDPT_LOW, PROT_MASK);
323 	pmap_kenter_pa(HIBERNATE_PDPT_HI, HIBERNATE_PDPT_HI, PROT_MASK);
324 	pmap_kenter_pa(HIBERNATE_PD_LOW, HIBERNATE_PD_LOW, PROT_MASK);
325 	pmap_kenter_pa(HIBERNATE_PD_LOW2, HIBERNATE_PD_LOW2, PROT_MASK);
326 	pmap_kenter_pa(HIBERNATE_PD_HI, HIBERNATE_PD_HI, PROT_MASK);
327 	pmap_kenter_pa(HIBERNATE_PT_LOW, HIBERNATE_PT_LOW, PROT_MASK);
328 	pmap_kenter_pa(HIBERNATE_PT_LOW2, HIBERNATE_PT_LOW2, PROT_MASK);
329 	pmap_kenter_pa(HIBERNATE_PT_HI, HIBERNATE_PT_HI, PROT_MASK);
330 
331 	/* Identity map 3 pages for stack */
332 	pmap_kenter_pa(HIBERNATE_STACK_PAGE, HIBERNATE_STACK_PAGE, PROT_MASK);
333 	pmap_kenter_pa(HIBERNATE_STACK_PAGE - PAGE_SIZE,
334 		HIBERNATE_STACK_PAGE - PAGE_SIZE, PROT_MASK);
335 	pmap_kenter_pa(HIBERNATE_STACK_PAGE - 2*PAGE_SIZE,
336 		HIBERNATE_STACK_PAGE - 2*PAGE_SIZE, PROT_MASK);
337 	pmap_activate(curproc);
338 
339 	bzero((caddr_t)HIBERNATE_PML4T, PAGE_SIZE);
340 	bzero((caddr_t)HIBERNATE_PDPT_LOW, PAGE_SIZE);
341 	bzero((caddr_t)HIBERNATE_PDPT_HI, PAGE_SIZE);
342 	bzero((caddr_t)HIBERNATE_PD_LOW, PAGE_SIZE);
343 	bzero((caddr_t)HIBERNATE_PD_LOW2, PAGE_SIZE);
344 	bzero((caddr_t)HIBERNATE_PD_HI, PAGE_SIZE);
345 	bzero((caddr_t)HIBERNATE_PT_LOW, PAGE_SIZE);
346 	bzero((caddr_t)HIBERNATE_PT_LOW2, PAGE_SIZE);
347 	bzero((caddr_t)HIBERNATE_PT_HI, PAGE_SIZE);
348 	bzero((caddr_t)(HIBERNATE_STACK_PAGE - 3*PAGE_SIZE) , 3*PAGE_SIZE);
349 
350 	/* First 512GB PML4E */
351 	pde = (pt_entry_t *)(HIBERNATE_PML4T +
352 		(pl4_pi(0) * sizeof(pt_entry_t)));
353 	npde = (HIBERNATE_PDPT_LOW) | PG_RW | PG_V;
354 	*pde = npde;
355 
356 	/* First 1GB PDPTE */
357 	pde = (pt_entry_t *)(HIBERNATE_PDPT_LOW +
358 		(pl3_pi(0) * sizeof(pt_entry_t)));
359 	npde = (HIBERNATE_PD_LOW) | PG_RW | PG_V;
360 	*pde = npde;
361 
362 	/* PD for first 2MB */
363 	pde = (pt_entry_t *)(HIBERNATE_PD_LOW +
364 		(pl2_pi(0) * sizeof(pt_entry_t)));
365 	npde = (HIBERNATE_PT_LOW) | PG_RW | PG_V;
366 	*pde = npde;
367 
368 	/*
369 	 * Identity map low physical pages.
370 	 * See arch/amd64/include/hibernate_var.h for page ranges used here.
371 	 */
372 	for (i = ACPI_TRAMPOLINE; i <= HIBERNATE_HIBALLOC_PAGE; i += PAGE_SIZE)
373 		hibernate_enter_resume_mapping(i, i, 0);
374 
375 	/*
376 	 * Map current kernel VA range using 2MB pages
377 	 */
378 	kern_start_2m_va = (vaddr_t)&start & ~(PAGE_MASK_L2);
379 	kern_end_2m_va = (vaddr_t)&end & ~(PAGE_MASK_L2);
380 
381 	/* amd64 kernels load at 16MB phys (on the 8th 2mb page) */
382 	phys_page_number = 8;
383 
384 	for (page = kern_start_2m_va; page <= kern_end_2m_va;
385 	    page += NBPD_L2, phys_page_number++) {
386 		pa = (paddr_t)(phys_page_number * NBPD_L2);
387 		hibernate_enter_resume_mapping(page, pa, 1);
388 	}
389 
390 	/*
391 	 * Identity map the piglet using 2MB pages.
392 	 */
393 	phys_page_number = hib_info->piglet_pa / NBPD_L2;
394 
395 	/* VA == PA */
396 	piglet_start_va = hib_info->piglet_pa;
397 	piglet_end_va = piglet_start_va + HIBERNATE_CHUNK_SIZE * 4;
398 
399 	for (page = piglet_start_va; page <= piglet_end_va;
400 	    page += NBPD_L2, phys_page_number++) {
401 		pa = (paddr_t)(phys_page_number * NBPD_L2);
402 		hibernate_enter_resume_mapping(page, pa, 1);
403 	}
404 
405 	/* Unmap MMU pages (stack remains mapped) */
406 	pmap_kremove(HIBERNATE_PML4T, PAGE_SIZE);
407 	pmap_kremove(HIBERNATE_PDPT_LOW, PAGE_SIZE);
408 	pmap_kremove(HIBERNATE_PDPT_HI, PAGE_SIZE);
409 	pmap_kremove(HIBERNATE_PD_LOW, PAGE_SIZE);
410 	pmap_kremove(HIBERNATE_PD_LOW2, PAGE_SIZE);
411 	pmap_kremove(HIBERNATE_PD_HI, PAGE_SIZE);
412 	pmap_kremove(HIBERNATE_PT_LOW, PAGE_SIZE);
413 	pmap_kremove(HIBERNATE_PT_LOW2, PAGE_SIZE);
414 	pmap_kremove(HIBERNATE_PT_HI, PAGE_SIZE);
415 
416 	pmap_activate(curproc);
417 }
418 
419 /*
420  * During inflate, certain pages that contain our bookkeeping information
421  * (eg, the chunk table, scratch pages, retguard region, etc) need to be
422  * skipped over and not inflated into.
423  *
424  * Return values:
425  *  HIB_MOVE: if the physical page at dest should be moved to the retguard save
426  *    region in the piglet
427  *  HIB_SKIP: if the physical page at dest should be skipped
428  *  0: otherwise (no special treatment needed)
429  */
430 int
hibernate_inflate_skip(union hibernate_info * hib_info,paddr_t dest)431 hibernate_inflate_skip(union hibernate_info *hib_info, paddr_t dest)
432 {
433 	extern paddr_t retguard_start_phys, retguard_end_phys;
434 
435 	if (dest >= hib_info->piglet_pa &&
436 	    dest <= (hib_info->piglet_pa + 4 * HIBERNATE_CHUNK_SIZE))
437 		return (HIB_SKIP);
438 
439 	if (dest >= retguard_start_phys && dest <= retguard_end_phys)
440 		return (HIB_MOVE);
441 
442 	return (0);
443 }
444 
445 void
hibernate_enable_intr_machdep(void)446 hibernate_enable_intr_machdep(void)
447 {
448 	intr_enable();
449 }
450 
451 void
hibernate_disable_intr_machdep(void)452 hibernate_disable_intr_machdep(void)
453 {
454 	intr_disable();
455 }
456 
457 #ifdef MULTIPROCESSOR
458 /*
459  * Quiesce CPUs in a multiprocessor machine before resuming. We need to do
460  * this since the APs will be hatched (but waiting for CPUF_GO), and we don't
461  * want the APs to be executing code and causing side effects during the
462  * unpack operation.
463  */
464 void
hibernate_quiesce_cpus(void)465 hibernate_quiesce_cpus(void)
466 {
467 	struct cpu_info *ci;
468 	u_long i;
469 
470 	KASSERT(CPU_IS_PRIMARY(curcpu()));
471 
472 	pmap_kenter_pa(ACPI_TRAMPOLINE, ACPI_TRAMPOLINE, PROT_READ | PROT_EXEC);
473 	pmap_kenter_pa(ACPI_TRAMP_DATA, ACPI_TRAMP_DATA,
474 		PROT_READ | PROT_WRITE);
475 
476 	if (curcpu()->ci_feature_sefflags_edx & SEFF0EDX_IBT)
477 		lcr4(rcr4() & ~CR4_CET);
478 
479 	for (i = 0; i < MAXCPUS; i++) {
480 		ci = cpu_info[i];
481 		if (ci == NULL)
482 			continue;
483 		if (ci->ci_idle_pcb == NULL)
484 			continue;
485 		if ((ci->ci_flags & CPUF_PRESENT) == 0)
486 			continue;
487 		if (ci->ci_flags & (CPUF_BSP | CPUF_SP | CPUF_PRIMARY))
488 			continue;
489 		atomic_setbits_int(&ci->ci_flags, CPUF_GO | CPUF_PARK);
490 	}
491 
492 	/* Wait a bit for the APs to park themselves */
493 	delay(500000);
494 
495 	pmap_kremove(ACPI_TRAMPOLINE, PAGE_SIZE);
496 	pmap_kremove(ACPI_TRAMP_DATA, PAGE_SIZE);
497 }
498 #endif /* MULTIPROCESSOR */
499