1 /* $OpenBSD: hibernate_machdep.c,v 1.52 2024/06/19 13:27:26 jsg Exp $ */
2
3 /*
4 * Copyright (c) 2012 Mike Larkin <mlarkin@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18
19 #include <sys/param.h>
20 #include <sys/systm.h>
21 #include <sys/buf.h>
22 #include <sys/conf.h>
23 #include <sys/device.h>
24 #include <sys/disk.h>
25 #include <sys/disklabel.h>
26 #include <sys/hibernate.h>
27 #include <sys/timeout.h>
28 #include <sys/malloc.h>
29 #include <sys/kcore.h>
30 #include <sys/atomic.h>
31
32 #include <uvm/uvm_extern.h>
33 #include <uvm/uvm_pmemrange.h>
34
35 #include <machine/biosvar.h>
36 #include <machine/cpu.h>
37 #include <machine/hibernate.h>
38 #include <machine/pte.h>
39 #include <machine/pmap.h>
40
41 #ifdef MULTIPROCESSOR
42 #include <machine/mpbiosvar.h>
43 #endif /* MULTIPROCESSOR */
44
45 #include <dev/acpi/acpivar.h>
46
47 #include "acpi.h"
48 #include "wd.h"
49 #include "ahci.h"
50 #include "softraid.h"
51 #include "sd.h"
52 #include "nvme.h"
53 #include "sdmmc.h"
54 #include "ufshci.h"
55
56 /* Hibernate support */
57 void hibernate_enter_resume_4k_pte(vaddr_t, paddr_t);
58 void hibernate_enter_resume_2m_pde(vaddr_t, paddr_t);
59
60 extern caddr_t start, end;
61 extern int mem_cluster_cnt;
62 extern phys_ram_seg_t mem_clusters[];
63 extern bios_memmap_t *bios_memmap;
64
65 /*
66 * amd64 MD Hibernate functions
67 *
68 * see amd64 hibernate.h for lowmem layout used during hibernate
69 */
70
71 /*
72 * Returns the hibernate write I/O function to use on this machine
73 */
74 hibio_fn
get_hibernate_io_function(dev_t dev)75 get_hibernate_io_function(dev_t dev)
76 {
77 char *blkname = findblkname(major(dev));
78
79 if (blkname == NULL)
80 return NULL;
81
82 #if NWD > 0
83 if (strcmp(blkname, "wd") == 0) {
84 extern int wd_hibernate_io(dev_t dev, daddr_t blkno,
85 vaddr_t addr, size_t size, int op, void *page);
86 return wd_hibernate_io;
87 }
88 #endif
89 #if NSD > 0
90 if (strcmp(blkname, "sd") == 0) {
91 extern struct cfdriver sd_cd;
92 extern int ahci_hibernate_io(dev_t dev, daddr_t blkno,
93 vaddr_t addr, size_t size, int op, void *page);
94 extern int nvme_hibernate_io(dev_t dev, daddr_t blkno,
95 vaddr_t addr, size_t size, int op, void *page);
96 extern int sr_hibernate_io(dev_t dev, daddr_t blkno,
97 vaddr_t addr, size_t size, int op, void *page);
98 extern int sdmmc_scsi_hibernate_io(dev_t dev, daddr_t blkno,
99 vaddr_t addr, size_t size, int op, void *page);
100 extern int ufshci_hibernate_io(dev_t dev, daddr_t blkno,
101 vaddr_t addr, size_t size, int op, void *page);
102 struct device *dv = disk_lookup(&sd_cd, DISKUNIT(dev));
103 struct {
104 const char *driver;
105 hibio_fn io_func;
106 } sd_io_funcs[] = {
107 #if NAHCI > 0
108 { "ahci", ahci_hibernate_io },
109 #endif
110 #if NNVME > 0
111 { "nvme", nvme_hibernate_io },
112 #endif
113 #if NSOFTRAID > 0
114 { "softraid", sr_hibernate_io },
115 #endif
116 #if NSDMMC > 0
117 { "sdmmc", sdmmc_scsi_hibernate_io },
118 #endif
119 #if NUFSHCI > 0
120 { "ufshci", ufshci_hibernate_io },
121 #endif
122 };
123
124 if (dv && dv->dv_parent && dv->dv_parent->dv_parent) {
125 const char *driver = dv->dv_parent->dv_parent->dv_cfdata->
126 cf_driver->cd_name;
127 int i;
128
129 for (i = 0; i < nitems(sd_io_funcs); i++) {
130 if (strcmp(driver, sd_io_funcs[i].driver) == 0)
131 return sd_io_funcs[i].io_func;
132 }
133 }
134 }
135 #endif /* NSD > 0 */
136 return NULL;
137 }
138
139 /*
140 * Gather MD-specific data and store into hiber_info
141 */
142 int
get_hibernate_info_md(union hibernate_info * hiber_info)143 get_hibernate_info_md(union hibernate_info *hiber_info)
144 {
145 int i;
146 bios_memmap_t *bmp;
147
148 /* Calculate memory ranges */
149 hiber_info->nranges = mem_cluster_cnt;
150 hiber_info->image_size = 0;
151
152 for (i = 0; i < mem_cluster_cnt; i++) {
153 hiber_info->ranges[i].base = mem_clusters[i].start;
154 hiber_info->ranges[i].end = mem_clusters[i].size + mem_clusters[i].start;
155 hiber_info->image_size += hiber_info->ranges[i].end -
156 hiber_info->ranges[i].base;
157 }
158
159 #if NACPI > 0
160 /* Record ACPI trampoline code page */
161 if (hiber_info->nranges >= nitems(hiber_info->ranges))
162 return (1);
163 hiber_info->ranges[hiber_info->nranges].base = ACPI_TRAMPOLINE;
164 hiber_info->ranges[hiber_info->nranges].end =
165 hiber_info->ranges[hiber_info->nranges].base + PAGE_SIZE;
166 hiber_info->image_size += PAGE_SIZE;
167 hiber_info->nranges++;
168
169 /* Record ACPI trampoline data page */
170 if (hiber_info->nranges >= nitems(hiber_info->ranges))
171 return (1);
172 hiber_info->ranges[hiber_info->nranges].base = ACPI_TRAMP_DATA;
173 hiber_info->ranges[hiber_info->nranges].end =
174 hiber_info->ranges[hiber_info->nranges].base + PAGE_SIZE;
175 hiber_info->image_size += PAGE_SIZE;
176 hiber_info->nranges++;
177 #endif
178 #ifdef MULTIPROCESSOR
179 /* Record MP trampoline code page */
180 if (hiber_info->nranges >= nitems(hiber_info->ranges))
181 return (1);
182 hiber_info->ranges[hiber_info->nranges].base = MP_TRAMPOLINE;
183 hiber_info->ranges[hiber_info->nranges].end =
184 hiber_info->ranges[hiber_info->nranges].base + PAGE_SIZE;
185 hiber_info->image_size += PAGE_SIZE;
186 hiber_info->nranges++;
187
188 /* Record MP trampoline data page */
189 if (hiber_info->nranges >= nitems(hiber_info->ranges))
190 return (1);
191 hiber_info->ranges[hiber_info->nranges].base =
192 MP_TRAMP_DATA;
193 hiber_info->ranges[hiber_info->nranges].end =
194 hiber_info->ranges[hiber_info->nranges].base + PAGE_SIZE;
195 hiber_info->image_size += PAGE_SIZE;
196 hiber_info->nranges++;
197 #endif
198
199 for (bmp = bios_memmap; bmp->type != BIOS_MAP_END; bmp++) {
200 /* Skip non-NVS ranges (already processed) */
201 if (bmp->type != BIOS_MAP_NVS)
202 continue;
203 if (hiber_info->nranges >= nitems(hiber_info->ranges))
204 return (1);
205
206 i = hiber_info->nranges;
207 hiber_info->ranges[i].base = round_page(bmp->addr);
208 hiber_info->ranges[i].end = trunc_page(bmp->addr + bmp->size);
209 hiber_info->image_size += hiber_info->ranges[i].end -
210 hiber_info->ranges[i].base;
211 hiber_info->nranges++;
212 }
213
214 hibernate_sort_ranges(hiber_info);
215
216 return (0);
217 }
218
219 /*
220 * Enter a mapping for va->pa in the resume pagetable, using
221 * the specified size.
222 *
223 * size : 0 if a 4KB mapping is desired
224 * 1 if a 2MB mapping is desired
225 */
226 void
hibernate_enter_resume_mapping(vaddr_t va,paddr_t pa,int size)227 hibernate_enter_resume_mapping(vaddr_t va, paddr_t pa, int size)
228 {
229 if (size)
230 return hibernate_enter_resume_2m_pde(va, pa);
231 else
232 return hibernate_enter_resume_4k_pte(va, pa);
233 }
234
235 /*
236 * Enter a 2MB PDE mapping for the supplied VA/PA into the resume-time pmap
237 */
238 void
hibernate_enter_resume_2m_pde(vaddr_t va,paddr_t pa)239 hibernate_enter_resume_2m_pde(vaddr_t va, paddr_t pa)
240 {
241 pt_entry_t *pde, npde;
242
243 if (va < NBPD_L4) {
244 if (va < NBPD_L3) {
245 /* First 512GB and 1GB are already mapped */
246 pde = (pt_entry_t *)(HIBERNATE_PD_LOW +
247 (pl2_pi(va) * sizeof(pt_entry_t)));
248 npde = (pa & PG_LGFRAME) |
249 PG_RW | PG_V | PG_M | PG_PS | PG_U;
250 *pde = npde;
251 } else {
252 /* Map the 1GB containing region */
253 pde = (pt_entry_t *)(HIBERNATE_PDPT_LOW +
254 (pl3_pi(va) * sizeof(pt_entry_t)));
255 npde = (HIBERNATE_PD_LOW2) | PG_RW | PG_V;
256 *pde = npde;
257
258 /* Map 2MB page */
259 pde = (pt_entry_t *)(HIBERNATE_PD_LOW2 +
260 (pl2_pi(va) * sizeof(pt_entry_t)));
261 npde = (pa & PG_LGFRAME) |
262 PG_RW | PG_V | PG_M | PG_PS | PG_U;
263 *pde = npde;
264 }
265 } else {
266 /* First map the 512GB containing region */
267 pde = (pt_entry_t *)(HIBERNATE_PML4T +
268 (pl4_pi(va) * sizeof(pt_entry_t)));
269 npde = (HIBERNATE_PDPT_HI) | PG_RW | PG_V;
270 *pde = npde;
271
272 /* Map the 1GB containing region */
273 pde = (pt_entry_t *)(HIBERNATE_PDPT_HI +
274 (pl3_pi(va) * sizeof(pt_entry_t)));
275 npde = (HIBERNATE_PD_HI) | PG_RW | PG_V;
276 *pde = npde;
277
278 /* Map the 2MB page */
279 pde = (pt_entry_t *)(HIBERNATE_PD_HI +
280 (pl2_pi(va) * sizeof(pt_entry_t)));
281 npde = (pa & PG_LGFRAME) | PG_RW | PG_V | PG_PS;
282 *pde = npde;
283 }
284 }
285
286 /*
287 * Enter a 4KB PTE mapping for the supplied VA/PA into the resume-time pmap.
288 */
289 void
hibernate_enter_resume_4k_pte(vaddr_t va,paddr_t pa)290 hibernate_enter_resume_4k_pte(vaddr_t va, paddr_t pa)
291 {
292 pt_entry_t *pde, npde;
293
294 /* Mappings entered here must be in the first 2MB VA */
295 KASSERT(va < NBPD_L2);
296
297 /* Map the page */
298 pde = (pt_entry_t *)(HIBERNATE_PT_LOW +
299 (pl1_pi(va) * sizeof(pt_entry_t)));
300 npde = (pa & PMAP_PA_MASK) | PG_RW | PG_V | PG_M | PG_U;
301 *pde = npde;
302 }
303
304 /*
305 * Create the resume-time page table. This table maps the image(pig) area,
306 * the kernel text area, and various utility pages for use during resume,
307 * since we cannot overwrite the resuming kernel's page table during inflate
308 * and expect things to work properly.
309 */
310 void
hibernate_populate_resume_pt(union hibernate_info * hib_info,paddr_t image_start,paddr_t image_end)311 hibernate_populate_resume_pt(union hibernate_info *hib_info,
312 paddr_t image_start, paddr_t image_end)
313 {
314 int phys_page_number, i;
315 paddr_t pa;
316 vaddr_t kern_start_2m_va, kern_end_2m_va, page;
317 vaddr_t piglet_start_va, piglet_end_va;
318 pt_entry_t *pde, npde;
319
320 /* Identity map MMU pages */
321 pmap_kenter_pa(HIBERNATE_PML4T, HIBERNATE_PML4T, PROT_MASK);
322 pmap_kenter_pa(HIBERNATE_PDPT_LOW, HIBERNATE_PDPT_LOW, PROT_MASK);
323 pmap_kenter_pa(HIBERNATE_PDPT_HI, HIBERNATE_PDPT_HI, PROT_MASK);
324 pmap_kenter_pa(HIBERNATE_PD_LOW, HIBERNATE_PD_LOW, PROT_MASK);
325 pmap_kenter_pa(HIBERNATE_PD_LOW2, HIBERNATE_PD_LOW2, PROT_MASK);
326 pmap_kenter_pa(HIBERNATE_PD_HI, HIBERNATE_PD_HI, PROT_MASK);
327 pmap_kenter_pa(HIBERNATE_PT_LOW, HIBERNATE_PT_LOW, PROT_MASK);
328 pmap_kenter_pa(HIBERNATE_PT_LOW2, HIBERNATE_PT_LOW2, PROT_MASK);
329 pmap_kenter_pa(HIBERNATE_PT_HI, HIBERNATE_PT_HI, PROT_MASK);
330
331 /* Identity map 3 pages for stack */
332 pmap_kenter_pa(HIBERNATE_STACK_PAGE, HIBERNATE_STACK_PAGE, PROT_MASK);
333 pmap_kenter_pa(HIBERNATE_STACK_PAGE - PAGE_SIZE,
334 HIBERNATE_STACK_PAGE - PAGE_SIZE, PROT_MASK);
335 pmap_kenter_pa(HIBERNATE_STACK_PAGE - 2*PAGE_SIZE,
336 HIBERNATE_STACK_PAGE - 2*PAGE_SIZE, PROT_MASK);
337 pmap_activate(curproc);
338
339 bzero((caddr_t)HIBERNATE_PML4T, PAGE_SIZE);
340 bzero((caddr_t)HIBERNATE_PDPT_LOW, PAGE_SIZE);
341 bzero((caddr_t)HIBERNATE_PDPT_HI, PAGE_SIZE);
342 bzero((caddr_t)HIBERNATE_PD_LOW, PAGE_SIZE);
343 bzero((caddr_t)HIBERNATE_PD_LOW2, PAGE_SIZE);
344 bzero((caddr_t)HIBERNATE_PD_HI, PAGE_SIZE);
345 bzero((caddr_t)HIBERNATE_PT_LOW, PAGE_SIZE);
346 bzero((caddr_t)HIBERNATE_PT_LOW2, PAGE_SIZE);
347 bzero((caddr_t)HIBERNATE_PT_HI, PAGE_SIZE);
348 bzero((caddr_t)(HIBERNATE_STACK_PAGE - 3*PAGE_SIZE) , 3*PAGE_SIZE);
349
350 /* First 512GB PML4E */
351 pde = (pt_entry_t *)(HIBERNATE_PML4T +
352 (pl4_pi(0) * sizeof(pt_entry_t)));
353 npde = (HIBERNATE_PDPT_LOW) | PG_RW | PG_V;
354 *pde = npde;
355
356 /* First 1GB PDPTE */
357 pde = (pt_entry_t *)(HIBERNATE_PDPT_LOW +
358 (pl3_pi(0) * sizeof(pt_entry_t)));
359 npde = (HIBERNATE_PD_LOW) | PG_RW | PG_V;
360 *pde = npde;
361
362 /* PD for first 2MB */
363 pde = (pt_entry_t *)(HIBERNATE_PD_LOW +
364 (pl2_pi(0) * sizeof(pt_entry_t)));
365 npde = (HIBERNATE_PT_LOW) | PG_RW | PG_V;
366 *pde = npde;
367
368 /*
369 * Identity map low physical pages.
370 * See arch/amd64/include/hibernate_var.h for page ranges used here.
371 */
372 for (i = ACPI_TRAMPOLINE; i <= HIBERNATE_HIBALLOC_PAGE; i += PAGE_SIZE)
373 hibernate_enter_resume_mapping(i, i, 0);
374
375 /*
376 * Map current kernel VA range using 2MB pages
377 */
378 kern_start_2m_va = (vaddr_t)&start & ~(PAGE_MASK_L2);
379 kern_end_2m_va = (vaddr_t)&end & ~(PAGE_MASK_L2);
380
381 /* amd64 kernels load at 16MB phys (on the 8th 2mb page) */
382 phys_page_number = 8;
383
384 for (page = kern_start_2m_va; page <= kern_end_2m_va;
385 page += NBPD_L2, phys_page_number++) {
386 pa = (paddr_t)(phys_page_number * NBPD_L2);
387 hibernate_enter_resume_mapping(page, pa, 1);
388 }
389
390 /*
391 * Identity map the piglet using 2MB pages.
392 */
393 phys_page_number = hib_info->piglet_pa / NBPD_L2;
394
395 /* VA == PA */
396 piglet_start_va = hib_info->piglet_pa;
397 piglet_end_va = piglet_start_va + HIBERNATE_CHUNK_SIZE * 4;
398
399 for (page = piglet_start_va; page <= piglet_end_va;
400 page += NBPD_L2, phys_page_number++) {
401 pa = (paddr_t)(phys_page_number * NBPD_L2);
402 hibernate_enter_resume_mapping(page, pa, 1);
403 }
404
405 /* Unmap MMU pages (stack remains mapped) */
406 pmap_kremove(HIBERNATE_PML4T, PAGE_SIZE);
407 pmap_kremove(HIBERNATE_PDPT_LOW, PAGE_SIZE);
408 pmap_kremove(HIBERNATE_PDPT_HI, PAGE_SIZE);
409 pmap_kremove(HIBERNATE_PD_LOW, PAGE_SIZE);
410 pmap_kremove(HIBERNATE_PD_LOW2, PAGE_SIZE);
411 pmap_kremove(HIBERNATE_PD_HI, PAGE_SIZE);
412 pmap_kremove(HIBERNATE_PT_LOW, PAGE_SIZE);
413 pmap_kremove(HIBERNATE_PT_LOW2, PAGE_SIZE);
414 pmap_kremove(HIBERNATE_PT_HI, PAGE_SIZE);
415
416 pmap_activate(curproc);
417 }
418
419 /*
420 * During inflate, certain pages that contain our bookkeeping information
421 * (eg, the chunk table, scratch pages, retguard region, etc) need to be
422 * skipped over and not inflated into.
423 *
424 * Return values:
425 * HIB_MOVE: if the physical page at dest should be moved to the retguard save
426 * region in the piglet
427 * HIB_SKIP: if the physical page at dest should be skipped
428 * 0: otherwise (no special treatment needed)
429 */
430 int
hibernate_inflate_skip(union hibernate_info * hib_info,paddr_t dest)431 hibernate_inflate_skip(union hibernate_info *hib_info, paddr_t dest)
432 {
433 extern paddr_t retguard_start_phys, retguard_end_phys;
434
435 if (dest >= hib_info->piglet_pa &&
436 dest <= (hib_info->piglet_pa + 4 * HIBERNATE_CHUNK_SIZE))
437 return (HIB_SKIP);
438
439 if (dest >= retguard_start_phys && dest <= retguard_end_phys)
440 return (HIB_MOVE);
441
442 return (0);
443 }
444
445 void
hibernate_enable_intr_machdep(void)446 hibernate_enable_intr_machdep(void)
447 {
448 intr_enable();
449 }
450
451 void
hibernate_disable_intr_machdep(void)452 hibernate_disable_intr_machdep(void)
453 {
454 intr_disable();
455 }
456
457 #ifdef MULTIPROCESSOR
458 /*
459 * Quiesce CPUs in a multiprocessor machine before resuming. We need to do
460 * this since the APs will be hatched (but waiting for CPUF_GO), and we don't
461 * want the APs to be executing code and causing side effects during the
462 * unpack operation.
463 */
464 void
hibernate_quiesce_cpus(void)465 hibernate_quiesce_cpus(void)
466 {
467 struct cpu_info *ci;
468 u_long i;
469
470 KASSERT(CPU_IS_PRIMARY(curcpu()));
471
472 pmap_kenter_pa(ACPI_TRAMPOLINE, ACPI_TRAMPOLINE, PROT_READ | PROT_EXEC);
473 pmap_kenter_pa(ACPI_TRAMP_DATA, ACPI_TRAMP_DATA,
474 PROT_READ | PROT_WRITE);
475
476 if (curcpu()->ci_feature_sefflags_edx & SEFF0EDX_IBT)
477 lcr4(rcr4() & ~CR4_CET);
478
479 for (i = 0; i < MAXCPUS; i++) {
480 ci = cpu_info[i];
481 if (ci == NULL)
482 continue;
483 if (ci->ci_idle_pcb == NULL)
484 continue;
485 if ((ci->ci_flags & CPUF_PRESENT) == 0)
486 continue;
487 if (ci->ci_flags & (CPUF_BSP | CPUF_SP | CPUF_PRIMARY))
488 continue;
489 atomic_setbits_int(&ci->ci_flags, CPUF_GO | CPUF_PARK);
490 }
491
492 /* Wait a bit for the APs to park themselves */
493 delay(500000);
494
495 pmap_kremove(ACPI_TRAMPOLINE, PAGE_SIZE);
496 pmap_kremove(ACPI_TRAMP_DATA, PAGE_SIZE);
497 }
498 #endif /* MULTIPROCESSOR */
499