xref: /netbsd-src/sys/arch/amd64/stand/prekern/mm.c (revision dadf0eef45c0862a0008b5e5b75d17ad81495ef6)
1 /*	$NetBSD: mm.c,v 1.28 2021/05/04 21:09:16 khorben Exp $	*/
2 
3 /*
4  * Copyright (c) 2017-2020 The NetBSD Foundation, Inc. All rights reserved.
5  *
6  * This code is derived from software contributed to The NetBSD Foundation
7  * by Maxime Villard.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28  * POSSIBILITY OF SUCH DAMAGE.
29  */
30 
31 #include "prekern.h"
32 
33 #define ELFROUND	64
34 
35 static const uint8_t pads[4] = {
36 	[BTSEG_NONE] = 0x00,
37 	[BTSEG_TEXT] = 0xCC,
38 	[BTSEG_RODATA] = 0x00,
39 	[BTSEG_DATA] = 0x00
40 };
41 
42 #define MM_PROT_READ	0x00
43 #define MM_PROT_WRITE	0x01
44 #define MM_PROT_EXECUTE	0x02
45 
46 static const pt_entry_t protection_codes[3] = {
47 	[MM_PROT_READ] = PTE_NX,
48 	[MM_PROT_WRITE] = PTE_W | PTE_NX,
49 	[MM_PROT_EXECUTE] = 0,
50 	/* RWX does not exist */
51 };
52 
53 struct bootspace bootspace;
54 
55 extern paddr_t kernpa_start, kernpa_end;
56 vaddr_t iom_base;
57 
58 paddr_t pa_avail = 0;
59 static const vaddr_t tmpva = (PREKERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
60 
61 void
mm_init(paddr_t first_pa)62 mm_init(paddr_t first_pa)
63 {
64 	pa_avail = first_pa;
65 }
66 
67 static void
mm_enter_pa(paddr_t pa,vaddr_t va,pte_prot_t prot)68 mm_enter_pa(paddr_t pa, vaddr_t va, pte_prot_t prot)
69 {
70 	if (PTE_BASE[pl1_i(va)] & PTE_P) {
71 		fatal("mm_enter_pa: mapping already present");
72 	}
73 	PTE_BASE[pl1_i(va)] = pa | PTE_P | protection_codes[prot];
74 }
75 
76 static void
mm_reenter_pa(paddr_t pa,vaddr_t va,pte_prot_t prot)77 mm_reenter_pa(paddr_t pa, vaddr_t va, pte_prot_t prot)
78 {
79 	PTE_BASE[pl1_i(va)] = pa | PTE_P | protection_codes[prot];
80 }
81 
82 static void
mm_flush_va(vaddr_t va)83 mm_flush_va(vaddr_t va)
84 {
85 	asm volatile("invlpg (%0)" ::"r" (va) : "memory");
86 }
87 
88 static paddr_t
mm_palloc(size_t npages)89 mm_palloc(size_t npages)
90 {
91 	paddr_t pa;
92 	size_t i;
93 
94 	/* Allocate the physical pages */
95 	pa = pa_avail;
96 	pa_avail += npages * PAGE_SIZE;
97 
98 	/* Zero them out */
99 	for (i = 0; i < npages; i++) {
100 		mm_reenter_pa(pa + i * PAGE_SIZE, tmpva,
101 		    MM_PROT_READ|MM_PROT_WRITE);
102 		mm_flush_va(tmpva);
103 		memset((void *)tmpva, 0, PAGE_SIZE);
104 	}
105 
106 	return pa;
107 }
108 
109 static bool
mm_pte_is_valid(pt_entry_t pte)110 mm_pte_is_valid(pt_entry_t pte)
111 {
112 	return ((pte & PTE_P) != 0);
113 }
114 
115 static void
mm_mprotect(vaddr_t startva,size_t size,pte_prot_t prot)116 mm_mprotect(vaddr_t startva, size_t size, pte_prot_t prot)
117 {
118 	size_t i, npages;
119 	vaddr_t va;
120 	paddr_t pa;
121 
122 	ASSERT(size % PAGE_SIZE == 0);
123 	npages = size / PAGE_SIZE;
124 
125 	for (i = 0; i < npages; i++) {
126 		va = startva + i * PAGE_SIZE;
127 		pa = (PTE_BASE[pl1_i(va)] & PTE_FRAME);
128 		mm_reenter_pa(pa, va, prot);
129 		mm_flush_va(va);
130 	}
131 }
132 
133 void
mm_bootspace_mprotect(void)134 mm_bootspace_mprotect(void)
135 {
136 	pte_prot_t prot;
137 	size_t i;
138 
139 	/* Remap the kernel segments with proper permissions. */
140 	for (i = 0; i < BTSPACE_NSEGS; i++) {
141 		if (bootspace.segs[i].type == BTSEG_TEXT) {
142 			prot = MM_PROT_READ|MM_PROT_EXECUTE;
143 		} else if (bootspace.segs[i].type == BTSEG_RODATA) {
144 			prot = MM_PROT_READ;
145 		} else {
146 			continue;
147 		}
148 		mm_mprotect(bootspace.segs[i].va, bootspace.segs[i].sz, prot);
149 	}
150 
151 	print_state(STATE_NORMAL, "Segments protection updated");
152 }
153 
154 static size_t
mm_nentries_range(vaddr_t startva,vaddr_t endva,size_t pgsz)155 mm_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
156 {
157 	size_t npages;
158 
159 	npages = roundup((endva / PAGE_SIZE), (pgsz / PAGE_SIZE)) -
160 	    rounddown((startva / PAGE_SIZE), (pgsz / PAGE_SIZE));
161 	return (npages / (pgsz / PAGE_SIZE));
162 }
163 
164 static void
mm_map_tree(vaddr_t startva,vaddr_t endva)165 mm_map_tree(vaddr_t startva, vaddr_t endva)
166 {
167 	size_t i, nL4e, nL3e, nL2e;
168 	size_t L4e_idx, L3e_idx, L2e_idx;
169 	paddr_t pa;
170 
171 	/* Build L4. */
172 	L4e_idx = pl4_i(startva);
173 	nL4e = mm_nentries_range(startva, endva, NBPD_L4);
174 	ASSERT(L4e_idx == 511);
175 	ASSERT(nL4e == 1);
176 	if (!mm_pte_is_valid(L4_BASE[L4e_idx])) {
177 		pa = mm_palloc(1);
178 		L4_BASE[L4e_idx] = pa | PTE_P | PTE_W;
179 	}
180 
181 	/* Build L3. */
182 	L3e_idx = pl3_i(startva);
183 	nL3e = mm_nentries_range(startva, endva, NBPD_L3);
184 	for (i = 0; i < nL3e; i++) {
185 		if (mm_pte_is_valid(L3_BASE[L3e_idx+i])) {
186 			continue;
187 		}
188 		pa = mm_palloc(1);
189 		L3_BASE[L3e_idx+i] = pa | PTE_P | PTE_W;
190 	}
191 
192 	/* Build L2. */
193 	L2e_idx = pl2_i(startva);
194 	nL2e = mm_nentries_range(startva, endva, NBPD_L2);
195 	for (i = 0; i < nL2e; i++) {
196 		if (mm_pte_is_valid(L2_BASE[L2e_idx+i])) {
197 			continue;
198 		}
199 		pa = mm_palloc(1);
200 		L2_BASE[L2e_idx+i] = pa | PTE_P | PTE_W;
201 	}
202 }
203 
204 static vaddr_t
mm_randva_kregion(size_t size,size_t pagesz)205 mm_randva_kregion(size_t size, size_t pagesz)
206 {
207 	vaddr_t sva, eva;
208 	vaddr_t randva;
209 	uint64_t rnd;
210 	size_t i;
211 	bool ok;
212 
213 	while (1) {
214 		prng_get_rand(&rnd, sizeof(rnd));
215 		randva = rounddown(KASLR_WINDOW_BASE +
216 		    rnd % (KASLR_WINDOW_SIZE - size), pagesz);
217 
218 		/* Detect collisions */
219 		ok = true;
220 		for (i = 0; i < BTSPACE_NSEGS; i++) {
221 			if (bootspace.segs[i].type == BTSEG_NONE) {
222 				continue;
223 			}
224 			sva = bootspace.segs[i].va;
225 			eva = sva + bootspace.segs[i].sz;
226 
227 			if ((sva <= randva) && (randva < eva)) {
228 				ok = false;
229 				break;
230 			}
231 			if ((sva < randva + size) && (randva + size <= eva)) {
232 				ok = false;
233 				break;
234 			}
235 			if (randva < sva && eva < (randva + size)) {
236 				ok = false;
237 				break;
238 			}
239 		}
240 		if (ok) {
241 			break;
242 		}
243 	}
244 
245 	mm_map_tree(randva, randva + size);
246 
247 	return randva;
248 }
249 
250 static paddr_t
bootspace_get_kern_segs_end_pa(void)251 bootspace_get_kern_segs_end_pa(void)
252 {
253 	paddr_t pa, max = 0;
254 	size_t i;
255 
256 	for (i = 0; i < BTSPACE_NSEGS; i++) {
257 		if (bootspace.segs[i].type == BTSEG_NONE) {
258 			continue;
259 		}
260 		pa = bootspace.segs[i].pa + bootspace.segs[i].sz;
261 		if (pa > max)
262 			max = pa;
263 	}
264 
265 	return max;
266 }
267 
268 static void
bootspace_addseg(int type,vaddr_t va,paddr_t pa,size_t sz)269 bootspace_addseg(int type, vaddr_t va, paddr_t pa, size_t sz)
270 {
271 	size_t i;
272 
273 	for (i = 0; i < BTSPACE_NSEGS; i++) {
274 		if (bootspace.segs[i].type == BTSEG_NONE) {
275 			bootspace.segs[i].type = type;
276 			bootspace.segs[i].va = va;
277 			bootspace.segs[i].pa = pa;
278 			bootspace.segs[i].sz = sz;
279 			return;
280 		}
281 	}
282 
283 	fatal("bootspace_addseg: segments full");
284 }
285 
286 static size_t
mm_shift_segment(vaddr_t va,size_t pagesz,size_t elfsz,size_t elfalign)287 mm_shift_segment(vaddr_t va, size_t pagesz, size_t elfsz, size_t elfalign)
288 {
289 	size_t shiftsize, offset;
290 	uint64_t rnd;
291 
292 	/*
293 	 * If possible, shift the segment in memory using a random offset. Once
294 	 * shifted the segment remains in the same page, of size pagesz. Make
295 	 * sure to respect the ELF alignment constraint.
296 	 */
297 
298 	if (elfalign == 0) {
299 		elfalign = ELFROUND;
300 	}
301 
302 	ASSERT(pagesz >= elfalign);
303 	ASSERT(pagesz % elfalign == 0);
304 	shiftsize = roundup(elfsz, pagesz) - roundup(elfsz, elfalign);
305 	if (shiftsize == 0) {
306 		return 0;
307 	}
308 
309 	prng_get_rand(&rnd, sizeof(rnd));
310 	offset = roundup(rnd % shiftsize, elfalign);
311 	ASSERT((va + offset) % elfalign == 0);
312 
313 	memmove((void *)(va + offset), (void *)va, elfsz);
314 
315 	return offset;
316 }
317 
318 static void
mm_map_head(void)319 mm_map_head(void)
320 {
321 	size_t i, npages, size;
322 	uint64_t rnd;
323 	vaddr_t randva;
324 
325 	/*
326 	 * The HEAD window is 1GB below the main KASLR window. This is to
327 	 * ensure that head always comes first in virtual memory. The reason
328 	 * for that is that we use (headva + sh_offset), and sh_offset is
329 	 * unsigned.
330 	 */
331 
332 	/*
333 	 * To get the size of the head, we give a look at the read-only
334 	 * mapping of the kernel we created in locore. We're identity mapped,
335 	 * so kernpa = kernva.
336 	 */
337 	size = elf_get_head_size((vaddr_t)kernpa_start);
338 	npages = size / PAGE_SIZE;
339 
340 	/*
341 	 * Choose a random range of VAs in the HEAD window, and create the page
342 	 * tree for it.
343 	 */
344 	prng_get_rand(&rnd, sizeof(rnd));
345 	randva = rounddown(HEAD_WINDOW_BASE + rnd % (HEAD_WINDOW_SIZE - size),
346 	    PAGE_SIZE);
347 	mm_map_tree(randva, randva + size);
348 
349 	/* Enter the area and build the ELF info */
350 	for (i = 0; i < npages; i++) {
351 		mm_enter_pa(kernpa_start + i * PAGE_SIZE,
352 		    randva + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE);
353 	}
354 	elf_build_head(randva);
355 
356 	/* Register the values in bootspace */
357 	bootspace.head.va = randva;
358 	bootspace.head.pa = kernpa_start;
359 	bootspace.head.sz = size;
360 }
361 
362 vaddr_t
mm_map_segment(int segtype,paddr_t pa,size_t elfsz,size_t elfalign)363 mm_map_segment(int segtype, paddr_t pa, size_t elfsz, size_t elfalign)
364 {
365 	size_t i, npages, size, pagesz, offset;
366 	vaddr_t randva;
367 	char pad;
368 
369 	if (elfsz <= PAGE_SIZE) {
370 		pagesz = NBPD_L1;
371 	} else {
372 		pagesz = NBPD_L2;
373 	}
374 
375 	/* Create the page tree */
376 	size = roundup(elfsz, pagesz);
377 	randva = mm_randva_kregion(size, pagesz);
378 
379 	/* Enter the segment */
380 	npages = size / PAGE_SIZE;
381 	for (i = 0; i < npages; i++) {
382 		mm_enter_pa(pa + i * PAGE_SIZE,
383 		    randva + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE);
384 	}
385 
386 	/* Shift the segment in memory */
387 	offset = mm_shift_segment(randva, pagesz, elfsz, elfalign);
388 	ASSERT(offset + elfsz <= size);
389 
390 	/* Fill the paddings */
391 	pad = pads[segtype];
392 	memset((void *)randva, pad, offset);
393 	memset((void *)(randva + offset + elfsz), pad, size - elfsz - offset);
394 
395 	/* Register the bootspace information */
396 	bootspace_addseg(segtype, randva, pa, size);
397 
398 	return (randva + offset);
399 }
400 
401 static void
mm_map_boot(void)402 mm_map_boot(void)
403 {
404 	size_t i, npages, size;
405 	vaddr_t randva;
406 	paddr_t bootpa;
407 
408 	/*
409 	 * The "boot" region is special: its page tree has a fixed size, but
410 	 * the number of pages entered is lower.
411 	 */
412 
413 	/* Create the page tree, starting at a random VA */
414 	size = (NKL2_KIMG_ENTRIES + 1) * NBPD_L2;
415 	randva = mm_randva_kregion(size, PAGE_SIZE);
416 
417 	/* The "boot" region begins right after the kernel segments */
418 	bootpa = bootspace_get_kern_segs_end_pa();
419 
420 	/* The prekern consumed some EXTRA memory up until pa_avail, this
421 	 * covers REL/RELA/SYM/STR and EXTRA */
422 	size = (pa_avail - bootpa);
423 	npages = size / PAGE_SIZE;
424 
425 	/* Enter the whole area linearly */
426 	for (i = 0; i < npages; i++) {
427 		mm_enter_pa(bootpa + i * PAGE_SIZE,
428 		    randva + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE);
429 	}
430 
431 	/* Fix up the ELF sections located in the "boot" region */
432 	elf_fixup_boot(randva, bootpa);
433 
434 	/* Map the ISA I/O MEM right after EXTRA, in pure VA */
435 	iom_base = randva + npages * PAGE_SIZE;
436 	npages = IOM_SIZE / PAGE_SIZE;
437 	for (i = 0; i < npages; i++) {
438 		mm_enter_pa(IOM_BEGIN + i * PAGE_SIZE,
439 		    iom_base + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE);
440 	}
441 
442 	/* Register the values in bootspace */
443 	bootspace.boot.va = randva;
444 	bootspace.boot.pa = bootpa;
445 	bootspace.boot.sz = (size_t)(iom_base + IOM_SIZE) -
446 	    (size_t)bootspace.boot.va;
447 
448 	/* Initialize the values that are located in the "boot" region */
449 	extern uint64_t PDPpaddr;
450 	bootspace.spareva = bootspace.boot.va + NKL2_KIMG_ENTRIES * NBPD_L2;
451 	bootspace.pdir = bootspace.boot.va + (PDPpaddr - bootspace.boot.pa);
452 	bootspace.smodule = (vaddr_t)iom_base + IOM_SIZE;
453 	bootspace.emodule = bootspace.boot.va + NKL2_KIMG_ENTRIES * NBPD_L2;
454 }
455 
456 /*
457  * The bootloader has set up the following layout of physical memory:
458  * +------------+--------------+------------+------------------------+-------+
459  * | ELF HEADER | SECT HEADERS | KERN SECTS | REL/RELA/SYM/STR SECTS | EXTRA |
460  * +------------+--------------+------------+------------------------+-------+
461  * This was done in the loadfile_elf32.c:loadfile_dynamic() function.
462  *
463  * We abstract this layout into several "regions":
464  * +---------------------------+------------+--------------------------------+
465  * |         Head region       | Kern segs  |          Boot region           |
466  * +---------------------------+------------+--------------------------------+
467  *
468  * There is a variable number of independent regions we create: one head,
469  * several kernel segments, one boot. They are all mapped at random VAs.
470  *
471  * "Head" contains the ELF Header and ELF Section Headers, and we use them to
472  * map the rest of the regions. Head must be placed *before* the other
473  * regions, in both virtual memory and physical memory.
474  *
475  * The "Kernel Segments" contain the kernel SHT_NOBITS and SHT_PROGBITS
476  * sections, in a 1:1 manner (one segment is associated with one section).
477  * The segments are mapped at random VAs and referenced in bootspace.segs[].
478  *
479  * "Boot" contains miscellaneous information:
480  *  - The ELF Rel/Rela/Sym/Str sections of the kernel
481  *  - Some extra memory the prekern has consumed so far
482  *  - The ISA I/O MEM, in pure VA
483  *  - Eventually the module_map, in pure VA (the kernel uses the available VA
484  *    at the end of "boot")
485  * Boot is placed *after* the other regions in physical memory. In virtual
486  * memory however there is no constraint, so its VA is randomly selected in
487  * the main KASLR window.
488  *
489  * At the end of this function, the bootspace structure is fully constructed.
490  */
491 void
mm_map_kernel(void)492 mm_map_kernel(void)
493 {
494 	memset(&bootspace, 0, sizeof(bootspace));
495 	mm_map_head();
496 	print_state(STATE_NORMAL, "Head region mapped");
497 	elf_map_sections();
498 	print_state(STATE_NORMAL, "Segments mapped");
499 	mm_map_boot();
500 	print_state(STATE_NORMAL, "Boot region mapped");
501 }
502