1 /* $NetBSD: mm.c,v 1.28 2021/05/04 21:09:16 khorben Exp $ */
2
3 /*
4 * Copyright (c) 2017-2020 The NetBSD Foundation, Inc. All rights reserved.
5 *
6 * This code is derived from software contributed to The NetBSD Foundation
7 * by Maxime Villard.
8 *
9 * Redistribution and use in source and binary forms, with or without
10 * modification, are permitted provided that the following conditions
11 * are met:
12 * 1. Redistributions of source code must retain the above copyright
13 * notice, this list of conditions and the following disclaimer.
14 * 2. Redistributions in binary form must reproduce the above copyright
15 * notice, this list of conditions and the following disclaimer in the
16 * documentation and/or other materials provided with the distribution.
17 *
18 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
20 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
21 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
22 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
28 * POSSIBILITY OF SUCH DAMAGE.
29 */
30
31 #include "prekern.h"
32
33 #define ELFROUND 64
34
35 static const uint8_t pads[4] = {
36 [BTSEG_NONE] = 0x00,
37 [BTSEG_TEXT] = 0xCC,
38 [BTSEG_RODATA] = 0x00,
39 [BTSEG_DATA] = 0x00
40 };
41
42 #define MM_PROT_READ 0x00
43 #define MM_PROT_WRITE 0x01
44 #define MM_PROT_EXECUTE 0x02
45
46 static const pt_entry_t protection_codes[3] = {
47 [MM_PROT_READ] = PTE_NX,
48 [MM_PROT_WRITE] = PTE_W | PTE_NX,
49 [MM_PROT_EXECUTE] = 0,
50 /* RWX does not exist */
51 };
52
53 struct bootspace bootspace;
54
55 extern paddr_t kernpa_start, kernpa_end;
56 vaddr_t iom_base;
57
58 paddr_t pa_avail = 0;
59 static const vaddr_t tmpva = (PREKERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2);
60
61 void
mm_init(paddr_t first_pa)62 mm_init(paddr_t first_pa)
63 {
64 pa_avail = first_pa;
65 }
66
67 static void
mm_enter_pa(paddr_t pa,vaddr_t va,pte_prot_t prot)68 mm_enter_pa(paddr_t pa, vaddr_t va, pte_prot_t prot)
69 {
70 if (PTE_BASE[pl1_i(va)] & PTE_P) {
71 fatal("mm_enter_pa: mapping already present");
72 }
73 PTE_BASE[pl1_i(va)] = pa | PTE_P | protection_codes[prot];
74 }
75
76 static void
mm_reenter_pa(paddr_t pa,vaddr_t va,pte_prot_t prot)77 mm_reenter_pa(paddr_t pa, vaddr_t va, pte_prot_t prot)
78 {
79 PTE_BASE[pl1_i(va)] = pa | PTE_P | protection_codes[prot];
80 }
81
82 static void
mm_flush_va(vaddr_t va)83 mm_flush_va(vaddr_t va)
84 {
85 asm volatile("invlpg (%0)" ::"r" (va) : "memory");
86 }
87
88 static paddr_t
mm_palloc(size_t npages)89 mm_palloc(size_t npages)
90 {
91 paddr_t pa;
92 size_t i;
93
94 /* Allocate the physical pages */
95 pa = pa_avail;
96 pa_avail += npages * PAGE_SIZE;
97
98 /* Zero them out */
99 for (i = 0; i < npages; i++) {
100 mm_reenter_pa(pa + i * PAGE_SIZE, tmpva,
101 MM_PROT_READ|MM_PROT_WRITE);
102 mm_flush_va(tmpva);
103 memset((void *)tmpva, 0, PAGE_SIZE);
104 }
105
106 return pa;
107 }
108
109 static bool
mm_pte_is_valid(pt_entry_t pte)110 mm_pte_is_valid(pt_entry_t pte)
111 {
112 return ((pte & PTE_P) != 0);
113 }
114
115 static void
mm_mprotect(vaddr_t startva,size_t size,pte_prot_t prot)116 mm_mprotect(vaddr_t startva, size_t size, pte_prot_t prot)
117 {
118 size_t i, npages;
119 vaddr_t va;
120 paddr_t pa;
121
122 ASSERT(size % PAGE_SIZE == 0);
123 npages = size / PAGE_SIZE;
124
125 for (i = 0; i < npages; i++) {
126 va = startva + i * PAGE_SIZE;
127 pa = (PTE_BASE[pl1_i(va)] & PTE_FRAME);
128 mm_reenter_pa(pa, va, prot);
129 mm_flush_va(va);
130 }
131 }
132
133 void
mm_bootspace_mprotect(void)134 mm_bootspace_mprotect(void)
135 {
136 pte_prot_t prot;
137 size_t i;
138
139 /* Remap the kernel segments with proper permissions. */
140 for (i = 0; i < BTSPACE_NSEGS; i++) {
141 if (bootspace.segs[i].type == BTSEG_TEXT) {
142 prot = MM_PROT_READ|MM_PROT_EXECUTE;
143 } else if (bootspace.segs[i].type == BTSEG_RODATA) {
144 prot = MM_PROT_READ;
145 } else {
146 continue;
147 }
148 mm_mprotect(bootspace.segs[i].va, bootspace.segs[i].sz, prot);
149 }
150
151 print_state(STATE_NORMAL, "Segments protection updated");
152 }
153
154 static size_t
mm_nentries_range(vaddr_t startva,vaddr_t endva,size_t pgsz)155 mm_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz)
156 {
157 size_t npages;
158
159 npages = roundup((endva / PAGE_SIZE), (pgsz / PAGE_SIZE)) -
160 rounddown((startva / PAGE_SIZE), (pgsz / PAGE_SIZE));
161 return (npages / (pgsz / PAGE_SIZE));
162 }
163
164 static void
mm_map_tree(vaddr_t startva,vaddr_t endva)165 mm_map_tree(vaddr_t startva, vaddr_t endva)
166 {
167 size_t i, nL4e, nL3e, nL2e;
168 size_t L4e_idx, L3e_idx, L2e_idx;
169 paddr_t pa;
170
171 /* Build L4. */
172 L4e_idx = pl4_i(startva);
173 nL4e = mm_nentries_range(startva, endva, NBPD_L4);
174 ASSERT(L4e_idx == 511);
175 ASSERT(nL4e == 1);
176 if (!mm_pte_is_valid(L4_BASE[L4e_idx])) {
177 pa = mm_palloc(1);
178 L4_BASE[L4e_idx] = pa | PTE_P | PTE_W;
179 }
180
181 /* Build L3. */
182 L3e_idx = pl3_i(startva);
183 nL3e = mm_nentries_range(startva, endva, NBPD_L3);
184 for (i = 0; i < nL3e; i++) {
185 if (mm_pte_is_valid(L3_BASE[L3e_idx+i])) {
186 continue;
187 }
188 pa = mm_palloc(1);
189 L3_BASE[L3e_idx+i] = pa | PTE_P | PTE_W;
190 }
191
192 /* Build L2. */
193 L2e_idx = pl2_i(startva);
194 nL2e = mm_nentries_range(startva, endva, NBPD_L2);
195 for (i = 0; i < nL2e; i++) {
196 if (mm_pte_is_valid(L2_BASE[L2e_idx+i])) {
197 continue;
198 }
199 pa = mm_palloc(1);
200 L2_BASE[L2e_idx+i] = pa | PTE_P | PTE_W;
201 }
202 }
203
204 static vaddr_t
mm_randva_kregion(size_t size,size_t pagesz)205 mm_randva_kregion(size_t size, size_t pagesz)
206 {
207 vaddr_t sva, eva;
208 vaddr_t randva;
209 uint64_t rnd;
210 size_t i;
211 bool ok;
212
213 while (1) {
214 prng_get_rand(&rnd, sizeof(rnd));
215 randva = rounddown(KASLR_WINDOW_BASE +
216 rnd % (KASLR_WINDOW_SIZE - size), pagesz);
217
218 /* Detect collisions */
219 ok = true;
220 for (i = 0; i < BTSPACE_NSEGS; i++) {
221 if (bootspace.segs[i].type == BTSEG_NONE) {
222 continue;
223 }
224 sva = bootspace.segs[i].va;
225 eva = sva + bootspace.segs[i].sz;
226
227 if ((sva <= randva) && (randva < eva)) {
228 ok = false;
229 break;
230 }
231 if ((sva < randva + size) && (randva + size <= eva)) {
232 ok = false;
233 break;
234 }
235 if (randva < sva && eva < (randva + size)) {
236 ok = false;
237 break;
238 }
239 }
240 if (ok) {
241 break;
242 }
243 }
244
245 mm_map_tree(randva, randva + size);
246
247 return randva;
248 }
249
250 static paddr_t
bootspace_get_kern_segs_end_pa(void)251 bootspace_get_kern_segs_end_pa(void)
252 {
253 paddr_t pa, max = 0;
254 size_t i;
255
256 for (i = 0; i < BTSPACE_NSEGS; i++) {
257 if (bootspace.segs[i].type == BTSEG_NONE) {
258 continue;
259 }
260 pa = bootspace.segs[i].pa + bootspace.segs[i].sz;
261 if (pa > max)
262 max = pa;
263 }
264
265 return max;
266 }
267
268 static void
bootspace_addseg(int type,vaddr_t va,paddr_t pa,size_t sz)269 bootspace_addseg(int type, vaddr_t va, paddr_t pa, size_t sz)
270 {
271 size_t i;
272
273 for (i = 0; i < BTSPACE_NSEGS; i++) {
274 if (bootspace.segs[i].type == BTSEG_NONE) {
275 bootspace.segs[i].type = type;
276 bootspace.segs[i].va = va;
277 bootspace.segs[i].pa = pa;
278 bootspace.segs[i].sz = sz;
279 return;
280 }
281 }
282
283 fatal("bootspace_addseg: segments full");
284 }
285
286 static size_t
mm_shift_segment(vaddr_t va,size_t pagesz,size_t elfsz,size_t elfalign)287 mm_shift_segment(vaddr_t va, size_t pagesz, size_t elfsz, size_t elfalign)
288 {
289 size_t shiftsize, offset;
290 uint64_t rnd;
291
292 /*
293 * If possible, shift the segment in memory using a random offset. Once
294 * shifted the segment remains in the same page, of size pagesz. Make
295 * sure to respect the ELF alignment constraint.
296 */
297
298 if (elfalign == 0) {
299 elfalign = ELFROUND;
300 }
301
302 ASSERT(pagesz >= elfalign);
303 ASSERT(pagesz % elfalign == 0);
304 shiftsize = roundup(elfsz, pagesz) - roundup(elfsz, elfalign);
305 if (shiftsize == 0) {
306 return 0;
307 }
308
309 prng_get_rand(&rnd, sizeof(rnd));
310 offset = roundup(rnd % shiftsize, elfalign);
311 ASSERT((va + offset) % elfalign == 0);
312
313 memmove((void *)(va + offset), (void *)va, elfsz);
314
315 return offset;
316 }
317
318 static void
mm_map_head(void)319 mm_map_head(void)
320 {
321 size_t i, npages, size;
322 uint64_t rnd;
323 vaddr_t randva;
324
325 /*
326 * The HEAD window is 1GB below the main KASLR window. This is to
327 * ensure that head always comes first in virtual memory. The reason
328 * for that is that we use (headva + sh_offset), and sh_offset is
329 * unsigned.
330 */
331
332 /*
333 * To get the size of the head, we give a look at the read-only
334 * mapping of the kernel we created in locore. We're identity mapped,
335 * so kernpa = kernva.
336 */
337 size = elf_get_head_size((vaddr_t)kernpa_start);
338 npages = size / PAGE_SIZE;
339
340 /*
341 * Choose a random range of VAs in the HEAD window, and create the page
342 * tree for it.
343 */
344 prng_get_rand(&rnd, sizeof(rnd));
345 randva = rounddown(HEAD_WINDOW_BASE + rnd % (HEAD_WINDOW_SIZE - size),
346 PAGE_SIZE);
347 mm_map_tree(randva, randva + size);
348
349 /* Enter the area and build the ELF info */
350 for (i = 0; i < npages; i++) {
351 mm_enter_pa(kernpa_start + i * PAGE_SIZE,
352 randva + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE);
353 }
354 elf_build_head(randva);
355
356 /* Register the values in bootspace */
357 bootspace.head.va = randva;
358 bootspace.head.pa = kernpa_start;
359 bootspace.head.sz = size;
360 }
361
362 vaddr_t
mm_map_segment(int segtype,paddr_t pa,size_t elfsz,size_t elfalign)363 mm_map_segment(int segtype, paddr_t pa, size_t elfsz, size_t elfalign)
364 {
365 size_t i, npages, size, pagesz, offset;
366 vaddr_t randva;
367 char pad;
368
369 if (elfsz <= PAGE_SIZE) {
370 pagesz = NBPD_L1;
371 } else {
372 pagesz = NBPD_L2;
373 }
374
375 /* Create the page tree */
376 size = roundup(elfsz, pagesz);
377 randva = mm_randva_kregion(size, pagesz);
378
379 /* Enter the segment */
380 npages = size / PAGE_SIZE;
381 for (i = 0; i < npages; i++) {
382 mm_enter_pa(pa + i * PAGE_SIZE,
383 randva + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE);
384 }
385
386 /* Shift the segment in memory */
387 offset = mm_shift_segment(randva, pagesz, elfsz, elfalign);
388 ASSERT(offset + elfsz <= size);
389
390 /* Fill the paddings */
391 pad = pads[segtype];
392 memset((void *)randva, pad, offset);
393 memset((void *)(randva + offset + elfsz), pad, size - elfsz - offset);
394
395 /* Register the bootspace information */
396 bootspace_addseg(segtype, randva, pa, size);
397
398 return (randva + offset);
399 }
400
401 static void
mm_map_boot(void)402 mm_map_boot(void)
403 {
404 size_t i, npages, size;
405 vaddr_t randva;
406 paddr_t bootpa;
407
408 /*
409 * The "boot" region is special: its page tree has a fixed size, but
410 * the number of pages entered is lower.
411 */
412
413 /* Create the page tree, starting at a random VA */
414 size = (NKL2_KIMG_ENTRIES + 1) * NBPD_L2;
415 randva = mm_randva_kregion(size, PAGE_SIZE);
416
417 /* The "boot" region begins right after the kernel segments */
418 bootpa = bootspace_get_kern_segs_end_pa();
419
420 /* The prekern consumed some EXTRA memory up until pa_avail, this
421 * covers REL/RELA/SYM/STR and EXTRA */
422 size = (pa_avail - bootpa);
423 npages = size / PAGE_SIZE;
424
425 /* Enter the whole area linearly */
426 for (i = 0; i < npages; i++) {
427 mm_enter_pa(bootpa + i * PAGE_SIZE,
428 randva + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE);
429 }
430
431 /* Fix up the ELF sections located in the "boot" region */
432 elf_fixup_boot(randva, bootpa);
433
434 /* Map the ISA I/O MEM right after EXTRA, in pure VA */
435 iom_base = randva + npages * PAGE_SIZE;
436 npages = IOM_SIZE / PAGE_SIZE;
437 for (i = 0; i < npages; i++) {
438 mm_enter_pa(IOM_BEGIN + i * PAGE_SIZE,
439 iom_base + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE);
440 }
441
442 /* Register the values in bootspace */
443 bootspace.boot.va = randva;
444 bootspace.boot.pa = bootpa;
445 bootspace.boot.sz = (size_t)(iom_base + IOM_SIZE) -
446 (size_t)bootspace.boot.va;
447
448 /* Initialize the values that are located in the "boot" region */
449 extern uint64_t PDPpaddr;
450 bootspace.spareva = bootspace.boot.va + NKL2_KIMG_ENTRIES * NBPD_L2;
451 bootspace.pdir = bootspace.boot.va + (PDPpaddr - bootspace.boot.pa);
452 bootspace.smodule = (vaddr_t)iom_base + IOM_SIZE;
453 bootspace.emodule = bootspace.boot.va + NKL2_KIMG_ENTRIES * NBPD_L2;
454 }
455
456 /*
457 * The bootloader has set up the following layout of physical memory:
458 * +------------+--------------+------------+------------------------+-------+
459 * | ELF HEADER | SECT HEADERS | KERN SECTS | REL/RELA/SYM/STR SECTS | EXTRA |
460 * +------------+--------------+------------+------------------------+-------+
461 * This was done in the loadfile_elf32.c:loadfile_dynamic() function.
462 *
463 * We abstract this layout into several "regions":
464 * +---------------------------+------------+--------------------------------+
465 * | Head region | Kern segs | Boot region |
466 * +---------------------------+------------+--------------------------------+
467 *
468 * There is a variable number of independent regions we create: one head,
469 * several kernel segments, one boot. They are all mapped at random VAs.
470 *
471 * "Head" contains the ELF Header and ELF Section Headers, and we use them to
472 * map the rest of the regions. Head must be placed *before* the other
473 * regions, in both virtual memory and physical memory.
474 *
475 * The "Kernel Segments" contain the kernel SHT_NOBITS and SHT_PROGBITS
476 * sections, in a 1:1 manner (one segment is associated with one section).
477 * The segments are mapped at random VAs and referenced in bootspace.segs[].
478 *
479 * "Boot" contains miscellaneous information:
480 * - The ELF Rel/Rela/Sym/Str sections of the kernel
481 * - Some extra memory the prekern has consumed so far
482 * - The ISA I/O MEM, in pure VA
483 * - Eventually the module_map, in pure VA (the kernel uses the available VA
484 * at the end of "boot")
485 * Boot is placed *after* the other regions in physical memory. In virtual
486 * memory however there is no constraint, so its VA is randomly selected in
487 * the main KASLR window.
488 *
489 * At the end of this function, the bootspace structure is fully constructed.
490 */
491 void
mm_map_kernel(void)492 mm_map_kernel(void)
493 {
494 memset(&bootspace, 0, sizeof(bootspace));
495 mm_map_head();
496 print_state(STATE_NORMAL, "Head region mapped");
497 elf_map_sections();
498 print_state(STATE_NORMAL, "Segments mapped");
499 mm_map_boot();
500 print_state(STATE_NORMAL, "Boot region mapped");
501 }
502