1 /* $NetBSD: mm.c,v 1.28 2021/05/04 21:09:16 khorben Exp $ */ 2 3 /* 4 * Copyright (c) 2017-2020 The NetBSD Foundation, Inc. All rights reserved. 5 * 6 * This code is derived from software contributed to The NetBSD Foundation 7 * by Maxime Villard. 8 * 9 * Redistribution and use in source and binary forms, with or without 10 * modification, are permitted provided that the following conditions 11 * are met: 12 * 1. Redistributions of source code must retain the above copyright 13 * notice, this list of conditions and the following disclaimer. 14 * 2. Redistributions in binary form must reproduce the above copyright 15 * notice, this list of conditions and the following disclaimer in the 16 * documentation and/or other materials provided with the distribution. 17 * 18 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 19 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 20 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 21 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 22 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 23 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 24 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 25 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 26 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 27 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 28 * POSSIBILITY OF SUCH DAMAGE. 29 */ 30 31 #include "prekern.h" 32 33 #define ELFROUND 64 34 35 static const uint8_t pads[4] = { 36 [BTSEG_NONE] = 0x00, 37 [BTSEG_TEXT] = 0xCC, 38 [BTSEG_RODATA] = 0x00, 39 [BTSEG_DATA] = 0x00 40 }; 41 42 #define MM_PROT_READ 0x00 43 #define MM_PROT_WRITE 0x01 44 #define MM_PROT_EXECUTE 0x02 45 46 static const pt_entry_t protection_codes[3] = { 47 [MM_PROT_READ] = PTE_NX, 48 [MM_PROT_WRITE] = PTE_W | PTE_NX, 49 [MM_PROT_EXECUTE] = 0, 50 /* RWX does not exist */ 51 }; 52 53 struct bootspace bootspace; 54 55 extern paddr_t kernpa_start, kernpa_end; 56 vaddr_t iom_base; 57 58 paddr_t pa_avail = 0; 59 static const vaddr_t tmpva = (PREKERNBASE + NKL2_KIMG_ENTRIES * NBPD_L2); 60 61 void 62 mm_init(paddr_t first_pa) 63 { 64 pa_avail = first_pa; 65 } 66 67 static void 68 mm_enter_pa(paddr_t pa, vaddr_t va, pte_prot_t prot) 69 { 70 if (PTE_BASE[pl1_i(va)] & PTE_P) { 71 fatal("mm_enter_pa: mapping already present"); 72 } 73 PTE_BASE[pl1_i(va)] = pa | PTE_P | protection_codes[prot]; 74 } 75 76 static void 77 mm_reenter_pa(paddr_t pa, vaddr_t va, pte_prot_t prot) 78 { 79 PTE_BASE[pl1_i(va)] = pa | PTE_P | protection_codes[prot]; 80 } 81 82 static void 83 mm_flush_va(vaddr_t va) 84 { 85 asm volatile("invlpg (%0)" ::"r" (va) : "memory"); 86 } 87 88 static paddr_t 89 mm_palloc(size_t npages) 90 { 91 paddr_t pa; 92 size_t i; 93 94 /* Allocate the physical pages */ 95 pa = pa_avail; 96 pa_avail += npages * PAGE_SIZE; 97 98 /* Zero them out */ 99 for (i = 0; i < npages; i++) { 100 mm_reenter_pa(pa + i * PAGE_SIZE, tmpva, 101 MM_PROT_READ|MM_PROT_WRITE); 102 mm_flush_va(tmpva); 103 memset((void *)tmpva, 0, PAGE_SIZE); 104 } 105 106 return pa; 107 } 108 109 static bool 110 mm_pte_is_valid(pt_entry_t pte) 111 { 112 return ((pte & PTE_P) != 0); 113 } 114 115 static void 116 mm_mprotect(vaddr_t startva, size_t size, pte_prot_t prot) 117 { 118 size_t i, npages; 119 vaddr_t va; 120 paddr_t pa; 121 122 ASSERT(size % PAGE_SIZE == 0); 123 npages = size / PAGE_SIZE; 124 125 for (i = 0; i < npages; i++) { 126 va = startva + i * PAGE_SIZE; 127 pa = (PTE_BASE[pl1_i(va)] & PTE_FRAME); 128 mm_reenter_pa(pa, va, prot); 129 mm_flush_va(va); 130 } 131 } 132 133 void 134 mm_bootspace_mprotect(void) 135 { 136 pte_prot_t prot; 137 size_t i; 138 139 /* Remap the kernel segments with proper permissions. */ 140 for (i = 0; i < BTSPACE_NSEGS; i++) { 141 if (bootspace.segs[i].type == BTSEG_TEXT) { 142 prot = MM_PROT_READ|MM_PROT_EXECUTE; 143 } else if (bootspace.segs[i].type == BTSEG_RODATA) { 144 prot = MM_PROT_READ; 145 } else { 146 continue; 147 } 148 mm_mprotect(bootspace.segs[i].va, bootspace.segs[i].sz, prot); 149 } 150 151 print_state(STATE_NORMAL, "Segments protection updated"); 152 } 153 154 static size_t 155 mm_nentries_range(vaddr_t startva, vaddr_t endva, size_t pgsz) 156 { 157 size_t npages; 158 159 npages = roundup((endva / PAGE_SIZE), (pgsz / PAGE_SIZE)) - 160 rounddown((startva / PAGE_SIZE), (pgsz / PAGE_SIZE)); 161 return (npages / (pgsz / PAGE_SIZE)); 162 } 163 164 static void 165 mm_map_tree(vaddr_t startva, vaddr_t endva) 166 { 167 size_t i, nL4e, nL3e, nL2e; 168 size_t L4e_idx, L3e_idx, L2e_idx; 169 paddr_t pa; 170 171 /* Build L4. */ 172 L4e_idx = pl4_i(startva); 173 nL4e = mm_nentries_range(startva, endva, NBPD_L4); 174 ASSERT(L4e_idx == 511); 175 ASSERT(nL4e == 1); 176 if (!mm_pte_is_valid(L4_BASE[L4e_idx])) { 177 pa = mm_palloc(1); 178 L4_BASE[L4e_idx] = pa | PTE_P | PTE_W; 179 } 180 181 /* Build L3. */ 182 L3e_idx = pl3_i(startva); 183 nL3e = mm_nentries_range(startva, endva, NBPD_L3); 184 for (i = 0; i < nL3e; i++) { 185 if (mm_pte_is_valid(L3_BASE[L3e_idx+i])) { 186 continue; 187 } 188 pa = mm_palloc(1); 189 L3_BASE[L3e_idx+i] = pa | PTE_P | PTE_W; 190 } 191 192 /* Build L2. */ 193 L2e_idx = pl2_i(startva); 194 nL2e = mm_nentries_range(startva, endva, NBPD_L2); 195 for (i = 0; i < nL2e; i++) { 196 if (mm_pte_is_valid(L2_BASE[L2e_idx+i])) { 197 continue; 198 } 199 pa = mm_palloc(1); 200 L2_BASE[L2e_idx+i] = pa | PTE_P | PTE_W; 201 } 202 } 203 204 static vaddr_t 205 mm_randva_kregion(size_t size, size_t pagesz) 206 { 207 vaddr_t sva, eva; 208 vaddr_t randva; 209 uint64_t rnd; 210 size_t i; 211 bool ok; 212 213 while (1) { 214 prng_get_rand(&rnd, sizeof(rnd)); 215 randva = rounddown(KASLR_WINDOW_BASE + 216 rnd % (KASLR_WINDOW_SIZE - size), pagesz); 217 218 /* Detect collisions */ 219 ok = true; 220 for (i = 0; i < BTSPACE_NSEGS; i++) { 221 if (bootspace.segs[i].type == BTSEG_NONE) { 222 continue; 223 } 224 sva = bootspace.segs[i].va; 225 eva = sva + bootspace.segs[i].sz; 226 227 if ((sva <= randva) && (randva < eva)) { 228 ok = false; 229 break; 230 } 231 if ((sva < randva + size) && (randva + size <= eva)) { 232 ok = false; 233 break; 234 } 235 if (randva < sva && eva < (randva + size)) { 236 ok = false; 237 break; 238 } 239 } 240 if (ok) { 241 break; 242 } 243 } 244 245 mm_map_tree(randva, randva + size); 246 247 return randva; 248 } 249 250 static paddr_t 251 bootspace_get_kern_segs_end_pa(void) 252 { 253 paddr_t pa, max = 0; 254 size_t i; 255 256 for (i = 0; i < BTSPACE_NSEGS; i++) { 257 if (bootspace.segs[i].type == BTSEG_NONE) { 258 continue; 259 } 260 pa = bootspace.segs[i].pa + bootspace.segs[i].sz; 261 if (pa > max) 262 max = pa; 263 } 264 265 return max; 266 } 267 268 static void 269 bootspace_addseg(int type, vaddr_t va, paddr_t pa, size_t sz) 270 { 271 size_t i; 272 273 for (i = 0; i < BTSPACE_NSEGS; i++) { 274 if (bootspace.segs[i].type == BTSEG_NONE) { 275 bootspace.segs[i].type = type; 276 bootspace.segs[i].va = va; 277 bootspace.segs[i].pa = pa; 278 bootspace.segs[i].sz = sz; 279 return; 280 } 281 } 282 283 fatal("bootspace_addseg: segments full"); 284 } 285 286 static size_t 287 mm_shift_segment(vaddr_t va, size_t pagesz, size_t elfsz, size_t elfalign) 288 { 289 size_t shiftsize, offset; 290 uint64_t rnd; 291 292 /* 293 * If possible, shift the segment in memory using a random offset. Once 294 * shifted the segment remains in the same page, of size pagesz. Make 295 * sure to respect the ELF alignment constraint. 296 */ 297 298 if (elfalign == 0) { 299 elfalign = ELFROUND; 300 } 301 302 ASSERT(pagesz >= elfalign); 303 ASSERT(pagesz % elfalign == 0); 304 shiftsize = roundup(elfsz, pagesz) - roundup(elfsz, elfalign); 305 if (shiftsize == 0) { 306 return 0; 307 } 308 309 prng_get_rand(&rnd, sizeof(rnd)); 310 offset = roundup(rnd % shiftsize, elfalign); 311 ASSERT((va + offset) % elfalign == 0); 312 313 memmove((void *)(va + offset), (void *)va, elfsz); 314 315 return offset; 316 } 317 318 static void 319 mm_map_head(void) 320 { 321 size_t i, npages, size; 322 uint64_t rnd; 323 vaddr_t randva; 324 325 /* 326 * The HEAD window is 1GB below the main KASLR window. This is to 327 * ensure that head always comes first in virtual memory. The reason 328 * for that is that we use (headva + sh_offset), and sh_offset is 329 * unsigned. 330 */ 331 332 /* 333 * To get the size of the head, we give a look at the read-only 334 * mapping of the kernel we created in locore. We're identity mapped, 335 * so kernpa = kernva. 336 */ 337 size = elf_get_head_size((vaddr_t)kernpa_start); 338 npages = size / PAGE_SIZE; 339 340 /* 341 * Choose a random range of VAs in the HEAD window, and create the page 342 * tree for it. 343 */ 344 prng_get_rand(&rnd, sizeof(rnd)); 345 randva = rounddown(HEAD_WINDOW_BASE + rnd % (HEAD_WINDOW_SIZE - size), 346 PAGE_SIZE); 347 mm_map_tree(randva, randva + size); 348 349 /* Enter the area and build the ELF info */ 350 for (i = 0; i < npages; i++) { 351 mm_enter_pa(kernpa_start + i * PAGE_SIZE, 352 randva + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE); 353 } 354 elf_build_head(randva); 355 356 /* Register the values in bootspace */ 357 bootspace.head.va = randva; 358 bootspace.head.pa = kernpa_start; 359 bootspace.head.sz = size; 360 } 361 362 vaddr_t 363 mm_map_segment(int segtype, paddr_t pa, size_t elfsz, size_t elfalign) 364 { 365 size_t i, npages, size, pagesz, offset; 366 vaddr_t randva; 367 char pad; 368 369 if (elfsz <= PAGE_SIZE) { 370 pagesz = NBPD_L1; 371 } else { 372 pagesz = NBPD_L2; 373 } 374 375 /* Create the page tree */ 376 size = roundup(elfsz, pagesz); 377 randva = mm_randva_kregion(size, pagesz); 378 379 /* Enter the segment */ 380 npages = size / PAGE_SIZE; 381 for (i = 0; i < npages; i++) { 382 mm_enter_pa(pa + i * PAGE_SIZE, 383 randva + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE); 384 } 385 386 /* Shift the segment in memory */ 387 offset = mm_shift_segment(randva, pagesz, elfsz, elfalign); 388 ASSERT(offset + elfsz <= size); 389 390 /* Fill the paddings */ 391 pad = pads[segtype]; 392 memset((void *)randva, pad, offset); 393 memset((void *)(randva + offset + elfsz), pad, size - elfsz - offset); 394 395 /* Register the bootspace information */ 396 bootspace_addseg(segtype, randva, pa, size); 397 398 return (randva + offset); 399 } 400 401 static void 402 mm_map_boot(void) 403 { 404 size_t i, npages, size; 405 vaddr_t randva; 406 paddr_t bootpa; 407 408 /* 409 * The "boot" region is special: its page tree has a fixed size, but 410 * the number of pages entered is lower. 411 */ 412 413 /* Create the page tree, starting at a random VA */ 414 size = (NKL2_KIMG_ENTRIES + 1) * NBPD_L2; 415 randva = mm_randva_kregion(size, PAGE_SIZE); 416 417 /* The "boot" region begins right after the kernel segments */ 418 bootpa = bootspace_get_kern_segs_end_pa(); 419 420 /* The prekern consumed some EXTRA memory up until pa_avail, this 421 * covers REL/RELA/SYM/STR and EXTRA */ 422 size = (pa_avail - bootpa); 423 npages = size / PAGE_SIZE; 424 425 /* Enter the whole area linearly */ 426 for (i = 0; i < npages; i++) { 427 mm_enter_pa(bootpa + i * PAGE_SIZE, 428 randva + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE); 429 } 430 431 /* Fix up the ELF sections located in the "boot" region */ 432 elf_fixup_boot(randva, bootpa); 433 434 /* Map the ISA I/O MEM right after EXTRA, in pure VA */ 435 iom_base = randva + npages * PAGE_SIZE; 436 npages = IOM_SIZE / PAGE_SIZE; 437 for (i = 0; i < npages; i++) { 438 mm_enter_pa(IOM_BEGIN + i * PAGE_SIZE, 439 iom_base + i * PAGE_SIZE, MM_PROT_READ|MM_PROT_WRITE); 440 } 441 442 /* Register the values in bootspace */ 443 bootspace.boot.va = randva; 444 bootspace.boot.pa = bootpa; 445 bootspace.boot.sz = (size_t)(iom_base + IOM_SIZE) - 446 (size_t)bootspace.boot.va; 447 448 /* Initialize the values that are located in the "boot" region */ 449 extern uint64_t PDPpaddr; 450 bootspace.spareva = bootspace.boot.va + NKL2_KIMG_ENTRIES * NBPD_L2; 451 bootspace.pdir = bootspace.boot.va + (PDPpaddr - bootspace.boot.pa); 452 bootspace.smodule = (vaddr_t)iom_base + IOM_SIZE; 453 bootspace.emodule = bootspace.boot.va + NKL2_KIMG_ENTRIES * NBPD_L2; 454 } 455 456 /* 457 * The bootloader has set up the following layout of physical memory: 458 * +------------+--------------+------------+------------------------+-------+ 459 * | ELF HEADER | SECT HEADERS | KERN SECTS | REL/RELA/SYM/STR SECTS | EXTRA | 460 * +------------+--------------+------------+------------------------+-------+ 461 * This was done in the loadfile_elf32.c:loadfile_dynamic() function. 462 * 463 * We abstract this layout into several "regions": 464 * +---------------------------+------------+--------------------------------+ 465 * | Head region | Kern segs | Boot region | 466 * +---------------------------+------------+--------------------------------+ 467 * 468 * There is a variable number of independent regions we create: one head, 469 * several kernel segments, one boot. They are all mapped at random VAs. 470 * 471 * "Head" contains the ELF Header and ELF Section Headers, and we use them to 472 * map the rest of the regions. Head must be placed *before* the other 473 * regions, in both virtual memory and physical memory. 474 * 475 * The "Kernel Segments" contain the kernel SHT_NOBITS and SHT_PROGBITS 476 * sections, in a 1:1 manner (one segment is associated with one section). 477 * The segments are mapped at random VAs and referenced in bootspace.segs[]. 478 * 479 * "Boot" contains miscellaneous information: 480 * - The ELF Rel/Rela/Sym/Str sections of the kernel 481 * - Some extra memory the prekern has consumed so far 482 * - The ISA I/O MEM, in pure VA 483 * - Eventually the module_map, in pure VA (the kernel uses the available VA 484 * at the end of "boot") 485 * Boot is placed *after* the other regions in physical memory. In virtual 486 * memory however there is no constraint, so its VA is randomly selected in 487 * the main KASLR window. 488 * 489 * At the end of this function, the bootspace structure is fully constructed. 490 */ 491 void 492 mm_map_kernel(void) 493 { 494 memset(&bootspace, 0, sizeof(bootspace)); 495 mm_map_head(); 496 print_state(STATE_NORMAL, "Head region mapped"); 497 elf_map_sections(); 498 print_state(STATE_NORMAL, "Segments mapped"); 499 mm_map_boot(); 500 print_state(STATE_NORMAL, "Boot region mapped"); 501 } 502