1 /* $NetBSD: x86_xpmap.c,v 1.90 2020/09/06 02:18:53 riastradh Exp $ */ 2 3 /* 4 * Copyright (c) 2017 The NetBSD Foundation, Inc. 5 * All rights reserved. 6 * 7 * This code is derived from software contributed to The NetBSD Foundation 8 * by Maxime Villard. 9 * 10 * Redistribution and use in source and binary forms, with or without 11 * modification, are permitted provided that the following conditions 12 * are met: 13 * 1. Redistributions of source code must retain the above copyright 14 * notice, this list of conditions and the following disclaimer. 15 * 2. Redistributions in binary form must reproduce the above copyright 16 * notice, this list of conditions and the following disclaimer in the 17 * documentation and/or other materials provided with the distribution. 18 * 19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS 20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED 21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR 22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS 23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR 24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF 25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS 26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN 27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) 28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE 29 * POSSIBILITY OF SUCH DAMAGE. 30 */ 31 32 /* 33 * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr> 34 * 35 * Permission to use, copy, modify, and distribute this software for any 36 * purpose with or without fee is hereby granted, provided that the above 37 * copyright notice and this permission notice appear in all copies. 38 * 39 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES 40 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF 41 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR 42 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES 43 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN 44 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF 45 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. 46 */ 47 48 /* 49 * Copyright (c) 2006, 2007 Manuel Bouyer. 50 * 51 * Redistribution and use in source and binary forms, with or without 52 * modification, are permitted provided that the following conditions 53 * are met: 54 * 1. Redistributions of source code must retain the above copyright 55 * notice, this list of conditions and the following disclaimer. 56 * 2. Redistributions in binary form must reproduce the above copyright 57 * notice, this list of conditions and the following disclaimer in the 58 * documentation and/or other materials provided with the distribution. 59 * 60 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 61 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 62 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 63 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 64 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 65 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 66 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 67 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 68 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 69 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 70 */ 71 72 /* 73 * Copyright (c) 2004 Christian Limpach. 74 * All rights reserved. 75 * 76 * Redistribution and use in source and binary forms, with or without 77 * modification, are permitted provided that the following conditions 78 * are met: 79 * 1. Redistributions of source code must retain the above copyright 80 * notice, this list of conditions and the following disclaimer. 81 * 2. Redistributions in binary form must reproduce the above copyright 82 * notice, this list of conditions and the following disclaimer in the 83 * documentation and/or other materials provided with the distribution. 84 * 85 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR 86 * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES 87 * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. 88 * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, 89 * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT 90 * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, 91 * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY 92 * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT 93 * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF 94 * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 95 */ 96 97 #include <sys/cdefs.h> 98 __KERNEL_RCSID(0, "$NetBSD: x86_xpmap.c,v 1.90 2020/09/06 02:18:53 riastradh Exp $"); 99 100 #include "opt_xen.h" 101 #include "opt_ddb.h" 102 #include "ksyms.h" 103 104 #include <sys/param.h> 105 #include <sys/systm.h> 106 #include <sys/mutex.h> 107 #include <sys/cpu.h> 108 #include <sys/kernel.h> 109 110 #include <uvm/uvm.h> 111 112 #include <machine/gdt.h> 113 114 #include <xen/xenfunc.h> 115 116 #include <dev/isa/isareg.h> 117 #include <machine/isa_machdep.h> 118 119 #ifdef XENDEBUG 120 #define __PRINTK(x) printk x 121 #else 122 #define __PRINTK(x) 123 #endif 124 125 /* Xen requires the start_info struct to be page aligned */ 126 union start_info_union start_info_union __aligned(PAGE_SIZE); 127 128 volatile shared_info_t *HYPERVISOR_shared_info __read_mostly; 129 unsigned long *xpmap_phys_to_machine_mapping __read_mostly; 130 kmutex_t pte_lock __cacheline_aligned; 131 vaddr_t xen_dummy_page; 132 pt_entry_t xpmap_pg_nx __read_mostly; 133 134 #define XPQUEUE_SIZE 2048 135 static mmu_update_t xpq_queue_array[MAXCPUS][XPQUEUE_SIZE]; 136 137 void xen_failsafe_handler(void); 138 139 extern struct xenstore_domain_interface *xenstore_interface; /* XXX */ 140 141 static void xen_bt_set_readonly(vaddr_t); 142 static void xen_bootstrap_tables(vaddr_t, vaddr_t, size_t, size_t, bool); 143 144 vaddr_t xen_locore(void); 145 146 /* 147 * kcpuset internally uses an array of uint32_t while xen uses an array of 148 * u_long. As we're little-endian we can cast one to the other. 149 */ 150 typedef union { 151 #ifdef _LP64 152 uint32_t xcpum_km[2]; 153 #else 154 uint32_t xcpum_km[1]; 155 #endif 156 u_long xcpum_xm; 157 } xcpumask_t; 158 159 void 160 xen_failsafe_handler(void) 161 { 162 163 panic("xen_failsafe_handler called!\n"); 164 } 165 166 void 167 xen_set_ldt(vaddr_t base, uint32_t entries) 168 { 169 vaddr_t va; 170 vaddr_t end; 171 pt_entry_t *ptp; 172 int s; 173 174 #ifdef __x86_64__ 175 end = base + (entries << 3); 176 #else 177 end = base + entries * sizeof(union descriptor); 178 #endif 179 180 for (va = base; va < end; va += PAGE_SIZE) { 181 KASSERT(va >= VM_MIN_KERNEL_ADDRESS); 182 ptp = kvtopte(va); 183 pmap_pte_clearbits(ptp, PTE_W); 184 } 185 s = splvm(); 186 xpq_queue_set_ldt(base, entries); 187 splx(s); 188 } 189 190 void 191 xpq_flush_queue(void) 192 { 193 mmu_update_t *xpq_queue; 194 int done = 0, ret; 195 size_t xpq_idx; 196 197 KASSERT(curcpu()->ci_ilevel >= IPL_VM || cold); 198 199 xpq_idx = curcpu()->ci_xpq_idx; 200 xpq_queue = xpq_queue_array[curcpu()->ci_cpuid]; 201 202 retry: 203 ret = HYPERVISOR_mmu_update(xpq_queue, xpq_idx, &done, DOMID_SELF); 204 205 if (ret < 0 && xpq_idx != 0) { 206 printf("xpq_flush_queue: %zu entries (%d successful) on " 207 "cpu%d (%ld)\n", 208 xpq_idx, done, curcpu()->ci_index, curcpu()->ci_cpuid); 209 210 if (done != 0) { 211 xpq_queue += done; 212 xpq_idx -= done; 213 done = 0; 214 goto retry; 215 } 216 217 panic("HYPERVISOR_mmu_update failed, ret: %d\n", ret); 218 } 219 curcpu()->ci_xpq_idx = 0; 220 } 221 222 static inline void 223 xpq_increment_idx(void) 224 { 225 KASSERT(curcpu()->ci_ilevel >= IPL_VM || cold); 226 if (__predict_false(++curcpu()->ci_xpq_idx == XPQUEUE_SIZE)) 227 xpq_flush_queue(); 228 } 229 230 void 231 xpq_queue_machphys_update(paddr_t ma, paddr_t pa) 232 { 233 mmu_update_t *xpq_queue = xpq_queue_array[curcpu()->ci_cpuid]; 234 size_t xpq_idx = curcpu()->ci_xpq_idx; 235 236 xpq_queue[xpq_idx].ptr = ma | MMU_MACHPHYS_UPDATE; 237 xpq_queue[xpq_idx].val = pa >> PAGE_SHIFT; 238 xpq_increment_idx(); 239 } 240 241 void 242 xpq_queue_pte_update(paddr_t ptr, pt_entry_t val) 243 { 244 mmu_update_t *xpq_queue = xpq_queue_array[curcpu()->ci_cpuid]; 245 size_t xpq_idx = curcpu()->ci_xpq_idx; 246 247 xpq_queue[xpq_idx].ptr = ptr | MMU_NORMAL_PT_UPDATE; 248 xpq_queue[xpq_idx].val = val; 249 xpq_increment_idx(); 250 } 251 252 void 253 xpq_queue_pt_switch(paddr_t pa) 254 { 255 struct mmuext_op op; 256 257 xpq_flush_queue(); 258 259 op.cmd = MMUEXT_NEW_BASEPTR; 260 op.arg1.mfn = pa >> PAGE_SHIFT; 261 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 262 panic(__func__); 263 } 264 265 void 266 xpq_queue_pin_table(paddr_t pa, int lvl) 267 { 268 struct mmuext_op op; 269 270 xpq_flush_queue(); 271 272 op.cmd = lvl; 273 op.arg1.mfn = pa >> PAGE_SHIFT; 274 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 275 panic(__func__); 276 } 277 278 void 279 xpq_queue_unpin_table(paddr_t pa) 280 { 281 struct mmuext_op op; 282 283 xpq_flush_queue(); 284 285 op.cmd = MMUEXT_UNPIN_TABLE; 286 op.arg1.mfn = pa >> PAGE_SHIFT; 287 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 288 panic(__func__); 289 } 290 291 void 292 xpq_queue_set_ldt(vaddr_t va, uint32_t entries) 293 { 294 struct mmuext_op op; 295 296 xpq_flush_queue(); 297 298 KASSERT(va == (va & ~PAGE_MASK)); 299 op.cmd = MMUEXT_SET_LDT; 300 op.arg1.linear_addr = va; 301 op.arg2.nr_ents = entries; 302 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 303 panic(__func__); 304 } 305 306 void 307 xpq_queue_tlb_flush(void) 308 { 309 struct mmuext_op op; 310 311 xpq_flush_queue(); 312 313 op.cmd = MMUEXT_TLB_FLUSH_LOCAL; 314 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 315 panic(__func__); 316 } 317 318 void 319 xpq_flush_cache(void) 320 { 321 int s = splvm(); 322 323 xpq_flush_queue(); 324 325 asm("wbinvd":::"memory"); 326 splx(s); 327 } 328 329 void 330 xpq_queue_invlpg(vaddr_t va) 331 { 332 struct mmuext_op op; 333 334 xpq_flush_queue(); 335 336 op.cmd = MMUEXT_INVLPG_LOCAL; 337 op.arg1.linear_addr = (va & ~PAGE_MASK); 338 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 339 panic(__func__); 340 } 341 342 void 343 xen_mcast_invlpg(vaddr_t va, kcpuset_t *kc) 344 { 345 xcpumask_t xcpumask; 346 mmuext_op_t op; 347 348 kcpuset_export_u32(kc, &xcpumask.xcpum_km[0], sizeof(xcpumask)); 349 350 xpq_flush_queue(); 351 352 op.cmd = MMUEXT_INVLPG_MULTI; 353 op.arg1.linear_addr = va; 354 set_xen_guest_handle(op.arg2.vcpumask, &xcpumask.xcpum_xm); 355 356 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 357 panic(__func__); 358 } 359 360 void 361 xen_bcast_invlpg(vaddr_t va) 362 { 363 mmuext_op_t op; 364 365 xpq_flush_queue(); 366 367 op.cmd = MMUEXT_INVLPG_ALL; 368 op.arg1.linear_addr = va; 369 370 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 371 panic(__func__); 372 } 373 374 /* This is a synchronous call. */ 375 void 376 xen_mcast_tlbflush(kcpuset_t *kc) 377 { 378 xcpumask_t xcpumask; 379 mmuext_op_t op; 380 381 kcpuset_export_u32(kc, &xcpumask.xcpum_km[0], sizeof(xcpumask)); 382 383 xpq_flush_queue(); 384 385 op.cmd = MMUEXT_TLB_FLUSH_MULTI; 386 set_xen_guest_handle(op.arg2.vcpumask, &xcpumask.xcpum_xm); 387 388 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 389 panic(__func__); 390 } 391 392 /* This is a synchronous call. */ 393 void 394 xen_bcast_tlbflush(void) 395 { 396 mmuext_op_t op; 397 398 xpq_flush_queue(); 399 400 op.cmd = MMUEXT_TLB_FLUSH_ALL; 401 402 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 403 panic(__func__); 404 } 405 406 void 407 xen_copy_page(paddr_t srcpa, paddr_t dstpa) 408 { 409 mmuext_op_t op; 410 411 op.cmd = MMUEXT_COPY_PAGE; 412 op.arg1.mfn = xpmap_ptom(dstpa) >> PAGE_SHIFT; 413 op.arg2.src_mfn = xpmap_ptom(srcpa) >> PAGE_SHIFT; 414 415 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 416 panic(__func__); 417 } 418 419 void 420 xen_pagezero(paddr_t pa) 421 { 422 mmuext_op_t op; 423 424 op.cmd = MMUEXT_CLEAR_PAGE; 425 op.arg1.mfn = xpmap_ptom(pa) >> PAGE_SHIFT; 426 427 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 428 panic(__func__); 429 } 430 431 int 432 xpq_update_foreign(paddr_t ptr, pt_entry_t val, int dom, u_int flags) 433 { 434 mmu_update_t op; 435 int ok; 436 int err; 437 438 xpq_flush_queue(); 439 440 op.ptr = ptr; 441 if (flags & PMAP_MD_XEN_NOTR) 442 op.ptr |= MMU_PT_UPDATE_NO_TRANSLATE; 443 op.val = val; 444 /* 445 * here we return a negative error number as Xen error to 446 * pmap_enter_ma. only calls from privcmd.c should end here, and 447 * it can deal with it. 448 */ 449 if ((err = HYPERVISOR_mmu_update(&op, 1, &ok, dom)) < 0) { 450 return err; 451 } 452 return 0; 453 } 454 455 #if L2_SLOT_KERNBASE > 0 456 #define TABLE_L2_ENTRIES (2 * (NKL2_KIMG_ENTRIES + 1)) 457 #else 458 #define TABLE_L2_ENTRIES (NKL2_KIMG_ENTRIES + 1) 459 #endif 460 461 #ifdef __x86_64__ 462 #define PDIRSZ PTP_LEVELS 463 #else 464 /* 465 * For PAE, we need an L3 page, a single contiguous L2 "superpage" of 4 pages 466 * (all of them mapped by the L3 page), and a shadow page for L3[3]. 467 */ 468 #define PDIRSZ (1 + 4 + 1) 469 #endif 470 471 /* 472 * Xen locore: get rid of the Xen bootstrap tables. Build and switch to new page 473 * tables. 474 * 475 * Virtual address space of the kernel when leaving this function: 476 * +--------------+------------------+-------------+------------+--------------- 477 * | KERNEL IMAGE | BOOTSTRAP TABLES | PROC0 UAREA | DUMMY PAGE | HYPER. SHARED 478 * +--------------+------------------+-------------+------------+--------------- 479 * 480 * ------+-----------------+-------------+ 481 * INFO | EARLY ZERO PAGE | ISA I/O MEM | 482 * ------+-----------------+-------------+ 483 * 484 * DUMMY PAGE is either a PDG for amd64 or a GDT for i386. 485 * 486 * (HYPER. SHARED INFO + EARLY ZERO PAGE + ISA I/O MEM) have no physical 487 * addresses preallocated. 488 */ 489 vaddr_t 490 xen_locore(void) 491 { 492 size_t nL2, oldcount, mapsize; 493 vaddr_t our_tables, xen_tables; 494 u_int descs[4]; 495 496 xen_init_features(); 497 498 xpmap_phys_to_machine_mapping = 499 (unsigned long *)xen_start_info.mfn_list; 500 501 /* Set the NX/XD bit, if available. descs[3] = %edx. */ 502 x86_cpuid(0x80000001, descs); 503 xpmap_pg_nx = (descs[3] & CPUID_NOX) ? PTE_NX : 0; 504 505 /* Space after Xen boostrap tables should be free */ 506 xen_tables = xen_start_info.pt_base; 507 our_tables = xen_tables + (xen_start_info.nr_pt_frames * PAGE_SIZE); 508 509 /* 510 * Calculate how much space we need. First, everything mapped before 511 * the Xen bootstrap tables. 512 */ 513 mapsize = xen_tables - KERNTEXTOFF; 514 515 /* After the tables we'll have: 516 * - UAREA 517 * - dummy user PGD (x86_64) 518 * - HYPERVISOR_shared_info 519 * - early_zerop 520 * - ISA I/O mem (if needed) 521 */ 522 mapsize += UPAGES * PAGE_SIZE; 523 #ifdef __x86_64__ 524 mapsize += PAGE_SIZE; 525 #endif 526 mapsize += PAGE_SIZE; 527 mapsize += PAGE_SIZE; 528 #ifdef DOM0OPS 529 if (xendomain_is_dom0()) { 530 mapsize += IOM_SIZE; 531 } 532 #endif 533 534 /* 535 * At this point, mapsize doesn't include the table size. 536 */ 537 #ifdef __x86_64__ 538 nL2 = TABLE_L2_ENTRIES; 539 #else 540 nL2 = (mapsize + (NBPD_L2 - 1)) >> L2_SHIFT; 541 #endif 542 543 /* 544 * Now compute how many L2 pages we need exactly. This is useful only 545 * on i386, since the initial count for amd64 is already enough. 546 */ 547 while (KERNTEXTOFF + mapsize + (nL2 + PDIRSZ) * PAGE_SIZE > 548 KERNBASE + (nL2 << L2_SHIFT)) { 549 nL2++; 550 } 551 552 #ifdef i386 553 /* 554 * One more L2 page: we'll allocate several pages after kva_start 555 * in pmap_bootstrap() before pmap_growkernel(), which have not been 556 * counted here. It's not a big issue to allocate one more L2 as 557 * pmap_growkernel() will be called anyway. 558 */ 559 nL2++; 560 nkptp[1] = nL2; 561 #endif 562 563 /* 564 * Install bootstrap pages. We may need more L2 pages than will 565 * have the final table here, as it's installed after the final table. 566 */ 567 oldcount = nL2; 568 569 bootstrap_again: 570 571 /* 572 * Xen space we'll reclaim may not be enough for our new page tables, 573 * move bootstrap tables if necessary. 574 */ 575 if (our_tables < xen_tables + ((nL2 + PDIRSZ) * PAGE_SIZE)) 576 our_tables = xen_tables + ((nL2 + PDIRSZ) * PAGE_SIZE); 577 578 /* 579 * Make sure the number of L2 pages we have is enough to map everything 580 * from KERNBASE to the bootstrap tables themselves. 581 */ 582 if (our_tables + ((oldcount + PDIRSZ) * PAGE_SIZE) > 583 KERNBASE + (oldcount << L2_SHIFT)) { 584 oldcount++; 585 goto bootstrap_again; 586 } 587 588 /* Create temporary tables */ 589 xen_bootstrap_tables(xen_tables, our_tables, 590 xen_start_info.nr_pt_frames, oldcount, false); 591 592 /* Create final tables */ 593 xen_bootstrap_tables(our_tables, xen_tables, 594 oldcount + PDIRSZ, nL2, true); 595 596 /* Zero out PROC0 UAREA and DUMMY PAGE. */ 597 memset((void *)(xen_tables + ((nL2 + PDIRSZ) * PAGE_SIZE)), 0, 598 (UPAGES + 1) * PAGE_SIZE); 599 600 /* Finally, flush TLB. */ 601 xpq_queue_tlb_flush(); 602 603 return (xen_tables + ((nL2 + PDIRSZ) * PAGE_SIZE)); 604 } 605 606 /* 607 * Build a new table and switch to it. 608 * old_count is # of old tables (including L4, L3 and L2). 609 * new_count is # of new tables (PTE only). 610 * We assume the areas don't overlap. 611 */ 612 static void 613 xen_bootstrap_tables(vaddr_t old_pgd, vaddr_t new_pgd, size_t old_count, 614 size_t new_count, bool final) 615 { 616 pd_entry_t *L4cpu, *L4, *L3, *L2, *pte; 617 paddr_t addr; 618 vaddr_t page, avail, map_end; 619 int i; 620 extern char __rodata_start; 621 extern char __data_start; 622 extern char __kernel_end; 623 extern char *early_zerop; /* from pmap.c */ 624 #ifdef i386 625 extern union descriptor tmpgdt[]; 626 #endif 627 628 /* 629 * Layout of RW area after the kernel image: 630 * xencons_interface (if present) 631 * xenstore_interface (if present) 632 * table pages (new_count + PDIRSZ entries) 633 * Extra mappings (only when final is true): 634 * UAREA 635 * dummy user PGD (x86_64 only) / GDT page (i386 only) 636 * HYPERVISOR_shared_info 637 * early_zerop 638 * ISA I/O mem (if needed) 639 */ 640 map_end = new_pgd + ((new_count + PDIRSZ) * PAGE_SIZE); 641 if (final) { 642 map_end += UPAGES * PAGE_SIZE; 643 xen_dummy_page = (vaddr_t)map_end; 644 map_end += PAGE_SIZE; 645 HYPERVISOR_shared_info = (shared_info_t *)map_end; 646 map_end += PAGE_SIZE; 647 early_zerop = (char *)map_end; 648 map_end += PAGE_SIZE; 649 } 650 651 /* 652 * We always set atdevbase, as it's used by init386 to find the first 653 * available VA. map_end is updated only if we are dom0, so 654 * atdevbase -> atdevbase + IOM_SIZE will be mapped only in 655 * this case. 656 */ 657 if (final) { 658 atdevbase = map_end; 659 #ifdef DOM0OPS 660 if (xendomain_is_dom0()) { 661 /* ISA I/O mem */ 662 map_end += IOM_SIZE; 663 } 664 #endif 665 } 666 667 __PRINTK(("xen_bootstrap_tables map_end 0x%lx\n", map_end)); 668 __PRINTK(("console %#lx ", xen_start_info.console_mfn)); 669 __PRINTK(("xenstore %#" PRIx32 "\n", xen_start_info.store_mfn)); 670 671 avail = new_pgd; 672 673 /* 674 * Create our page tables. 675 */ 676 677 #ifdef __x86_64__ 678 /* per-cpu L4 */ 679 L4cpu = (pd_entry_t *)avail; 680 memset(L4cpu, 0, PAGE_SIZE); 681 avail += PAGE_SIZE; 682 683 /* pmap_kernel L4 */ 684 L4 = (pd_entry_t *)avail; 685 memset(L4, 0, PAGE_SIZE); 686 avail += PAGE_SIZE; 687 688 /* L3 */ 689 L3 = (pd_entry_t *)avail; 690 memset(L3, 0, PAGE_SIZE); 691 avail += PAGE_SIZE; 692 693 /* link L4->L3 */ 694 addr = ((u_long)L3) - KERNBASE; 695 L4cpu[pl4_pi(KERNTEXTOFF)] = xpmap_ptom_masked(addr) | PTE_P | PTE_W; 696 L4[pl4_pi(KERNTEXTOFF)] = xpmap_ptom_masked(addr) | PTE_P | PTE_W; 697 698 /* L2 */ 699 L2 = (pd_entry_t *)avail; 700 memset(L2, 0, PAGE_SIZE); 701 avail += PAGE_SIZE; 702 703 /* link L3->L2 */ 704 addr = ((u_long)L2) - KERNBASE; 705 L3[pl3_pi(KERNTEXTOFF)] = xpmap_ptom_masked(addr) | PTE_P | PTE_W; 706 #else 707 /* no L4 on i386PAE */ 708 __USE(L4cpu); 709 __USE(L4); 710 711 /* L3 */ 712 L3 = (pd_entry_t *)avail; 713 memset(L3, 0, PAGE_SIZE); 714 avail += PAGE_SIZE; 715 716 /* 717 * Our PAE-style level 2, 5 contiguous pages (4 L2 + 1 shadow). 718 * +-----------------+----------------+---------+ 719 * Physical layout: | 3 * USERLAND L2 | L2 KERN SHADOW | L2 KERN | 720 * +-----------------+----------------+---------+ 721 * However, we enter L3[3] into L2 KERN, and not L2 KERN SHADOW. 722 * This way, L2[L2_SLOT_KERN] always points to the shadow. 723 */ 724 L2 = (pd_entry_t *)avail; 725 memset(L2, 0, PAGE_SIZE * 5); 726 avail += PAGE_SIZE * 5; 727 728 /* 729 * Link L2 pages in L3, with a special case for L2 KERN. Xen doesn't 730 * want RW permissions in L3 entries, it'll add them itself. 731 */ 732 addr = ((u_long)L2) - KERNBASE; 733 for (i = 0; i < 3; i++, addr += PAGE_SIZE) { 734 L3[i] = xpmap_ptom_masked(addr) | PTE_P; 735 } 736 addr += PAGE_SIZE; 737 L3[3] = xpmap_ptom_masked(addr) | PTE_P; 738 #endif 739 740 /* Level 1 */ 741 page = KERNTEXTOFF; 742 for (i = 0; i < new_count; i ++) { 743 vaddr_t cur_page = page; 744 745 pte = (pd_entry_t *)avail; 746 memset(pte, 0, PAGE_SIZE); 747 avail += PAGE_SIZE; 748 749 while (pl2_pi(page) == pl2_pi(cur_page)) { 750 if (page >= map_end) { 751 /* not mapped at all */ 752 pte[pl1_pi(page)] = 0; 753 page += PAGE_SIZE; 754 continue; 755 } 756 pte[pl1_pi(page)] = xpmap_ptom_masked(page - KERNBASE); 757 if (page == (vaddr_t)HYPERVISOR_shared_info) { 758 pte[pl1_pi(page)] = xen_start_info.shared_info; 759 } 760 if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT) 761 == xen_start_info.console.domU.mfn) { 762 xencons_interface = (void *)page; 763 pte[pl1_pi(page)] = xen_start_info.console_mfn; 764 pte[pl1_pi(page)] <<= PAGE_SHIFT; 765 } 766 if ((xpmap_ptom_masked(page - KERNBASE) >> PAGE_SHIFT) 767 == xen_start_info.store_mfn) { 768 xenstore_interface = (void *)page; 769 pte[pl1_pi(page)] = xen_start_info.store_mfn; 770 pte[pl1_pi(page)] <<= PAGE_SHIFT; 771 } 772 #ifdef DOM0OPS 773 if (page >= (vaddr_t)atdevbase && 774 page < (vaddr_t)atdevbase + IOM_SIZE) { 775 pte[pl1_pi(page)] = 776 IOM_BEGIN + (page - (vaddr_t)atdevbase); 777 pte[pl1_pi(page)] |= xpmap_pg_nx; 778 } 779 #endif 780 781 pte[pl1_pi(page)] |= PTE_P; 782 if (page < (vaddr_t)&__rodata_start) { 783 /* Map the kernel text RX. Nothing to do. */ 784 } else if (page >= (vaddr_t)&__rodata_start && 785 page < (vaddr_t)&__data_start) { 786 /* Map the kernel rodata R. */ 787 pte[pl1_pi(page)] |= xpmap_pg_nx; 788 } else if (page >= old_pgd && 789 page < old_pgd + (old_count * PAGE_SIZE)) { 790 /* Map the old page tables R. */ 791 pte[pl1_pi(page)] |= xpmap_pg_nx; 792 } else if (page >= new_pgd && 793 page < new_pgd + ((new_count + PDIRSZ) * PAGE_SIZE)) { 794 /* Map the new page tables R. */ 795 pte[pl1_pi(page)] |= xpmap_pg_nx; 796 #ifdef i386 797 } else if (page == (vaddr_t)tmpgdt) { 798 /* 799 * Map bootstrap gdt R/O. Later, we will re-add 800 * this page to uvm after making it writable. 801 */ 802 pte[pl1_pi(page)] = 0; 803 page += PAGE_SIZE; 804 continue; 805 #endif 806 } else if (page >= (vaddr_t)&__data_start && 807 page < (vaddr_t)&__kernel_end) { 808 /* Map the kernel data+bss RW. */ 809 pte[pl1_pi(page)] |= PTE_W | xpmap_pg_nx; 810 } else { 811 /* Map the page RW. */ 812 pte[pl1_pi(page)] |= PTE_W | xpmap_pg_nx; 813 } 814 815 page += PAGE_SIZE; 816 } 817 818 addr = ((u_long)pte) - KERNBASE; 819 L2[pl2_pi(cur_page)] = xpmap_ptom_masked(addr) | PTE_W | PTE_P; 820 821 /* Mark readonly */ 822 xen_bt_set_readonly((vaddr_t)pte); 823 } 824 825 /* Install recursive page tables mapping */ 826 #ifdef __x86_64__ 827 /* Recursive entry in pmap_kernel(). */ 828 L4[PDIR_SLOT_PTE] = xpmap_ptom_masked((paddr_t)L4 - KERNBASE) 829 | PTE_P | xpmap_pg_nx; 830 /* Recursive entry in higher-level per-cpu PD. */ 831 L4cpu[PDIR_SLOT_PTE] = xpmap_ptom_masked((paddr_t)L4cpu - KERNBASE) 832 | PTE_P | xpmap_pg_nx; 833 834 /* Mark tables RO */ 835 xen_bt_set_readonly((vaddr_t)L2); 836 #else 837 /* Copy L2 KERN into L2 KERN SHADOW, and reference the latter in cpu0. */ 838 memcpy(&L2[L2_SLOT_KERN + NPDPG], &L2[L2_SLOT_KERN], PAGE_SIZE); 839 cpu_info_primary.ci_kpm_pdir = &L2[L2_SLOT_KERN + NPDPG]; 840 cpu_info_primary.ci_kpm_pdirpa = 841 (vaddr_t)cpu_info_primary.ci_kpm_pdir - KERNBASE; 842 843 /* 844 * We don't enter a recursive entry from the L3 PD. Instead, we enter 845 * the first 4 L2 pages, which includes the kernel's L2 shadow. But we 846 * have to enter the shadow after switching %cr3, or Xen will refcount 847 * some PTEs with the wrong type. 848 */ 849 addr = (u_long)L2 - KERNBASE; 850 for (i = 0; i < 3; i++, addr += PAGE_SIZE) { 851 L2[PDIR_SLOT_PTE + i] = xpmap_ptom_masked(addr) | PTE_P | 852 xpmap_pg_nx; 853 } 854 855 /* Mark tables RO, and pin L2 KERN SHADOW. */ 856 addr = (u_long)L2 - KERNBASE; 857 for (i = 0; i < 5; i++, addr += PAGE_SIZE) { 858 xen_bt_set_readonly(((vaddr_t)L2) + PAGE_SIZE * i); 859 } 860 if (final) { 861 addr = (u_long)L2 - KERNBASE + 3 * PAGE_SIZE; 862 xpq_queue_pin_l2_table(xpmap_ptom_masked(addr)); 863 } 864 #endif 865 866 xen_bt_set_readonly((vaddr_t)L3); 867 #ifdef __x86_64__ 868 xen_bt_set_readonly((vaddr_t)L4cpu); 869 #endif 870 871 /* Pin the PGD */ 872 #ifdef __x86_64__ 873 xpq_queue_pin_l4_table(xpmap_ptom_masked(new_pgd - KERNBASE)); 874 #else 875 xpq_queue_pin_l3_table(xpmap_ptom_masked(new_pgd - KERNBASE)); 876 #endif 877 878 /* Save phys. addr of PDP, for libkvm. */ 879 #ifdef __x86_64__ 880 PDPpaddr = (u_long)L4 - KERNBASE; 881 #else 882 PDPpaddr = (u_long)L2 - KERNBASE; /* PDP is the L2 with PAE */ 883 #endif 884 885 /* Switch to new tables */ 886 xpq_queue_pt_switch(xpmap_ptom_masked(new_pgd - KERNBASE)); 887 888 if (final) { 889 #ifdef __x86_64__ 890 /* Save the address of the real per-cpu L4 page. */ 891 cpu_info_primary.ci_kpm_pdir = L4cpu; 892 cpu_info_primary.ci_kpm_pdirpa = ((paddr_t)L4cpu - KERNBASE); 893 #else 894 /* Save the address of the L3 page */ 895 cpu_info_primary.ci_pae_l3_pdir = L3; 896 cpu_info_primary.ci_pae_l3_pdirpa = (new_pgd - KERNBASE); 897 898 /* Now enter the kernel's PTE mappings */ 899 addr = (u_long)L2 - KERNBASE + PAGE_SIZE * 3; 900 xpq_queue_pte_update( 901 xpmap_ptom(((vaddr_t)&L2[PDIR_SLOT_PTE + 3]) - KERNBASE), 902 xpmap_ptom_masked(addr) | PTE_P); 903 xpq_flush_queue(); 904 #endif 905 } 906 907 /* 908 * Now we can safely reclaim the space taken by the old tables. 909 */ 910 911 /* Unpin old PGD */ 912 xpq_queue_unpin_table(xpmap_ptom_masked(old_pgd - KERNBASE)); 913 914 /* Mark old tables RW */ 915 page = old_pgd; 916 addr = xpmap_mtop((paddr_t)L2[pl2_pi(page)] & PTE_4KFRAME); 917 pte = (pd_entry_t *)((u_long)addr + KERNBASE); 918 pte += pl1_pi(page); 919 while (page < old_pgd + (old_count * PAGE_SIZE) && page < map_end) { 920 addr = xpmap_ptom(((u_long)pte) - KERNBASE); 921 xpq_queue_pte_update(addr, *pte | PTE_W); 922 page += PAGE_SIZE; 923 /* 924 * Our PTEs are contiguous so it's safe to just "++" here. 925 */ 926 pte++; 927 } 928 xpq_flush_queue(); 929 } 930 931 /* 932 * Mark a page read-only, assuming vaddr = paddr + KERNBASE. 933 */ 934 static void 935 xen_bt_set_readonly(vaddr_t page) 936 { 937 pt_entry_t entry; 938 939 entry = xpmap_ptom_masked(page - KERNBASE); 940 entry |= PTE_P | xpmap_pg_nx; 941 942 HYPERVISOR_update_va_mapping(page, entry, UVMF_INVLPG); 943 } 944 945 #ifdef __x86_64__ 946 void 947 xen_set_user_pgd(paddr_t page) 948 { 949 struct mmuext_op op; 950 951 int s = splvm(); 952 xpq_flush_queue(); 953 splx(s); 954 op.cmd = MMUEXT_NEW_USER_BASEPTR; 955 op.arg1.mfn = xpmap_ptom_masked(page) >> PAGE_SHIFT; 956 if (HYPERVISOR_mmuext_op(&op, 1, NULL, DOMID_SELF) < 0) 957 panic("xen_set_user_pgd: failed to install new user page" 958 " directory %#" PRIxPADDR, page); 959 } 960 #endif /* __x86_64__ */ 961