1 /* $NetBSD: svs.c,v 1.42 2022/09/24 11:05:18 riastradh Exp $ */
2
3 /*
4 * Copyright (c) 2018-2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Maxime Villard.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: svs.c,v 1.42 2022/09/24 11:05:18 riastradh Exp $");
34
35 #include "opt_svs.h"
36 #include "opt_user_ldt.h"
37
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/proc.h>
41 #include <sys/cpu.h>
42 #include <sys/kauth.h>
43 #include <sys/sysctl.h>
44 #include <sys/xcall.h>
45 #include <sys/reboot.h>
46
47 #include <x86/cputypes.h>
48
49 #include <machine/cpuvar.h>
50 #include <machine/frameasm.h>
51 #include <machine/gdt.h>
52 #include <machine/pmap_private.h>
53
54 #include <uvm/uvm.h>
55 #include <uvm/uvm_page.h>
56
57 /*
58 * Separate Virtual Space
59 *
60 * A per-cpu L4 page is maintained in ci_svs_updirpa. During each context
61 * switch to a user pmap, the lower half of updirpa is populated with the
62 * entries containing the userland pages.
63 *
64 * ~~~~~~~~~~ The UTLS Page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
65 *
66 * We use a special per-cpu page that we call UTLS, for User Thread Local
67 * Storage. Each CPU has one UTLS page. This page has two VAs:
68 *
69 * o When the user page tables are loaded in CR3, the VA to access this
70 * page is &pcpuarea->utls, defined as SVS_UTLS in assembly. This VA is
71 * _constant_ across CPUs, but in the user page tables this VA points to
72 * the physical page of the UTLS that is _local_ to the CPU.
73 *
74 * o When the kernel page tables are loaded in CR3, the VA to access this
75 * page is ci->ci_svs_utls.
76 *
77 * +----------------------------------------------------------------------+
78 * | CPU0 Local Data (Physical Page) |
79 * | +------------------+ +-------------+ |
80 * | | User Page Tables | SVS_UTLS ---------------------> | cpu0's UTLS | |
81 * | +------------------+ +-------------+ |
82 * +-------------------------------------------------------------^--------+
83 * |
84 * +----------+
85 * |
86 * +----------------------------------------------------------------------+ |
87 * | CPU1 Local Data (Physical Page) | |
88 * | +------------------+ +-------------+ | |
89 * | | User Page Tables | SVS_UTLS ---------------------> | cpu1's UTLS | | |
90 * | +------------------+ +-------------+ | |
91 * +-------------------------------------------------------------^--------+ |
92 * | |
93 * +------------------+ /----------------------+ |
94 * | Kern Page Tables | ci->ci_svs_utls |
95 * +------------------+ \---------------------------------+
96 *
97 * The goal of the UTLS page is to provide an area where we can store whatever
98 * we want, in a way that it is accessible both when the Kernel and when the
99 * User page tables are loaded in CR3.
100 *
101 * We store in the UTLS page three 64bit values:
102 *
103 * o UTLS_KPDIRPA: the value we must put in CR3 in order to load the kernel
104 * page tables.
105 *
106 * o UTLS_SCRATCH: a dummy place where we temporarily store a value during
107 * the syscall entry procedure.
108 *
109 * o UTLS_RSP0: the value we must put in RSP in order to have a stack where
110 * we can push the register states. This is used only during the syscall
111 * entry procedure, because there the CPU does not automatically switch
112 * RSP (it does not use the TSS.rsp0 mechanism described below).
113 *
114 * ~~~~~~~~~~ The Stack Switching Mechanism Without SVS ~~~~~~~~~~~~~~~~~~~~~~
115 *
116 * The kernel stack is per-lwp (pcb_rsp0). When doing a context switch between
117 * two user LWPs, the kernel updates TSS.rsp0 (which is per-cpu) to point to
118 * the stack of the new LWP. Then the execution continues. At some point, the
119 * user LWP we context-switched to will perform a syscall or will receive an
120 * interrupt. There, the CPU will automatically read TSS.rsp0 and use it as a
121 * stack. The kernel then pushes the register states on this stack, and
122 * executes in kernel mode normally.
123 *
124 * TSS.rsp0 is used by the CPU only during ring3->ring0 transitions. Therefore,
125 * when an interrupt is received while we were in kernel mode, the CPU does not
126 * read TSS.rsp0. Instead, it just uses the current stack.
127 *
128 * ~~~~~~~~~~ The Stack Switching Mechanism With SVS ~~~~~~~~~~~~~~~~~~~~~~~~~
129 *
130 * In the pcpu_area structure, pointed to by the "pcpuarea" variable, each CPU
131 * has a two-page rsp0 entry (pcpuarea->ent[cid].rsp0). These two pages do
132 * _not_ have associated physical addresses. They are only two VAs.
133 *
134 * The first page is unmapped and acts as a redzone. The second page is
135 * dynamically kentered into the highest page of the real per-lwp kernel stack;
136 * but pay close attention, it is kentered _only_ in the user page tables.
137 * That is to say, the VA of this second page is mapped when the user page
138 * tables are loaded, but not mapped when the kernel page tables are loaded.
139 *
140 * During a context switch, svs_lwp_switch() gets called first. This function
141 * does the kenter job described above, not in the kernel page tables (that
142 * are currently loaded), but in the user page tables (that are not loaded).
143 *
144 * VIRTUAL ADDRESSES PHYSICAL ADDRESSES
145 *
146 * +-----------------------------+
147 * | KERNEL PAGE TABLES |
148 * | +-------------------+ | +-------------------+
149 * | | pcb_rsp0 (page 0) | ------------------> | pcb_rsp0 (page 0) |
150 * | +-------------------+ | +-------------------+
151 * | | pcb_rsp0 (page 1) | ------------------> | pcb_rsp0 (page 1) |
152 * | +-------------------+ | +-------------------+
153 * | | pcb_rsp0 (page 2) | ------------------> | pcb_rsp0 (page 2) |
154 * | +-------------------+ | +-------------------+
155 * | | pcb_rsp0 (page 3) | ------------------> | pcb_rsp0 (page 3) |
156 * | +-------------------+ | +-> +-------------------+
157 * +-----------------------------+ |
158 * |
159 * +---------------------------------------+ |
160 * | USER PAGE TABLES | |
161 * | +----------------------------------+ | |
162 * | | pcpuarea->ent[cid].rsp0 (page 0) | | |
163 * | +----------------------------------+ | |
164 * | | pcpuarea->ent[cid].rsp0 (page 1) | ----+
165 * | +----------------------------------+ |
166 * +---------------------------------------+
167 *
168 * After svs_lwp_switch() gets called, we set pcpuarea->ent[cid].rsp0 (page 1)
169 * in TSS.rsp0. Later, when returning to userland on the lwp we context-
170 * switched to, we will load the user page tables and execute in userland
171 * normally.
172 *
173 * Next time an interrupt or syscall is received, the CPU will automatically
174 * use TSS.rsp0 as a stack. Here it is executing with the user page tables
175 * loaded, and therefore TSS.rsp0 is _mapped_.
176 *
177 * As part of the kernel entry procedure, we now switch CR3 to load the kernel
178 * page tables. Here, we are still using the stack pointer we set in TSS.rsp0.
179 *
180 * Remember that it was only one page of stack which was mapped only in the
181 * user page tables. We just switched to the kernel page tables, so we must
182 * update RSP to be the real per-lwp kernel stack (pcb_rsp0). And we do so,
183 * without touching the stack (since it is now unmapped, touching it would
184 * fault).
185 *
186 * After we updated RSP, we can continue execution exactly as in the non-SVS
187 * case. We don't need to copy the values the CPU pushed on TSS.rsp0: even if
188 * we updated RSP to a totally different VA, this VA points to the same
189 * physical page as TSS.rsp0. So in the end, the values the CPU pushed are
190 * still here even with the new RSP.
191 *
192 * Thanks to this double-kenter optimization, we don't need to copy the
193 * trapframe during each user<->kernel transition.
194 *
195 * ~~~~~~~~~~ Notes On Locking And Synchronization ~~~~~~~~~~~~~~~~~~~~~~~~~~~
196 *
197 * o Touching ci_svs_updir without holding ci_svs_mtx first is *not*
198 * allowed.
199 *
200 * o pm_kernel_cpus contains the set of CPUs that have the pmap loaded
201 * in their CR3 register. It must *not* be replaced by pm_cpus.
202 *
203 * o When a context switch on the current CPU is made from a user LWP
204 * towards a kernel LWP, CR3 is not updated. Therefore, the pmap's
205 * pm_kernel_cpus still contains the current CPU. It implies that the
206 * remote CPUs that execute other threads of the user process we just
207 * left will keep synchronizing us against their changes.
208 *
209 * ~~~~~~~~~~ List Of Areas That Are Removed From Userland ~~~~~~~~~~~~~~~~~~~
210 *
211 * o PTE Space
212 * o Direct Map
213 * o Remote PCPU Areas
214 * o Kernel Heap
215 * o Kernel Image
216 *
217 * ~~~~~~~~~~ Todo List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
218 *
219 * Ordered from highest priority to lowest:
220 *
221 * o The NMI stack is not double-entered. Therefore if we ever receive an NMI
222 * and leave it, the content of the stack will be visible to userland (via
223 * Meltdown). Normally we never leave NMIs, unless a privileged user
224 * launched PMCs. That's unlikely to happen, our PMC support is pretty
225 * minimal, and privileged only.
226 *
227 * o Narrow down the entry points: hide the 'jmp handler' instructions. This
228 * makes sense on GENERIC_KASLR kernels.
229 */
230
231 /* -------------------------------------------------------------------------- */
232
233 /* SVS_ENTER. */
234 extern uint8_t svs_enter, svs_enter_end;
235 static const struct x86_hotpatch_source hp_svs_enter_source = {
236 .saddr = &svs_enter,
237 .eaddr = &svs_enter_end
238 };
239 static const struct x86_hotpatch_descriptor hp_svs_enter_desc = {
240 .name = HP_NAME_SVS_ENTER,
241 .nsrc = 1,
242 .srcs = { &hp_svs_enter_source }
243 };
244 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_enter_desc);
245
246 /* SVS_ENTER_ALT. */
247 extern uint8_t svs_enter_altstack, svs_enter_altstack_end;
248 static const struct x86_hotpatch_source hp_svs_enter_altstack_source = {
249 .saddr = &svs_enter_altstack,
250 .eaddr = &svs_enter_altstack_end
251 };
252 static const struct x86_hotpatch_descriptor hp_svs_enter_altstack_desc = {
253 .name = HP_NAME_SVS_ENTER_ALT,
254 .nsrc = 1,
255 .srcs = { &hp_svs_enter_altstack_source }
256 };
257 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_enter_altstack_desc);
258
259 /* SVS_ENTER_NMI. */
260 extern uint8_t svs_enter_nmi, svs_enter_nmi_end;
261 static const struct x86_hotpatch_source hp_svs_enter_nmi_source = {
262 .saddr = &svs_enter_nmi,
263 .eaddr = &svs_enter_nmi_end
264 };
265 static const struct x86_hotpatch_descriptor hp_svs_enter_nmi_desc = {
266 .name = HP_NAME_SVS_ENTER_NMI,
267 .nsrc = 1,
268 .srcs = { &hp_svs_enter_nmi_source }
269 };
270 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_enter_nmi_desc);
271
272 /* SVS_LEAVE. */
273 extern uint8_t svs_leave, svs_leave_end;
274 static const struct x86_hotpatch_source hp_svs_leave_source = {
275 .saddr = &svs_leave,
276 .eaddr = &svs_leave_end
277 };
278 static const struct x86_hotpatch_descriptor hp_svs_leave_desc = {
279 .name = HP_NAME_SVS_LEAVE,
280 .nsrc = 1,
281 .srcs = { &hp_svs_leave_source }
282 };
283 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_leave_desc);
284
285 /* SVS_LEAVE_ALT. */
286 extern uint8_t svs_leave_altstack, svs_leave_altstack_end;
287 static const struct x86_hotpatch_source hp_svs_leave_altstack_source = {
288 .saddr = &svs_leave_altstack,
289 .eaddr = &svs_leave_altstack_end
290 };
291 static const struct x86_hotpatch_descriptor hp_svs_leave_altstack_desc = {
292 .name = HP_NAME_SVS_LEAVE_ALT,
293 .nsrc = 1,
294 .srcs = { &hp_svs_leave_altstack_source }
295 };
296 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_leave_altstack_desc);
297
298 /* SVS_LEAVE_NMI. */
299 extern uint8_t svs_leave_nmi, svs_leave_nmi_end;
300 static const struct x86_hotpatch_source hp_svs_leave_nmi_source = {
301 .saddr = &svs_leave_nmi,
302 .eaddr = &svs_leave_nmi_end
303 };
304 static const struct x86_hotpatch_descriptor hp_svs_leave_nmi_desc = {
305 .name = HP_NAME_SVS_LEAVE_NMI,
306 .nsrc = 1,
307 .srcs = { &hp_svs_leave_nmi_source }
308 };
309 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_leave_nmi_desc);
310
311 /* -------------------------------------------------------------------------- */
312
313 bool svs_enabled __read_mostly = false;
314 bool svs_pcid __read_mostly = false;
315
316 static uint64_t svs_pcid_kcr3 __read_mostly;
317 static uint64_t svs_pcid_ucr3 __read_mostly;
318
319 struct svs_utls {
320 paddr_t kpdirpa;
321 uint64_t scratch;
322 vaddr_t rsp0;
323 };
324
325 static pd_entry_t *
svs_tree_add(struct cpu_info * ci,vaddr_t va)326 svs_tree_add(struct cpu_info *ci, vaddr_t va)
327 {
328 extern const vaddr_t ptp_masks[];
329 extern const int ptp_shifts[];
330 pd_entry_t *dstpde;
331 struct vm_page *pg;
332 size_t i, pidx;
333 paddr_t pa;
334
335 dstpde = ci->ci_svs_updir;
336
337 for (i = PTP_LEVELS; i > 1; i--) {
338 pidx = pl_pi(va, i);
339
340 if (!pmap_valid_entry(dstpde[pidx])) {
341 pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
342 if (pg == 0)
343 panic("%s: failed to allocate PA for CPU %d\n",
344 __func__, cpu_index(ci));
345 pa = VM_PAGE_TO_PHYS(pg);
346
347 dstpde[pidx] = PTE_P | PTE_W | pa;
348 }
349
350 pa = (paddr_t)(dstpde[pidx] & PTE_FRAME);
351 dstpde = (pd_entry_t *)PMAP_DIRECT_MAP(pa);
352 }
353
354 return dstpde;
355 }
356
357 static void
svs_page_add(struct cpu_info * ci,vaddr_t va,bool global)358 svs_page_add(struct cpu_info *ci, vaddr_t va, bool global)
359 {
360 pd_entry_t *srcpde, *dstpde, pde;
361 size_t idx, pidx;
362 paddr_t pa;
363
364 /* Create levels L4, L3 and L2. */
365 dstpde = svs_tree_add(ci, va);
366
367 pidx = pl1_pi(va);
368
369 /*
370 * If 'va' is in a large page, we need to compute its physical
371 * address manually.
372 */
373 idx = pl2_i(va);
374 srcpde = L2_BASE;
375 if (!pmap_valid_entry(srcpde[idx])) {
376 panic("%s: L2 page not mapped", __func__);
377 }
378 if (srcpde[idx] & PTE_PS) {
379 KASSERT(!global);
380 pa = srcpde[idx] & PTE_2MFRAME;
381 pa += (paddr_t)(va % NBPD_L2);
382 pde = (srcpde[idx] & ~(PTE_PS|PTE_2MFRAME)) | pa;
383
384 if (pmap_valid_entry(dstpde[pidx])) {
385 panic("%s: L1 page already mapped", __func__);
386 }
387 dstpde[pidx] = pde;
388 return;
389 }
390
391 /*
392 * Normal page, just copy the PDE.
393 */
394 idx = pl1_i(va);
395 srcpde = L1_BASE;
396 if (!pmap_valid_entry(srcpde[idx])) {
397 panic("%s: L1 page not mapped", __func__);
398 }
399 if (pmap_valid_entry(dstpde[pidx])) {
400 panic("%s: L1 page already mapped", __func__);
401 }
402 dstpde[pidx] = srcpde[idx];
403
404 /*
405 * If we want a global translation, mark both the src and dst with
406 * PTE_G.
407 */
408 if (global) {
409 srcpde[idx] |= PTE_G;
410 dstpde[pidx] |= PTE_G;
411 tlbflushg();
412 }
413 }
414
415 static void
svs_rsp0_init(struct cpu_info * ci)416 svs_rsp0_init(struct cpu_info *ci)
417 {
418 const cpuid_t cid = cpu_index(ci);
419 vaddr_t va, rsp0;
420 pd_entry_t *pd;
421 size_t pidx;
422
423 rsp0 = (vaddr_t)&pcpuarea->ent[cid].rsp0;
424
425 /* The first page is a redzone. */
426 va = rsp0 + PAGE_SIZE;
427
428 /* Create levels L4, L3 and L2. */
429 pd = svs_tree_add(ci, va);
430
431 /* Get the info for L1. */
432 pidx = pl1_i(va % NBPD_L2);
433 if (pmap_valid_entry(pd[pidx])) {
434 panic("%s: rsp0 page already mapped", __func__);
435 }
436
437 ci->ci_svs_rsp0_pte = (pt_entry_t *)&pd[pidx];
438 ci->ci_svs_rsp0 = rsp0 + PAGE_SIZE + sizeof(struct trapframe);
439 ci->ci_svs_ursp0 = ci->ci_svs_rsp0 - sizeof(struct trapframe);
440 ci->ci_svs_krsp0 = 0;
441 }
442
443 static void
svs_utls_init(struct cpu_info * ci)444 svs_utls_init(struct cpu_info *ci)
445 {
446 const vaddr_t utlsva = (vaddr_t)&pcpuarea->utls;
447 struct svs_utls *utls;
448 struct vm_page *pg;
449 pd_entry_t *pd;
450 size_t pidx;
451 paddr_t pa;
452 vaddr_t va;
453
454 /* Create levels L4, L3 and L2 of the UTLS page. */
455 pd = svs_tree_add(ci, utlsva);
456
457 /* Allocate L1. */
458 pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
459 if (pg == 0)
460 panic("%s: failed to allocate PA for CPU %d\n", __func__,
461 cpu_index(ci));
462 pa = VM_PAGE_TO_PHYS(pg);
463
464 /* Enter L1. */
465 if (pmap_valid_entry(L1_BASE[pl1_i(utlsva)])) {
466 panic("%s: local page already mapped", __func__);
467 }
468 pidx = pl1_pi(utlsva);
469 if (pmap_valid_entry(pd[pidx])) {
470 panic("%s: L1 page already mapped", __func__);
471 }
472 pd[pidx] = PTE_P | PTE_W | pmap_pg_nx | pa;
473
474 /*
475 * Now, allocate a VA in the kernel map, that points to the UTLS
476 * page. After that, the UTLS page will be accessible in kernel
477 * mode via ci_svs_utls.
478 */
479 va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
480 UVM_KMF_VAONLY|UVM_KMF_NOWAIT);
481 if (va == 0) {
482 panic("%s: unable to allocate VA\n", __func__);
483 }
484 pmap_kenter_pa(va, pa, VM_PROT_READ|VM_PROT_WRITE, 0);
485 pmap_update(pmap_kernel());
486
487 ci->ci_svs_utls = va;
488
489 /* Initialize the constant fields of the UTLS page */
490 utls = (struct svs_utls *)ci->ci_svs_utls;
491 utls->rsp0 = ci->ci_svs_rsp0;
492 }
493
494 static void
svs_pcid_init(struct cpu_info * ci)495 svs_pcid_init(struct cpu_info *ci)
496 {
497 if (!svs_pcid) {
498 return;
499 }
500
501 svs_pcid_ucr3 = __SHIFTIN(PMAP_PCID_USER, CR3_PCID) | CR3_NO_TLB_FLUSH;
502 svs_pcid_kcr3 = __SHIFTIN(PMAP_PCID_KERN, CR3_PCID) | CR3_NO_TLB_FLUSH;
503
504 ci->ci_svs_updirpa |= svs_pcid_ucr3;
505 }
506
507 static void
svs_range_add(struct cpu_info * ci,vaddr_t va,size_t size,bool global)508 svs_range_add(struct cpu_info *ci, vaddr_t va, size_t size, bool global)
509 {
510 size_t i, n;
511
512 KASSERT(size % PAGE_SIZE == 0);
513 n = size / PAGE_SIZE;
514 for (i = 0; i < n; i++) {
515 svs_page_add(ci, va + i * PAGE_SIZE, global);
516 }
517 }
518
519 void
cpu_svs_init(struct cpu_info * ci)520 cpu_svs_init(struct cpu_info *ci)
521 {
522 extern char __text_user_start;
523 extern char __text_user_end;
524 extern vaddr_t idt_vaddr;
525 const cpuid_t cid = cpu_index(ci);
526 struct vm_page *pg;
527
528 KASSERT(ci != NULL);
529
530 pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
531 if (pg == 0)
532 panic("%s: failed to allocate L4 PA for CPU %d\n",
533 __func__, cpu_index(ci));
534 ci->ci_svs_updirpa = VM_PAGE_TO_PHYS(pg);
535
536 ci->ci_svs_updir = (pt_entry_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
537 UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
538 if (ci->ci_svs_updir == NULL)
539 panic("%s: failed to allocate L4 VA for CPU %d\n",
540 __func__, cpu_index(ci));
541
542 pmap_kenter_pa((vaddr_t)ci->ci_svs_updir, ci->ci_svs_updirpa,
543 VM_PROT_READ | VM_PROT_WRITE, 0);
544
545 pmap_update(pmap_kernel());
546
547 mutex_init(&ci->ci_svs_mtx, MUTEX_DEFAULT, IPL_VM);
548
549 if (cid == cpu_index(&cpu_info_primary) || !idt_vec_is_pcpu())
550 svs_page_add(ci, idt_vaddr, true);
551 svs_page_add(ci, (vaddr_t)&pcpuarea->ldt, true);
552 svs_range_add(ci, (vaddr_t)&pcpuarea->ent[cid],
553 offsetof(struct pcpu_entry, rsp0), true);
554 svs_range_add(ci, (vaddr_t)&__text_user_start,
555 (vaddr_t)&__text_user_end - (vaddr_t)&__text_user_start, false);
556
557 svs_rsp0_init(ci);
558 svs_utls_init(ci);
559 svs_pcid_init(ci);
560
561 #ifdef USER_LDT
562 mutex_enter(&cpu_lock);
563 ci->ci_svs_ldt_sel = ldt_alloc(&pcpuarea->ent[cid].ldt,
564 MAX_USERLDT_SIZE);
565 mutex_exit(&cpu_lock);
566 #endif
567 }
568
569 void
svs_pmap_sync(struct pmap * pmap,int index)570 svs_pmap_sync(struct pmap *pmap, int index)
571 {
572 CPU_INFO_ITERATOR cii;
573 struct cpu_info *ci;
574 cpuid_t cid;
575
576 KASSERT(pmap != NULL);
577 KASSERT(pmap != pmap_kernel());
578 KASSERT(pmap_is_user(pmap));
579 KASSERT(mutex_owned(&pmap->pm_lock));
580 KASSERT(kpreempt_disabled());
581 KASSERT(index < PDIR_SLOT_USERLIM);
582
583 ci = curcpu();
584 cid = cpu_index(ci);
585
586 mutex_enter(&ci->ci_svs_mtx);
587 KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
588 ci->ci_svs_updir[index] = pmap->pm_pdir[index];
589 mutex_exit(&ci->ci_svs_mtx);
590
591 if (!kcpuset_isotherset(pmap->pm_kernel_cpus, cid)) {
592 return;
593 }
594
595 for (CPU_INFO_FOREACH(cii, ci)) {
596 cid = cpu_index(ci);
597
598 if (!kcpuset_isset(pmap->pm_kernel_cpus, cid)) {
599 continue;
600 }
601
602 /* take the lock and check again */
603 mutex_enter(&ci->ci_svs_mtx);
604 if (kcpuset_isset(pmap->pm_kernel_cpus, cid)) {
605 ci->ci_svs_updir[index] = pmap->pm_pdir[index];
606 }
607 mutex_exit(&ci->ci_svs_mtx);
608 }
609 }
610
611 void
svs_ldt_sync(struct pmap * pmap)612 svs_ldt_sync(struct pmap *pmap)
613 {
614 struct cpu_info *ci = curcpu();
615 void *ldt;
616 int sel;
617
618 KASSERT(kpreempt_disabled());
619
620 /*
621 * Another LWP could concurrently modify the LDT via x86_set_ldt1().
622 * The LWP will wait for pmap_ldt_sync() to finish before destroying
623 * the outdated LDT.
624 *
625 * We have preemption disabled here, so it is guaranteed that even
626 * if the LDT we are syncing is the outdated one, it is still valid.
627 *
628 * pmap_ldt_sync() will execute later once we have preemption enabled,
629 * and will install the new LDT.
630 */
631 sel = atomic_load_relaxed(&pmap->pm_ldt_sel);
632 if (__predict_false(sel != GSYSSEL(GLDT_SEL, SEL_KPL))) {
633 ldt = atomic_load_relaxed(&pmap->pm_ldt);
634 memcpy(&pcpuarea->ent[cpu_index(ci)].ldt, ldt,
635 MAX_USERLDT_SIZE);
636 sel = ci->ci_svs_ldt_sel;
637 }
638
639 lldt(sel);
640 }
641
642 void
svs_lwp_switch(struct lwp * oldlwp,struct lwp * newlwp)643 svs_lwp_switch(struct lwp *oldlwp, struct lwp *newlwp)
644 {
645 struct cpu_info *ci = curcpu();
646 struct svs_utls *utls;
647 struct pcb *pcb;
648 pt_entry_t *pte;
649 uintptr_t rsp0;
650 vaddr_t va;
651
652 if (newlwp->l_flag & LW_SYSTEM) {
653 return;
654 }
655
656 #ifdef DIAGNOSTIC
657 if (!(oldlwp->l_flag & LW_SYSTEM)) {
658 pcb = lwp_getpcb(oldlwp);
659 rsp0 = pcb->pcb_rsp0;
660 va = rounddown(rsp0, PAGE_SIZE);
661 KASSERT(ci->ci_svs_krsp0 == rsp0 - sizeof(struct trapframe));
662 pte = ci->ci_svs_rsp0_pte;
663 KASSERT(*pte == L1_BASE[pl1_i(va)]);
664 }
665 #endif
666
667 pcb = lwp_getpcb(newlwp);
668 rsp0 = pcb->pcb_rsp0;
669 va = rounddown(rsp0, PAGE_SIZE);
670
671 /* Update the kernel rsp0 in cpu_info */
672 ci->ci_svs_krsp0 = rsp0 - sizeof(struct trapframe);
673 KASSERT((ci->ci_svs_krsp0 % PAGE_SIZE) ==
674 (ci->ci_svs_ursp0 % PAGE_SIZE));
675
676 utls = (struct svs_utls *)ci->ci_svs_utls;
677 utls->scratch = 0;
678
679 /*
680 * Enter the user rsp0. If we're using PCID we must flush the user VA,
681 * if we aren't it will be flushed during the next CR3 reload.
682 */
683 pte = ci->ci_svs_rsp0_pte;
684 *pte = L1_BASE[pl1_i(va)];
685 if (svs_pcid) {
686 invpcid(INVPCID_ADDRESS, PMAP_PCID_USER, ci->ci_svs_rsp0);
687 }
688 }
689
690 /*
691 * We may come here with the pmap unlocked. If a remote CPU is updating
692 * them at the same time, it's not a problem: the remote CPU will call
693 * svs_pmap_sync afterwards, and our updirpa will be synchronized properly.
694 */
695 void
svs_pdir_switch(struct pmap * pmap)696 svs_pdir_switch(struct pmap *pmap)
697 {
698 struct cpu_info *ci = curcpu();
699 struct svs_utls *utls;
700
701 KASSERT(kpreempt_disabled());
702 KASSERT(pmap != pmap_kernel());
703 KASSERT(pmap_is_user(pmap));
704
705 /* Update the info in the UTLS page */
706 utls = (struct svs_utls *)ci->ci_svs_utls;
707 utls->kpdirpa = pmap_pdirpa(pmap, 0) | svs_pcid_kcr3;
708
709 /* Copy user slots. */
710 mutex_enter(&ci->ci_svs_mtx);
711 svs_quad_copy(ci->ci_svs_updir, pmap->pm_pdir, PDIR_SLOT_USERLIM);
712 mutex_exit(&ci->ci_svs_mtx);
713
714 if (svs_pcid) {
715 invpcid(INVPCID_CONTEXT, PMAP_PCID_USER, 0);
716 }
717 }
718
719 static void
svs_enable(void)720 svs_enable(void)
721 {
722 svs_enabled = true;
723
724 x86_hotpatch(HP_NAME_SVS_ENTER, 0);
725 x86_hotpatch(HP_NAME_SVS_ENTER_ALT, 0);
726 x86_hotpatch(HP_NAME_SVS_ENTER_NMI, 0);
727
728 x86_hotpatch(HP_NAME_SVS_LEAVE, 0);
729 x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, 0);
730 x86_hotpatch(HP_NAME_SVS_LEAVE_NMI, 0);
731 }
732
733 void
svs_init(void)734 svs_init(void)
735 {
736 uint64_t msr;
737
738 if (cpu_vendor != CPUVENDOR_INTEL) {
739 return;
740 }
741 if (boothowto & RB_MD3) {
742 return;
743 }
744 if (cpu_info_primary.ci_feat_val[7] & CPUID_SEF_ARCH_CAP) {
745 msr = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
746 if (msr & IA32_ARCH_RDCL_NO) {
747 /*
748 * The processor indicates it is not vulnerable to the
749 * Rogue Data Cache Load (Meltdown) flaw.
750 */
751 return;
752 }
753 }
754
755 if ((cpu_info_primary.ci_feat_val[1] & CPUID2_PCID) &&
756 (cpu_info_primary.ci_feat_val[5] & CPUID_SEF_INVPCID)) {
757 svs_pcid = true;
758 lcr4(rcr4() | CR4_PCIDE);
759 }
760
761 svs_enable();
762 }
763