xref: /netbsd-src/sys/arch/x86/x86/svs.c (revision 880fcd92584274f259f27c8075cedf150cce4f4a)
1 /*	$NetBSD: svs.c,v 1.42 2022/09/24 11:05:18 riastradh Exp $	*/
2 
3 /*
4  * Copyright (c) 2018-2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Maxime Villard.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 #include <sys/cdefs.h>
33 __KERNEL_RCSID(0, "$NetBSD: svs.c,v 1.42 2022/09/24 11:05:18 riastradh Exp $");
34 
35 #include "opt_svs.h"
36 #include "opt_user_ldt.h"
37 
38 #include <sys/param.h>
39 #include <sys/systm.h>
40 #include <sys/proc.h>
41 #include <sys/cpu.h>
42 #include <sys/kauth.h>
43 #include <sys/sysctl.h>
44 #include <sys/xcall.h>
45 #include <sys/reboot.h>
46 
47 #include <x86/cputypes.h>
48 
49 #include <machine/cpuvar.h>
50 #include <machine/frameasm.h>
51 #include <machine/gdt.h>
52 #include <machine/pmap_private.h>
53 
54 #include <uvm/uvm.h>
55 #include <uvm/uvm_page.h>
56 
57 /*
58  * Separate Virtual Space
59  *
60  * A per-cpu L4 page is maintained in ci_svs_updirpa. During each context
61  * switch to a user pmap, the lower half of updirpa is populated with the
62  * entries containing the userland pages.
63  *
64  * ~~~~~~~~~~ The UTLS Page ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
65  *
66  * We use a special per-cpu page that we call UTLS, for User Thread Local
67  * Storage. Each CPU has one UTLS page. This page has two VAs:
68  *
69  *  o When the user page tables are loaded in CR3, the VA to access this
70  *    page is &pcpuarea->utls, defined as SVS_UTLS in assembly. This VA is
71  *    _constant_ across CPUs, but in the user page tables this VA points to
72  *    the physical page of the UTLS that is _local_ to the CPU.
73  *
74  *  o When the kernel page tables are loaded in CR3, the VA to access this
75  *    page is ci->ci_svs_utls.
76  *
77  * +----------------------------------------------------------------------+
78  * | CPU0 Local Data                                      (Physical Page) |
79  * | +------------------+                                 +-------------+ |
80  * | | User Page Tables | SVS_UTLS ---------------------> | cpu0's UTLS | |
81  * | +------------------+                                 +-------------+ |
82  * +-------------------------------------------------------------^--------+
83  *                                                               |
84  *                                                               +----------+
85  *                                                                          |
86  * +----------------------------------------------------------------------+ |
87  * | CPU1 Local Data                                      (Physical Page) | |
88  * | +------------------+                                 +-------------+ | |
89  * | | User Page Tables | SVS_UTLS ---------------------> | cpu1's UTLS | | |
90  * | +------------------+                                 +-------------+ | |
91  * +-------------------------------------------------------------^--------+ |
92  *                                                               |          |
93  *   +------------------+                 /----------------------+          |
94  *   | Kern Page Tables | ci->ci_svs_utls                                   |
95  *   +------------------+                 \---------------------------------+
96  *
97  * The goal of the UTLS page is to provide an area where we can store whatever
98  * we want, in a way that it is accessible both when the Kernel and when the
99  * User page tables are loaded in CR3.
100  *
101  * We store in the UTLS page three 64bit values:
102  *
103  *  o UTLS_KPDIRPA: the value we must put in CR3 in order to load the kernel
104  *    page tables.
105  *
106  *  o UTLS_SCRATCH: a dummy place where we temporarily store a value during
107  *    the syscall entry procedure.
108  *
109  *  o UTLS_RSP0: the value we must put in RSP in order to have a stack where
110  *    we can push the register states. This is used only during the syscall
111  *    entry procedure, because there the CPU does not automatically switch
112  *    RSP (it does not use the TSS.rsp0 mechanism described below).
113  *
114  * ~~~~~~~~~~ The Stack Switching Mechanism Without SVS ~~~~~~~~~~~~~~~~~~~~~~
115  *
116  * The kernel stack is per-lwp (pcb_rsp0). When doing a context switch between
117  * two user LWPs, the kernel updates TSS.rsp0 (which is per-cpu) to point to
118  * the stack of the new LWP. Then the execution continues. At some point, the
119  * user LWP we context-switched to will perform a syscall or will receive an
120  * interrupt. There, the CPU will automatically read TSS.rsp0 and use it as a
121  * stack. The kernel then pushes the register states on this stack, and
122  * executes in kernel mode normally.
123  *
124  * TSS.rsp0 is used by the CPU only during ring3->ring0 transitions. Therefore,
125  * when an interrupt is received while we were in kernel mode, the CPU does not
126  * read TSS.rsp0. Instead, it just uses the current stack.
127  *
128  * ~~~~~~~~~~ The Stack Switching Mechanism With SVS ~~~~~~~~~~~~~~~~~~~~~~~~~
129  *
130  * In the pcpu_area structure, pointed to by the "pcpuarea" variable, each CPU
131  * has a two-page rsp0 entry (pcpuarea->ent[cid].rsp0). These two pages do
132  * _not_ have associated physical addresses. They are only two VAs.
133  *
134  * The first page is unmapped and acts as a redzone. The second page is
135  * dynamically kentered into the highest page of the real per-lwp kernel stack;
136  * but pay close attention, it is kentered _only_ in the user page tables.
137  * That is to say, the VA of this second page is mapped when the user page
138  * tables are loaded, but not mapped when the kernel page tables are loaded.
139  *
140  * During a context switch, svs_lwp_switch() gets called first. This function
141  * does the kenter job described above, not in the kernel page tables (that
142  * are currently loaded), but in the user page tables (that are not loaded).
143  *
144  *           VIRTUAL ADDRESSES                     PHYSICAL ADDRESSES
145  *
146  * +-----------------------------+
147  * |      KERNEL PAGE TABLES     |
148  * |    +-------------------+    |                +-------------------+
149  * |    | pcb_rsp0 (page 0) | ------------------> | pcb_rsp0 (page 0) |
150  * |    +-------------------+    |                +-------------------+
151  * |    | pcb_rsp0 (page 1) | ------------------> | pcb_rsp0 (page 1) |
152  * |    +-------------------+    |                +-------------------+
153  * |    | pcb_rsp0 (page 2) | ------------------> | pcb_rsp0 (page 2) |
154  * |    +-------------------+    |                +-------------------+
155  * |    | pcb_rsp0 (page 3) | ------------------> | pcb_rsp0 (page 3) |
156  * |    +-------------------+    |            +-> +-------------------+
157  * +-----------------------------+            |
158  *                                            |
159  * +---------------------------------------+  |
160  * |           USER PAGE TABLES            |  |
161  * | +----------------------------------+  |  |
162  * | | pcpuarea->ent[cid].rsp0 (page 0) |  |  |
163  * | +----------------------------------+  |  |
164  * | | pcpuarea->ent[cid].rsp0 (page 1) | ----+
165  * | +----------------------------------+  |
166  * +---------------------------------------+
167  *
168  * After svs_lwp_switch() gets called, we set pcpuarea->ent[cid].rsp0 (page 1)
169  * in TSS.rsp0. Later, when returning to userland on the lwp we context-
170  * switched to, we will load the user page tables and execute in userland
171  * normally.
172  *
173  * Next time an interrupt or syscall is received, the CPU will automatically
174  * use TSS.rsp0 as a stack. Here it is executing with the user page tables
175  * loaded, and therefore TSS.rsp0 is _mapped_.
176  *
177  * As part of the kernel entry procedure, we now switch CR3 to load the kernel
178  * page tables. Here, we are still using the stack pointer we set in TSS.rsp0.
179  *
180  * Remember that it was only one page of stack which was mapped only in the
181  * user page tables. We just switched to the kernel page tables, so we must
182  * update RSP to be the real per-lwp kernel stack (pcb_rsp0). And we do so,
183  * without touching the stack (since it is now unmapped, touching it would
184  * fault).
185  *
186  * After we updated RSP, we can continue execution exactly as in the non-SVS
187  * case. We don't need to copy the values the CPU pushed on TSS.rsp0: even if
188  * we updated RSP to a totally different VA, this VA points to the same
189  * physical page as TSS.rsp0. So in the end, the values the CPU pushed are
190  * still here even with the new RSP.
191  *
192  * Thanks to this double-kenter optimization, we don't need to copy the
193  * trapframe during each user<->kernel transition.
194  *
195  * ~~~~~~~~~~ Notes On Locking And Synchronization ~~~~~~~~~~~~~~~~~~~~~~~~~~~
196  *
197  *  o Touching ci_svs_updir without holding ci_svs_mtx first is *not*
198  *    allowed.
199  *
200  *  o pm_kernel_cpus contains the set of CPUs that have the pmap loaded
201  *    in their CR3 register. It must *not* be replaced by pm_cpus.
202  *
203  *  o When a context switch on the current CPU is made from a user LWP
204  *    towards a kernel LWP, CR3 is not updated. Therefore, the pmap's
205  *    pm_kernel_cpus still contains the current CPU. It implies that the
206  *    remote CPUs that execute other threads of the user process we just
207  *    left will keep synchronizing us against their changes.
208  *
209  * ~~~~~~~~~~ List Of Areas That Are Removed From Userland ~~~~~~~~~~~~~~~~~~~
210  *
211  *  o PTE Space
212  *  o Direct Map
213  *  o Remote PCPU Areas
214  *  o Kernel Heap
215  *  o Kernel Image
216  *
217  * ~~~~~~~~~~ Todo List ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
218  *
219  * Ordered from highest priority to lowest:
220  *
221  *  o The NMI stack is not double-entered. Therefore if we ever receive an NMI
222  *    and leave it, the content of the stack will be visible to userland (via
223  *    Meltdown). Normally we never leave NMIs, unless a privileged user
224  *    launched PMCs. That's unlikely to happen, our PMC support is pretty
225  *    minimal, and privileged only.
226  *
227  *  o Narrow down the entry points: hide the 'jmp handler' instructions. This
228  *    makes sense on GENERIC_KASLR kernels.
229  */
230 
231 /* -------------------------------------------------------------------------- */
232 
233 /* SVS_ENTER. */
234 extern uint8_t svs_enter, svs_enter_end;
235 static const struct x86_hotpatch_source hp_svs_enter_source = {
236 	.saddr = &svs_enter,
237 	.eaddr = &svs_enter_end
238 };
239 static const struct x86_hotpatch_descriptor hp_svs_enter_desc = {
240 	.name = HP_NAME_SVS_ENTER,
241 	.nsrc = 1,
242 	.srcs = { &hp_svs_enter_source }
243 };
244 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_enter_desc);
245 
246 /* SVS_ENTER_ALT. */
247 extern uint8_t svs_enter_altstack, svs_enter_altstack_end;
248 static const struct x86_hotpatch_source hp_svs_enter_altstack_source = {
249 	.saddr = &svs_enter_altstack,
250 	.eaddr = &svs_enter_altstack_end
251 };
252 static const struct x86_hotpatch_descriptor hp_svs_enter_altstack_desc = {
253 	.name = HP_NAME_SVS_ENTER_ALT,
254 	.nsrc = 1,
255 	.srcs = { &hp_svs_enter_altstack_source }
256 };
257 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_enter_altstack_desc);
258 
259 /* SVS_ENTER_NMI. */
260 extern uint8_t svs_enter_nmi, svs_enter_nmi_end;
261 static const struct x86_hotpatch_source hp_svs_enter_nmi_source = {
262 	.saddr = &svs_enter_nmi,
263 	.eaddr = &svs_enter_nmi_end
264 };
265 static const struct x86_hotpatch_descriptor hp_svs_enter_nmi_desc = {
266 	.name = HP_NAME_SVS_ENTER_NMI,
267 	.nsrc = 1,
268 	.srcs = { &hp_svs_enter_nmi_source }
269 };
270 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_enter_nmi_desc);
271 
272 /* SVS_LEAVE. */
273 extern uint8_t svs_leave, svs_leave_end;
274 static const struct x86_hotpatch_source hp_svs_leave_source = {
275 	.saddr = &svs_leave,
276 	.eaddr = &svs_leave_end
277 };
278 static const struct x86_hotpatch_descriptor hp_svs_leave_desc = {
279 	.name = HP_NAME_SVS_LEAVE,
280 	.nsrc = 1,
281 	.srcs = { &hp_svs_leave_source }
282 };
283 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_leave_desc);
284 
285 /* SVS_LEAVE_ALT. */
286 extern uint8_t svs_leave_altstack, svs_leave_altstack_end;
287 static const struct x86_hotpatch_source hp_svs_leave_altstack_source = {
288 	.saddr = &svs_leave_altstack,
289 	.eaddr = &svs_leave_altstack_end
290 };
291 static const struct x86_hotpatch_descriptor hp_svs_leave_altstack_desc = {
292 	.name = HP_NAME_SVS_LEAVE_ALT,
293 	.nsrc = 1,
294 	.srcs = { &hp_svs_leave_altstack_source }
295 };
296 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_leave_altstack_desc);
297 
298 /* SVS_LEAVE_NMI. */
299 extern uint8_t svs_leave_nmi, svs_leave_nmi_end;
300 static const struct x86_hotpatch_source hp_svs_leave_nmi_source = {
301 	.saddr = &svs_leave_nmi,
302 	.eaddr = &svs_leave_nmi_end
303 };
304 static const struct x86_hotpatch_descriptor hp_svs_leave_nmi_desc = {
305 	.name = HP_NAME_SVS_LEAVE_NMI,
306 	.nsrc = 1,
307 	.srcs = { &hp_svs_leave_nmi_source }
308 };
309 __link_set_add_rodata(x86_hotpatch_descriptors, hp_svs_leave_nmi_desc);
310 
311 /* -------------------------------------------------------------------------- */
312 
313 bool svs_enabled __read_mostly = false;
314 bool svs_pcid __read_mostly = false;
315 
316 static uint64_t svs_pcid_kcr3 __read_mostly;
317 static uint64_t svs_pcid_ucr3 __read_mostly;
318 
319 struct svs_utls {
320 	paddr_t kpdirpa;
321 	uint64_t scratch;
322 	vaddr_t rsp0;
323 };
324 
325 static pd_entry_t *
svs_tree_add(struct cpu_info * ci,vaddr_t va)326 svs_tree_add(struct cpu_info *ci, vaddr_t va)
327 {
328 	extern const vaddr_t ptp_masks[];
329 	extern const int ptp_shifts[];
330 	pd_entry_t *dstpde;
331 	struct vm_page *pg;
332 	size_t i, pidx;
333 	paddr_t pa;
334 
335 	dstpde = ci->ci_svs_updir;
336 
337 	for (i = PTP_LEVELS; i > 1; i--) {
338 		pidx = pl_pi(va, i);
339 
340 		if (!pmap_valid_entry(dstpde[pidx])) {
341 			pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
342 			if (pg == 0)
343 				panic("%s: failed to allocate PA for CPU %d\n",
344 					__func__, cpu_index(ci));
345 			pa = VM_PAGE_TO_PHYS(pg);
346 
347 			dstpde[pidx] = PTE_P | PTE_W | pa;
348 		}
349 
350 		pa = (paddr_t)(dstpde[pidx] & PTE_FRAME);
351 		dstpde = (pd_entry_t *)PMAP_DIRECT_MAP(pa);
352 	}
353 
354 	return dstpde;
355 }
356 
357 static void
svs_page_add(struct cpu_info * ci,vaddr_t va,bool global)358 svs_page_add(struct cpu_info *ci, vaddr_t va, bool global)
359 {
360 	pd_entry_t *srcpde, *dstpde, pde;
361 	size_t idx, pidx;
362 	paddr_t pa;
363 
364 	/* Create levels L4, L3 and L2. */
365 	dstpde = svs_tree_add(ci, va);
366 
367 	pidx = pl1_pi(va);
368 
369 	/*
370 	 * If 'va' is in a large page, we need to compute its physical
371 	 * address manually.
372 	 */
373 	idx = pl2_i(va);
374 	srcpde = L2_BASE;
375 	if (!pmap_valid_entry(srcpde[idx])) {
376 		panic("%s: L2 page not mapped", __func__);
377 	}
378 	if (srcpde[idx] & PTE_PS) {
379 		KASSERT(!global);
380 		pa = srcpde[idx] & PTE_2MFRAME;
381 		pa += (paddr_t)(va % NBPD_L2);
382 		pde = (srcpde[idx] & ~(PTE_PS|PTE_2MFRAME)) | pa;
383 
384 		if (pmap_valid_entry(dstpde[pidx])) {
385 			panic("%s: L1 page already mapped", __func__);
386 		}
387 		dstpde[pidx] = pde;
388 		return;
389 	}
390 
391 	/*
392 	 * Normal page, just copy the PDE.
393 	 */
394 	idx = pl1_i(va);
395 	srcpde = L1_BASE;
396 	if (!pmap_valid_entry(srcpde[idx])) {
397 		panic("%s: L1 page not mapped", __func__);
398 	}
399 	if (pmap_valid_entry(dstpde[pidx])) {
400 		panic("%s: L1 page already mapped", __func__);
401 	}
402 	dstpde[pidx] = srcpde[idx];
403 
404 	/*
405 	 * If we want a global translation, mark both the src and dst with
406 	 * PTE_G.
407 	 */
408 	if (global) {
409 		srcpde[idx] |= PTE_G;
410 		dstpde[pidx] |= PTE_G;
411 		tlbflushg();
412 	}
413 }
414 
415 static void
svs_rsp0_init(struct cpu_info * ci)416 svs_rsp0_init(struct cpu_info *ci)
417 {
418 	const cpuid_t cid = cpu_index(ci);
419 	vaddr_t va, rsp0;
420 	pd_entry_t *pd;
421 	size_t pidx;
422 
423 	rsp0 = (vaddr_t)&pcpuarea->ent[cid].rsp0;
424 
425 	/* The first page is a redzone. */
426 	va = rsp0 + PAGE_SIZE;
427 
428 	/* Create levels L4, L3 and L2. */
429 	pd = svs_tree_add(ci, va);
430 
431 	/* Get the info for L1. */
432 	pidx = pl1_i(va % NBPD_L2);
433 	if (pmap_valid_entry(pd[pidx])) {
434 		panic("%s: rsp0 page already mapped", __func__);
435 	}
436 
437 	ci->ci_svs_rsp0_pte = (pt_entry_t *)&pd[pidx];
438 	ci->ci_svs_rsp0 = rsp0 + PAGE_SIZE + sizeof(struct trapframe);
439 	ci->ci_svs_ursp0 = ci->ci_svs_rsp0 - sizeof(struct trapframe);
440 	ci->ci_svs_krsp0 = 0;
441 }
442 
443 static void
svs_utls_init(struct cpu_info * ci)444 svs_utls_init(struct cpu_info *ci)
445 {
446 	const vaddr_t utlsva = (vaddr_t)&pcpuarea->utls;
447 	struct svs_utls *utls;
448 	struct vm_page *pg;
449 	pd_entry_t *pd;
450 	size_t pidx;
451 	paddr_t pa;
452 	vaddr_t va;
453 
454 	/* Create levels L4, L3 and L2 of the UTLS page. */
455 	pd = svs_tree_add(ci, utlsva);
456 
457 	/* Allocate L1. */
458 	pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
459 	if (pg == 0)
460 		panic("%s: failed to allocate PA for CPU %d\n", __func__,
461 		    cpu_index(ci));
462 	pa = VM_PAGE_TO_PHYS(pg);
463 
464 	/* Enter L1. */
465 	if (pmap_valid_entry(L1_BASE[pl1_i(utlsva)])) {
466 		panic("%s: local page already mapped", __func__);
467 	}
468 	pidx = pl1_pi(utlsva);
469 	if (pmap_valid_entry(pd[pidx])) {
470 		panic("%s: L1 page already mapped", __func__);
471 	}
472 	pd[pidx] = PTE_P | PTE_W | pmap_pg_nx | pa;
473 
474 	/*
475 	 * Now, allocate a VA in the kernel map, that points to the UTLS
476 	 * page. After that, the UTLS page will be accessible in kernel
477 	 * mode via ci_svs_utls.
478 	 */
479 	va = uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
480 	    UVM_KMF_VAONLY|UVM_KMF_NOWAIT);
481 	if (va == 0) {
482 		panic("%s: unable to allocate VA\n", __func__);
483 	}
484 	pmap_kenter_pa(va, pa, VM_PROT_READ|VM_PROT_WRITE, 0);
485 	pmap_update(pmap_kernel());
486 
487 	ci->ci_svs_utls = va;
488 
489 	/* Initialize the constant fields of the UTLS page */
490 	utls = (struct svs_utls *)ci->ci_svs_utls;
491 	utls->rsp0 = ci->ci_svs_rsp0;
492 }
493 
494 static void
svs_pcid_init(struct cpu_info * ci)495 svs_pcid_init(struct cpu_info *ci)
496 {
497 	if (!svs_pcid) {
498 		return;
499 	}
500 
501 	svs_pcid_ucr3 = __SHIFTIN(PMAP_PCID_USER, CR3_PCID) | CR3_NO_TLB_FLUSH;
502 	svs_pcid_kcr3 = __SHIFTIN(PMAP_PCID_KERN, CR3_PCID) | CR3_NO_TLB_FLUSH;
503 
504 	ci->ci_svs_updirpa |= svs_pcid_ucr3;
505 }
506 
507 static void
svs_range_add(struct cpu_info * ci,vaddr_t va,size_t size,bool global)508 svs_range_add(struct cpu_info *ci, vaddr_t va, size_t size, bool global)
509 {
510 	size_t i, n;
511 
512 	KASSERT(size % PAGE_SIZE == 0);
513 	n = size / PAGE_SIZE;
514 	for (i = 0; i < n; i++) {
515 		svs_page_add(ci, va + i * PAGE_SIZE, global);
516 	}
517 }
518 
519 void
cpu_svs_init(struct cpu_info * ci)520 cpu_svs_init(struct cpu_info *ci)
521 {
522 	extern char __text_user_start;
523 	extern char __text_user_end;
524 	extern vaddr_t idt_vaddr;
525 	const cpuid_t cid = cpu_index(ci);
526 	struct vm_page *pg;
527 
528 	KASSERT(ci != NULL);
529 
530 	pg = uvm_pagealloc(NULL, 0, NULL, UVM_PGA_ZERO);
531 	if (pg == 0)
532 		panic("%s: failed to allocate L4 PA for CPU %d\n",
533 			__func__, cpu_index(ci));
534 	ci->ci_svs_updirpa = VM_PAGE_TO_PHYS(pg);
535 
536 	ci->ci_svs_updir = (pt_entry_t *)uvm_km_alloc(kernel_map, PAGE_SIZE, 0,
537 		UVM_KMF_VAONLY | UVM_KMF_NOWAIT);
538 	if (ci->ci_svs_updir == NULL)
539 		panic("%s: failed to allocate L4 VA for CPU %d\n",
540 			__func__, cpu_index(ci));
541 
542 	pmap_kenter_pa((vaddr_t)ci->ci_svs_updir, ci->ci_svs_updirpa,
543 		VM_PROT_READ | VM_PROT_WRITE, 0);
544 
545 	pmap_update(pmap_kernel());
546 
547 	mutex_init(&ci->ci_svs_mtx, MUTEX_DEFAULT, IPL_VM);
548 
549 	if (cid == cpu_index(&cpu_info_primary) || !idt_vec_is_pcpu())
550 		svs_page_add(ci, idt_vaddr, true);
551 	svs_page_add(ci, (vaddr_t)&pcpuarea->ldt, true);
552 	svs_range_add(ci, (vaddr_t)&pcpuarea->ent[cid],
553 	    offsetof(struct pcpu_entry, rsp0), true);
554 	svs_range_add(ci, (vaddr_t)&__text_user_start,
555 	    (vaddr_t)&__text_user_end - (vaddr_t)&__text_user_start, false);
556 
557 	svs_rsp0_init(ci);
558 	svs_utls_init(ci);
559 	svs_pcid_init(ci);
560 
561 #ifdef USER_LDT
562 	mutex_enter(&cpu_lock);
563 	ci->ci_svs_ldt_sel = ldt_alloc(&pcpuarea->ent[cid].ldt,
564 	    MAX_USERLDT_SIZE);
565 	mutex_exit(&cpu_lock);
566 #endif
567 }
568 
569 void
svs_pmap_sync(struct pmap * pmap,int index)570 svs_pmap_sync(struct pmap *pmap, int index)
571 {
572 	CPU_INFO_ITERATOR cii;
573 	struct cpu_info *ci;
574 	cpuid_t cid;
575 
576 	KASSERT(pmap != NULL);
577 	KASSERT(pmap != pmap_kernel());
578 	KASSERT(pmap_is_user(pmap));
579 	KASSERT(mutex_owned(&pmap->pm_lock));
580 	KASSERT(kpreempt_disabled());
581 	KASSERT(index < PDIR_SLOT_USERLIM);
582 
583 	ci = curcpu();
584 	cid = cpu_index(ci);
585 
586 	mutex_enter(&ci->ci_svs_mtx);
587 	KASSERT(kcpuset_isset(pmap->pm_kernel_cpus, cid));
588 	ci->ci_svs_updir[index] = pmap->pm_pdir[index];
589 	mutex_exit(&ci->ci_svs_mtx);
590 
591 	if (!kcpuset_isotherset(pmap->pm_kernel_cpus, cid)) {
592 		return;
593 	}
594 
595 	for (CPU_INFO_FOREACH(cii, ci)) {
596 		cid = cpu_index(ci);
597 
598 		if (!kcpuset_isset(pmap->pm_kernel_cpus, cid)) {
599 			continue;
600 		}
601 
602 		/* take the lock and check again */
603 		mutex_enter(&ci->ci_svs_mtx);
604 		if (kcpuset_isset(pmap->pm_kernel_cpus, cid)) {
605 			ci->ci_svs_updir[index] = pmap->pm_pdir[index];
606 		}
607 		mutex_exit(&ci->ci_svs_mtx);
608 	}
609 }
610 
611 void
svs_ldt_sync(struct pmap * pmap)612 svs_ldt_sync(struct pmap *pmap)
613 {
614 	struct cpu_info *ci = curcpu();
615 	void *ldt;
616 	int sel;
617 
618 	KASSERT(kpreempt_disabled());
619 
620 	/*
621 	 * Another LWP could concurrently modify the LDT via x86_set_ldt1().
622 	 * The LWP will wait for pmap_ldt_sync() to finish before destroying
623 	 * the outdated LDT.
624 	 *
625 	 * We have preemption disabled here, so it is guaranteed that even
626 	 * if the LDT we are syncing is the outdated one, it is still valid.
627 	 *
628 	 * pmap_ldt_sync() will execute later once we have preemption enabled,
629 	 * and will install the new LDT.
630 	 */
631 	sel = atomic_load_relaxed(&pmap->pm_ldt_sel);
632 	if (__predict_false(sel != GSYSSEL(GLDT_SEL, SEL_KPL))) {
633 		ldt = atomic_load_relaxed(&pmap->pm_ldt);
634 		memcpy(&pcpuarea->ent[cpu_index(ci)].ldt, ldt,
635 		    MAX_USERLDT_SIZE);
636 		sel = ci->ci_svs_ldt_sel;
637 	}
638 
639 	lldt(sel);
640 }
641 
642 void
svs_lwp_switch(struct lwp * oldlwp,struct lwp * newlwp)643 svs_lwp_switch(struct lwp *oldlwp, struct lwp *newlwp)
644 {
645 	struct cpu_info *ci = curcpu();
646 	struct svs_utls *utls;
647 	struct pcb *pcb;
648 	pt_entry_t *pte;
649 	uintptr_t rsp0;
650 	vaddr_t va;
651 
652 	if (newlwp->l_flag & LW_SYSTEM) {
653 		return;
654 	}
655 
656 #ifdef DIAGNOSTIC
657 	if (!(oldlwp->l_flag & LW_SYSTEM)) {
658 		pcb = lwp_getpcb(oldlwp);
659 		rsp0 = pcb->pcb_rsp0;
660 		va = rounddown(rsp0, PAGE_SIZE);
661 		KASSERT(ci->ci_svs_krsp0 == rsp0 - sizeof(struct trapframe));
662 		pte = ci->ci_svs_rsp0_pte;
663 		KASSERT(*pte == L1_BASE[pl1_i(va)]);
664 	}
665 #endif
666 
667 	pcb = lwp_getpcb(newlwp);
668 	rsp0 = pcb->pcb_rsp0;
669 	va = rounddown(rsp0, PAGE_SIZE);
670 
671 	/* Update the kernel rsp0 in cpu_info */
672 	ci->ci_svs_krsp0 = rsp0 - sizeof(struct trapframe);
673 	KASSERT((ci->ci_svs_krsp0 % PAGE_SIZE) ==
674 	    (ci->ci_svs_ursp0 % PAGE_SIZE));
675 
676 	utls = (struct svs_utls *)ci->ci_svs_utls;
677 	utls->scratch = 0;
678 
679 	/*
680 	 * Enter the user rsp0. If we're using PCID we must flush the user VA,
681 	 * if we aren't it will be flushed during the next CR3 reload.
682 	 */
683 	pte = ci->ci_svs_rsp0_pte;
684 	*pte = L1_BASE[pl1_i(va)];
685 	if (svs_pcid) {
686 		invpcid(INVPCID_ADDRESS, PMAP_PCID_USER, ci->ci_svs_rsp0);
687 	}
688 }
689 
690 /*
691  * We may come here with the pmap unlocked.  If a remote CPU is updating
692  * them at the same time, it's not a problem: the remote CPU will call
693  * svs_pmap_sync afterwards, and our updirpa will be synchronized properly.
694  */
695 void
svs_pdir_switch(struct pmap * pmap)696 svs_pdir_switch(struct pmap *pmap)
697 {
698 	struct cpu_info *ci = curcpu();
699 	struct svs_utls *utls;
700 
701 	KASSERT(kpreempt_disabled());
702 	KASSERT(pmap != pmap_kernel());
703 	KASSERT(pmap_is_user(pmap));
704 
705 	/* Update the info in the UTLS page */
706 	utls = (struct svs_utls *)ci->ci_svs_utls;
707 	utls->kpdirpa = pmap_pdirpa(pmap, 0) | svs_pcid_kcr3;
708 
709 	/* Copy user slots. */
710 	mutex_enter(&ci->ci_svs_mtx);
711 	svs_quad_copy(ci->ci_svs_updir, pmap->pm_pdir, PDIR_SLOT_USERLIM);
712 	mutex_exit(&ci->ci_svs_mtx);
713 
714 	if (svs_pcid) {
715 		invpcid(INVPCID_CONTEXT, PMAP_PCID_USER, 0);
716 	}
717 }
718 
719 static void
svs_enable(void)720 svs_enable(void)
721 {
722 	svs_enabled = true;
723 
724 	x86_hotpatch(HP_NAME_SVS_ENTER, 0);
725 	x86_hotpatch(HP_NAME_SVS_ENTER_ALT, 0);
726 	x86_hotpatch(HP_NAME_SVS_ENTER_NMI, 0);
727 
728 	x86_hotpatch(HP_NAME_SVS_LEAVE, 0);
729 	x86_hotpatch(HP_NAME_SVS_LEAVE_ALT, 0);
730 	x86_hotpatch(HP_NAME_SVS_LEAVE_NMI, 0);
731 }
732 
733 void
svs_init(void)734 svs_init(void)
735 {
736 	uint64_t msr;
737 
738 	if (cpu_vendor != CPUVENDOR_INTEL) {
739 		return;
740 	}
741 	if (boothowto & RB_MD3) {
742 		return;
743 	}
744 	if (cpu_info_primary.ci_feat_val[7] & CPUID_SEF_ARCH_CAP) {
745 		msr = rdmsr(MSR_IA32_ARCH_CAPABILITIES);
746 		if (msr & IA32_ARCH_RDCL_NO) {
747 			/*
748 			 * The processor indicates it is not vulnerable to the
749 			 * Rogue Data Cache Load (Meltdown) flaw.
750 			 */
751 			return;
752 		}
753 	}
754 
755 	if ((cpu_info_primary.ci_feat_val[1] & CPUID2_PCID) &&
756 	    (cpu_info_primary.ci_feat_val[5] & CPUID_SEF_INVPCID)) {
757 		svs_pcid = true;
758 		lcr4(rcr4() | CR4_PCIDE);
759 	}
760 
761 	svs_enable();
762 }
763