1 /* $NetBSD: x86_tlb.c,v 1.21 2023/12/08 21:46:02 andvar Exp $ */
2
3 /*-
4 * Copyright (c) 2008-2020 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * This code is derived from software contributed to The NetBSD Foundation
8 * by Andrew Doran and Mindaugas Rasiukevicius.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 *
19 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29 * POSSIBILITY OF SUCH DAMAGE.
30 */
31
32 /*
33 * x86 pmap(9) module: TLB shootdowns.
34 *
35 * TLB shootdowns are hard interrupts that operate outside the SPL framework.
36 * They do not need to be blocked, provided that the pmap module gets the
37 * order of events correct. The calls are made by poking the LAPIC directly.
38 * The interrupt handler is short and does one of the following: invalidate
39 * a set of pages, all user TLB entries or the entire TLB.
40 */
41
42 #include <sys/cdefs.h>
43 __KERNEL_RCSID(0, "$NetBSD: x86_tlb.c,v 1.21 2023/12/08 21:46:02 andvar Exp $");
44
45 #include <sys/param.h>
46 #include <sys/kernel.h>
47
48 #include <sys/systm.h>
49 #include <sys/atomic.h>
50 #include <sys/cpu.h>
51 #include <sys/intr.h>
52 #include <uvm/uvm.h>
53
54 #include <machine/cpuvar.h>
55 #include <machine/pmap_private.h>
56
57 #ifdef XENPV
58 #include <xen/xenpmap.h>
59 #endif /* XENPV */
60 #include <x86/i82489reg.h>
61 #include <x86/i82489var.h>
62
63 /*
64 * TLB shootdown packet. Each CPU has a copy of this packet, where we build
65 * sets of TLB shootdowns. If shootdowns need to occur on remote CPUs, the
66 * packet is copied into a shared mailbox kept on the initiator's kernel
67 * stack. Once the copy is made, no further updates to the mailbox are made
68 * until the request is completed. This keeps the cache line in the shared
69 * state, and bus traffic to a minimum.
70 *
71 * In order to make maximal use of the available space, control fields are
72 * overlaid into the lower 12 bits of the first 4 virtual addresses. This
73 * is very ugly, but it counts.
74 *
75 * On i386 the packet is 64 bytes in size. On amd64 it's 128 bytes. This
76 * is sized in concert with UBC_WINSIZE, otherwise excessive shootdown
77 * interrupts could be issued.
78 */
79
80 #define TP_MAXVA 16 /* for individual mappings */
81 #define TP_ALLVA PAGE_MASK /* special: shoot all mappings */
82
83 typedef struct {
84 uintptr_t tp_store[TP_MAXVA];
85 } pmap_tlb_packet_t;
86
87 #define TP_COUNT 0
88 #define TP_USERPMAP 1
89 #define TP_GLOBAL 2
90 #define TP_DONE 3
91
92 #define TP_GET_COUNT(tp) ((tp)->tp_store[TP_COUNT] & PAGE_MASK)
93 #define TP_GET_USERPMAP(tp) ((tp)->tp_store[TP_USERPMAP] & 1)
94 #define TP_GET_GLOBAL(tp) ((tp)->tp_store[TP_GLOBAL] & 1)
95 #define TP_GET_DONE(tp) (atomic_load_relaxed(&(tp)->tp_store[TP_DONE]) & 1)
96 #define TP_GET_VA(tp, i) ((tp)->tp_store[(i)] & ~PAGE_MASK)
97
98 #define TP_INC_COUNT(tp) ((tp)->tp_store[TP_COUNT]++)
99 #define TP_SET_ALLVA(tp) ((tp)->tp_store[TP_COUNT] |= TP_ALLVA)
100 #define TP_SET_VA(tp, c, va) ((tp)->tp_store[(c)] |= ((va) & ~PAGE_MASK))
101
102 #define TP_SET_USERPMAP(tp) ((tp)->tp_store[TP_USERPMAP] |= 1)
103 #define TP_SET_GLOBAL(tp) ((tp)->tp_store[TP_GLOBAL] |= 1)
104 #define TP_SET_DONE(tp) \
105 do { \
106 uintptr_t v = atomic_load_relaxed(&(tp)->tp_store[TP_DONE]); \
107 atomic_store_relaxed(&(tp)->tp_store[TP_DONE], v | 1); \
108 } while (/* CONSTCOND */ 0);
109
110 #define TP_CLEAR(tp) memset(__UNVOLATILE(tp), 0, sizeof(*(tp)));
111
112 /*
113 * TLB shootdown state.
114 */
115 static volatile pmap_tlb_packet_t *volatile pmap_tlb_packet __cacheline_aligned;
116 static volatile u_int pmap_tlb_pendcount __cacheline_aligned;
117 static struct evcnt pmap_tlb_evcnt __cacheline_aligned;
118
119 /*
120 * TLB shootdown statistics.
121 */
122 #ifdef TLBSTATS
123 static struct evcnt tlbstat_local[TLBSHOOT__MAX];
124 static struct evcnt tlbstat_remote[TLBSHOOT__MAX];
125 static struct evcnt tlbstat_kernel[TLBSHOOT__MAX];
126 static struct evcnt tlbstat_single_req;
127 static struct evcnt tlbstat_single_issue;
128 static const char * tlbstat_name[ ] = {
129 "REMOVE_ALL",
130 "KENTER",
131 "KREMOVE",
132 "FREE_PTP",
133 "REMOVE_PTE",
134 "SYNC_PV",
135 "WRITE_PROTECT",
136 "ENTER",
137 "NVMM",
138 "BUS_DMA",
139 "BUS_SPACE",
140 };
141 #endif
142
143 void
pmap_tlb_init(void)144 pmap_tlb_init(void)
145 {
146
147 evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR,
148 NULL, "TLB", "shootdown");
149
150 #ifdef TLBSTATS
151 int i;
152
153 for (i = 0; i < TLBSHOOT__MAX; i++) {
154 evcnt_attach_dynamic(&tlbstat_local[i], EVCNT_TYPE_MISC,
155 NULL, "tlbshoot local", tlbstat_name[i]);
156 }
157 for (i = 0; i < TLBSHOOT__MAX; i++) {
158 evcnt_attach_dynamic(&tlbstat_remote[i], EVCNT_TYPE_MISC,
159 NULL, "tlbshoot remote", tlbstat_name[i]);
160 }
161 for (i = 0; i < TLBSHOOT__MAX; i++) {
162 evcnt_attach_dynamic(&tlbstat_kernel[i], EVCNT_TYPE_MISC,
163 NULL, "tlbshoot kernel", tlbstat_name[i]);
164 }
165 evcnt_attach_dynamic(&tlbstat_single_req, EVCNT_TYPE_MISC,
166 NULL, "tlbshoot single page", "requests");
167 evcnt_attach_dynamic(&tlbstat_single_issue, EVCNT_TYPE_MISC,
168 NULL, "tlbshoot single page", "issues");
169 #endif
170 }
171
172 void
pmap_tlb_cpu_init(struct cpu_info * ci)173 pmap_tlb_cpu_init(struct cpu_info *ci)
174 {
175 pmap_tlb_packet_t *tp = (pmap_tlb_packet_t *)ci->ci_pmap_data;
176
177 memset(tp, 0, sizeof(pmap_tlb_packet_t));
178 kcpuset_create(&ci->ci_tlb_cpuset, true);
179 }
180
181 static inline void
pmap_tlbstat_count(struct pmap * pm,vaddr_t va,tlbwhy_t why)182 pmap_tlbstat_count(struct pmap *pm, vaddr_t va, tlbwhy_t why)
183 {
184 #ifdef TLBSTATS
185 const cpuid_t cid = cpu_index(curcpu());
186 bool local = false, remote = false;
187
188 if (va != (vaddr_t)-1LL) {
189 atomic_inc_64(&tlbstat_single_req.ev_count);
190 }
191 if (pm == pmap_kernel()) {
192 atomic_inc_64(&tlbstat_kernel[why].ev_count);
193 return;
194 }
195
196 if (va >= VM_MAXUSER_ADDRESS) {
197 remote = kcpuset_isotherset(pm->pm_kernel_cpus, cid);
198 local = kcpuset_isset(pm->pm_kernel_cpus, cid);
199 }
200 remote |= kcpuset_isotherset(pm->pm_cpus, cid);
201 local |= kcpuset_isset(pm->pm_cpus, cid);
202
203 if (local) {
204 atomic_inc_64(&tlbstat_local[why].ev_count);
205 }
206 if (remote) {
207 atomic_inc_64(&tlbstat_remote[why].ev_count);
208 }
209 #endif
210 }
211
212 static inline void
pmap_tlb_invalidate(volatile pmap_tlb_packet_t * tp)213 pmap_tlb_invalidate(volatile pmap_tlb_packet_t *tp)
214 {
215 int i = TP_GET_COUNT(tp);
216
217 /* Find out what we need to invalidate. */
218 if (i == TP_ALLVA) {
219 if (TP_GET_GLOBAL(tp) != 0) {
220 /* Invalidating all TLB entries. */
221 tlbflushg();
222 } else {
223 /* Invalidating non-global TLB entries only. */
224 tlbflush();
225 }
226 } else {
227 /* Invalidating a single page or a range of pages. */
228 KASSERT(i != 0);
229 do {
230 --i;
231 pmap_update_pg(TP_GET_VA(tp, i));
232 } while (i > 0);
233 }
234 }
235
236 /*
237 * pmap_tlb_shootdown: invalidate a page on all CPUs using pmap 'pm'.
238 */
239 void
pmap_tlb_shootdown(struct pmap * pm,vaddr_t va,pt_entry_t pte,tlbwhy_t why)240 pmap_tlb_shootdown(struct pmap *pm, vaddr_t va, pt_entry_t pte, tlbwhy_t why)
241 {
242 pmap_tlb_packet_t *tp;
243 struct cpu_info *ci;
244 uint8_t count;
245 int s;
246
247 #ifndef XENPV
248 KASSERT((pte & PTE_G) == 0 || pm == pmap_kernel());
249 #endif
250
251 if (__predict_false(pm->pm_tlb_flush != NULL)) {
252 (*pm->pm_tlb_flush)(pm);
253 return;
254 }
255
256 if ((pte & PTE_PS) != 0) {
257 va &= PTE_LGFRAME;
258 }
259
260 /*
261 * Add the shootdown operation to our pending set.
262 */
263 s = splvm();
264 ci = curcpu();
265 tp = (pmap_tlb_packet_t *)ci->ci_pmap_data;
266
267 /* Whole address flush will be needed if PTE_G is set. */
268 if ((pte & PTE_G) != 0) {
269 TP_SET_GLOBAL(tp);
270 }
271 count = TP_GET_COUNT(tp);
272
273 if (count < TP_MAXVA && va != (vaddr_t)-1LL) {
274 /* Flush a single page. */
275 TP_SET_VA(tp, count, va);
276 TP_INC_COUNT(tp);
277 } else {
278 /* Flush everything - may already be set. */
279 TP_SET_ALLVA(tp);
280 }
281
282 if (pm != pmap_kernel()) {
283 kcpuset_merge(ci->ci_tlb_cpuset, pm->pm_cpus);
284 if (va >= VM_MAXUSER_ADDRESS) {
285 kcpuset_merge(ci->ci_tlb_cpuset, pm->pm_kernel_cpus);
286 }
287 TP_SET_USERPMAP(tp);
288 } else {
289 kcpuset_copy(ci->ci_tlb_cpuset, kcpuset_running);
290 }
291 pmap_tlbstat_count(pm, va, why);
292 splx(s);
293 }
294
295 #ifdef XENPV
296
297 static inline void
pmap_tlb_processpacket(volatile pmap_tlb_packet_t * tp,kcpuset_t * target)298 pmap_tlb_processpacket(volatile pmap_tlb_packet_t *tp, kcpuset_t *target)
299 {
300 #ifdef MULTIPROCESSOR
301 int i = TP_GET_COUNT(tp);
302
303 if (i != TP_ALLVA) {
304 /* Invalidating a single page or a range of pages. */
305 KASSERT(i != 0);
306 do {
307 --i;
308 xen_mcast_invlpg(TP_GET_VA(tp, i), target);
309 } while (i > 0);
310 } else {
311 xen_mcast_tlbflush(target);
312 }
313
314 /* Remote CPUs have been synchronously flushed. */
315 pmap_tlb_pendcount = 0;
316 pmap_tlb_packet = NULL;
317 TP_SET_DONE(tp);
318 #endif /* MULTIPROCESSOR */
319 }
320
321 #else
322
323 static inline void
pmap_tlb_processpacket(volatile pmap_tlb_packet_t * tp,kcpuset_t * target)324 pmap_tlb_processpacket(volatile pmap_tlb_packet_t *tp, kcpuset_t *target)
325 {
326 #ifdef MULTIPROCESSOR
327 int err = 0;
328
329 if (!kcpuset_match(target, kcpuset_attached)) {
330 const struct cpu_info * const self = curcpu();
331 CPU_INFO_ITERATOR cii;
332 struct cpu_info *lci;
333
334 for (CPU_INFO_FOREACH(cii, lci)) {
335 const cpuid_t lcid = cpu_index(lci);
336
337 if (__predict_false(lci == self) ||
338 !kcpuset_isset(target, lcid)) {
339 continue;
340 }
341 err |= x86_ipi(LAPIC_TLB_VECTOR,
342 lci->ci_cpuid, LAPIC_DLMODE_FIXED);
343 }
344 } else {
345 err = x86_ipi(LAPIC_TLB_VECTOR, LAPIC_DEST_ALLEXCL,
346 LAPIC_DLMODE_FIXED);
347 }
348 KASSERT(err == 0);
349 #endif /* MULTIPROCESSOR */
350 }
351
352 #endif /* XENPV */
353
354 /*
355 * pmap_tlb_shootnow: process pending TLB shootdowns queued on current CPU.
356 *
357 * => Must be called with preemption disabled.
358 */
359 void
pmap_tlb_shootnow(void)360 pmap_tlb_shootnow(void)
361 {
362 volatile pmap_tlb_packet_t *tp, *ts;
363 volatile uint8_t stackbuf[sizeof(*tp) + COHERENCY_UNIT];
364 struct cpu_info *ci;
365 kcpuset_t *target;
366 u_int local, rcpucount;
367 cpuid_t cid;
368 int s;
369
370 KASSERT(kpreempt_disabled());
371
372 /* Pre-check first. */
373 ci = curcpu();
374 tp = (pmap_tlb_packet_t *)ci->ci_pmap_data;
375 if (TP_GET_COUNT(tp) == 0) {
376 return;
377 }
378
379 /* An interrupt may have flushed our updates, so check again. */
380 s = splvm();
381 if (TP_GET_COUNT(tp) == 0) {
382 splx(s);
383 return;
384 }
385
386 cid = cpu_index(ci);
387 target = ci->ci_tlb_cpuset;
388 local = kcpuset_isset(target, cid) ? 1 : 0;
389 rcpucount = kcpuset_countset(target) - local;
390
391 /*
392 * Fast path for local shootdowns only. Do the shootdowns, and
393 * clear out the buffer for the next user.
394 */
395 if (rcpucount == 0) {
396 pmap_tlb_invalidate(tp);
397 kcpuset_zero(ci->ci_tlb_cpuset);
398 TP_CLEAR(tp);
399 splx(s);
400 return;
401 }
402
403 /*
404 * Copy the packet into the stack buffer, and gain ownership of the
405 * global pointer. We must keep interrupts blocked once we own the
406 * pointer and until the IPIs are triggered, or we could deadlock
407 * against an interrupt on the current CPU trying the same.
408 */
409 KASSERT(rcpucount < ncpu);
410 ts = (void *)roundup2((uintptr_t)stackbuf, COHERENCY_UNIT);
411 *ts = *tp;
412 KASSERT(TP_GET_DONE(ts) == 0);
413 while (atomic_cas_ptr(&pmap_tlb_packet, NULL,
414 __UNVOLATILE(ts)) != NULL) {
415 KASSERT(atomic_load_relaxed(&pmap_tlb_packet) != ts);
416 /*
417 * Don't bother with exponentional backoff, as the pointer
418 * is in a dedicated cache line and only updated twice per
419 * IPI (in contrast to the pending counter). The cache
420 * line will spend most of its time in the SHARED state.
421 */
422 splx(s);
423 do {
424 x86_pause();
425 } while (atomic_load_relaxed(&pmap_tlb_packet) != NULL);
426 s = splvm();
427
428 /*
429 * An interrupt might have done the shootdowns for
430 * us while we spun.
431 */
432 if (TP_GET_COUNT(tp) == 0) {
433 splx(s);
434 return;
435 }
436 }
437
438 /*
439 * Ownership of the global pointer provides serialization of the
440 * update to the count and the event counter. With those values
441 * updated, start shootdowns on remote CPUs.
442 */
443 pmap_tlb_pendcount = rcpucount;
444 pmap_tlb_evcnt.ev_count++;
445 pmap_tlb_processpacket(ts, target);
446
447 /*
448 * Clear out the local CPU's buffer for the next user. Once done,
449 * we can drop the IPL.
450 */
451 #ifdef TLBSTATS
452 if (TP_GET_COUNT(tp) != TP_ALLVA) {
453 atomic_add_64(&tlbstat_single_issue.ev_count,
454 TP_GET_COUNT(tp));
455 }
456 #endif
457 kcpuset_zero(ci->ci_tlb_cpuset);
458 TP_CLEAR(tp);
459 splx(s);
460
461 /*
462 * Shootdowns on remote CPUs are now in flight. In the meantime,
463 * perform local shootdown if needed, using our copy of the packet.
464 */
465 if (local) {
466 pmap_tlb_invalidate(ts);
467 }
468
469 /*
470 * Wait for the updates to be processed by remote CPUs. Poll the
471 * flag in the packet in order to limit bus traffic (only the last
472 * CPU out will update it and only we are reading it). No memory
473 * barrier required due to prior stores - yay x86.
474 */
475 while (TP_GET_DONE(ts) == 0) {
476 x86_pause();
477 }
478 }
479
480 /*
481 * pmap_tlb_intr: pmap shootdown interrupt handler to invalidate TLB entries.
482 *
483 * Called from IPI only. We are outside the SPL framework, with interrupts
484 * disabled on the CPU: be careful.
485 *
486 * TLB flush and the interrupt that brought us here are serializing
487 * operations (they defeat speculative execution). Any speculative load
488 * producing a TLB fill between receipt of the interrupt and the TLB flush
489 * will load "current" PTEs. None of the mappings relied on by this ISR for
490 * its execution will be changing. So it's safe to acknowledge the request
491 * and allow the initiator to proceed before performing the flush.
492 */
493 void
pmap_tlb_intr(void)494 pmap_tlb_intr(void)
495 {
496 pmap_tlb_packet_t copy;
497 volatile pmap_tlb_packet_t *source;
498 struct cpu_info *ci;
499
500 /* Make a private copy of the packet. */
501 source = pmap_tlb_packet;
502 copy = *source;
503
504 /*
505 * If we are the last CPU out, clear the active pointer and mark the
506 * packet as done. Both can be done without using an atomic, and
507 * the one atomic we do use serves as our memory barrier.
508 *
509 * It's important to clear the active pointer before setting
510 * TP_DONE, to ensure a remote CPU does not exit & re-enter
511 * pmap_tlb_shootnow() only to find its current pointer still
512 * seemingly active.
513 */
514 if (atomic_dec_uint_nv(&pmap_tlb_pendcount) == 0) {
515 atomic_store_relaxed(&pmap_tlb_packet, NULL);
516 __insn_barrier();
517 TP_SET_DONE(source);
518 }
519 pmap_tlb_invalidate(©);
520
521 /*
522 * Check the current TLB state. If we don't want further flushes
523 * for this pmap, then take the CPU out of the pmap's set. The
524 * order of updates to the set and TLB state must closely align with
525 * the pmap code, as we can interrupt code running in the pmap
526 * module.
527 */
528 ci = curcpu();
529 if (ci->ci_tlbstate == TLBSTATE_LAZY && TP_GET_USERPMAP(©) != 0) {
530 kcpuset_atomic_clear(ci->ci_pmap->pm_cpus, cpu_index(ci));
531 ci->ci_tlbstate = TLBSTATE_STALE;
532 }
533 }
534