xref: /netbsd-src/sys/arch/x86/x86/x86_tlb.c (revision 5e4abe6a3f6b12b3229ab05a50d046d14c92e85a)
1 /*	$NetBSD: x86_tlb.c,v 1.21 2023/12/08 21:46:02 andvar Exp $	*/
2 
3 /*-
4  * Copyright (c) 2008-2020 The NetBSD Foundation, Inc.
5  * All rights reserved.
6  *
7  * This code is derived from software contributed to The NetBSD Foundation
8  * by Andrew Doran and Mindaugas Rasiukevicius.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
20  * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
21  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
22  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
23  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
24  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
25  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
26  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
27  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
28  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
29  * POSSIBILITY OF SUCH DAMAGE.
30  */
31 
32 /*
33  * x86 pmap(9) module: TLB shootdowns.
34  *
35  * TLB shootdowns are hard interrupts that operate outside the SPL framework.
36  * They do not need to be blocked, provided that the pmap module gets the
37  * order of events correct.  The calls are made by poking the LAPIC directly.
38  * The interrupt handler is short and does one of the following: invalidate
39  * a set of pages, all user TLB entries or the entire TLB.
40  */
41 
42 #include <sys/cdefs.h>
43 __KERNEL_RCSID(0, "$NetBSD: x86_tlb.c,v 1.21 2023/12/08 21:46:02 andvar Exp $");
44 
45 #include <sys/param.h>
46 #include <sys/kernel.h>
47 
48 #include <sys/systm.h>
49 #include <sys/atomic.h>
50 #include <sys/cpu.h>
51 #include <sys/intr.h>
52 #include <uvm/uvm.h>
53 
54 #include <machine/cpuvar.h>
55 #include <machine/pmap_private.h>
56 
57 #ifdef XENPV
58 #include <xen/xenpmap.h>
59 #endif /* XENPV */
60 #include <x86/i82489reg.h>
61 #include <x86/i82489var.h>
62 
63 /*
64  * TLB shootdown packet.  Each CPU has a copy of this packet, where we build
65  * sets of TLB shootdowns.  If shootdowns need to occur on remote CPUs, the
66  * packet is copied into a shared mailbox kept on the initiator's kernel
67  * stack.  Once the copy is made, no further updates to the mailbox are made
68  * until the request is completed.  This keeps the cache line in the shared
69  * state, and bus traffic to a minimum.
70  *
71  * In order to make maximal use of the available space, control fields are
72  * overlaid into the lower 12 bits of the first 4 virtual addresses.  This
73  * is very ugly, but it counts.
74  *
75  * On i386 the packet is 64 bytes in size.  On amd64 it's 128 bytes.  This
76  * is sized in concert with UBC_WINSIZE, otherwise excessive shootdown
77  * interrupts could be issued.
78  */
79 
80 #define	TP_MAXVA	16		/* for individual mappings */
81 #define	TP_ALLVA	PAGE_MASK	/* special: shoot all mappings */
82 
83 typedef struct {
84 	uintptr_t		tp_store[TP_MAXVA];
85 } pmap_tlb_packet_t;
86 
87 #define	TP_COUNT	0
88 #define	TP_USERPMAP	1
89 #define	TP_GLOBAL	2
90 #define	TP_DONE		3
91 
92 #define	TP_GET_COUNT(tp)	((tp)->tp_store[TP_COUNT] & PAGE_MASK)
93 #define	TP_GET_USERPMAP(tp)	((tp)->tp_store[TP_USERPMAP] & 1)
94 #define	TP_GET_GLOBAL(tp)	((tp)->tp_store[TP_GLOBAL] & 1)
95 #define	TP_GET_DONE(tp)		(atomic_load_relaxed(&(tp)->tp_store[TP_DONE]) & 1)
96 #define	TP_GET_VA(tp, i)	((tp)->tp_store[(i)] & ~PAGE_MASK)
97 
98 #define	TP_INC_COUNT(tp)	((tp)->tp_store[TP_COUNT]++)
99 #define	TP_SET_ALLVA(tp)	((tp)->tp_store[TP_COUNT] |= TP_ALLVA)
100 #define	TP_SET_VA(tp, c, va)	((tp)->tp_store[(c)] |= ((va) & ~PAGE_MASK))
101 
102 #define	TP_SET_USERPMAP(tp)	((tp)->tp_store[TP_USERPMAP] |= 1)
103 #define	TP_SET_GLOBAL(tp)	((tp)->tp_store[TP_GLOBAL] |= 1)
104 #define	TP_SET_DONE(tp)							     \
105 	do {								     \
106 		uintptr_t v = atomic_load_relaxed(&(tp)->tp_store[TP_DONE]); \
107 		atomic_store_relaxed(&(tp)->tp_store[TP_DONE], v | 1);	     \
108 	} while (/* CONSTCOND */ 0);
109 
110 #define	TP_CLEAR(tp)		memset(__UNVOLATILE(tp), 0, sizeof(*(tp)));
111 
112 /*
113  * TLB shootdown state.
114  */
115 static volatile pmap_tlb_packet_t *volatile pmap_tlb_packet __cacheline_aligned;
116 static volatile u_int		pmap_tlb_pendcount	__cacheline_aligned;
117 static struct evcnt		pmap_tlb_evcnt		__cacheline_aligned;
118 
119 /*
120  * TLB shootdown statistics.
121  */
122 #ifdef TLBSTATS
123 static struct evcnt		tlbstat_local[TLBSHOOT__MAX];
124 static struct evcnt		tlbstat_remote[TLBSHOOT__MAX];
125 static struct evcnt		tlbstat_kernel[TLBSHOOT__MAX];
126 static struct evcnt		tlbstat_single_req;
127 static struct evcnt		tlbstat_single_issue;
128 static const char *		tlbstat_name[ ] = {
129 	"REMOVE_ALL",
130 	"KENTER",
131 	"KREMOVE",
132 	"FREE_PTP",
133 	"REMOVE_PTE",
134 	"SYNC_PV",
135 	"WRITE_PROTECT",
136 	"ENTER",
137 	"NVMM",
138 	"BUS_DMA",
139 	"BUS_SPACE",
140 };
141 #endif
142 
143 void
pmap_tlb_init(void)144 pmap_tlb_init(void)
145 {
146 
147 	evcnt_attach_dynamic(&pmap_tlb_evcnt, EVCNT_TYPE_INTR,
148 	    NULL, "TLB", "shootdown");
149 
150 #ifdef TLBSTATS
151 	int i;
152 
153 	for (i = 0; i < TLBSHOOT__MAX; i++) {
154 		evcnt_attach_dynamic(&tlbstat_local[i], EVCNT_TYPE_MISC,
155 		    NULL, "tlbshoot local", tlbstat_name[i]);
156 	}
157 	for (i = 0; i < TLBSHOOT__MAX; i++) {
158 		evcnt_attach_dynamic(&tlbstat_remote[i], EVCNT_TYPE_MISC,
159 		    NULL, "tlbshoot remote", tlbstat_name[i]);
160 	}
161 	for (i = 0; i < TLBSHOOT__MAX; i++) {
162 		evcnt_attach_dynamic(&tlbstat_kernel[i], EVCNT_TYPE_MISC,
163 		    NULL, "tlbshoot kernel", tlbstat_name[i]);
164 	}
165 	evcnt_attach_dynamic(&tlbstat_single_req, EVCNT_TYPE_MISC,
166 	    NULL, "tlbshoot single page", "requests");
167 	evcnt_attach_dynamic(&tlbstat_single_issue, EVCNT_TYPE_MISC,
168 	    NULL, "tlbshoot single page", "issues");
169 #endif
170 }
171 
172 void
pmap_tlb_cpu_init(struct cpu_info * ci)173 pmap_tlb_cpu_init(struct cpu_info *ci)
174 {
175 	pmap_tlb_packet_t *tp = (pmap_tlb_packet_t *)ci->ci_pmap_data;
176 
177 	memset(tp, 0, sizeof(pmap_tlb_packet_t));
178 	kcpuset_create(&ci->ci_tlb_cpuset, true);
179 }
180 
181 static inline void
pmap_tlbstat_count(struct pmap * pm,vaddr_t va,tlbwhy_t why)182 pmap_tlbstat_count(struct pmap *pm, vaddr_t va, tlbwhy_t why)
183 {
184 #ifdef TLBSTATS
185 	const cpuid_t cid = cpu_index(curcpu());
186 	bool local = false, remote = false;
187 
188 	if (va != (vaddr_t)-1LL) {
189 		atomic_inc_64(&tlbstat_single_req.ev_count);
190 	}
191 	if (pm == pmap_kernel()) {
192 		atomic_inc_64(&tlbstat_kernel[why].ev_count);
193 		return;
194 	}
195 
196 	if (va >= VM_MAXUSER_ADDRESS) {
197 		remote = kcpuset_isotherset(pm->pm_kernel_cpus, cid);
198 		local = kcpuset_isset(pm->pm_kernel_cpus, cid);
199 	}
200 	remote |= kcpuset_isotherset(pm->pm_cpus, cid);
201 	local |= kcpuset_isset(pm->pm_cpus, cid);
202 
203 	if (local) {
204 		atomic_inc_64(&tlbstat_local[why].ev_count);
205 	}
206 	if (remote) {
207 		atomic_inc_64(&tlbstat_remote[why].ev_count);
208 	}
209 #endif
210 }
211 
212 static inline void
pmap_tlb_invalidate(volatile pmap_tlb_packet_t * tp)213 pmap_tlb_invalidate(volatile pmap_tlb_packet_t *tp)
214 {
215 	int i = TP_GET_COUNT(tp);
216 
217 	/* Find out what we need to invalidate. */
218 	if (i == TP_ALLVA) {
219 		if (TP_GET_GLOBAL(tp) != 0) {
220 			/* Invalidating all TLB entries. */
221 			tlbflushg();
222 		} else {
223 			/* Invalidating non-global TLB entries only. */
224 			tlbflush();
225 		}
226 	} else {
227 		/* Invalidating a single page or a range of pages. */
228 		KASSERT(i != 0);
229 		do {
230 			--i;
231 			pmap_update_pg(TP_GET_VA(tp, i));
232 		} while (i > 0);
233 	}
234 }
235 
236 /*
237  * pmap_tlb_shootdown: invalidate a page on all CPUs using pmap 'pm'.
238  */
239 void
pmap_tlb_shootdown(struct pmap * pm,vaddr_t va,pt_entry_t pte,tlbwhy_t why)240 pmap_tlb_shootdown(struct pmap *pm, vaddr_t va, pt_entry_t pte, tlbwhy_t why)
241 {
242 	pmap_tlb_packet_t *tp;
243 	struct cpu_info *ci;
244 	uint8_t count;
245 	int s;
246 
247 #ifndef XENPV
248 	KASSERT((pte & PTE_G) == 0 || pm == pmap_kernel());
249 #endif
250 
251 	if (__predict_false(pm->pm_tlb_flush != NULL)) {
252 		(*pm->pm_tlb_flush)(pm);
253 		return;
254 	}
255 
256 	if ((pte & PTE_PS) != 0) {
257 		va &= PTE_LGFRAME;
258 	}
259 
260 	/*
261 	 * Add the shootdown operation to our pending set.
262 	 */
263 	s = splvm();
264 	ci = curcpu();
265 	tp = (pmap_tlb_packet_t *)ci->ci_pmap_data;
266 
267 	/* Whole address flush will be needed if PTE_G is set. */
268 	if ((pte & PTE_G) != 0) {
269 		TP_SET_GLOBAL(tp);
270 	}
271 	count = TP_GET_COUNT(tp);
272 
273 	if (count < TP_MAXVA && va != (vaddr_t)-1LL) {
274 		/* Flush a single page. */
275 		TP_SET_VA(tp, count, va);
276 		TP_INC_COUNT(tp);
277 	} else {
278 		/* Flush everything - may already be set. */
279 		TP_SET_ALLVA(tp);
280 	}
281 
282 	if (pm != pmap_kernel()) {
283 		kcpuset_merge(ci->ci_tlb_cpuset, pm->pm_cpus);
284 		if (va >= VM_MAXUSER_ADDRESS) {
285 			kcpuset_merge(ci->ci_tlb_cpuset, pm->pm_kernel_cpus);
286 		}
287 		TP_SET_USERPMAP(tp);
288 	} else {
289 		kcpuset_copy(ci->ci_tlb_cpuset, kcpuset_running);
290 	}
291 	pmap_tlbstat_count(pm, va, why);
292 	splx(s);
293 }
294 
295 #ifdef XENPV
296 
297 static inline void
pmap_tlb_processpacket(volatile pmap_tlb_packet_t * tp,kcpuset_t * target)298 pmap_tlb_processpacket(volatile pmap_tlb_packet_t *tp, kcpuset_t *target)
299 {
300 #ifdef MULTIPROCESSOR
301 	int i = TP_GET_COUNT(tp);
302 
303 	if (i != TP_ALLVA) {
304 		/* Invalidating a single page or a range of pages. */
305 		KASSERT(i != 0);
306 		do {
307 			--i;
308 			xen_mcast_invlpg(TP_GET_VA(tp, i), target);
309 		} while (i > 0);
310 	} else {
311 		xen_mcast_tlbflush(target);
312 	}
313 
314 	/* Remote CPUs have been synchronously flushed. */
315 	pmap_tlb_pendcount = 0;
316 	pmap_tlb_packet = NULL;
317 	TP_SET_DONE(tp);
318 #endif /* MULTIPROCESSOR */
319 }
320 
321 #else
322 
323 static inline void
pmap_tlb_processpacket(volatile pmap_tlb_packet_t * tp,kcpuset_t * target)324 pmap_tlb_processpacket(volatile pmap_tlb_packet_t *tp, kcpuset_t *target)
325 {
326 #ifdef MULTIPROCESSOR
327 	int err = 0;
328 
329 	if (!kcpuset_match(target, kcpuset_attached)) {
330 		const struct cpu_info * const self = curcpu();
331 		CPU_INFO_ITERATOR cii;
332 		struct cpu_info *lci;
333 
334 		for (CPU_INFO_FOREACH(cii, lci)) {
335 			const cpuid_t lcid = cpu_index(lci);
336 
337 			if (__predict_false(lci == self) ||
338 			    !kcpuset_isset(target, lcid)) {
339 				continue;
340 			}
341 			err |= x86_ipi(LAPIC_TLB_VECTOR,
342 			    lci->ci_cpuid, LAPIC_DLMODE_FIXED);
343 		}
344 	} else {
345 		err = x86_ipi(LAPIC_TLB_VECTOR, LAPIC_DEST_ALLEXCL,
346 		    LAPIC_DLMODE_FIXED);
347 	}
348 	KASSERT(err == 0);
349 #endif /* MULTIPROCESSOR */
350 }
351 
352 #endif /* XENPV */
353 
354 /*
355  * pmap_tlb_shootnow: process pending TLB shootdowns queued on current CPU.
356  *
357  * => Must be called with preemption disabled.
358  */
359 void
pmap_tlb_shootnow(void)360 pmap_tlb_shootnow(void)
361 {
362 	volatile pmap_tlb_packet_t *tp, *ts;
363 	volatile uint8_t stackbuf[sizeof(*tp) + COHERENCY_UNIT];
364 	struct cpu_info *ci;
365 	kcpuset_t *target;
366 	u_int local, rcpucount;
367 	cpuid_t cid;
368 	int s;
369 
370 	KASSERT(kpreempt_disabled());
371 
372 	/* Pre-check first. */
373 	ci = curcpu();
374 	tp = (pmap_tlb_packet_t *)ci->ci_pmap_data;
375 	if (TP_GET_COUNT(tp) == 0) {
376 		return;
377 	}
378 
379 	/* An interrupt may have flushed our updates, so check again. */
380 	s = splvm();
381 	if (TP_GET_COUNT(tp) == 0) {
382 		splx(s);
383 		return;
384 	}
385 
386 	cid = cpu_index(ci);
387 	target = ci->ci_tlb_cpuset;
388 	local = kcpuset_isset(target, cid) ? 1 : 0;
389 	rcpucount = kcpuset_countset(target) - local;
390 
391 	/*
392 	 * Fast path for local shootdowns only.  Do the shootdowns, and
393 	 * clear out the buffer for the next user.
394 	 */
395 	if (rcpucount == 0) {
396 		pmap_tlb_invalidate(tp);
397 		kcpuset_zero(ci->ci_tlb_cpuset);
398 		TP_CLEAR(tp);
399 		splx(s);
400 		return;
401 	}
402 
403 	/*
404 	 * Copy the packet into the stack buffer, and gain ownership of the
405 	 * global pointer.  We must keep interrupts blocked once we own the
406 	 * pointer and until the IPIs are triggered, or we could deadlock
407 	 * against an interrupt on the current CPU trying the same.
408 	 */
409 	KASSERT(rcpucount < ncpu);
410 	ts = (void *)roundup2((uintptr_t)stackbuf, COHERENCY_UNIT);
411 	*ts = *tp;
412 	KASSERT(TP_GET_DONE(ts) == 0);
413 	while (atomic_cas_ptr(&pmap_tlb_packet, NULL,
414 	    __UNVOLATILE(ts)) != NULL) {
415 		KASSERT(atomic_load_relaxed(&pmap_tlb_packet) != ts);
416 		/*
417 		 * Don't bother with exponentional backoff, as the pointer
418 		 * is in a dedicated cache line and only updated twice per
419 		 * IPI (in contrast to the pending counter).  The cache
420 		 * line will spend most of its time in the SHARED state.
421 		 */
422 		splx(s);
423 		do {
424 			x86_pause();
425 		} while (atomic_load_relaxed(&pmap_tlb_packet) != NULL);
426 		s = splvm();
427 
428 		/*
429 		 * An interrupt might have done the shootdowns for
430 		 * us while we spun.
431 		 */
432 		if (TP_GET_COUNT(tp) == 0) {
433 			splx(s);
434 			return;
435 		}
436 	}
437 
438 	/*
439 	 * Ownership of the global pointer provides serialization of the
440 	 * update to the count and the event counter.  With those values
441 	 * updated, start shootdowns on remote CPUs.
442 	 */
443 	pmap_tlb_pendcount = rcpucount;
444 	pmap_tlb_evcnt.ev_count++;
445 	pmap_tlb_processpacket(ts, target);
446 
447 	/*
448 	 * Clear out the local CPU's buffer for the next user.  Once done,
449 	 * we can drop the IPL.
450 	 */
451 #ifdef TLBSTATS
452 	if (TP_GET_COUNT(tp) != TP_ALLVA) {
453 		atomic_add_64(&tlbstat_single_issue.ev_count,
454 		    TP_GET_COUNT(tp));
455 	}
456 #endif
457 	kcpuset_zero(ci->ci_tlb_cpuset);
458 	TP_CLEAR(tp);
459 	splx(s);
460 
461 	/*
462 	 * Shootdowns on remote CPUs are now in flight.  In the meantime,
463 	 * perform local shootdown if needed, using our copy of the packet.
464 	 */
465 	if (local) {
466 		pmap_tlb_invalidate(ts);
467 	}
468 
469 	/*
470 	 * Wait for the updates to be processed by remote CPUs.  Poll the
471 	 * flag in the packet in order to limit bus traffic (only the last
472 	 * CPU out will update it and only we are reading it).  No memory
473 	 * barrier required due to prior stores - yay x86.
474 	 */
475 	while (TP_GET_DONE(ts) == 0) {
476 		x86_pause();
477 	}
478 }
479 
480 /*
481  * pmap_tlb_intr: pmap shootdown interrupt handler to invalidate TLB entries.
482  *
483  * Called from IPI only.  We are outside the SPL framework, with interrupts
484  * disabled on the CPU: be careful.
485  *
486  * TLB flush and the interrupt that brought us here are serializing
487  * operations (they defeat speculative execution).  Any speculative load
488  * producing a TLB fill between receipt of the interrupt and the TLB flush
489  * will load "current" PTEs.  None of the mappings relied on by this ISR for
490  * its execution will be changing.  So it's safe to acknowledge the request
491  * and allow the initiator to proceed before performing the flush.
492  */
493 void
pmap_tlb_intr(void)494 pmap_tlb_intr(void)
495 {
496 	pmap_tlb_packet_t copy;
497 	volatile pmap_tlb_packet_t *source;
498 	struct cpu_info *ci;
499 
500 	/* Make a private copy of the packet. */
501 	source = pmap_tlb_packet;
502 	copy = *source;
503 
504 	/*
505 	 * If we are the last CPU out, clear the active pointer and mark the
506 	 * packet as done.  Both can be done without using an atomic, and
507 	 * the one atomic we do use serves as our memory barrier.
508 	 *
509 	 * It's important to clear the active pointer before setting
510 	 * TP_DONE, to ensure a remote CPU does not exit & re-enter
511 	 * pmap_tlb_shootnow() only to find its current pointer still
512 	 * seemingly active.
513 	 */
514 	if (atomic_dec_uint_nv(&pmap_tlb_pendcount) == 0) {
515 		atomic_store_relaxed(&pmap_tlb_packet, NULL);
516 		__insn_barrier();
517 		TP_SET_DONE(source);
518 	}
519 	pmap_tlb_invalidate(&copy);
520 
521 	/*
522 	 * Check the current TLB state.  If we don't want further flushes
523 	 * for this pmap, then take the CPU out of the pmap's set.  The
524 	 * order of updates to the set and TLB state must closely align with
525 	 * the pmap code, as we can interrupt code running in the pmap
526 	 * module.
527 	 */
528 	ci = curcpu();
529 	if (ci->ci_tlbstate == TLBSTATE_LAZY && TP_GET_USERPMAP(&copy) != 0) {
530 		kcpuset_atomic_clear(ci->ci_pmap->pm_cpus, cpu_index(ci));
531 		ci->ci_tlbstate = TLBSTATE_STALE;
532 	}
533 }
534