xref: /netbsd-src/sys/arch/xen/x86/xen_pmap.c (revision a5847cc334d9a7029f6352b847e9e8d71a0f9e0c)
1 /*	$NetBSD: xen_pmap.c,v 1.8 2011/11/08 17:16:52 cherry Exp $	*/
2 
3 /*
4  * Copyright (c) 2007 Manuel Bouyer.
5  *
6  * Redistribution and use in source and binary forms, with or without
7  * modification, are permitted provided that the following conditions
8  * are met:
9  * 1. Redistributions of source code must retain the above copyright
10  *    notice, this list of conditions and the following disclaimer.
11  * 2. Redistributions in binary form must reproduce the above copyright
12  *    notice, this list of conditions and the following disclaimer in the
13  *    documentation and/or other materials provided with the distribution.
14  *
15  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
16  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
17  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
18  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
19  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
20  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
21  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
22  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
23  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
24  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
25  *
26  */
27 
28 /*
29  * Copyright (c) 2006 Mathieu Ropert <mro@adviseo.fr>
30  *
31  * Permission to use, copy, modify, and distribute this software for any
32  * purpose with or without fee is hereby granted, provided that the above
33  * copyright notice and this permission notice appear in all copies.
34  *
35  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
36  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
37  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
38  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
39  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
40  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
41  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
42  */
43 
44 /*
45  * Copyright (c) 1997 Charles D. Cranor and Washington University.
46  * All rights reserved.
47  *
48  * Redistribution and use in source and binary forms, with or without
49  * modification, are permitted provided that the following conditions
50  * are met:
51  * 1. Redistributions of source code must retain the above copyright
52  *    notice, this list of conditions and the following disclaimer.
53  * 2. Redistributions in binary form must reproduce the above copyright
54  *    notice, this list of conditions and the following disclaimer in the
55  *    documentation and/or other materials provided with the distribution.
56  *
57  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR
58  * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
59  * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
60  * IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT,
61  * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
62  * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
63  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
64  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
65  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
66  * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
67  */
68 
69 /*
70  * Copyright 2001 (c) Wasabi Systems, Inc.
71  * All rights reserved.
72  *
73  * Written by Frank van der Linden for Wasabi Systems, Inc.
74  *
75  * Redistribution and use in source and binary forms, with or without
76  * modification, are permitted provided that the following conditions
77  * are met:
78  * 1. Redistributions of source code must retain the above copyright
79  *    notice, this list of conditions and the following disclaimer.
80  * 2. Redistributions in binary form must reproduce the above copyright
81  *    notice, this list of conditions and the following disclaimer in the
82  *    documentation and/or other materials provided with the distribution.
83  * 3. All advertising materials mentioning features or use of this software
84  *    must display the following acknowledgement:
85  *      This product includes software developed for the NetBSD Project by
86  *      Wasabi Systems, Inc.
87  * 4. The name of Wasabi Systems, Inc. may not be used to endorse
88  *    or promote products derived from this software without specific prior
89  *    written permission.
90  *
91  * THIS SOFTWARE IS PROVIDED BY WASABI SYSTEMS, INC. ``AS IS'' AND
92  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
93  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
94  * PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL WASABI SYSTEMS, INC
95  * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
96  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
97  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
98  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
99  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
100  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
101  * POSSIBILITY OF SUCH DAMAGE.
102  */
103 
104 #include <sys/cdefs.h>
105 __KERNEL_RCSID(0, "$NetBSD: xen_pmap.c,v 1.8 2011/11/08 17:16:52 cherry Exp $");
106 
107 #include "opt_user_ldt.h"
108 #include "opt_lockdebug.h"
109 #include "opt_multiprocessor.h"
110 #include "opt_xen.h"
111 #if !defined(__x86_64__)
112 #include "opt_kstack_dr0.h"
113 #endif /* !defined(__x86_64__) */
114 
115 #include <sys/param.h>
116 #include <sys/systm.h>
117 #include <sys/proc.h>
118 #include <sys/pool.h>
119 #include <sys/kernel.h>
120 #include <sys/atomic.h>
121 #include <sys/cpu.h>
122 #include <sys/intr.h>
123 #include <sys/xcall.h>
124 
125 #include <uvm/uvm.h>
126 
127 #include <dev/isa/isareg.h>
128 
129 #include <machine/specialreg.h>
130 #include <machine/gdt.h>
131 #include <machine/isa_machdep.h>
132 #include <machine/cpuvar.h>
133 
134 #include <x86/pmap.h>
135 #include <x86/pmap_pv.h>
136 
137 #include <x86/i82489reg.h>
138 #include <x86/i82489var.h>
139 
140 #ifdef XEN
141 #include <xen/xen3-public/xen.h>
142 #include <xen/hypervisor.h>
143 #endif
144 
145 #define COUNT(x)	/* nothing */
146 
147 static pd_entry_t * const alternate_pdes[] = APDES_INITIALIZER;
148 extern pd_entry_t * const normal_pdes[];
149 
150 extern paddr_t pmap_pa_start; /* PA of first physical page for this domain */
151 extern paddr_t pmap_pa_end;   /* PA of last physical page for this domain */
152 
153 void
154 pmap_apte_flush(struct pmap *pmap)
155 {
156 
157 	KASSERT(kpreempt_disabled());
158 
159 	/*
160 	 * Flush the APTE mapping from all other CPUs that
161 	 * are using the pmap we are using (who's APTE space
162 	 * is the one we've just modified).
163 	 *
164 	 * XXXthorpej -- find a way to defer the IPI.
165 	 */
166 	pmap_tlb_shootdown(pmap, (vaddr_t)-1LL, 0, TLBSHOOT_APTE);
167 	pmap_tlb_shootnow();
168 }
169 
170 /*
171  * Unmap the content of APDP PDEs
172  */
173 void
174 pmap_unmap_apdp(void)
175 {
176 	int i;
177 
178 	for (i = 0; i < PDP_SIZE; i++) {
179 		pmap_pte_set(APDP_PDE+i, 0);
180 #if defined (PAE)
181 		/*
182 		 * For PAE, there are two places where alternative recursive
183 		 * mappings could be found with Xen:
184 		 * - in the L2 shadow pages
185 		 * - the "real" L2 kernel page (pmap_kl2pd), which is unique
186 		 * and static.
187 		 * We first clear the APDP for the current pmap. As L2 kernel
188 		 * page is unique, we only need to do it once for all pmaps.
189 		 */
190 		pmap_pte_set(APDP_PDE_SHADOW+i, 0);
191 #endif
192 	}
193 }
194 
195 /*
196  * pmap_map_ptes: map a pmap's PTEs into KVM and lock them in
197  *
198  * => we lock enough pmaps to keep things locked in
199  * => must be undone with pmap_unmap_ptes before returning
200  */
201 
202 void
203 pmap_map_ptes(struct pmap *pmap, struct pmap **pmap2,
204 	      pd_entry_t **ptepp, pd_entry_t * const **pdeppp)
205 {
206 	pd_entry_t opde, npde;
207 	struct pmap *ourpmap;
208 	struct cpu_info *ci;
209 	struct lwp *l;
210 	bool iscurrent;
211 	uint64_t ncsw;
212 	int s;
213 
214 	/* the kernel's pmap is always accessible */
215 	if (pmap == pmap_kernel()) {
216 		*pmap2 = NULL;
217 		*ptepp = PTE_BASE;
218 		*pdeppp = normal_pdes;
219 		return;
220 	}
221 	KASSERT(kpreempt_disabled());
222 
223  retry:
224 	l = curlwp;
225 	ncsw = l->l_ncsw;
226  	ourpmap = NULL;
227 	ci = curcpu();
228 #if defined(__x86_64__)
229 	/*
230 	 * curmap can only be pmap_kernel so at this point
231 	 * pmap_is_curpmap is always false
232 	 */
233 	iscurrent = 0;
234 	ourpmap = pmap_kernel();
235 #else /* __x86_64__*/
236 	if (ci->ci_want_pmapload &&
237 	    vm_map_pmap(&l->l_proc->p_vmspace->vm_map) == pmap) {
238 		pmap_load();
239 		if (l->l_ncsw != ncsw)
240 			goto retry;
241 	}
242 	iscurrent = pmap_is_curpmap(pmap);
243 	/* if curpmap then we are always mapped */
244 	if (iscurrent) {
245 		mutex_enter(pmap->pm_lock);
246 		*pmap2 = NULL;
247 		*ptepp = PTE_BASE;
248 		*pdeppp = normal_pdes;
249 		goto out;
250 	}
251 	ourpmap = ci->ci_pmap;
252 #endif /* __x86_64__ */
253 
254 	/* need to lock both curpmap and pmap: use ordered locking */
255 	pmap_reference(ourpmap);
256 	if ((uintptr_t) pmap < (uintptr_t) ourpmap) {
257 		mutex_enter(pmap->pm_lock);
258 		mutex_enter(ourpmap->pm_lock);
259 	} else {
260 		mutex_enter(ourpmap->pm_lock);
261 		mutex_enter(pmap->pm_lock);
262 	}
263 
264 	if (l->l_ncsw != ncsw)
265 		goto unlock_and_retry;
266 
267 	/* need to load a new alternate pt space into curpmap? */
268 	COUNT(apdp_pde_map);
269 	opde = *APDP_PDE;
270 	if (!pmap_valid_entry(opde) ||
271 	    pmap_pte2pa(opde) != pmap_pdirpa(pmap, 0)) {
272 		int i;
273 		s = splvm();
274 		/* Make recursive entry usable in user PGD */
275 		for (i = 0; i < PDP_SIZE; i++) {
276 			npde = pmap_pa2pte(
277 			    pmap_pdirpa(pmap, i * NPDPG)) | PG_k | PG_V;
278 			xpq_queue_pte_update(
279 			    xpmap_ptom(pmap_pdirpa(pmap, PDIR_SLOT_PTE + i)),
280 			    npde);
281 			xpq_queue_pte_update(xpmap_ptetomach(&APDP_PDE[i]),
282 			    npde);
283 #ifdef PAE
284 			/* update shadow entry too */
285 			xpq_queue_pte_update(
286 			    xpmap_ptetomach(&APDP_PDE_SHADOW[i]), npde);
287 #endif /* PAE */
288 			xpq_queue_invlpg(
289 			    (vaddr_t)&pmap->pm_pdir[PDIR_SLOT_PTE + i]);
290 		}
291 		if (pmap_valid_entry(opde))
292 			pmap_apte_flush(ourpmap);
293 		splx(s);
294 	}
295 	*pmap2 = ourpmap;
296 	*ptepp = APTE_BASE;
297 	*pdeppp = alternate_pdes;
298 	KASSERT(l->l_ncsw == ncsw);
299 #if !defined(__x86_64__)
300  out:
301 #endif
302  	/*
303  	 * might have blocked, need to retry?
304  	 */
305 	if (l->l_ncsw != ncsw) {
306  unlock_and_retry:
307 	    	if (ourpmap != NULL) {
308 			mutex_exit(ourpmap->pm_lock);
309 			pmap_destroy(ourpmap);
310 		}
311 		mutex_exit(pmap->pm_lock);
312 		goto retry;
313 	}
314 }
315 
316 /*
317  * pmap_unmap_ptes: unlock the PTE mapping of "pmap"
318  */
319 
320 void
321 pmap_unmap_ptes(struct pmap *pmap, struct pmap *pmap2)
322 {
323 
324 	if (pmap == pmap_kernel()) {
325 		return;
326 	}
327 	KASSERT(kpreempt_disabled());
328 	if (pmap2 == NULL) {
329 		mutex_exit(pmap->pm_lock);
330 	} else {
331 #if defined(__x86_64__)
332 		KASSERT(pmap2 == pmap_kernel());
333 #else
334 		KASSERT(curcpu()->ci_pmap == pmap2);
335 #endif
336 #if defined(MULTIPROCESSOR)
337 		pmap_unmap_apdp();
338 		pmap_pte_flush();
339 		pmap_apte_flush(pmap2);
340 #endif /* MULTIPROCESSOR */
341 		COUNT(apdp_pde_unmap);
342 		mutex_exit(pmap->pm_lock);
343 		mutex_exit(pmap2->pm_lock);
344 		pmap_destroy(pmap2);
345 	}
346 }
347 
348 int
349 pmap_enter(struct pmap *pmap, vaddr_t va, paddr_t pa, vm_prot_t prot, u_int flags)
350 {
351         paddr_t ma;
352 
353 	if (__predict_false(pa < pmap_pa_start || pmap_pa_end <= pa)) {
354 		ma = pa; /* XXX hack */
355 	} else {
356 		ma = xpmap_ptom(pa);
357 	}
358 
359 	return pmap_enter_ma(pmap, va, ma, pa, prot, flags, DOMID_SELF);
360 }
361 
362 /*
363  * pmap_kenter_ma: enter a kernel mapping without R/M (pv_entry) tracking
364  *
365  * => no need to lock anything, assume va is already allocated
366  * => should be faster than normal pmap enter function
367  * => we expect a MACHINE address
368  */
369 
370 void
371 pmap_kenter_ma(vaddr_t va, paddr_t ma, vm_prot_t prot, u_int flags)
372 {
373 	pt_entry_t *pte, opte, npte;
374 
375 	if (va < VM_MIN_KERNEL_ADDRESS)
376 		pte = vtopte(va);
377 	else
378 		pte = kvtopte(va);
379 
380 	npte = ma | ((prot & VM_PROT_WRITE) ? PG_RW : PG_RO) |
381 	     PG_V | PG_k;
382 	if (flags & PMAP_NOCACHE)
383 		npte |= PG_N;
384 
385 	if ((cpu_feature[2] & CPUID_NOX) && !(prot & VM_PROT_EXECUTE))
386 		npte |= PG_NX;
387 
388 	opte = pmap_pte_testset (pte, npte); /* zap! */
389 
390 	if (pmap_valid_entry(opte)) {
391 #if defined(MULTIPROCESSOR)
392 		kpreempt_disable();
393 		pmap_tlb_shootdown(pmap_kernel(), va, opte, TLBSHOOT_KENTER);
394 		kpreempt_enable();
395 #else
396 		/* Don't bother deferring in the single CPU case. */
397 		pmap_update_pg(va);
398 #endif
399 	}
400 }
401 
402 /*
403  * pmap_extract_ma: extract a MA for the given VA
404  */
405 
406 bool
407 pmap_extract_ma(struct pmap *pmap, vaddr_t va, paddr_t *pap)
408 {
409 	pt_entry_t *ptes, pte;
410 	pd_entry_t pde;
411 	pd_entry_t * const *pdes;
412 	struct pmap *pmap2;
413 
414 	kpreempt_disable();
415 	pmap_map_ptes(pmap, &pmap2, &ptes, &pdes);
416 	if (!pmap_pdes_valid(va, pdes, &pde)) {
417 		pmap_unmap_ptes(pmap, pmap2);
418 		kpreempt_enable();
419 		return false;
420 	}
421 
422 	pte = ptes[pl1_i(va)];
423 	pmap_unmap_ptes(pmap, pmap2);
424 	kpreempt_enable();
425 
426 	if (__predict_true((pte & PG_V) != 0)) {
427 		if (pap != NULL)
428 			*pap = (pte & PG_FRAME) | (va & (NBPD_L1 - 1));
429 		return true;
430 	}
431 
432 	return false;
433 }
434 
435 /*
436  * Flush all APDP entries found in pmaps
437  * Required during Xen save/restore operations, as it does not
438  * handle alternative recursive mappings properly
439  */
440 void
441 pmap_unmap_all_apdp_pdes(void)
442 {
443 
444 	int i;
445 	int s;
446 	struct pmap *pm;
447 
448 	s = splvm();
449 
450 	pmap_unmap_apdp();
451 
452 	mutex_enter(&pmaps_lock);
453 	/*
454 	 * Set APDP entries to 0 in all pmaps.
455 	 * Note that for PAE kernels, this only clears the APDP entries
456 	 * found in the L2 shadow pages, as pmap_pdirpa() is used to obtain
457 	 * the PA of the pmap->pm_pdir[] pages (forming the 4 contiguous
458 	 * pages of PAE PD: 3 for user space, 1 for the L2 kernel shadow page)
459 	 */
460 	LIST_FOREACH(pm, &pmaps, pm_list) {
461 		for (i = 0; i < PDP_SIZE; i++) {
462 			xpq_queue_pte_update(
463 			    xpmap_ptom(pmap_pdirpa(pm, PDIR_SLOT_APTE + i)),
464 			    0);
465 		}
466 	}
467 	mutex_exit(&pmaps_lock);
468 
469 	xpq_flush_queue();
470 
471 	splx(s);
472 
473 }
474 
475 #ifdef PAE
476 /*
477  * NetBSD uses L2 shadow pages to support PAE with Xen. However, Xen does not
478  * handle them correctly during save/restore, leading to incorrect page
479  * tracking and pinning during restore.
480  * For save/restore to succeed, two functions are introduced:
481  * - pmap_map_recursive_entries(), used by resume code to set the recursive
482  *   mapping entries to their correct value
483  * - pmap_unmap_recursive_entries(), used by suspend code to clear all
484  *   PDIR_SLOT_PTE entries
485  */
486 void
487 pmap_map_recursive_entries(void)
488 {
489 
490 	int i;
491 	struct pmap *pm;
492 
493 	mutex_enter(&pmaps_lock);
494 
495 	LIST_FOREACH(pm, &pmaps, pm_list) {
496 		for (i = 0; i < PDP_SIZE; i++) {
497 			xpq_queue_pte_update(
498 			    xpmap_ptom(pmap_pdirpa(pm, PDIR_SLOT_PTE + i)),
499 			    xpmap_ptom((pm)->pm_pdirpa[i]) | PG_V);
500 		}
501 	}
502 
503 	mutex_exit(&pmaps_lock);
504 
505 	for (i = 0; i < PDP_SIZE; i++) {
506 		xpq_queue_pte_update(
507 		    xpmap_ptom(pmap_pdirpa(pmap_kernel(), PDIR_SLOT_PTE + i)),
508 		    xpmap_ptom(pmap_kernel()->pm_pdirpa[i]) | PG_V);
509 	}
510 
511 	xpq_flush_queue();
512 }
513 
514 void
515 pmap_unmap_recursive_entries(void)
516 {
517 
518 	int i;
519 	struct pmap *pm;
520 
521 	pmap_invalidate_pool_caches();
522 
523 	mutex_enter(&pmaps_lock);
524 
525 	LIST_FOREACH(pm, &pmaps, pm_list) {
526 		for (i = 0; i < PDP_SIZE; i++) {
527 			xpq_queue_pte_update(
528 			    xpmap_ptom(pmap_pdirpa(pm, PDIR_SLOT_PTE + i)), 0);
529 		}
530 	}
531 
532 	mutex_exit(&pmaps_lock);
533 
534 	/* do it for pmap_kernel() too! */
535 	for (i = 0; i < PDP_SIZE; i++)
536 		xpq_queue_pte_update(
537 		    xpmap_ptom(pmap_pdirpa(pmap_kernel(), PDIR_SLOT_PTE + i)),
538 		    0);
539 
540 	xpq_flush_queue();
541 
542 }
543 #endif /* PAE */
544