xref: /plan9-contrib/sys/src/9k/k10/mmu.c (revision 38151b0b66407dd6bdaaa567048e0f2778fb0149)
1 #include "u.h"
2 #include "../port/lib.h"
3 #include "mem.h"
4 #include "dat.h"
5 #include "fns.h"
6 
7 #include "amd64.h"
8 
9 #define ALIGNED(p, a)	(!(((uintptr)(p)) & ((a)-1)))
10 
11 #define PDMAP		(0xffffffffff800000ull)
12 #define PDPX(v)		PTLX((v), 2)
13 #define PDX(v)		PTLX((v), 1)
14 #define PTX(v)		PTLX((v), 0)
15 
16 #define VMAP		(0xffffffffe0000000ull)
17 #define VMAPSZ		(256*MiB)
18 
19 #define KSEG1PML4	(0xffff000000000000ull\
20 			|(PTLX(KSEG1, 3)<<(((3)*PTSHFT)+PGSHFT))\
21 			|(PTLX(KSEG1, 3)<<(((2)*PTSHFT)+PGSHFT))\
22 			|(PTLX(KSEG1, 3)<<(((1)*PTSHFT)+PGSHFT))\
23 			|(PTLX(KSEG1, 3)<<(((0)*PTSHFT)+PGSHFT)))
24 
25 #define KSEG1PTP(va, l)	((0xffff000000000000ull\
26 			|(KSEG1PML4<<((3-(l))*PTSHFT))\
27 			|(((va) & 0xffffffffffffull)>>(((l)+1)*PTSHFT))\
28 			& ~0xfffull))
29 
30 static Lock vmaplock;
31 static Page mach0pml4;
32 
33 void
mmuflushtlb(u64int)34 mmuflushtlb(u64int)
35 {
36 	if(m->pml4->daddr){
37 		memset(UINT2PTR(m->pml4->va), 0, m->pml4->daddr*sizeof(PTE));
38 		m->pml4->daddr = 0;
39 	}
40 	cr3put(m->pml4->pa);
41 }
42 
43 void
mmuflush(void)44 mmuflush(void)
45 {
46 	int s;
47 
48 	s = splhi();
49 	up->newtlb = 1;
50 	mmuswitch(up);
51 	splx(s);
52 }
53 
54 static void
mmuptpfree(Proc * proc,int release)55 mmuptpfree(Proc* proc, int release)
56 {
57 	int l;
58 	PTE *pte;
59 	Page **last, *page;
60 
61 	/*
62 	 * To do here:
63 	 *	coalesce the clean and release functionality
64 	 *	(it's either one or the other, and no need for
65 	 *	wakeup in mmurelease as not using the palloc pool);
66 	 *	0-based levels, not 1-based, for consistency;
67 	 *	fix memset level for 2MiB pages;
68 	 *	use a dedicated datastructure rather than Page?
69 	 */
70 	for(l = 1; l < 4; l++){
71 		last = &proc->mmuptp[l];
72 		if(*last == nil)
73 			continue;
74 		for(page = *last; page != nil; page = page->next){
75 			if(!release){
76 				if(l == 1)
77 					memset(UINT2PTR(page->va), 0, PTSZ);
78 				pte = UINT2PTR(page->prev->va);
79 				pte[page->daddr] = 0;
80 			}
81 			last = &page->next;
82 		}
83 		*last = proc->mmuptp[0];
84 		proc->mmuptp[0] = proc->mmuptp[l];
85 		proc->mmuptp[l] = nil;
86 	}
87 
88 	m->pml4->daddr = 0;
89 }
90 
91 static Page*
mmuptpalloc(void)92 mmuptpalloc(void)
93 {
94 	Page *page;
95 	uintmem pa;
96 	int color;
97 
98 	/*
99 	 * Do not really need a whole Page structure,
100 	 * but it makes testing this out a lot easier.
101 	 * Could keep a cache and free excess.
102 	 */
103 	if((page = malloc(sizeof(Page))) == nil){
104 		print("mmuptpalloc Page\n");
105 
106 		return nil;
107 	}
108 	color = NOCOLOR;
109 	if((pa = physalloc(PTSZ, &color, page)) == 0){
110 		print("mmuptpalloc pa\n");
111 		free(page);
112 
113 		return nil;
114 	}
115 
116 	page->va = PTR2UINT(KADDR(pa));
117 	page->pa = pa;
118 	page->ref = 1;
119 	page->color = color;
120 	memset(UINT2PTR(page->va), 0, PTSZ);
121 
122 	return page;
123 }
124 
125 void
mmuswitch(Proc * proc)126 mmuswitch(Proc* proc)
127 {
128 	PTE *pte;
129 	Page *page;
130 
131 	if(proc->newtlb){
132 		mmuptpfree(proc, 0);
133 		proc->newtlb = 0;
134 	}
135 
136 	if(m->pml4->daddr){
137 		memset(UINT2PTR(m->pml4->va), 0, m->pml4->daddr*sizeof(PTE));
138 		m->pml4->daddr = 0;
139 	}
140 
141 	pte = UINT2PTR(m->pml4->va);
142 	for(page = proc->mmuptp[3]; page != nil; page = page->next){
143 		pte[page->daddr] = PPN(page->pa)|PteU|PteRW|PteP;
144 		if(page->daddr >= m->pml4->daddr)
145 			m->pml4->daddr = page->daddr+1;
146 		page->prev = m->pml4;
147 	}
148 
149 	tssrsp0(STACKALIGN(PTR2UINT(proc->kstack+KSTACK)));
150 	cr3put(m->pml4->pa);
151 }
152 
153 void
mmurelease(Proc * proc)154 mmurelease(Proc* proc)
155 {
156 	Page *page, *next;
157 
158 	/*
159 	 * See comments in mmuptpfree above.
160 	 */
161 	mmuptpfree(proc, 1);
162 
163 	for(page = proc->mmuptp[0]; page != nil; page = next){
164 		next = page->next;
165 		if(--page->ref)
166 			panic("mmurelease: page->ref %d\n", page->ref);
167 		physfree(page->pa, PTSZ);
168 		free(page);
169 	}
170 	if(proc->mmuptp[0] && palloc.r.p)
171 		wakeup(&palloc.r);
172 	proc->mmuptp[0] = nil;
173 
174 	tssrsp0(STACKALIGN(m->stack+MACHSTKSZ));
175 	cr3put(m->pml4->pa);
176 }
177 
178 static PTE*
mmuptpget(uintptr va,int level)179 mmuptpget(uintptr va, int level)
180 {
181 	return (PTE*)KSEG1PTP(va, level);
182 }
183 
184 void
mmuput(uintptr va,uintmem pa,Page *)185 mmuput(uintptr va, uintmem pa, Page*)
186 {
187 	Mpl pl;
188 	int l, x;
189 	PTE *pte, *ptp;
190 	Page *page, *prev;
191 
192 	pte = nil;
193 	pl = splhi();
194 	prev = m->pml4;
195 	for(l = 3; l >= 0; l--){
196 		ptp = mmuptpget(va, l);
197 		x = PTLX(va, l);
198 		pte = &ptp[x];
199 		for(page = up->mmuptp[l]; page != nil; page = page->next){
200 			if(page->prev == prev && page->daddr == x)
201 				break;
202 		}
203 		if(page == nil){
204 			if(up->mmuptp[0] == nil)
205 				page = mmuptpalloc();
206 			else {
207 				page = up->mmuptp[0];
208 				up->mmuptp[0] = page->next;
209 			}
210 			page->daddr = x;
211 			page->next = up->mmuptp[l];
212 			up->mmuptp[l] = page;
213 			page->prev = prev;
214 			*pte = PPN(page->pa)|PteU|PteRW|PteP;
215 			if(l == 3 && x >= m->pml4->daddr)
216 				m->pml4->daddr = x+1;
217 		}
218 		prev = page;
219 	}
220 
221 	*pte = pa|PteU;
222 //if(pa & PteRW)
223 //  *pte |= PteNX;
224 	splx(pl);
225 
226 	invlpg(va);			/* only if old entry valid? */
227 }
228 
229 static PTE
pdeget(uintptr va)230 pdeget(uintptr va)
231 {
232 	PTE *pdp;
233 
234 	if(va < 0xffffffffc0000000ull)
235 		panic("pdeget(%#p)", va);
236 
237 	pdp = (PTE*)(PDMAP+PDX(PDMAP)*4096);
238 
239 	return pdp[PDX(va)];
240 }
241 
242 /*
243  * Add kernel mappings for pa -> va for a section of size bytes.
244  * Called only after the va range is known to be unoccupied.
245  */
246 static int
pdmap(uintmem pa,int attr,uintptr va,usize size)247 pdmap(uintmem pa, int attr, uintptr va, usize size)
248 {
249 	uintmem pae;
250 	PTE *pd, *pde, *pt, *pte;
251 	uintmem pdpa;
252 	int pdx, pgsz, color;
253 
254 	pd = (PTE*)(PDMAP+PDX(PDMAP)*4096);
255 
256 	for(pae = pa + size; pa < pae; pa += pgsz){
257 		pdx = PDX(va);
258 		pde = &pd[pdx];
259 
260 		/*
261 		 * Check if it can be mapped using a big page,
262 		 * i.e. is big enough and starts on a suitable boundary.
263 		 * Assume processor can do it.
264 		 */
265 		if(ALIGNED(pa, PGLSZ(1)) && ALIGNED(va, PGLSZ(1)) && (pae-pa) >= PGLSZ(1)){
266 			assert(*pde == 0);
267 			*pde = pa|attr|PtePS|PteP;
268 			pgsz = PGLSZ(1);
269 		}
270 		else{
271 			pt = (PTE*)(PDMAP+pdx*PTSZ);
272 			if(*pde == 0){
273 				color = NOCOLOR;
274 				pdpa = physalloc(PTSZ, &color, nil);
275 				if(pdpa == 0)
276 					panic("pdmap");
277 				*pde = pdpa|PteRW|PteP;
278 				memset(pt, 0, PTSZ);
279 			}
280 
281 			pte = &pt[PTX(va)];
282 			assert(!(*pte & PteP));
283 			*pte = pa|attr|PteP;
284 			pgsz = PGLSZ(0);
285 		}
286 		va += pgsz;
287 	}
288 
289 	return 0;
290 }
291 
292 static int
findhole(PTE * a,int n,int count)293 findhole(PTE* a, int n, int count)
294 {
295 	int have, i;
296 
297 	have = 0;
298 	for(i = 0; i < n; i++){
299 		if(a[i] == 0)
300 			have++;
301 		else
302 			have = 0;
303 		if(have >= count)
304 			return i+1 - have;
305 	}
306 
307 	return -1;
308 }
309 
310 /*
311  * Look for free space in the vmap.
312  */
313 static uintptr
vmapalloc(usize size)314 vmapalloc(usize size)
315 {
316 	int i, n, o;
317 	PTE *pd, *pt;
318 	int pdsz, ptsz;
319 
320 	pd = (PTE*)(PDMAP+PDX(PDMAP)*4096);
321 	pd += PDX(VMAP);
322 	pdsz = VMAPSZ/PGLSZ(1);
323 
324 	/*
325 	 * Look directly in the PD entries if the size is
326 	 * larger than the range mapped by a single entry.
327 	 */
328 	if(size >= PGLSZ(1)){
329 		n = HOWMANY(size, PGLSZ(1));
330 		if((o = findhole(pd, pdsz, n)) != -1)
331 			return VMAP + o*PGLSZ(1);
332 		return 0;
333 	}
334 
335 	/*
336 	 * Size is smaller than that mapped by a single PD entry.
337 	 * Look for an already mapped PT page that has room.
338 	 */
339 	n = HOWMANY(size, PGLSZ(0));
340 	ptsz = PGLSZ(0)/sizeof(PTE);
341 	for(i = 0; i < pdsz; i++){
342 		if(!(pd[i] & PteP) || (pd[i] & PtePS))
343 			continue;
344 
345 		pt = (PTE*)(PDMAP+(PDX(VMAP)+i)*4096);
346 		if((o = findhole(pt, ptsz, n)) != -1)
347 			return VMAP + i*PGLSZ(1) + o*PGLSZ(0);
348 	}
349 
350 	/*
351 	 * Nothing suitable, start using a new PD entry.
352 	 */
353 	if((o = findhole(pd, pdsz, 1)) != -1)
354 		return VMAP + o*PGLSZ(1);
355 
356 	return 0;
357 }
358 
359 void*
vmap(uintmem pa,usize size)360 vmap(uintmem pa, usize size)
361 {
362 	uintptr va;
363 	usize o, sz;
364 
365 	DBG("vmap(%#P, %lud)\n", pa, size);
366 
367 	if(m->machno != 0)
368 		panic("vmap");
369 
370 	/*
371 	 * This is incomplete; the checks are not comprehensive
372 	 * enough.
373 	 * Sometimes the request is for an already-mapped piece
374 	 * of low memory, in which case just return a good value
375 	 * and hope that a corresponding vunmap of the address
376 	 * will have the same address.
377 	 * To do this properly will require keeping track of the
378 	 * mappings; perhaps something like kmap, but kmap probably
379 	 * can't be used early enough for some of the uses.
380 	 */
381 	if(pa+size < 1ull*MiB)
382 		return KADDR(pa);
383 	if(pa < 1ull*MiB)
384 		return nil;
385 
386 	/*
387 	 * Might be asking for less than a page.
388 	 * This should have a smaller granularity if
389 	 * the page size is large.
390 	 */
391 	o = pa & ((1<<PGSHFT)-1);
392 	pa -= o;
393 	sz = ROUNDUP(size+o, PGSZ);
394 
395 	if(pa == 0){
396 		DBG("vmap(0, %lud) pc=%#p\n", size, getcallerpc(&pa));
397 		return nil;
398 	}
399 	ilock(&vmaplock);
400 	if((va = vmapalloc(sz)) == 0 || pdmap(pa, PtePCD|PteRW, va, sz) < 0){
401 		iunlock(&vmaplock);
402 		return nil;
403 	}
404 	iunlock(&vmaplock);
405 
406 	DBG("vmap(%#P, %lud) => %#p\n", pa+o, size, va+o);
407 
408 	return UINT2PTR(va + o);
409 }
410 
411 void
vunmap(void * v,usize size)412 vunmap(void* v, usize size)
413 {
414 	uintptr va;
415 
416 	DBG("vunmap(%#p, %lud)\n", v, size);
417 
418 	if(m->machno != 0)
419 		panic("vunmap");
420 
421 	/*
422 	 * See the comments above in vmap.
423 	 */
424 	va = PTR2UINT(v);
425 	if(va >= KZERO && va+size < KZERO+1ull*MiB)
426 		return;
427 
428 	/*
429 	 * Here will have to deal with releasing any
430 	 * resources used for the allocation (e.g. page table
431 	 * pages).
432 	 */
433 	DBG("vunmap(%#p, %lud)\n", v, size);
434 }
435 
436 int
mmuwalk(uintptr va,int level,PTE ** ret,u64int (* alloc)(usize))437 mmuwalk(uintptr va, int level, PTE** ret, u64int (*alloc)(usize))
438 {
439 //alloc and pa - uintmem or PTE or what?
440 	int l;
441 	Mpl pl;
442 	uintptr pa;
443 	PTE *pte, *ptp;
444 
445 	DBG("mmuwalk%d: va %#p level %d\n", m->machno, va, level);
446 	pte = nil;
447 	pl = splhi();
448 	for(l = 3; l >= 0; l--){
449 		ptp = mmuptpget(va, l);
450 		pte = &ptp[PTLX(va, l)];
451 		if(l == level)
452 			break;
453 		if(!(*pte & PteP)){
454 			if(alloc == nil)
455 				break;
456 			pa = alloc(PTSZ);
457 			if(pa == ~0)
458 				return -1;
459 if(pa & 0xfffull) print("mmuwalk pa %#llux\n", pa);
460 			*pte = pa|PteRW|PteP;
461 			if((ptp = mmuptpget(va, l-1)) == nil)
462 				panic("mmuwalk: mmuptpget(%#p, %d)\n", va, l-1);
463 			memset(ptp, 0, PTSZ);
464 		}
465 		else if(*pte & PtePS)
466 			break;
467 	}
468 	*ret = pte;
469 	splx(pl);
470 
471 	return l;
472 }
473 
474 u64int
mmuphysaddr(uintptr va)475 mmuphysaddr(uintptr va)
476 {
477 	int l;
478 	PTE *pte;
479 	u64int mask, pa;
480 
481 	/*
482 	 * Given a VA, find the PA.
483 	 * This is probably not the right interface,
484 	 * but will do as an experiment. Usual
485 	 * question, should va be void* or uintptr?
486 	 */
487 	l = mmuwalk(va, 0, &pte, nil);
488 	DBG("mmuphysaddr: va %#p l %d\n", va, l);
489 	if(l < 0)
490 		return ~0;
491 
492 	mask = (1ull<<(((l)*PTSHFT)+PGSHFT))-1;
493 	pa = (*pte & ~mask) + (va & mask);
494 
495 	DBG("mmuphysaddr: l %d va %#p pa %#llux\n", l, va, pa);
496 
497 	return pa;
498 }
499 
500 void
mmuinit(void)501 mmuinit(void)
502 {
503 	int l;
504 	uchar *p;
505 	PTE *pte;
506 	Page *page;
507 	uintptr pml4;
508 	u64int o, pa, r, sz;
509 
510 	archmmu();
511 	DBG("mach%d: %#p npgsz %d\n", m->machno, m, m->npgsz);
512 	if(m->machno != 0){
513 		/*
514 		 * GAK: Has to go when each mach is using
515 		 * its own page table
516 		 */
517 		p = UINT2PTR(m->stack);
518 		p += MACHSTKSZ;
519 		memmove(p, UINT2PTR(mach0pml4.va), PTSZ);
520 		m->pml4 = &m->pml4kludge;
521 		m->pml4->va = PTR2UINT(p);
522 		m->pml4->pa = PADDR(p);
523 		m->pml4->daddr = mach0pml4.daddr;	/* # of user mappings in pml4 */
524 		if(m->pml4->daddr){
525 			memset(p, 0, m->pml4->daddr*sizeof(PTE));
526 			m->pml4->daddr = 0;
527 		}
528 pte = (PTE*)p;
529 pte[PTLX(KSEG1PML4, 3)] = m->pml4->pa|PteRW|PteP;
530 
531 		r = rdmsr(Efer);
532 		r |= Nxe;
533 		wrmsr(Efer, r);
534 		cr3put(m->pml4->pa);
535 		DBG("mach%d: %#p pml4 %#p\n", m->machno, m, m->pml4);
536 		return;
537 	}
538 
539 	page = &mach0pml4;
540 	page->pa = cr3get();
541 	page->va = PTR2UINT(sys->pml4);
542 
543 	m->pml4 = page;
544 
545 	r = rdmsr(Efer);
546 	r |= Nxe;
547 	wrmsr(Efer, r);
548 
549 	/*
550 	 * Set up the various kernel memory allocator limits:
551 	 * pmstart/pmend bound the unused physical memory;
552 	 * vmstart/vmend bound the total possible virtual memory
553 	 * used by the kernel;
554 	 * vmunused is the highest virtual address currently mapped
555 	 * and used by the kernel;
556 	 * vmunmapped is the highest virtual address currently
557 	 * mapped by the kernel.
558 	 * Vmunused can be bumped up to vmunmapped before more
559 	 * physical memory needs to be allocated and mapped.
560 	 *
561 	 * This is set up here so meminit can map appropriately.
562 	 */
563 	o = sys->pmstart;
564 	sz = ROUNDUP(o, 4*MiB) - o;
565 	pa = asmalloc(0, sz, 1, 0);
566 	if(pa != o)
567 		panic("mmuinit: pa %#llux memstart %#llux\n", pa, o);
568 	sys->pmstart += sz;
569 
570 	sys->vmstart = KSEG0;
571 	sys->vmunused = sys->vmstart + ROUNDUP(o, 4*KiB);
572 	sys->vmunmapped = sys->vmstart + o + sz;
573 	sys->vmend = sys->vmstart + TMFM;
574 
575 	print("mmuinit: vmstart %#p vmunused %#p vmunmapped %#p vmend %#p\n",
576 		sys->vmstart, sys->vmunused, sys->vmunmapped, sys->vmend);
577 
578 	/*
579 	 * Set up the map for PD entry access by inserting
580 	 * the relevant PDP entry into the PD. It's equivalent
581 	 * to PADDR(sys->pd)|PteRW|PteP.
582 	 *
583 	 * Change code that uses this to use the KSEG1PML4
584 	 * map below.
585 	 */
586 	sys->pd[PDX(PDMAP)] = sys->pdp[PDPX(PDMAP)] & ~(PteD|PteA);
587 	print("sys->pd %#p %#p\n", sys->pd[PDX(PDMAP)], sys->pdp[PDPX(PDMAP)]);
588 
589 	assert((pdeget(PDMAP) & ~(PteD|PteA)) == (PADDR(sys->pd)|PteRW|PteP));
590 
591 	/*
592 	 * Set up the map for PTE access by inserting
593 	 * the relevant PML4 into itself.
594 	 * Note: outwith level 0, PteG is MBZ on AMD processors,
595 	 * is 'Reserved' on Intel processors, and the behaviour
596 	 * can be different.
597 	 */
598 	pml4 = cr3get();
599 	sys->pml4[PTLX(KSEG1PML4, 3)] = pml4|PteRW|PteP;
600 	cr3put(m->pml4->pa);
601 
602 	if((l = mmuwalk(KZERO, 3, &pte, nil)) >= 0)
603 		print("l %d %#p %llux\n", l, pte, *pte);
604 	if((l = mmuwalk(KZERO, 2, &pte, nil)) >= 0)
605 		print("l %d %#p %llux\n", l, pte, *pte);
606 	if((l = mmuwalk(KZERO, 1, &pte, nil)) >= 0)
607 		print("l %d %#p %llux\n", l, pte, *pte);
608 	if((l = mmuwalk(KZERO, 0, &pte, nil)) >= 0)
609 		print("l %d %#p %llux\n", l, pte, *pte);
610 
611 	mmuphysaddr(PTR2UINT(end));
612 }
613 
614 void
mmucachectl(Page * p,uint why)615 mmucachectl(Page *p, uint why)
616 {
617 	if(!pagedout(p))
618 		memset(p->cachectl, why, sizeof(p->cachectl));
619 }
620 
621 /*
622  * Double-check the user MMU.
623  * Error checking only.
624  */
625 void
checkmmu(uintptr va,uintmem pa)626 checkmmu(uintptr va, uintmem pa)
627 {
628 	uintmem mpa;
629 
630 	mpa = mmuphysaddr(va);
631 	if(mpa != ~(uintmem)0 && mpa != pa)
632 		print("%d %s: va=%#p pa=%#P mmupa=%#P\n",
633 			up->pid, up->text, va, pa, mpa);
634 }
635