1 #include "u.h"
2 #include "../port/lib.h"
3 #include "mem.h"
4 #include "dat.h"
5 #include "fns.h"
6
7 #include "amd64.h"
8
9 #define ALIGNED(p, a) (!(((uintptr)(p)) & ((a)-1)))
10
11 #define PDMAP (0xffffffffff800000ull)
12 #define PDPX(v) PTLX((v), 2)
13 #define PDX(v) PTLX((v), 1)
14 #define PTX(v) PTLX((v), 0)
15
16 #define VMAP (0xffffffffe0000000ull)
17 #define VMAPSZ (256*MiB)
18
19 #define KSEG1PML4 (0xffff000000000000ull\
20 |(PTLX(KSEG1, 3)<<(((3)*PTSHFT)+PGSHFT))\
21 |(PTLX(KSEG1, 3)<<(((2)*PTSHFT)+PGSHFT))\
22 |(PTLX(KSEG1, 3)<<(((1)*PTSHFT)+PGSHFT))\
23 |(PTLX(KSEG1, 3)<<(((0)*PTSHFT)+PGSHFT)))
24
25 #define KSEG1PTP(va, l) ((0xffff000000000000ull\
26 |(KSEG1PML4<<((3-(l))*PTSHFT))\
27 |(((va) & 0xffffffffffffull)>>(((l)+1)*PTSHFT))\
28 & ~0xfffull))
29
30 static Lock vmaplock;
31 static Page mach0pml4;
32
33 void
mmuflushtlb(u64int)34 mmuflushtlb(u64int)
35 {
36 if(m->pml4->daddr){
37 memset(UINT2PTR(m->pml4->va), 0, m->pml4->daddr*sizeof(PTE));
38 m->pml4->daddr = 0;
39 }
40 cr3put(m->pml4->pa);
41 }
42
43 void
mmuflush(void)44 mmuflush(void)
45 {
46 int s;
47
48 s = splhi();
49 up->newtlb = 1;
50 mmuswitch(up);
51 splx(s);
52 }
53
54 static void
mmuptpfree(Proc * proc,int release)55 mmuptpfree(Proc* proc, int release)
56 {
57 int l;
58 PTE *pte;
59 Page **last, *page;
60
61 /*
62 * To do here:
63 * coalesce the clean and release functionality
64 * (it's either one or the other, and no need for
65 * wakeup in mmurelease as not using the palloc pool);
66 * 0-based levels, not 1-based, for consistency;
67 * fix memset level for 2MiB pages;
68 * use a dedicated datastructure rather than Page?
69 */
70 for(l = 1; l < 4; l++){
71 last = &proc->mmuptp[l];
72 if(*last == nil)
73 continue;
74 for(page = *last; page != nil; page = page->next){
75 if(!release){
76 if(l == 1)
77 memset(UINT2PTR(page->va), 0, PTSZ);
78 pte = UINT2PTR(page->prev->va);
79 pte[page->daddr] = 0;
80 }
81 last = &page->next;
82 }
83 *last = proc->mmuptp[0];
84 proc->mmuptp[0] = proc->mmuptp[l];
85 proc->mmuptp[l] = nil;
86 }
87
88 m->pml4->daddr = 0;
89 }
90
91 static Page*
mmuptpalloc(void)92 mmuptpalloc(void)
93 {
94 Page *page;
95 uintmem pa;
96 int color;
97
98 /*
99 * Do not really need a whole Page structure,
100 * but it makes testing this out a lot easier.
101 * Could keep a cache and free excess.
102 */
103 if((page = malloc(sizeof(Page))) == nil){
104 print("mmuptpalloc Page\n");
105
106 return nil;
107 }
108 color = NOCOLOR;
109 if((pa = physalloc(PTSZ, &color, page)) == 0){
110 print("mmuptpalloc pa\n");
111 free(page);
112
113 return nil;
114 }
115
116 page->va = PTR2UINT(KADDR(pa));
117 page->pa = pa;
118 page->ref = 1;
119 page->color = color;
120 memset(UINT2PTR(page->va), 0, PTSZ);
121
122 return page;
123 }
124
125 void
mmuswitch(Proc * proc)126 mmuswitch(Proc* proc)
127 {
128 PTE *pte;
129 Page *page;
130
131 if(proc->newtlb){
132 mmuptpfree(proc, 0);
133 proc->newtlb = 0;
134 }
135
136 if(m->pml4->daddr){
137 memset(UINT2PTR(m->pml4->va), 0, m->pml4->daddr*sizeof(PTE));
138 m->pml4->daddr = 0;
139 }
140
141 pte = UINT2PTR(m->pml4->va);
142 for(page = proc->mmuptp[3]; page != nil; page = page->next){
143 pte[page->daddr] = PPN(page->pa)|PteU|PteRW|PteP;
144 if(page->daddr >= m->pml4->daddr)
145 m->pml4->daddr = page->daddr+1;
146 page->prev = m->pml4;
147 }
148
149 tssrsp0(STACKALIGN(PTR2UINT(proc->kstack+KSTACK)));
150 cr3put(m->pml4->pa);
151 }
152
153 void
mmurelease(Proc * proc)154 mmurelease(Proc* proc)
155 {
156 Page *page, *next;
157
158 /*
159 * See comments in mmuptpfree above.
160 */
161 mmuptpfree(proc, 1);
162
163 for(page = proc->mmuptp[0]; page != nil; page = next){
164 next = page->next;
165 if(--page->ref)
166 panic("mmurelease: page->ref %d\n", page->ref);
167 physfree(page->pa, PTSZ);
168 free(page);
169 }
170 if(proc->mmuptp[0] && palloc.r.p)
171 wakeup(&palloc.r);
172 proc->mmuptp[0] = nil;
173
174 tssrsp0(STACKALIGN(m->stack+MACHSTKSZ));
175 cr3put(m->pml4->pa);
176 }
177
178 static PTE*
mmuptpget(uintptr va,int level)179 mmuptpget(uintptr va, int level)
180 {
181 return (PTE*)KSEG1PTP(va, level);
182 }
183
184 void
mmuput(uintptr va,uintmem pa,Page *)185 mmuput(uintptr va, uintmem pa, Page*)
186 {
187 Mpl pl;
188 int l, x;
189 PTE *pte, *ptp;
190 Page *page, *prev;
191
192 pte = nil;
193 pl = splhi();
194 prev = m->pml4;
195 for(l = 3; l >= 0; l--){
196 ptp = mmuptpget(va, l);
197 x = PTLX(va, l);
198 pte = &ptp[x];
199 for(page = up->mmuptp[l]; page != nil; page = page->next){
200 if(page->prev == prev && page->daddr == x)
201 break;
202 }
203 if(page == nil){
204 if(up->mmuptp[0] == nil)
205 page = mmuptpalloc();
206 else {
207 page = up->mmuptp[0];
208 up->mmuptp[0] = page->next;
209 }
210 page->daddr = x;
211 page->next = up->mmuptp[l];
212 up->mmuptp[l] = page;
213 page->prev = prev;
214 *pte = PPN(page->pa)|PteU|PteRW|PteP;
215 if(l == 3 && x >= m->pml4->daddr)
216 m->pml4->daddr = x+1;
217 }
218 prev = page;
219 }
220
221 *pte = pa|PteU;
222 //if(pa & PteRW)
223 // *pte |= PteNX;
224 splx(pl);
225
226 invlpg(va); /* only if old entry valid? */
227 }
228
229 static PTE
pdeget(uintptr va)230 pdeget(uintptr va)
231 {
232 PTE *pdp;
233
234 if(va < 0xffffffffc0000000ull)
235 panic("pdeget(%#p)", va);
236
237 pdp = (PTE*)(PDMAP+PDX(PDMAP)*4096);
238
239 return pdp[PDX(va)];
240 }
241
242 /*
243 * Add kernel mappings for pa -> va for a section of size bytes.
244 * Called only after the va range is known to be unoccupied.
245 */
246 static int
pdmap(uintmem pa,int attr,uintptr va,usize size)247 pdmap(uintmem pa, int attr, uintptr va, usize size)
248 {
249 uintmem pae;
250 PTE *pd, *pde, *pt, *pte;
251 uintmem pdpa;
252 int pdx, pgsz, color;
253
254 pd = (PTE*)(PDMAP+PDX(PDMAP)*4096);
255
256 for(pae = pa + size; pa < pae; pa += pgsz){
257 pdx = PDX(va);
258 pde = &pd[pdx];
259
260 /*
261 * Check if it can be mapped using a big page,
262 * i.e. is big enough and starts on a suitable boundary.
263 * Assume processor can do it.
264 */
265 if(ALIGNED(pa, PGLSZ(1)) && ALIGNED(va, PGLSZ(1)) && (pae-pa) >= PGLSZ(1)){
266 assert(*pde == 0);
267 *pde = pa|attr|PtePS|PteP;
268 pgsz = PGLSZ(1);
269 }
270 else{
271 pt = (PTE*)(PDMAP+pdx*PTSZ);
272 if(*pde == 0){
273 color = NOCOLOR;
274 pdpa = physalloc(PTSZ, &color, nil);
275 if(pdpa == 0)
276 panic("pdmap");
277 *pde = pdpa|PteRW|PteP;
278 memset(pt, 0, PTSZ);
279 }
280
281 pte = &pt[PTX(va)];
282 assert(!(*pte & PteP));
283 *pte = pa|attr|PteP;
284 pgsz = PGLSZ(0);
285 }
286 va += pgsz;
287 }
288
289 return 0;
290 }
291
292 static int
findhole(PTE * a,int n,int count)293 findhole(PTE* a, int n, int count)
294 {
295 int have, i;
296
297 have = 0;
298 for(i = 0; i < n; i++){
299 if(a[i] == 0)
300 have++;
301 else
302 have = 0;
303 if(have >= count)
304 return i+1 - have;
305 }
306
307 return -1;
308 }
309
310 /*
311 * Look for free space in the vmap.
312 */
313 static uintptr
vmapalloc(usize size)314 vmapalloc(usize size)
315 {
316 int i, n, o;
317 PTE *pd, *pt;
318 int pdsz, ptsz;
319
320 pd = (PTE*)(PDMAP+PDX(PDMAP)*4096);
321 pd += PDX(VMAP);
322 pdsz = VMAPSZ/PGLSZ(1);
323
324 /*
325 * Look directly in the PD entries if the size is
326 * larger than the range mapped by a single entry.
327 */
328 if(size >= PGLSZ(1)){
329 n = HOWMANY(size, PGLSZ(1));
330 if((o = findhole(pd, pdsz, n)) != -1)
331 return VMAP + o*PGLSZ(1);
332 return 0;
333 }
334
335 /*
336 * Size is smaller than that mapped by a single PD entry.
337 * Look for an already mapped PT page that has room.
338 */
339 n = HOWMANY(size, PGLSZ(0));
340 ptsz = PGLSZ(0)/sizeof(PTE);
341 for(i = 0; i < pdsz; i++){
342 if(!(pd[i] & PteP) || (pd[i] & PtePS))
343 continue;
344
345 pt = (PTE*)(PDMAP+(PDX(VMAP)+i)*4096);
346 if((o = findhole(pt, ptsz, n)) != -1)
347 return VMAP + i*PGLSZ(1) + o*PGLSZ(0);
348 }
349
350 /*
351 * Nothing suitable, start using a new PD entry.
352 */
353 if((o = findhole(pd, pdsz, 1)) != -1)
354 return VMAP + o*PGLSZ(1);
355
356 return 0;
357 }
358
359 void*
vmap(uintmem pa,usize size)360 vmap(uintmem pa, usize size)
361 {
362 uintptr va;
363 usize o, sz;
364
365 DBG("vmap(%#P, %lud)\n", pa, size);
366
367 if(m->machno != 0)
368 panic("vmap");
369
370 /*
371 * This is incomplete; the checks are not comprehensive
372 * enough.
373 * Sometimes the request is for an already-mapped piece
374 * of low memory, in which case just return a good value
375 * and hope that a corresponding vunmap of the address
376 * will have the same address.
377 * To do this properly will require keeping track of the
378 * mappings; perhaps something like kmap, but kmap probably
379 * can't be used early enough for some of the uses.
380 */
381 if(pa+size < 1ull*MiB)
382 return KADDR(pa);
383 if(pa < 1ull*MiB)
384 return nil;
385
386 /*
387 * Might be asking for less than a page.
388 * This should have a smaller granularity if
389 * the page size is large.
390 */
391 o = pa & ((1<<PGSHFT)-1);
392 pa -= o;
393 sz = ROUNDUP(size+o, PGSZ);
394
395 if(pa == 0){
396 DBG("vmap(0, %lud) pc=%#p\n", size, getcallerpc(&pa));
397 return nil;
398 }
399 ilock(&vmaplock);
400 if((va = vmapalloc(sz)) == 0 || pdmap(pa, PtePCD|PteRW, va, sz) < 0){
401 iunlock(&vmaplock);
402 return nil;
403 }
404 iunlock(&vmaplock);
405
406 DBG("vmap(%#P, %lud) => %#p\n", pa+o, size, va+o);
407
408 return UINT2PTR(va + o);
409 }
410
411 void
vunmap(void * v,usize size)412 vunmap(void* v, usize size)
413 {
414 uintptr va;
415
416 DBG("vunmap(%#p, %lud)\n", v, size);
417
418 if(m->machno != 0)
419 panic("vunmap");
420
421 /*
422 * See the comments above in vmap.
423 */
424 va = PTR2UINT(v);
425 if(va >= KZERO && va+size < KZERO+1ull*MiB)
426 return;
427
428 /*
429 * Here will have to deal with releasing any
430 * resources used for the allocation (e.g. page table
431 * pages).
432 */
433 DBG("vunmap(%#p, %lud)\n", v, size);
434 }
435
436 int
mmuwalk(uintptr va,int level,PTE ** ret,u64int (* alloc)(usize))437 mmuwalk(uintptr va, int level, PTE** ret, u64int (*alloc)(usize))
438 {
439 //alloc and pa - uintmem or PTE or what?
440 int l;
441 Mpl pl;
442 uintptr pa;
443 PTE *pte, *ptp;
444
445 DBG("mmuwalk%d: va %#p level %d\n", m->machno, va, level);
446 pte = nil;
447 pl = splhi();
448 for(l = 3; l >= 0; l--){
449 ptp = mmuptpget(va, l);
450 pte = &ptp[PTLX(va, l)];
451 if(l == level)
452 break;
453 if(!(*pte & PteP)){
454 if(alloc == nil)
455 break;
456 pa = alloc(PTSZ);
457 if(pa == ~0)
458 return -1;
459 if(pa & 0xfffull) print("mmuwalk pa %#llux\n", pa);
460 *pte = pa|PteRW|PteP;
461 if((ptp = mmuptpget(va, l-1)) == nil)
462 panic("mmuwalk: mmuptpget(%#p, %d)\n", va, l-1);
463 memset(ptp, 0, PTSZ);
464 }
465 else if(*pte & PtePS)
466 break;
467 }
468 *ret = pte;
469 splx(pl);
470
471 return l;
472 }
473
474 u64int
mmuphysaddr(uintptr va)475 mmuphysaddr(uintptr va)
476 {
477 int l;
478 PTE *pte;
479 u64int mask, pa;
480
481 /*
482 * Given a VA, find the PA.
483 * This is probably not the right interface,
484 * but will do as an experiment. Usual
485 * question, should va be void* or uintptr?
486 */
487 l = mmuwalk(va, 0, &pte, nil);
488 DBG("mmuphysaddr: va %#p l %d\n", va, l);
489 if(l < 0)
490 return ~0;
491
492 mask = (1ull<<(((l)*PTSHFT)+PGSHFT))-1;
493 pa = (*pte & ~mask) + (va & mask);
494
495 DBG("mmuphysaddr: l %d va %#p pa %#llux\n", l, va, pa);
496
497 return pa;
498 }
499
500 void
mmuinit(void)501 mmuinit(void)
502 {
503 int l;
504 uchar *p;
505 PTE *pte;
506 Page *page;
507 uintptr pml4;
508 u64int o, pa, r, sz;
509
510 archmmu();
511 DBG("mach%d: %#p npgsz %d\n", m->machno, m, m->npgsz);
512 if(m->machno != 0){
513 /*
514 * GAK: Has to go when each mach is using
515 * its own page table
516 */
517 p = UINT2PTR(m->stack);
518 p += MACHSTKSZ;
519 memmove(p, UINT2PTR(mach0pml4.va), PTSZ);
520 m->pml4 = &m->pml4kludge;
521 m->pml4->va = PTR2UINT(p);
522 m->pml4->pa = PADDR(p);
523 m->pml4->daddr = mach0pml4.daddr; /* # of user mappings in pml4 */
524 if(m->pml4->daddr){
525 memset(p, 0, m->pml4->daddr*sizeof(PTE));
526 m->pml4->daddr = 0;
527 }
528 pte = (PTE*)p;
529 pte[PTLX(KSEG1PML4, 3)] = m->pml4->pa|PteRW|PteP;
530
531 r = rdmsr(Efer);
532 r |= Nxe;
533 wrmsr(Efer, r);
534 cr3put(m->pml4->pa);
535 DBG("mach%d: %#p pml4 %#p\n", m->machno, m, m->pml4);
536 return;
537 }
538
539 page = &mach0pml4;
540 page->pa = cr3get();
541 page->va = PTR2UINT(sys->pml4);
542
543 m->pml4 = page;
544
545 r = rdmsr(Efer);
546 r |= Nxe;
547 wrmsr(Efer, r);
548
549 /*
550 * Set up the various kernel memory allocator limits:
551 * pmstart/pmend bound the unused physical memory;
552 * vmstart/vmend bound the total possible virtual memory
553 * used by the kernel;
554 * vmunused is the highest virtual address currently mapped
555 * and used by the kernel;
556 * vmunmapped is the highest virtual address currently
557 * mapped by the kernel.
558 * Vmunused can be bumped up to vmunmapped before more
559 * physical memory needs to be allocated and mapped.
560 *
561 * This is set up here so meminit can map appropriately.
562 */
563 o = sys->pmstart;
564 sz = ROUNDUP(o, 4*MiB) - o;
565 pa = asmalloc(0, sz, 1, 0);
566 if(pa != o)
567 panic("mmuinit: pa %#llux memstart %#llux\n", pa, o);
568 sys->pmstart += sz;
569
570 sys->vmstart = KSEG0;
571 sys->vmunused = sys->vmstart + ROUNDUP(o, 4*KiB);
572 sys->vmunmapped = sys->vmstart + o + sz;
573 sys->vmend = sys->vmstart + TMFM;
574
575 print("mmuinit: vmstart %#p vmunused %#p vmunmapped %#p vmend %#p\n",
576 sys->vmstart, sys->vmunused, sys->vmunmapped, sys->vmend);
577
578 /*
579 * Set up the map for PD entry access by inserting
580 * the relevant PDP entry into the PD. It's equivalent
581 * to PADDR(sys->pd)|PteRW|PteP.
582 *
583 * Change code that uses this to use the KSEG1PML4
584 * map below.
585 */
586 sys->pd[PDX(PDMAP)] = sys->pdp[PDPX(PDMAP)] & ~(PteD|PteA);
587 print("sys->pd %#p %#p\n", sys->pd[PDX(PDMAP)], sys->pdp[PDPX(PDMAP)]);
588
589 assert((pdeget(PDMAP) & ~(PteD|PteA)) == (PADDR(sys->pd)|PteRW|PteP));
590
591 /*
592 * Set up the map for PTE access by inserting
593 * the relevant PML4 into itself.
594 * Note: outwith level 0, PteG is MBZ on AMD processors,
595 * is 'Reserved' on Intel processors, and the behaviour
596 * can be different.
597 */
598 pml4 = cr3get();
599 sys->pml4[PTLX(KSEG1PML4, 3)] = pml4|PteRW|PteP;
600 cr3put(m->pml4->pa);
601
602 if((l = mmuwalk(KZERO, 3, &pte, nil)) >= 0)
603 print("l %d %#p %llux\n", l, pte, *pte);
604 if((l = mmuwalk(KZERO, 2, &pte, nil)) >= 0)
605 print("l %d %#p %llux\n", l, pte, *pte);
606 if((l = mmuwalk(KZERO, 1, &pte, nil)) >= 0)
607 print("l %d %#p %llux\n", l, pte, *pte);
608 if((l = mmuwalk(KZERO, 0, &pte, nil)) >= 0)
609 print("l %d %#p %llux\n", l, pte, *pte);
610
611 mmuphysaddr(PTR2UINT(end));
612 }
613
614 void
mmucachectl(Page * p,uint why)615 mmucachectl(Page *p, uint why)
616 {
617 if(!pagedout(p))
618 memset(p->cachectl, why, sizeof(p->cachectl));
619 }
620
621 /*
622 * Double-check the user MMU.
623 * Error checking only.
624 */
625 void
checkmmu(uintptr va,uintmem pa)626 checkmmu(uintptr va, uintmem pa)
627 {
628 uintmem mpa;
629
630 mpa = mmuphysaddr(va);
631 if(mpa != ~(uintmem)0 && mpa != pa)
632 print("%d %s: va=%#p pa=%#P mmupa=%#P\n",
633 up->pid, up->text, va, pa, mpa);
634 }
635