1 /* 2 * Copyright (c) 2006 The DragonFly Project. All rights reserved. 3 * 4 * This code is derived from software contributed to The DragonFly Project 5 * by Matthew Dillon <dillon@backplane.com> 6 * 7 * Redistribution and use in source and binary forms, with or without 8 * modification, are permitted provided that the following conditions 9 * are met: 10 * 11 * 1. Redistributions of source code must retain the above copyright 12 * notice, this list of conditions and the following disclaimer. 13 * 2. Redistributions in binary form must reproduce the above copyright 14 * notice, this list of conditions and the following disclaimer in 15 * the documentation and/or other materials provided with the 16 * distribution. 17 * 3. Neither the name of The DragonFly Project nor the names of its 18 * contributors may be used to endorse or promote products derived 19 * from this software without specific, prior written permission. 20 * 21 * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 22 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 23 * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS 24 * FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE 25 * COPYRIGHT HOLDERS OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 26 * INCIDENTAL, SPECIAL, EXEMPLARY OR CONSEQUENTIAL DAMAGES (INCLUDING, 27 * BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; 28 * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED 29 * AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, 30 * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT 31 * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF 32 * SUCH DAMAGE. 33 * 34 * $DragonFly: src/sys/vm/vm_vmspace.c,v 1.11 2007/04/29 18:25:41 dillon Exp $ 35 */ 36 #include "opt_ddb.h" 37 38 #include <sys/param.h> 39 #include <sys/kernel.h> 40 #include <sys/systm.h> 41 #include <sys/sysproto.h> 42 #include <sys/kern_syscall.h> 43 #include <sys/mman.h> 44 #include <sys/proc.h> 45 #include <sys/malloc.h> 46 #include <sys/sysctl.h> 47 #include <sys/vkernel.h> 48 #include <sys/vmspace.h> 49 50 #include <vm/vm_extern.h> 51 #include <vm/pmap.h> 52 #include <ddb/ddb.h> 53 54 #include <machine/vmparam.h> 55 56 #include <sys/spinlock2.h> 57 #include <sys/sysref2.h> 58 59 static struct vmspace_entry *vkernel_find_vmspace(struct vkernel_common *vc, 60 void *id); 61 static void vmspace_entry_delete(struct vmspace_entry *ve, 62 struct vkernel_common *vc); 63 64 static MALLOC_DEFINE(M_VKERNEL, "vkernel", "VKernel structures"); 65 66 /* 67 * vmspace_create (void *id, int type, void *data) 68 * 69 * Create a VMSPACE under the control of the caller with the specified id. 70 * An id of NULL cannot be used. The type and data fields must currently 71 * be 0. 72 * 73 * The vmspace starts out completely empty. Memory may be mapped into the 74 * VMSPACE with vmspace_mmap() and MAP_VPAGETABLE section(s) controlled 75 * with vmspace_mcontrol(). 76 */ 77 int 78 sys_vmspace_create(struct vmspace_create_args *uap) 79 { 80 struct vkernel_common *vc; 81 struct vmspace_entry *ve; 82 struct vkernel *vk; 83 84 if (vkernel_enable == 0) 85 return (EOPNOTSUPP); 86 87 /* 88 * Create a virtual kernel side-structure for the process if one 89 * does not exist. 90 */ 91 if ((vk = curproc->p_vkernel) == NULL) { 92 vk = kmalloc(sizeof(*vk), M_VKERNEL, M_WAITOK|M_ZERO); 93 vc = kmalloc(sizeof(*vc), M_VKERNEL, M_WAITOK|M_ZERO); 94 vc->vc_refs = 1; 95 spin_init(&vc->vc_spin); 96 RB_INIT(&vc->vc_root); 97 vk->vk_common = vc; 98 curproc->p_vkernel = vk; 99 } 100 vc = vk->vk_common; 101 102 /* 103 * Create a new VMSPACE 104 */ 105 if (vkernel_find_vmspace(vc, uap->id)) 106 return (EEXIST); 107 ve = kmalloc(sizeof(struct vmspace_entry), M_VKERNEL, M_WAITOK|M_ZERO); 108 ve->vmspace = vmspace_alloc(VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 109 ve->id = uap->id; 110 pmap_pinit2(vmspace_pmap(ve->vmspace)); 111 RB_INSERT(vmspace_rb_tree, &vc->vc_root, ve); 112 return (0); 113 } 114 115 /* 116 * vmspace_destroy (void *id) 117 * 118 * Destroy a VMSPACE. 119 */ 120 int 121 sys_vmspace_destroy(struct vmspace_destroy_args *uap) 122 { 123 struct vkernel_common *vc; 124 struct vmspace_entry *ve; 125 struct vkernel *vk; 126 127 if ((vk = curproc->p_vkernel) == NULL) 128 return (EINVAL); 129 vc = vk->vk_common; 130 if ((ve = vkernel_find_vmspace(vc, uap->id)) == NULL) 131 return (ENOENT); 132 if (ve->refs) 133 return (EBUSY); 134 vmspace_entry_delete(ve, vc); 135 return(0); 136 } 137 138 /* 139 * vmspace_ctl (void *id, int cmd, struct trapframe *tframe, 140 * struct vextframe *vframe); 141 * 142 * Transfer control to a VMSPACE. Control is returned after the specified 143 * number of microseconds or if a page fault, signal, trap, or system call 144 * occurs. The context is updated as appropriate. 145 */ 146 int 147 sys_vmspace_ctl(struct vmspace_ctl_args *uap) 148 { 149 struct vkernel_common *vc; 150 struct vmspace_entry *ve; 151 struct vkernel *vk; 152 struct proc *p; 153 int framesz; 154 int error; 155 156 if ((vk = curproc->p_vkernel) == NULL) 157 return (EINVAL); 158 vc = vk->vk_common; 159 if ((ve = vkernel_find_vmspace(vc, uap->id)) == NULL) 160 return (ENOENT); 161 162 /* 163 * Signal mailbox interlock 164 */ 165 if (curproc->p_flag & P_MAILBOX) { 166 curproc->p_flag &= ~P_MAILBOX; 167 return (EINTR); 168 } 169 170 switch(uap->cmd) { 171 case VMSPACE_CTL_RUN: 172 /* 173 * Save the caller's register context, swap VM spaces, and 174 * install the passed register context. Return with 175 * EJUSTRETURN so the syscall code doesn't adjust the context. 176 */ 177 p = curproc; 178 ++ve->refs; 179 framesz = sizeof(struct trapframe); 180 vk->vk_current = ve; 181 vk->vk_save_vmspace = p->p_vmspace; 182 vk->vk_user_trapframe = uap->tframe; 183 vk->vk_user_vextframe = uap->vframe; 184 bcopy(uap->sysmsg_frame, &vk->vk_save_trapframe, framesz); 185 bcopy(&curthread->td_tls, &vk->vk_save_vextframe.vx_tls, 186 sizeof(vk->vk_save_vextframe.vx_tls)); 187 error = copyin(uap->tframe, uap->sysmsg_frame, framesz); 188 if (error == 0) 189 error = copyin(&uap->vframe->vx_tls, &curthread->td_tls, sizeof(struct savetls)); 190 if (error == 0) 191 error = cpu_sanitize_frame(uap->sysmsg_frame); 192 if (error == 0) 193 error = cpu_sanitize_tls(&curthread->td_tls); 194 if (error) { 195 bcopy(&vk->vk_save_trapframe, uap->sysmsg_frame, framesz); 196 bcopy(&vk->vk_save_vextframe.vx_tls, &curthread->td_tls, 197 sizeof(vk->vk_save_vextframe.vx_tls)); 198 set_user_TLS(); 199 vk->vk_current = NULL; 200 vk->vk_save_vmspace = NULL; 201 --ve->refs; 202 } else { 203 pmap_replacevm(p, ve->vmspace, 0); 204 set_user_TLS(); 205 set_vkernel_fp(uap->sysmsg_frame); 206 error = EJUSTRETURN; 207 } 208 break; 209 default: 210 error = EOPNOTSUPP; 211 break; 212 } 213 return(error); 214 } 215 216 /* 217 * vmspace_mmap(id, addr, len, prot, flags, fd, offset) 218 * 219 * map memory within a VMSPACE. This function is just like a normal mmap() 220 * but operates on the vmspace's memory map. Most callers use this to create 221 * a MAP_VPAGETABLE mapping. 222 */ 223 int 224 sys_vmspace_mmap(struct vmspace_mmap_args *uap) 225 { 226 struct vkernel_common *vc; 227 struct vmspace_entry *ve; 228 struct vkernel *vk; 229 int error; 230 231 if ((vk = curproc->p_vkernel) == NULL) 232 return (EINVAL); 233 vc = vk->vk_common; 234 if ((ve = vkernel_find_vmspace(vc, uap->id)) == NULL) 235 return (ENOENT); 236 error = kern_mmap(ve->vmspace, uap->addr, uap->len, 237 uap->prot, uap->flags, 238 uap->fd, uap->offset, &uap->sysmsg_resultp); 239 return (error); 240 } 241 242 /* 243 * vmspace_munmap(id, addr, len) 244 * 245 * unmap memory within a VMSPACE. 246 */ 247 int 248 sys_vmspace_munmap(struct vmspace_munmap_args *uap) 249 { 250 struct vkernel_common *vc; 251 struct vmspace_entry *ve; 252 struct vkernel *vk; 253 vm_offset_t addr; 254 vm_size_t size, pageoff; 255 vm_map_t map; 256 257 if ((vk = curproc->p_vkernel) == NULL) 258 return (EINVAL); 259 vc = vk->vk_common; 260 if ((ve = vkernel_find_vmspace(vc, uap->id)) == NULL) 261 return (ENOENT); 262 263 /* 264 * Copied from sys_munmap() 265 */ 266 addr = (vm_offset_t)uap->addr; 267 size = uap->len; 268 269 pageoff = (addr & PAGE_MASK); 270 addr -= pageoff; 271 size += pageoff; 272 size = (vm_size_t)round_page(size); 273 if (addr + size < addr) 274 return (EINVAL); 275 if (size == 0) 276 return (0); 277 278 if (VM_MAX_USER_ADDRESS > 0 && addr + size > VM_MAX_USER_ADDRESS) 279 return (EINVAL); 280 if (VM_MIN_USER_ADDRESS > 0 && addr < VM_MIN_USER_ADDRESS) 281 return (EINVAL); 282 map = &ve->vmspace->vm_map; 283 if (!vm_map_check_protection(map, addr, addr + size, VM_PROT_NONE)) 284 return (EINVAL); 285 vm_map_remove(map, addr, addr + size); 286 return (0); 287 } 288 289 /* 290 * vmspace_pread(id, buf, nbyte, flags, offset) 291 * 292 * Read data from a vmspace. The number of bytes read is returned or 293 * -1 if an unrecoverable error occured. If the number of bytes read is 294 * less then the request size, a page fault occured in the VMSPACE which 295 * the caller must resolve in order to proceed. 296 */ 297 int 298 sys_vmspace_pread(struct vmspace_pread_args *uap) 299 { 300 struct vkernel_common *vc; 301 struct vmspace_entry *ve; 302 struct vkernel *vk; 303 304 if ((vk = curproc->p_vkernel) == NULL) 305 return (EINVAL); 306 vc = vk->vk_common; 307 if ((ve = vkernel_find_vmspace(vc, uap->id)) == NULL) 308 return (ENOENT); 309 return (EINVAL); 310 } 311 312 /* 313 * vmspace_pwrite(id, buf, nbyte, flags, offset) 314 * 315 * Write data to a vmspace. The number of bytes written is returned or 316 * -1 if an unrecoverable error occured. If the number of bytes written is 317 * less then the request size, a page fault occured in the VMSPACE which 318 * the caller must resolve in order to proceed. 319 */ 320 int 321 sys_vmspace_pwrite(struct vmspace_pwrite_args *uap) 322 { 323 struct vkernel_common *vc; 324 struct vmspace_entry *ve; 325 struct vkernel *vk; 326 327 if ((vk = curproc->p_vkernel) == NULL) 328 return (EINVAL); 329 vc = vk->vk_common; 330 if ((ve = vkernel_find_vmspace(vc, uap->id)) == NULL) 331 return (ENOENT); 332 return (EINVAL); 333 } 334 335 /* 336 * vmspace_mcontrol(id, addr, len, behav, value) 337 * 338 * madvise/mcontrol support for a vmspace. 339 */ 340 int 341 sys_vmspace_mcontrol(struct vmspace_mcontrol_args *uap) 342 { 343 struct vkernel_common *vc; 344 struct vmspace_entry *ve; 345 struct vkernel *vk; 346 vm_offset_t start, end; 347 348 if ((vk = curproc->p_vkernel) == NULL) 349 return (EINVAL); 350 vc = vk->vk_common; 351 if ((ve = vkernel_find_vmspace(vc, uap->id)) == NULL) 352 return (ENOENT); 353 354 /* 355 * This code is basically copied from sys_mcontrol() 356 */ 357 if (uap->behav < 0 || uap->behav > MADV_CONTROL_END) 358 return (EINVAL); 359 360 if (VM_MAX_USER_ADDRESS > 0 && 361 ((vm_offset_t) uap->addr + uap->len) > VM_MAX_USER_ADDRESS) 362 return (EINVAL); 363 if (VM_MIN_USER_ADDRESS > 0 && uap->addr < VM_MIN_USER_ADDRESS) 364 return (EINVAL); 365 if (((vm_offset_t) uap->addr + uap->len) < (vm_offset_t) uap->addr) 366 return (EINVAL); 367 368 start = trunc_page((vm_offset_t) uap->addr); 369 end = round_page((vm_offset_t) uap->addr + uap->len); 370 371 return (vm_map_madvise(&ve->vmspace->vm_map, start, end, 372 uap->behav, uap->value)); 373 } 374 375 /* 376 * Red black tree functions 377 */ 378 static int rb_vmspace_compare(struct vmspace_entry *, struct vmspace_entry *); 379 RB_GENERATE(vmspace_rb_tree, vmspace_entry, rb_entry, rb_vmspace_compare); 380 381 /* a->start is address, and the only field has to be initialized */ 382 static int 383 rb_vmspace_compare(struct vmspace_entry *a, struct vmspace_entry *b) 384 { 385 if ((char *)a->id < (char *)b->id) 386 return(-1); 387 else if ((char *)a->id > (char *)b->id) 388 return(1); 389 return(0); 390 } 391 392 static 393 int 394 rb_vmspace_delete(struct vmspace_entry *ve, void *data) 395 { 396 struct vkernel_common *vc = data; 397 398 KKASSERT(ve->refs == 0); 399 vmspace_entry_delete(ve, vc); 400 return(0); 401 } 402 403 /* 404 * Remove a vmspace_entry from the RB tree and destroy it. We have to clean 405 * up the pmap, the vm_map, then destroy the vmspace. 406 */ 407 static 408 void 409 vmspace_entry_delete(struct vmspace_entry *ve, struct vkernel_common *vc) 410 { 411 RB_REMOVE(vmspace_rb_tree, &vc->vc_root, ve); 412 413 pmap_remove_pages(vmspace_pmap(ve->vmspace), 414 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 415 vm_map_remove(&ve->vmspace->vm_map, 416 VM_MIN_USER_ADDRESS, VM_MAX_USER_ADDRESS); 417 sysref_put(&ve->vmspace->vm_sysref); 418 kfree(ve, M_VKERNEL); 419 } 420 421 422 static 423 struct vmspace_entry * 424 vkernel_find_vmspace(struct vkernel_common *vc, void *id) 425 { 426 struct vmspace_entry *ve; 427 struct vmspace_entry key; 428 429 key.id = id; 430 ve = RB_FIND(vmspace_rb_tree, &vc->vc_root, &key); 431 return (ve); 432 } 433 434 /* 435 * Manage vkernel refs, used by the kernel when fork()ing or exit()ing 436 * a vkernel process. 437 */ 438 void 439 vkernel_inherit(struct proc *p1, struct proc *p2) 440 { 441 struct vkernel_common *vc; 442 struct vkernel *vk; 443 444 vk = p1->p_vkernel; 445 vc = vk->vk_common; 446 KKASSERT(vc->vc_refs > 0); 447 atomic_add_int(&vc->vc_refs, 1); 448 vk = kmalloc(sizeof(*vk), M_VKERNEL, M_WAITOK|M_ZERO); 449 p2->p_vkernel = vk; 450 vk->vk_common = vc; 451 } 452 453 void 454 vkernel_exit(struct proc *p) 455 { 456 struct vkernel_common *vc; 457 struct vmspace_entry *ve; 458 struct vkernel *vk; 459 int freeme = 0; 460 461 vk = p->p_vkernel; 462 p->p_vkernel = NULL; 463 vc = vk->vk_common; 464 vk->vk_common = NULL; 465 466 /* 467 * Restore the original VM context if we are killed while running 468 * a different one. 469 * 470 * This isn't supposed to happen. What is supposed to happen is 471 * that the process should enter vkernel_trap() before the handling 472 * the signal. 473 */ 474 if ((ve = vk->vk_current) != NULL) { 475 kprintf("Killed with active VC, notify kernel list\n"); 476 #ifdef DDB 477 db_print_backtrace(); 478 #endif 479 vk->vk_current = NULL; 480 pmap_replacevm(p, vk->vk_save_vmspace, 0); 481 vk->vk_save_vmspace = NULL; 482 KKASSERT(ve->refs > 0); 483 --ve->refs; 484 } 485 486 /* 487 * Dereference the common area 488 */ 489 KKASSERT(vc->vc_refs > 0); 490 spin_lock_wr(&vc->vc_spin); 491 if (--vc->vc_refs == 0) 492 freeme = 1; 493 spin_unlock_wr(&vc->vc_spin); 494 495 if (freeme) { 496 RB_SCAN(vmspace_rb_tree, &vc->vc_root, NULL, 497 rb_vmspace_delete, vc); 498 kfree(vc, M_VKERNEL); 499 } 500 kfree(vk, M_VKERNEL); 501 } 502 503 /* 504 * A VM space under virtual kernel control trapped out or made a system call 505 * or otherwise needs to return control to the virtual kernel context. 506 */ 507 int 508 vkernel_trap(struct proc *p, struct trapframe *frame) 509 { 510 struct vmspace_entry *ve; 511 struct vkernel *vk; 512 int error; 513 514 /* 515 * Which vmspace entry was running? 516 */ 517 vk = p->p_vkernel; 518 ve = vk->vk_current; 519 vk->vk_current = NULL; 520 KKASSERT(ve != NULL); 521 522 /* 523 * Switch the process context back to the virtual kernel's VM space. 524 */ 525 pmap_replacevm(p, vk->vk_save_vmspace, 0); 526 vk->vk_save_vmspace = NULL; 527 KKASSERT(ve->refs > 0); 528 --ve->refs; 529 530 /* 531 * Copy the emulated process frame to the virtual kernel process. 532 * The emulated process cannot change TLS descriptors so don't 533 * bother saving them, we already have a copy. 534 * 535 * Restore the virtual kernel's saved context so the virtual kernel 536 * process can resume. 537 */ 538 error = copyout(frame, vk->vk_user_trapframe, sizeof(*frame)); 539 bcopy(&vk->vk_save_trapframe, frame, sizeof(*frame)); 540 bcopy(&vk->vk_save_vextframe.vx_tls, &curthread->td_tls, 541 sizeof(vk->vk_save_vextframe.vx_tls)); 542 set_user_TLS(); 543 return(error); 544 } 545 546