1*3253Smec /* 2*3253Smec * CDDL HEADER START 3*3253Smec * 4*3253Smec * The contents of this file are subject to the terms of the 5*3253Smec * Common Development and Distribution License (the "License"). 6*3253Smec * You may not use this file except in compliance with the License. 7*3253Smec * 8*3253Smec * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9*3253Smec * or http://www.opensolaris.org/os/licensing. 10*3253Smec * See the License for the specific language governing permissions 11*3253Smec * and limitations under the License. 12*3253Smec * 13*3253Smec * When distributing Covered Code, include this CDDL HEADER in each 14*3253Smec * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15*3253Smec * If applicable, add the following below this CDDL HEADER, with the 16*3253Smec * fields enclosed by brackets "[]" replaced with your own identifying 17*3253Smec * information: Portions Copyright [yyyy] [name of copyright owner] 18*3253Smec * 19*3253Smec * CDDL HEADER END 20*3253Smec */ 21*3253Smec /* 22*3253Smec * Copyright 2006 Sun Microsystems, Inc. All rights reserved. 23*3253Smec * Use is subject to license terms. 24*3253Smec */ 25*3253Smec 26*3253Smec #pragma ident "%Z%%M% %I% %E% SMI" 27*3253Smec 28*3253Smec #include <sys/types.h> 29*3253Smec #include <sys/modctl.h> 30*3253Smec #include <sys/conf.h> 31*3253Smec #include <sys/ddi.h> 32*3253Smec #include <sys/sunddi.h> 33*3253Smec #include <sys/devops.h> 34*3253Smec #include <sys/stat.h> 35*3253Smec #include <sys/file.h> 36*3253Smec #include <sys/cred.h> 37*3253Smec #include <sys/policy.h> 38*3253Smec #include <sys/errno.h> 39*3253Smec #include <vm/seg_dev.h> 40*3253Smec #include <vm/seg_vn.h> 41*3253Smec #include <vm/page.h> 42*3253Smec #include <sys/fs/swapnode.h> 43*3253Smec #include <sys/sysmacros.h> 44*3253Smec #include <sys/fcntl.h> 45*3253Smec #include <sys/vmsystm.h> 46*3253Smec #include <sys/physmem.h> 47*3253Smec 48*3253Smec static dev_info_t *physmem_dip = NULL; 49*3253Smec 50*3253Smec /* 51*3253Smec * Linked list element hanging off physmem_proc_hash below, which holds all 52*3253Smec * the information for a given segment which has been setup for this process. 53*3253Smec * This is a simple linked list as we are assuming that for a given process 54*3253Smec * the setup ioctl will only be called a handful of times. If this assumption 55*3253Smec * changes in the future, a quicker to traverse data structure should be used. 56*3253Smec */ 57*3253Smec struct physmem_hash { 58*3253Smec struct physmem_hash *ph_next; 59*3253Smec uint64_t ph_base_pa; 60*3253Smec caddr_t ph_base_va; 61*3253Smec size_t ph_seg_len; 62*3253Smec struct vnode *ph_vnode; 63*3253Smec }; 64*3253Smec 65*3253Smec /* 66*3253Smec * Hash of all of the processes which have setup mappings with the driver with 67*3253Smec * pointers to per process data. 68*3253Smec */ 69*3253Smec struct physmem_proc_hash { 70*3253Smec struct proc *pph_proc; 71*3253Smec struct physmem_hash *pph_hash; 72*3253Smec struct physmem_proc_hash *pph_next; 73*3253Smec }; 74*3253Smec 75*3253Smec 76*3253Smec /* Needs to be a power of two for simple hash algorithm */ 77*3253Smec #define PPH_SIZE 8 78*3253Smec struct physmem_proc_hash *pph[PPH_SIZE]; 79*3253Smec 80*3253Smec /* 81*3253Smec * Lock which protects the pph hash above. To add an element (either a new 82*3253Smec * process or a new segment) the WRITE lock must be held. To traverse the 83*3253Smec * list, only a READ lock is needed. 84*3253Smec */ 85*3253Smec krwlock_t pph_rwlock; 86*3253Smec 87*3253Smec #define PHYSMEM_HASH(procp) ((int)((((uintptr_t)procp) >> 8) & (PPH_SIZE - 1))) 88*3253Smec 89*3253Smec /* 90*3253Smec * Need to keep a reference count of how many processes have the driver 91*3253Smec * open to prevent it from disappearing. 92*3253Smec */ 93*3253Smec uint64_t physmem_vnodecnt; 94*3253Smec kmutex_t physmem_mutex; /* protects phsymem_vnodecnt */ 95*3253Smec 96*3253Smec static int physmem_getpage(struct vnode *vp, offset_t off, size_t len, 97*3253Smec uint_t *protp, page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, 98*3253Smec enum seg_rw rw, struct cred *cr); 99*3253Smec 100*3253Smec static int physmem_addmap(struct vnode *vp, offset_t off, struct as *as, 101*3253Smec caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, 102*3253Smec struct cred *cred); 103*3253Smec 104*3253Smec static int physmem_delmap(struct vnode *vp, offset_t off, struct as *as, 105*3253Smec caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags, 106*3253Smec struct cred *cred); 107*3253Smec 108*3253Smec static void physmem_inactive(vnode_t *vp, cred_t *crp); 109*3253Smec 110*3253Smec const fs_operation_def_t physmem_vnodeops_template[] = { 111*3253Smec VOPNAME_GETPAGE, physmem_getpage, 112*3253Smec VOPNAME_ADDMAP, (fs_generic_func_p) physmem_addmap, 113*3253Smec VOPNAME_DELMAP, physmem_delmap, 114*3253Smec VOPNAME_INACTIVE, (fs_generic_func_p) physmem_inactive, 115*3253Smec NULL, NULL 116*3253Smec }; 117*3253Smec 118*3253Smec vnodeops_t *physmem_vnodeops = NULL; 119*3253Smec 120*3253Smec /* 121*3253Smec * Removes the current process from the hash if the process has no more 122*3253Smec * physmem segments active. 123*3253Smec */ 124*3253Smec void 125*3253Smec physmem_remove_hash_proc() 126*3253Smec { 127*3253Smec int index; 128*3253Smec struct physmem_proc_hash **walker; 129*3253Smec struct physmem_proc_hash *victim = NULL; 130*3253Smec 131*3253Smec index = PHYSMEM_HASH(curproc); 132*3253Smec rw_enter(&pph_rwlock, RW_WRITER); 133*3253Smec walker = &pph[index]; 134*3253Smec while (*walker != NULL) { 135*3253Smec if ((*walker)->pph_proc == curproc && 136*3253Smec (*walker)->pph_hash == NULL) { 137*3253Smec victim = *walker; 138*3253Smec *walker = victim->pph_next; 139*3253Smec break; 140*3253Smec } 141*3253Smec walker = &((*walker)->pph_next); 142*3253Smec } 143*3253Smec rw_exit(&pph_rwlock); 144*3253Smec if (victim != NULL) 145*3253Smec kmem_free(victim, sizeof (struct physmem_proc_hash)); 146*3253Smec } 147*3253Smec 148*3253Smec /* 149*3253Smec * Add a new entry to the hash for the given process to cache the 150*3253Smec * address ranges that it is working on. If this is the first hash 151*3253Smec * item to be added for this process, we will create the head pointer 152*3253Smec * for this process. 153*3253Smec * Returns 0 on success, ERANGE when the physical address is already in the 154*3253Smec * hash. Note that we add it to the hash as we have already called as_map 155*3253Smec * and thus the as_unmap call will try to free the vnode, which needs 156*3253Smec * to be found in the hash. 157*3253Smec */ 158*3253Smec int 159*3253Smec physmem_add_hash(struct physmem_hash *php) 160*3253Smec { 161*3253Smec int index; 162*3253Smec struct physmem_proc_hash *iterator; 163*3253Smec struct physmem_proc_hash *newp = NULL; 164*3253Smec struct physmem_hash *temp; 165*3253Smec int ret = 0; 166*3253Smec 167*3253Smec index = PHYSMEM_HASH(curproc); 168*3253Smec 169*3253Smec insert: 170*3253Smec rw_enter(&pph_rwlock, RW_WRITER); 171*3253Smec iterator = pph[index]; 172*3253Smec while (iterator != NULL) { 173*3253Smec if (iterator->pph_proc == curproc) { 174*3253Smec /* 175*3253Smec * check to make sure a single process does not try to 176*3253Smec * map the same region twice. 177*3253Smec */ 178*3253Smec for (temp = iterator->pph_hash; temp != NULL; 179*3253Smec temp = temp->ph_next) { 180*3253Smec if ((php->ph_base_pa >= temp->ph_base_pa && 181*3253Smec php->ph_base_pa < temp->ph_base_pa + 182*3253Smec temp->ph_seg_len) || 183*3253Smec (temp->ph_base_pa >= php->ph_base_pa && 184*3253Smec temp->ph_base_pa < php->ph_base_pa + 185*3253Smec php->ph_seg_len)) { 186*3253Smec ret = ERANGE; 187*3253Smec break; 188*3253Smec } 189*3253Smec } 190*3253Smec if (ret == 0) { 191*3253Smec php->ph_next = iterator->pph_hash; 192*3253Smec iterator->pph_hash = php; 193*3253Smec } 194*3253Smec rw_exit(&pph_rwlock); 195*3253Smec /* Need to check for two threads in sync */ 196*3253Smec if (newp != NULL) 197*3253Smec kmem_free(newp, sizeof (*newp)); 198*3253Smec return (ret); 199*3253Smec } 200*3253Smec iterator = iterator->pph_next; 201*3253Smec } 202*3253Smec 203*3253Smec if (newp != NULL) { 204*3253Smec newp->pph_proc = curproc; 205*3253Smec newp->pph_next = pph[index]; 206*3253Smec newp->pph_hash = php; 207*3253Smec php->ph_next = NULL; 208*3253Smec pph[index] = newp; 209*3253Smec rw_exit(&pph_rwlock); 210*3253Smec return (0); 211*3253Smec } 212*3253Smec 213*3253Smec rw_exit(&pph_rwlock); 214*3253Smec /* Dropped the lock so we could use KM_SLEEP */ 215*3253Smec newp = kmem_zalloc(sizeof (struct physmem_proc_hash), KM_SLEEP); 216*3253Smec goto insert; 217*3253Smec } 218*3253Smec 219*3253Smec /* 220*3253Smec * Will return the pointer to the physmem_hash struct if the setup routine 221*3253Smec * has previously been called for this memory. 222*3253Smec * Returns NULL on failure. 223*3253Smec */ 224*3253Smec struct physmem_hash * 225*3253Smec physmem_get_hash(uint64_t req_paddr, size_t len, proc_t *procp) 226*3253Smec { 227*3253Smec int index; 228*3253Smec struct physmem_proc_hash *proc_hp; 229*3253Smec struct physmem_hash *php; 230*3253Smec 231*3253Smec ASSERT(rw_lock_held(&pph_rwlock)); 232*3253Smec 233*3253Smec index = PHYSMEM_HASH(procp); 234*3253Smec proc_hp = pph[index]; 235*3253Smec while (proc_hp != NULL) { 236*3253Smec if (proc_hp->pph_proc == procp) { 237*3253Smec php = proc_hp->pph_hash; 238*3253Smec while (php != NULL) { 239*3253Smec if ((req_paddr >= php->ph_base_pa) && 240*3253Smec (req_paddr + len <= 241*3253Smec php->ph_base_pa + php->ph_seg_len)) { 242*3253Smec return (php); 243*3253Smec } 244*3253Smec php = php->ph_next; 245*3253Smec } 246*3253Smec } 247*3253Smec proc_hp = proc_hp->pph_next; 248*3253Smec } 249*3253Smec return (NULL); 250*3253Smec } 251*3253Smec 252*3253Smec int 253*3253Smec physmem_validate_cookie(uint64_t p_cookie) 254*3253Smec { 255*3253Smec int index; 256*3253Smec struct physmem_proc_hash *proc_hp; 257*3253Smec struct physmem_hash *php; 258*3253Smec 259*3253Smec ASSERT(rw_lock_held(&pph_rwlock)); 260*3253Smec 261*3253Smec index = PHYSMEM_HASH(curproc); 262*3253Smec proc_hp = pph[index]; 263*3253Smec while (proc_hp != NULL) { 264*3253Smec if (proc_hp->pph_proc == curproc) { 265*3253Smec php = proc_hp->pph_hash; 266*3253Smec while (php != NULL) { 267*3253Smec if ((uint64_t)(uintptr_t)php == p_cookie) { 268*3253Smec return (1); 269*3253Smec } 270*3253Smec php = php->ph_next; 271*3253Smec } 272*3253Smec } 273*3253Smec proc_hp = proc_hp->pph_next; 274*3253Smec } 275*3253Smec return (0); 276*3253Smec } 277*3253Smec 278*3253Smec /* 279*3253Smec * Remove the given vnode from the pph hash. If it exists in the hash the 280*3253Smec * process still has to be around as the vnode is obviously still around and 281*3253Smec * since it's a physmem vnode, it must be in the hash. 282*3253Smec * If it is not in the hash that must mean that the setup ioctl failed. 283*3253Smec * Return 0 in this instance, 1 if it is in the hash. 284*3253Smec */ 285*3253Smec int 286*3253Smec physmem_remove_vnode_hash(vnode_t *vp) 287*3253Smec { 288*3253Smec int index; 289*3253Smec struct physmem_proc_hash *proc_hp; 290*3253Smec struct physmem_hash **phpp; 291*3253Smec struct physmem_hash *victim; 292*3253Smec 293*3253Smec index = PHYSMEM_HASH(curproc); 294*3253Smec /* synchronize with the map routine */ 295*3253Smec rw_enter(&pph_rwlock, RW_WRITER); 296*3253Smec proc_hp = pph[index]; 297*3253Smec while (proc_hp != NULL) { 298*3253Smec if (proc_hp->pph_proc == curproc) { 299*3253Smec phpp = &proc_hp->pph_hash; 300*3253Smec while (*phpp != NULL) { 301*3253Smec if ((*phpp)->ph_vnode == vp) { 302*3253Smec victim = *phpp; 303*3253Smec *phpp = victim->ph_next; 304*3253Smec 305*3253Smec rw_exit(&pph_rwlock); 306*3253Smec kmem_free(victim, sizeof (*victim)); 307*3253Smec return (1); 308*3253Smec } 309*3253Smec phpp = &(*phpp)->ph_next; 310*3253Smec } 311*3253Smec } 312*3253Smec proc_hp = proc_hp->pph_next; 313*3253Smec } 314*3253Smec rw_exit(&pph_rwlock); 315*3253Smec 316*3253Smec /* not found */ 317*3253Smec return (0); 318*3253Smec } 319*3253Smec 320*3253Smec int 321*3253Smec physmem_setup_vnops() 322*3253Smec { 323*3253Smec int error; 324*3253Smec char *name = "physmem"; 325*3253Smec if (physmem_vnodeops != NULL) 326*3253Smec cmn_err(CE_PANIC, "physmem vnodeops already set\n"); 327*3253Smec error = vn_make_ops(name, physmem_vnodeops_template, &physmem_vnodeops); 328*3253Smec if (error != 0) { 329*3253Smec cmn_err(CE_WARN, "physmem_setup_vnops: bad vnode ops template"); 330*3253Smec } 331*3253Smec return (error); 332*3253Smec } 333*3253Smec 334*3253Smec /* 335*3253Smec * The guts of the PHYSMEM_SETUP ioctl. 336*3253Smec * Create a segment in the address space with the specified parameters. 337*3253Smec * If pspp->user_va is NULL, as_gap will be used to find an appropriate VA. 338*3253Smec * We do not do bounds checking on the requested phsycial addresses, if they 339*3253Smec * do not exist in the system, they will not be mappable. 340*3253Smec * Returns 0 on success with the following error codes on failure: 341*3253Smec * ENOMEM - The VA range requested was already mapped if pspp->user_va is 342*3253Smec * non-NULL or the system was unable to find enough VA space for 343*3253Smec * the desired length if user_va was NULL> 344*3253Smec * EINVAL - The requested PA, VA, or length was not PAGESIZE aligned. 345*3253Smec */ 346*3253Smec int 347*3253Smec physmem_setup_addrs(struct physmem_setup_param *pspp) 348*3253Smec { 349*3253Smec struct as *as = curproc->p_as; 350*3253Smec struct segvn_crargs vn_a; 351*3253Smec int ret = 0; 352*3253Smec uint64_t base_pa; 353*3253Smec size_t len; 354*3253Smec caddr_t uvaddr; 355*3253Smec struct vnode *vp; 356*3253Smec struct physmem_hash *php; 357*3253Smec 358*3253Smec ASSERT(pspp != NULL); 359*3253Smec base_pa = pspp->req_paddr; 360*3253Smec len = pspp->len; 361*3253Smec uvaddr = (caddr_t)(uintptr_t)pspp->user_va; 362*3253Smec 363*3253Smec /* Sanity checking */ 364*3253Smec if (!IS_P2ALIGNED(base_pa, PAGESIZE)) 365*3253Smec return (EINVAL); 366*3253Smec if (!IS_P2ALIGNED(len, PAGESIZE)) 367*3253Smec return (EINVAL); 368*3253Smec if (uvaddr != NULL && !IS_P2ALIGNED(uvaddr, PAGESIZE)) 369*3253Smec return (EINVAL); 370*3253Smec 371*3253Smec php = kmem_zalloc(sizeof (struct physmem_hash), KM_SLEEP); 372*3253Smec 373*3253Smec /* Need to bump vnode count so that the driver can not be unloaded */ 374*3253Smec mutex_enter(&physmem_mutex); 375*3253Smec physmem_vnodecnt++; 376*3253Smec mutex_exit(&physmem_mutex); 377*3253Smec 378*3253Smec vp = vn_alloc(KM_SLEEP); 379*3253Smec ASSERT(vp != NULL); /* SLEEP can't return NULL */ 380*3253Smec vn_setops(vp, physmem_vnodeops); 381*3253Smec 382*3253Smec php->ph_vnode = vp; 383*3253Smec 384*3253Smec vn_a.vp = vp; 385*3253Smec vn_a.offset = (u_offset_t)base_pa; 386*3253Smec vn_a.type = MAP_SHARED; 387*3253Smec vn_a.prot = PROT_ALL; 388*3253Smec vn_a.maxprot = PROT_ALL; 389*3253Smec vn_a.flags = 0; 390*3253Smec vn_a.cred = NULL; 391*3253Smec vn_a.amp = NULL; 392*3253Smec vn_a.szc = 0; 393*3253Smec vn_a.lgrp_mem_policy_flags = 0; 394*3253Smec 395*3253Smec as_rangelock(as); 396*3253Smec if (uvaddr != NULL) { 397*3253Smec if (as_gap(as, len, &uvaddr, &len, AH_LO, NULL) == -1) { 398*3253Smec ret = ENOMEM; 399*3253Smec fail: 400*3253Smec as_rangeunlock(as); 401*3253Smec vn_free(vp); 402*3253Smec kmem_free(php, sizeof (*php)); 403*3253Smec mutex_enter(&physmem_mutex); 404*3253Smec physmem_vnodecnt--; 405*3253Smec mutex_exit(&physmem_mutex); 406*3253Smec return (ret); 407*3253Smec } 408*3253Smec } else { 409*3253Smec /* We pick the address for the user */ 410*3253Smec map_addr(&uvaddr, len, 0, 1, 0); 411*3253Smec if (uvaddr == NULL) { 412*3253Smec ret = ENOMEM; 413*3253Smec goto fail; 414*3253Smec } 415*3253Smec } 416*3253Smec ret = as_map(as, uvaddr, len, segvn_create, &vn_a); 417*3253Smec 418*3253Smec as_rangeunlock(as); 419*3253Smec if (ret == 0) { 420*3253Smec php->ph_base_pa = base_pa; 421*3253Smec php->ph_base_va = uvaddr; 422*3253Smec php->ph_seg_len = len; 423*3253Smec pspp->user_va = (uint64_t)(uintptr_t)uvaddr; 424*3253Smec pspp->cookie = (uint64_t)(uintptr_t)php; 425*3253Smec ret = physmem_add_hash(php); 426*3253Smec if (ret == 0) 427*3253Smec return (0); 428*3253Smec (void) as_unmap(as, uvaddr, len); 429*3253Smec return (ret); 430*3253Smec } 431*3253Smec 432*3253Smec goto fail; 433*3253Smec /*NOTREACHED*/ 434*3253Smec } 435*3253Smec 436*3253Smec /* 437*3253Smec * The guts of the PHYSMEM_MAP ioctl. 438*3253Smec * Map the given PA to the appropriate VA if PHYSMEM_SETUP ioctl has already 439*3253Smec * been called for this PA range. 440*3253Smec * Returns 0 on success with the following error codes on failure: 441*3253Smec * EPERM - The requested page is long term locked, and thus repeated 442*3253Smec * requests to allocate this page will likely fail. 443*3253Smec * EAGAIN - The requested page could not be allocated, but it is believed 444*3253Smec * that future attempts could succeed. 445*3253Smec * ENOMEM - There was not enough free memory in the system to safely 446*3253Smec * map the requested page. 447*3253Smec * EINVAL - The requested paddr was not PAGESIZE aligned or the 448*3253Smec * PHYSMEM_SETUP ioctl was not called for this page. 449*3253Smec * ENOENT - The requested page was iniside the kernel cage, and the 450*3253Smec * PHYSMEM_CAGE flag was not set. 451*3253Smec * EBUSY - The requested page is retired and the PHYSMEM_RETIRE flag 452*3253Smec * was not set. 453*3253Smec */ 454*3253Smec static int 455*3253Smec physmem_map_addrs(struct physmem_map_param *pmpp) 456*3253Smec { 457*3253Smec caddr_t uvaddr; 458*3253Smec page_t *pp; 459*3253Smec uint64_t req_paddr; 460*3253Smec struct vnode *vp; 461*3253Smec int ret = 0; 462*3253Smec struct physmem_hash *php; 463*3253Smec uint_t flags = 0; 464*3253Smec 465*3253Smec ASSERT(pmpp != NULL); 466*3253Smec req_paddr = pmpp->req_paddr; 467*3253Smec 468*3253Smec if (!IS_P2ALIGNED(req_paddr, PAGESIZE)) 469*3253Smec return (EINVAL); 470*3253Smec /* Find the vnode for this map request */ 471*3253Smec rw_enter(&pph_rwlock, RW_READER); 472*3253Smec php = physmem_get_hash(req_paddr, PAGESIZE, curproc); 473*3253Smec if (php == NULL) { 474*3253Smec rw_exit(&pph_rwlock); 475*3253Smec return (EINVAL); 476*3253Smec } 477*3253Smec vp = php->ph_vnode; 478*3253Smec uvaddr = php->ph_base_va + (req_paddr - php->ph_base_pa); 479*3253Smec rw_exit(&pph_rwlock); 480*3253Smec 481*3253Smec pp = page_numtopp_nolock(btop((size_t)req_paddr)); 482*3253Smec if (pp == NULL) { 483*3253Smec pmpp->ret_va = NULL; 484*3253Smec return (EPERM); 485*3253Smec } 486*3253Smec 487*3253Smec /* 488*3253Smec * Check to see if page already mapped correctly. This can happen 489*3253Smec * when we failed to capture a page previously and it was captured 490*3253Smec * asynchronously for us. Return success in this case. 491*3253Smec */ 492*3253Smec if (pp->p_vnode == vp) { 493*3253Smec ASSERT(pp->p_offset == (u_offset_t)req_paddr); 494*3253Smec pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr; 495*3253Smec return (0); 496*3253Smec } 497*3253Smec 498*3253Smec /* 499*3253Smec * physmem should be responsible for checking for cage 500*3253Smec * and prom pages. 501*3253Smec */ 502*3253Smec if (pmpp->flags & PHYSMEM_CAGE) 503*3253Smec flags = CAPTURE_GET_CAGE; 504*3253Smec if (pmpp->flags & PHYSMEM_RETIRED) 505*3253Smec flags |= CAPTURE_GET_RETIRED; 506*3253Smec 507*3253Smec ret = page_trycapture(pp, 0, flags | CAPTURE_PHYSMEM, curproc); 508*3253Smec 509*3253Smec if (ret != 0) { 510*3253Smec pmpp->ret_va = NULL; 511*3253Smec return (ret); 512*3253Smec } else { 513*3253Smec pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr; 514*3253Smec return (0); 515*3253Smec } 516*3253Smec } 517*3253Smec 518*3253Smec /* 519*3253Smec * Map the given page into the process's address space if possible. 520*3253Smec * We actually only hash the page in on the correct vnode as the page 521*3253Smec * will be mapped via segvn_pagefault. 522*3253Smec * returns 0 on success 523*3253Smec * returns 1 if there is no need to map this page anymore (process exited) 524*3253Smec * returns -1 if we failed to map the page. 525*3253Smec */ 526*3253Smec int 527*3253Smec map_page_proc(page_t *pp, void *arg, uint_t flags) 528*3253Smec { 529*3253Smec struct vnode *vp; 530*3253Smec proc_t *procp = (proc_t *)arg; 531*3253Smec int ret; 532*3253Smec u_offset_t paddr = (u_offset_t)ptob(pp->p_pagenum); 533*3253Smec struct physmem_hash *php; 534*3253Smec 535*3253Smec ASSERT(pp != NULL); 536*3253Smec 537*3253Smec /* 538*3253Smec * Check against availrmem to make sure that we're not low on memory. 539*3253Smec * We check again here as ASYNC requests do not do this check elsewhere. 540*3253Smec * We return 1 as we don't want the page to have the PR_CAPTURE bit 541*3253Smec * set or be on the page capture hash. 542*3253Smec */ 543*3253Smec if (swapfs_minfree > availrmem + 1) { 544*3253Smec page_free(pp, 1); 545*3253Smec return (1); 546*3253Smec } 547*3253Smec 548*3253Smec /* 549*3253Smec * If this is an asynchronous request for the current process, 550*3253Smec * we can not map the page as it's possible that we are also in the 551*3253Smec * process of unmapping the page which could result in a deadlock 552*3253Smec * with the as lock. 553*3253Smec */ 554*3253Smec if ((flags & CAPTURE_ASYNC) && (curproc == procp)) { 555*3253Smec page_free(pp, 1); 556*3253Smec return (-1); 557*3253Smec } 558*3253Smec 559*3253Smec /* only return zeroed out pages */ 560*3253Smec pagezero(pp, 0, PAGESIZE); 561*3253Smec 562*3253Smec rw_enter(&pph_rwlock, RW_READER); 563*3253Smec php = physmem_get_hash(paddr, PAGESIZE, procp); 564*3253Smec if (php == NULL) { 565*3253Smec rw_exit(&pph_rwlock); 566*3253Smec /* 567*3253Smec * Free the page as there is no longer a valid outstanding 568*3253Smec * request for this page. 569*3253Smec */ 570*3253Smec page_free(pp, 1); 571*3253Smec return (1); 572*3253Smec } 573*3253Smec 574*3253Smec vp = php->ph_vnode; 575*3253Smec 576*3253Smec /* 577*3253Smec * We need to protect against a possible deadlock here where we own 578*3253Smec * the vnode page hash mutex and want to acquire it again as there 579*3253Smec * are locations in the code, where we unlock a page while holding 580*3253Smec * the mutex which can lead to the page being captured and eventually 581*3253Smec * end up here. 582*3253Smec */ 583*3253Smec if (mutex_owned(page_vnode_mutex(vp))) { 584*3253Smec rw_exit(&pph_rwlock); 585*3253Smec page_free(pp, 1); 586*3253Smec return (-1); 587*3253Smec } 588*3253Smec 589*3253Smec ret = page_hashin(pp, vp, paddr, NULL); 590*3253Smec rw_exit(&pph_rwlock); 591*3253Smec if (ret == 0) { 592*3253Smec page_free(pp, 1); 593*3253Smec return (-1); 594*3253Smec } 595*3253Smec 596*3253Smec page_downgrade(pp); 597*3253Smec 598*3253Smec mutex_enter(&freemem_lock); 599*3253Smec availrmem--; 600*3253Smec mutex_exit(&freemem_lock); 601*3253Smec 602*3253Smec return (0); 603*3253Smec } 604*3253Smec 605*3253Smec /* 606*3253Smec * The guts of the PHYSMEM_DESTROY ioctl. 607*3253Smec * The cookie passed in will provide all of the information needed to 608*3253Smec * free up the address space and physical memory associated with the 609*3253Smec * corresponding PHSYMEM_SETUP ioctl. 610*3253Smec * Returns 0 on success with the following error codes on failure: 611*3253Smec * EINVAL - The cookie supplied is not valid. 612*3253Smec */ 613*3253Smec int 614*3253Smec physmem_destroy_addrs(uint64_t p_cookie) 615*3253Smec { 616*3253Smec struct as *as = curproc->p_as; 617*3253Smec size_t len; 618*3253Smec caddr_t uvaddr; 619*3253Smec 620*3253Smec rw_enter(&pph_rwlock, RW_READER); 621*3253Smec if (physmem_validate_cookie(p_cookie) == 0) { 622*3253Smec rw_exit(&pph_rwlock); 623*3253Smec return (EINVAL); 624*3253Smec } 625*3253Smec 626*3253Smec len = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_seg_len; 627*3253Smec uvaddr = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_base_va; 628*3253Smec rw_exit(&pph_rwlock); 629*3253Smec 630*3253Smec (void) as_unmap(as, uvaddr, len); 631*3253Smec 632*3253Smec return (0); 633*3253Smec } 634*3253Smec 635*3253Smec /* 636*3253Smec * If the page has been hashed into the physmem vnode, then just look it up 637*3253Smec * and return it via pl, otherwise return ENOMEM as the map ioctl has not 638*3253Smec * succeeded on the given page. 639*3253Smec */ 640*3253Smec /*ARGSUSED*/ 641*3253Smec static int 642*3253Smec physmem_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp, 643*3253Smec page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw, 644*3253Smec struct cred *cr) 645*3253Smec { 646*3253Smec page_t *pp; 647*3253Smec 648*3253Smec ASSERT(len == PAGESIZE); 649*3253Smec ASSERT(AS_READ_HELD(seg->s_as, &seg->s_as->a_lock)); 650*3253Smec 651*3253Smec /* 652*3253Smec * If the page is in the hash, then we successfully claimed this 653*3253Smec * page earlier, so return it to the caller. 654*3253Smec */ 655*3253Smec pp = page_lookup(vp, off, SE_SHARED); 656*3253Smec if (pp != NULL) { 657*3253Smec pl[0] = pp; 658*3253Smec pl[1] = NULL; 659*3253Smec *protp = PROT_ALL; 660*3253Smec return (0); 661*3253Smec } 662*3253Smec return (ENOMEM); 663*3253Smec } 664*3253Smec 665*3253Smec /* 666*3253Smec * We can not allow a process mapping /dev/physmem pages to fork as there can 667*3253Smec * only be a single mapping to a /dev/physmem page at a given time. Thus, the 668*3253Smec * return of EINVAL when we are not working on our own address space. 669*3253Smec * Otherwise we return zero as this function is required for normal operation. 670*3253Smec */ 671*3253Smec /*ARGSUSED*/ 672*3253Smec static int 673*3253Smec physmem_addmap(struct vnode *vp, offset_t off, struct as *as, 674*3253Smec caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags, 675*3253Smec struct cred *cred) 676*3253Smec { 677*3253Smec if (curproc->p_as != as) { 678*3253Smec return (EINVAL); 679*3253Smec } 680*3253Smec return (0); 681*3253Smec } 682*3253Smec 683*3253Smec /* Will always get called for removing a whole segment. */ 684*3253Smec /*ARGSUSED*/ 685*3253Smec static int 686*3253Smec physmem_delmap(struct vnode *vp, offset_t off, struct as *as, 687*3253Smec caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags, 688*3253Smec struct cred *cred) 689*3253Smec { 690*3253Smec /* 691*3253Smec * Release our hold on the vnode so that the final VN_RELE will 692*3253Smec * call physmem_inactive to clean things up. 693*3253Smec */ 694*3253Smec VN_RELE(vp); 695*3253Smec 696*3253Smec return (0); 697*3253Smec } 698*3253Smec 699*3253Smec /* 700*3253Smec * Clean up all the pages belonging to this vnode and then free it. 701*3253Smec */ 702*3253Smec /*ARGSUSED*/ 703*3253Smec static void 704*3253Smec physmem_inactive(vnode_t *vp, cred_t *crp) 705*3253Smec { 706*3253Smec page_t *pp; 707*3253Smec 708*3253Smec /* 709*3253Smec * Remove the vnode from the hash now, to prevent asynchronous 710*3253Smec * attempts to map into this vnode. This avoids a deadlock 711*3253Smec * where two threads try to get into this logic at the same 712*3253Smec * time and try to map the pages they are destroying into the 713*3253Smec * other's address space. 714*3253Smec * If it's not in the hash, just free it. 715*3253Smec */ 716*3253Smec if (physmem_remove_vnode_hash(vp) == 0) { 717*3253Smec ASSERT(vp->v_pages == NULL); 718*3253Smec vn_free(vp); 719*3253Smec physmem_remove_hash_proc(); 720*3253Smec mutex_enter(&physmem_mutex); 721*3253Smec physmem_vnodecnt--; 722*3253Smec mutex_exit(&physmem_mutex); 723*3253Smec return; 724*3253Smec } 725*3253Smec 726*3253Smec /* 727*3253Smec * At this point in time, no other logic can be adding or removing 728*3253Smec * pages from the vnode, otherwise the v_pages list could be inaccurate. 729*3253Smec */ 730*3253Smec 731*3253Smec while ((pp = vp->v_pages) != NULL) { 732*3253Smec page_t *rpp; 733*3253Smec if (page_tryupgrade(pp)) { 734*3253Smec /* 735*3253Smec * set lckcnt for page_destroy to do availrmem 736*3253Smec * accounting 737*3253Smec */ 738*3253Smec pp->p_lckcnt = 1; 739*3253Smec page_destroy(pp, 0); 740*3253Smec } else { 741*3253Smec /* failure to lock should be transient */ 742*3253Smec rpp = page_lookup(vp, ptob(pp->p_pagenum), SE_SHARED); 743*3253Smec if (rpp != pp) { 744*3253Smec page_unlock(rpp); 745*3253Smec continue; 746*3253Smec } 747*3253Smec page_unlock(pp); 748*3253Smec } 749*3253Smec } 750*3253Smec vn_free(vp); 751*3253Smec physmem_remove_hash_proc(); 752*3253Smec mutex_enter(&physmem_mutex); 753*3253Smec physmem_vnodecnt--; 754*3253Smec mutex_exit(&physmem_mutex); 755*3253Smec } 756*3253Smec 757*3253Smec /*ARGSUSED*/ 758*3253Smec static int 759*3253Smec physmem_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp, 760*3253Smec int *rvalp) 761*3253Smec { 762*3253Smec int ret; 763*3253Smec 764*3253Smec switch (cmd) { 765*3253Smec case PHYSMEM_SETUP: 766*3253Smec { 767*3253Smec struct physmem_setup_param psp; 768*3253Smec if (ddi_copyin((void *)arg, &psp, 769*3253Smec sizeof (struct physmem_setup_param), 0)) 770*3253Smec return (EFAULT); 771*3253Smec ret = physmem_setup_addrs(&psp); 772*3253Smec if (ddi_copyout(&psp, (void *)arg, sizeof (psp), 0)) 773*3253Smec return (EFAULT); 774*3253Smec } 775*3253Smec break; 776*3253Smec case PHYSMEM_MAP: 777*3253Smec { 778*3253Smec struct physmem_map_param pmp; 779*3253Smec if (ddi_copyin((void *)arg, &pmp, 780*3253Smec sizeof (struct physmem_map_param), 0)) 781*3253Smec return (EFAULT); 782*3253Smec ret = physmem_map_addrs(&pmp); 783*3253Smec if (ddi_copyout(&pmp, (void *)arg, sizeof (pmp), 0)) 784*3253Smec return (EFAULT); 785*3253Smec } 786*3253Smec break; 787*3253Smec case PHYSMEM_DESTROY: 788*3253Smec { 789*3253Smec uint64_t cookie; 790*3253Smec if (ddi_copyin((void *)arg, &cookie, 791*3253Smec sizeof (uint64_t), 0)) 792*3253Smec return (EFAULT); 793*3253Smec ret = physmem_destroy_addrs(cookie); 794*3253Smec } 795*3253Smec break; 796*3253Smec default: 797*3253Smec return (ENOTSUP); 798*3253Smec } 799*3253Smec return (ret); 800*3253Smec } 801*3253Smec 802*3253Smec /*ARGSUSED*/ 803*3253Smec static int 804*3253Smec physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp) 805*3253Smec { 806*3253Smec int ret; 807*3253Smec static int msg_printed = 0; 808*3253Smec 809*3253Smec if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) { 810*3253Smec return (EINVAL); 811*3253Smec } 812*3253Smec 813*3253Smec /* need to make sure we have the right privileges */ 814*3253Smec if ((ret = secpolicy_resource(credp)) != 0) 815*3253Smec return (ret); 816*3253Smec if ((ret = secpolicy_lock_memory(credp)) != 0) 817*3253Smec return (ret); 818*3253Smec 819*3253Smec if (msg_printed == 0) { 820*3253Smec cmn_err(CE_NOTE, "!driver has been opened. This driver may " 821*3253Smec "take out long term locks on pages which may impact " 822*3253Smec "dynamic reconfiguration events"); 823*3253Smec msg_printed = 1; 824*3253Smec } 825*3253Smec 826*3253Smec return (0); 827*3253Smec } 828*3253Smec 829*3253Smec /*ARGSUSED*/ 830*3253Smec static int 831*3253Smec physmem_close(dev_t dev, int flag, int otyp, cred_t *credp) 832*3253Smec { 833*3253Smec return (0); 834*3253Smec } 835*3253Smec 836*3253Smec /*ARGSUSED*/ 837*3253Smec static int 838*3253Smec physmem_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd, 839*3253Smec void *arg, void **resultp) 840*3253Smec { 841*3253Smec switch (infocmd) { 842*3253Smec case DDI_INFO_DEVT2DEVINFO: 843*3253Smec *resultp = physmem_dip; 844*3253Smec return (DDI_SUCCESS); 845*3253Smec 846*3253Smec case DDI_INFO_DEVT2INSTANCE: 847*3253Smec *resultp = (void *)(ulong_t)getminor((dev_t)arg); 848*3253Smec return (DDI_SUCCESS); 849*3253Smec 850*3253Smec default: 851*3253Smec return (DDI_FAILURE); 852*3253Smec } 853*3253Smec } 854*3253Smec 855*3253Smec static int 856*3253Smec physmem_attach(dev_info_t *dip, ddi_attach_cmd_t cmd) 857*3253Smec { 858*3253Smec int i; 859*3253Smec 860*3253Smec if (cmd == DDI_RESUME) { 861*3253Smec return (DDI_SUCCESS); 862*3253Smec } 863*3253Smec 864*3253Smec if (cmd != DDI_ATTACH) 865*3253Smec return (DDI_FAILURE); 866*3253Smec 867*3253Smec if (ddi_create_minor_node(dip, ddi_get_name(dip), S_IFCHR, 868*3253Smec ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS) 869*3253Smec return (DDI_FAILURE); 870*3253Smec 871*3253Smec physmem_dip = dip; 872*3253Smec 873*3253Smec /* Initialize driver specific data */ 874*3253Smec if (physmem_setup_vnops()) { 875*3253Smec ddi_remove_minor_node(dip, ddi_get_name(dip)); 876*3253Smec return (DDI_FAILURE); 877*3253Smec } 878*3253Smec 879*3253Smec for (i = 0; i < PPH_SIZE; i++) 880*3253Smec pph[i] = NULL; 881*3253Smec 882*3253Smec page_capture_register_callback(PC_PHYSMEM, 10000, 883*3253Smec map_page_proc); 884*3253Smec 885*3253Smec return (DDI_SUCCESS); 886*3253Smec } 887*3253Smec 888*3253Smec static int 889*3253Smec physmem_detach(dev_info_t *dip, ddi_detach_cmd_t cmd) 890*3253Smec { 891*3253Smec int ret = DDI_SUCCESS; 892*3253Smec 893*3253Smec if (cmd == DDI_SUSPEND) { 894*3253Smec return (DDI_SUCCESS); 895*3253Smec } 896*3253Smec 897*3253Smec if (cmd != DDI_DETACH) 898*3253Smec return (DDI_FAILURE); 899*3253Smec 900*3253Smec ASSERT(physmem_dip == dip); 901*3253Smec 902*3253Smec mutex_enter(&physmem_mutex); 903*3253Smec if (physmem_vnodecnt == 0) { 904*3253Smec if (physmem_vnodeops != NULL) { 905*3253Smec vn_freevnodeops(physmem_vnodeops); 906*3253Smec physmem_vnodeops = NULL; 907*3253Smec page_capture_unregister_callback(PC_PHYSMEM); 908*3253Smec } 909*3253Smec } else { 910*3253Smec ret = EBUSY; 911*3253Smec } 912*3253Smec mutex_exit(&physmem_mutex); 913*3253Smec if (ret == DDI_SUCCESS) 914*3253Smec ddi_remove_minor_node(dip, ddi_get_name(dip)); 915*3253Smec return (ret); 916*3253Smec } 917*3253Smec 918*3253Smec static struct cb_ops physmem_cb_ops = { 919*3253Smec physmem_open, /* open */ 920*3253Smec physmem_close, /* close */ 921*3253Smec nodev, /* strategy */ 922*3253Smec nodev, /* print */ 923*3253Smec nodev, /* dump */ 924*3253Smec nodev, /* read */ 925*3253Smec nodev, /* write */ 926*3253Smec physmem_ioctl, /* ioctl */ 927*3253Smec nodev, /* devmap */ 928*3253Smec nodev, /* mmap */ 929*3253Smec nodev, /* segmap */ 930*3253Smec nochpoll, /* chpoll */ 931*3253Smec ddi_prop_op, /* prop_op */ 932*3253Smec NULL, /* cb_str */ 933*3253Smec D_NEW | D_MP | D_DEVMAP, 934*3253Smec CB_REV, 935*3253Smec NULL, 936*3253Smec NULL 937*3253Smec }; 938*3253Smec 939*3253Smec static struct dev_ops physmem_ops = { 940*3253Smec DEVO_REV, 941*3253Smec 0, 942*3253Smec physmem_getinfo, 943*3253Smec nulldev, 944*3253Smec nulldev, 945*3253Smec physmem_attach, 946*3253Smec physmem_detach, 947*3253Smec nodev, 948*3253Smec &physmem_cb_ops, 949*3253Smec NULL, 950*3253Smec NULL 951*3253Smec }; 952*3253Smec 953*3253Smec static struct modldrv modldrv = { 954*3253Smec &mod_driverops, 955*3253Smec "physmem driver %I%", 956*3253Smec &physmem_ops 957*3253Smec }; 958*3253Smec 959*3253Smec static struct modlinkage modlinkage = { 960*3253Smec MODREV_1, 961*3253Smec &modldrv, 962*3253Smec NULL 963*3253Smec }; 964*3253Smec 965*3253Smec int 966*3253Smec _init(void) 967*3253Smec { 968*3253Smec return (mod_install(&modlinkage)); 969*3253Smec } 970*3253Smec 971*3253Smec int 972*3253Smec _info(struct modinfo *modinfop) 973*3253Smec { 974*3253Smec return (mod_info(&modlinkage, modinfop)); 975*3253Smec } 976*3253Smec 977*3253Smec int 978*3253Smec _fini(void) 979*3253Smec { 980*3253Smec return (mod_remove(&modlinkage)); 981*3253Smec } 982