xref: /onnv-gate/usr/src/uts/common/io/physmem.c (revision 7656:2621e50fdf4a)
13253Smec /*
23253Smec  * CDDL HEADER START
33253Smec  *
43253Smec  * The contents of this file are subject to the terms of the
53253Smec  * Common Development and Distribution License (the "License").
63253Smec  * You may not use this file except in compliance with the License.
73253Smec  *
83253Smec  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
93253Smec  * or http://www.opensolaris.org/os/licensing.
103253Smec  * See the License for the specific language governing permissions
113253Smec  * and limitations under the License.
123253Smec  *
133253Smec  * When distributing Covered Code, include this CDDL HEADER in each
143253Smec  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
153253Smec  * If applicable, add the following below this CDDL HEADER, with the
163253Smec  * fields enclosed by brackets "[]" replaced with your own identifying
173253Smec  * information: Portions Copyright [yyyy] [name of copyright owner]
183253Smec  *
193253Smec  * CDDL HEADER END
203253Smec  */
213253Smec /*
22*7656SSherry.Moore@Sun.COM  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
233253Smec  * Use is subject to license terms.
243253Smec  */
253253Smec 
263253Smec 
273253Smec #include <sys/types.h>
283253Smec #include <sys/modctl.h>
293253Smec #include <sys/conf.h>
303253Smec #include <sys/ddi.h>
313253Smec #include <sys/sunddi.h>
323253Smec #include <sys/devops.h>
333253Smec #include <sys/stat.h>
343253Smec #include <sys/file.h>
353253Smec #include <sys/cred.h>
363253Smec #include <sys/policy.h>
373253Smec #include <sys/errno.h>
383253Smec #include <vm/seg_dev.h>
393253Smec #include <vm/seg_vn.h>
403253Smec #include <vm/page.h>
413253Smec #include <sys/fs/swapnode.h>
423253Smec #include <sys/sysmacros.h>
433253Smec #include <sys/fcntl.h>
443253Smec #include <sys/vmsystm.h>
453253Smec #include <sys/physmem.h>
463898Srsb #include <sys/vfs_opreg.h>
473253Smec 
483253Smec static dev_info_t		*physmem_dip = NULL;
493253Smec 
503253Smec /*
513253Smec  * Linked list element hanging off physmem_proc_hash below, which holds all
523253Smec  * the information for a given segment which has been setup for this process.
533253Smec  * This is a simple linked list as we are assuming that for a given process
543253Smec  * the setup ioctl will only be called a handful of times.  If this assumption
553253Smec  * changes in the future, a quicker to traverse data structure should be used.
563253Smec  */
573253Smec struct physmem_hash {
583253Smec 	struct physmem_hash *ph_next;
593253Smec 	uint64_t ph_base_pa;
603253Smec 	caddr_t ph_base_va;
613253Smec 	size_t ph_seg_len;
623253Smec 	struct vnode *ph_vnode;
633253Smec };
643253Smec 
653253Smec /*
663253Smec  * Hash of all of the processes which have setup mappings with the driver with
673253Smec  * pointers to per process data.
683253Smec  */
693253Smec struct physmem_proc_hash {
703253Smec 	struct proc *pph_proc;
713253Smec 	struct physmem_hash *pph_hash;
723253Smec 	struct physmem_proc_hash *pph_next;
733253Smec };
743253Smec 
753253Smec 
763253Smec /* Needs to be a power of two for simple hash algorithm */
773253Smec #define	PPH_SIZE	8
783253Smec struct physmem_proc_hash *pph[PPH_SIZE];
793253Smec 
803253Smec /*
813253Smec  * Lock which protects the pph hash above.  To add an element (either a new
823253Smec  * process or a new segment) the WRITE lock must be held.  To traverse the
833253Smec  * list, only a READ lock is needed.
843253Smec  */
853253Smec krwlock_t pph_rwlock;
863253Smec 
873253Smec #define	PHYSMEM_HASH(procp) ((int)((((uintptr_t)procp) >> 8) & (PPH_SIZE - 1)))
883253Smec 
893253Smec /*
903253Smec  * Need to keep a reference count of how many processes have the driver
913253Smec  * open to prevent it from disappearing.
923253Smec  */
933253Smec uint64_t physmem_vnodecnt;
943253Smec kmutex_t physmem_mutex;		/* protects phsymem_vnodecnt */
953253Smec 
963253Smec static int physmem_getpage(struct vnode *vp, offset_t off, size_t len,
973253Smec     uint_t *protp, page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
985331Samw     enum seg_rw rw, struct cred *cr, caller_context_t *ct);
993253Smec 
1003253Smec static int physmem_addmap(struct vnode *vp, offset_t off, struct as *as,
1013253Smec     caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
1025331Samw     struct cred *cred, caller_context_t *ct);
1033253Smec 
1043253Smec static int physmem_delmap(struct vnode *vp, offset_t off, struct as *as,
1053253Smec     caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags,
1065331Samw     struct cred *cred, caller_context_t *ct);
1073253Smec 
1085331Samw static void physmem_inactive(vnode_t *vp, cred_t *crp, caller_context_t *ct);
1093253Smec 
1103253Smec const fs_operation_def_t physmem_vnodeops_template[] = {
1113898Srsb 	VOPNAME_GETPAGE,	{ .vop_getpage = physmem_getpage },
1123898Srsb 	VOPNAME_ADDMAP,		{ .vop_addmap = physmem_addmap },
1133898Srsb 	VOPNAME_DELMAP,		{ .vop_delmap = physmem_delmap },
1143898Srsb 	VOPNAME_INACTIVE,	{ .vop_inactive = physmem_inactive },
1153898Srsb 	NULL,			NULL
1163253Smec };
1173253Smec 
1183253Smec vnodeops_t *physmem_vnodeops = NULL;
1193253Smec 
1203253Smec /*
1213253Smec  * Removes the current process from the hash if the process has no more
1223253Smec  * physmem segments active.
1233253Smec  */
1243253Smec void
physmem_remove_hash_proc()1253253Smec physmem_remove_hash_proc()
1263253Smec {
1273253Smec 	int index;
1283253Smec 	struct physmem_proc_hash **walker;
1293253Smec 	struct physmem_proc_hash *victim = NULL;
1303253Smec 
1313253Smec 	index = PHYSMEM_HASH(curproc);
1323253Smec 	rw_enter(&pph_rwlock, RW_WRITER);
1333253Smec 	walker = &pph[index];
1343253Smec 	while (*walker != NULL) {
1353253Smec 		if ((*walker)->pph_proc == curproc &&
1363253Smec 		    (*walker)->pph_hash == NULL) {
1373253Smec 			victim = *walker;
1383253Smec 			*walker = victim->pph_next;
1393253Smec 			break;
1403253Smec 		}
1413253Smec 		walker = &((*walker)->pph_next);
1423253Smec 	}
1433253Smec 	rw_exit(&pph_rwlock);
1443253Smec 	if (victim != NULL)
1453253Smec 		kmem_free(victim, sizeof (struct physmem_proc_hash));
1463253Smec }
1473253Smec 
1483253Smec /*
1493253Smec  * Add a new entry to the hash for the given process to cache the
1503253Smec  * address ranges that it is working on.  If this is the first hash
1513253Smec  * item to be added for this process, we will create the head pointer
1523253Smec  * for this process.
1533253Smec  * Returns 0 on success, ERANGE when the physical address is already in the
1543616Smec  * hash.
1553253Smec  */
1563253Smec int
physmem_add_hash(struct physmem_hash * php)1573253Smec physmem_add_hash(struct physmem_hash *php)
1583253Smec {
1593253Smec 	int index;
1603253Smec 	struct physmem_proc_hash *iterator;
1613253Smec 	struct physmem_proc_hash *newp = NULL;
1623253Smec 	struct physmem_hash *temp;
1633253Smec 	int ret = 0;
1643253Smec 
1653253Smec 	index = PHYSMEM_HASH(curproc);
1663253Smec 
1673253Smec insert:
1683253Smec 	rw_enter(&pph_rwlock, RW_WRITER);
1693253Smec 	iterator = pph[index];
1703253Smec 	while (iterator != NULL) {
1713253Smec 		if (iterator->pph_proc == curproc) {
1723253Smec 			/*
1733253Smec 			 * check to make sure a single process does not try to
1743253Smec 			 * map the same region twice.
1753253Smec 			 */
1763253Smec 			for (temp = iterator->pph_hash; temp != NULL;
1773253Smec 			    temp = temp->ph_next) {
1783253Smec 				if ((php->ph_base_pa >= temp->ph_base_pa &&
1793253Smec 				    php->ph_base_pa < temp->ph_base_pa +
1803253Smec 				    temp->ph_seg_len) ||
1813253Smec 				    (temp->ph_base_pa >= php->ph_base_pa &&
1823253Smec 				    temp->ph_base_pa < php->ph_base_pa +
1833253Smec 				    php->ph_seg_len)) {
1843253Smec 					ret = ERANGE;
1853253Smec 					break;
1863253Smec 				}
1873253Smec 			}
1883253Smec 			if (ret == 0) {
1893253Smec 				php->ph_next = iterator->pph_hash;
1903253Smec 				iterator->pph_hash = php;
1913253Smec 			}
1923253Smec 			rw_exit(&pph_rwlock);
1933253Smec 			/* Need to check for two threads in sync */
1943253Smec 			if (newp != NULL)
1953253Smec 				kmem_free(newp, sizeof (*newp));
1963253Smec 			return (ret);
1973253Smec 		}
1983253Smec 		iterator = iterator->pph_next;
1993253Smec 	}
2003253Smec 
2013253Smec 	if (newp != NULL) {
2023253Smec 		newp->pph_proc = curproc;
2033253Smec 		newp->pph_next = pph[index];
2043253Smec 		newp->pph_hash = php;
2053253Smec 		php->ph_next = NULL;
2063253Smec 		pph[index] = newp;
2073253Smec 		rw_exit(&pph_rwlock);
2083253Smec 		return (0);
2093253Smec 	}
2103253Smec 
2113253Smec 	rw_exit(&pph_rwlock);
2123253Smec 	/* Dropped the lock so we could use KM_SLEEP */
2133253Smec 	newp = kmem_zalloc(sizeof (struct physmem_proc_hash), KM_SLEEP);
2143253Smec 	goto insert;
2153253Smec }
2163253Smec 
2173253Smec /*
2183253Smec  * Will return the pointer to the physmem_hash struct if the setup routine
2193253Smec  * has previously been called for this memory.
2203253Smec  * Returns NULL on failure.
2213253Smec  */
2223253Smec struct physmem_hash *
physmem_get_hash(uint64_t req_paddr,size_t len,proc_t * procp)2233253Smec physmem_get_hash(uint64_t req_paddr, size_t len, proc_t *procp)
2243253Smec {
2253253Smec 	int index;
2263253Smec 	struct physmem_proc_hash *proc_hp;
2273253Smec 	struct physmem_hash *php;
2283253Smec 
2293253Smec 	ASSERT(rw_lock_held(&pph_rwlock));
2303253Smec 
2313253Smec 	index = PHYSMEM_HASH(procp);
2323253Smec 	proc_hp = pph[index];
2333253Smec 	while (proc_hp != NULL) {
2343253Smec 		if (proc_hp->pph_proc == procp) {
2353253Smec 			php = proc_hp->pph_hash;
2363253Smec 			while (php != NULL) {
2373253Smec 				if ((req_paddr >= php->ph_base_pa) &&
2383253Smec 				    (req_paddr + len <=
2393253Smec 				    php->ph_base_pa + php->ph_seg_len)) {
2403253Smec 					return (php);
2413253Smec 				}
2423253Smec 				php = php->ph_next;
2433253Smec 			}
2443253Smec 		}
2453253Smec 		proc_hp = proc_hp->pph_next;
2463253Smec 	}
2473253Smec 	return (NULL);
2483253Smec }
2493253Smec 
2503253Smec int
physmem_validate_cookie(uint64_t p_cookie)2513253Smec physmem_validate_cookie(uint64_t p_cookie)
2523253Smec {
2533253Smec 	int index;
2543253Smec 	struct physmem_proc_hash *proc_hp;
2553253Smec 	struct physmem_hash *php;
2563253Smec 
2573253Smec 	ASSERT(rw_lock_held(&pph_rwlock));
2583253Smec 
2593253Smec 	index = PHYSMEM_HASH(curproc);
2603253Smec 	proc_hp = pph[index];
2613253Smec 	while (proc_hp != NULL) {
2623253Smec 		if (proc_hp->pph_proc == curproc) {
2633253Smec 			php = proc_hp->pph_hash;
2643253Smec 			while (php != NULL) {
2653253Smec 				if ((uint64_t)(uintptr_t)php == p_cookie) {
2663253Smec 					return (1);
2673253Smec 				}
2683253Smec 				php = php->ph_next;
2693253Smec 			}
2703253Smec 		}
2713253Smec 		proc_hp = proc_hp->pph_next;
2723253Smec 	}
2733253Smec 	return (0);
2743253Smec }
2753253Smec 
2763253Smec /*
2773253Smec  * Remove the given vnode from the pph hash.  If it exists in the hash the
2783253Smec  * process still has to be around as the vnode is obviously still around and
2793253Smec  * since it's a physmem vnode, it must be in the hash.
2803253Smec  * If it is not in the hash that must mean that the setup ioctl failed.
2813253Smec  * Return 0 in this instance, 1 if it is in the hash.
2823253Smec  */
2833253Smec int
physmem_remove_vnode_hash(vnode_t * vp)2843253Smec physmem_remove_vnode_hash(vnode_t *vp)
2853253Smec {
2863253Smec 	int index;
2873253Smec 	struct physmem_proc_hash *proc_hp;
2883253Smec 	struct physmem_hash **phpp;
2893253Smec 	struct physmem_hash *victim;
2903253Smec 
2913253Smec 	index = PHYSMEM_HASH(curproc);
2923253Smec 	/* synchronize with the map routine */
2933253Smec 	rw_enter(&pph_rwlock, RW_WRITER);
2943253Smec 	proc_hp = pph[index];
2953253Smec 	while (proc_hp != NULL) {
2963253Smec 		if (proc_hp->pph_proc == curproc) {
2973253Smec 			phpp = &proc_hp->pph_hash;
2983253Smec 			while (*phpp != NULL) {
2993253Smec 				if ((*phpp)->ph_vnode == vp) {
3003253Smec 					victim = *phpp;
3013253Smec 					*phpp = victim->ph_next;
3023253Smec 
3033253Smec 					rw_exit(&pph_rwlock);
3043253Smec 					kmem_free(victim, sizeof (*victim));
3053253Smec 					return (1);
3063253Smec 				}
3073253Smec 				phpp = &(*phpp)->ph_next;
3083253Smec 			}
3093253Smec 		}
3103253Smec 		proc_hp = proc_hp->pph_next;
3113253Smec 	}
3123253Smec 	rw_exit(&pph_rwlock);
3133253Smec 
3143253Smec 	/* not found */
3153253Smec 	return (0);
3163253Smec }
3173253Smec 
3183253Smec int
physmem_setup_vnops()3193253Smec physmem_setup_vnops()
3203253Smec {
3213253Smec 	int error;
3223253Smec 	char *name = "physmem";
3233253Smec 	if (physmem_vnodeops != NULL)
3243253Smec 		cmn_err(CE_PANIC, "physmem vnodeops already set\n");
3253253Smec 	error = vn_make_ops(name, physmem_vnodeops_template, &physmem_vnodeops);
3263253Smec 	if (error != 0) {
3273253Smec 		cmn_err(CE_WARN, "physmem_setup_vnops: bad vnode ops template");
3283253Smec 	}
3293253Smec 	return (error);
3303253Smec }
3313253Smec 
3323253Smec /*
3333253Smec  * The guts of the PHYSMEM_SETUP ioctl.
3343253Smec  * Create a segment in the address space with the specified parameters.
3353253Smec  * If pspp->user_va is NULL, as_gap will be used to find an appropriate VA.
3365331Samw  * We do not do bounds checking on the requested physical addresses, if they
3373253Smec  * do not exist in the system, they will not be mappable.
3383253Smec  * Returns 0 on success with the following error codes on failure:
3393253Smec  *	ENOMEM - The VA range requested was already mapped if pspp->user_va is
3403253Smec  *		non-NULL or the system was unable to find enough VA space for
3413253Smec  *		the desired length if user_va was NULL>
3423253Smec  *	EINVAL - The requested PA, VA, or length was not PAGESIZE aligned.
3433253Smec  */
3443253Smec int
physmem_setup_addrs(struct physmem_setup_param * pspp)3453253Smec physmem_setup_addrs(struct physmem_setup_param *pspp)
3463253Smec {
3473253Smec 	struct as *as = curproc->p_as;
3483253Smec 	struct segvn_crargs vn_a;
3493253Smec 	int ret = 0;
3503253Smec 	uint64_t base_pa;
3513253Smec 	size_t len;
3523253Smec 	caddr_t uvaddr;
3533253Smec 	struct vnode *vp;
3543253Smec 	struct physmem_hash *php;
3553253Smec 
3563253Smec 	ASSERT(pspp != NULL);
3573253Smec 	base_pa = pspp->req_paddr;
3583253Smec 	len = pspp->len;
3593253Smec 	uvaddr = (caddr_t)(uintptr_t)pspp->user_va;
3603253Smec 
3613253Smec 	/* Sanity checking */
3623253Smec 	if (!IS_P2ALIGNED(base_pa, PAGESIZE))
3633253Smec 		return (EINVAL);
3643253Smec 	if (!IS_P2ALIGNED(len, PAGESIZE))
3653253Smec 		return (EINVAL);
3663253Smec 	if (uvaddr != NULL && !IS_P2ALIGNED(uvaddr, PAGESIZE))
3673253Smec 		return (EINVAL);
3683253Smec 
3693253Smec 	php = kmem_zalloc(sizeof (struct physmem_hash), KM_SLEEP);
3703253Smec 
3713253Smec 	/* Need to bump vnode count so that the driver can not be unloaded */
3723253Smec 	mutex_enter(&physmem_mutex);
3733253Smec 	physmem_vnodecnt++;
3743253Smec 	mutex_exit(&physmem_mutex);
3753253Smec 
3763253Smec 	vp = vn_alloc(KM_SLEEP);
3773253Smec 	ASSERT(vp != NULL);	/* SLEEP can't return NULL */
3783253Smec 	vn_setops(vp, physmem_vnodeops);
3793253Smec 
3803253Smec 	php->ph_vnode = vp;
3813253Smec 
3823253Smec 	vn_a.vp = vp;
3833253Smec 	vn_a.offset = (u_offset_t)base_pa;
3843253Smec 	vn_a.type = MAP_SHARED;
3853253Smec 	vn_a.prot = PROT_ALL;
3863253Smec 	vn_a.maxprot = PROT_ALL;
3873253Smec 	vn_a.flags = 0;
3883253Smec 	vn_a.cred = NULL;
3893253Smec 	vn_a.amp = NULL;
3903253Smec 	vn_a.szc = 0;
3913253Smec 	vn_a.lgrp_mem_policy_flags = 0;
3923253Smec 
3933253Smec 	as_rangelock(as);
3943253Smec 	if (uvaddr != NULL) {
3953253Smec 		if (as_gap(as, len, &uvaddr, &len, AH_LO, NULL) == -1) {
3963253Smec 			ret = ENOMEM;
3973253Smec fail:
3983253Smec 			as_rangeunlock(as);
3993253Smec 			vn_free(vp);
4003253Smec 			kmem_free(php, sizeof (*php));
4013253Smec 			mutex_enter(&physmem_mutex);
4023253Smec 			physmem_vnodecnt--;
4033253Smec 			mutex_exit(&physmem_mutex);
4043253Smec 			return (ret);
4053253Smec 		}
4063253Smec 	} else {
4073253Smec 		/* We pick the address for the user */
4083253Smec 		map_addr(&uvaddr, len, 0, 1, 0);
4093253Smec 		if (uvaddr == NULL) {
4103253Smec 			ret = ENOMEM;
4113253Smec 			goto fail;
4123253Smec 		}
4133253Smec 	}
4143253Smec 	ret = as_map(as, uvaddr, len, segvn_create, &vn_a);
4153253Smec 
4163253Smec 	if (ret == 0) {
4173616Smec 		as_rangeunlock(as);
4183253Smec 		php->ph_base_pa = base_pa;
4193253Smec 		php->ph_base_va = uvaddr;
4203253Smec 		php->ph_seg_len = len;
4213253Smec 		pspp->user_va = (uint64_t)(uintptr_t)uvaddr;
4223253Smec 		pspp->cookie = (uint64_t)(uintptr_t)php;
4233253Smec 		ret = physmem_add_hash(php);
4243253Smec 		if (ret == 0)
4253253Smec 			return (0);
4263616Smec 
4273616Smec 		/* Note that the call to as_unmap will free the vnode */
4283253Smec 		(void) as_unmap(as, uvaddr, len);
4293616Smec 		kmem_free(php, sizeof (*php));
4303253Smec 		return (ret);
4313253Smec 	}
4323253Smec 
4333253Smec 	goto fail;
4343253Smec 	/*NOTREACHED*/
4353253Smec }
4363253Smec 
4373253Smec /*
4383253Smec  * The guts of the PHYSMEM_MAP ioctl.
4393253Smec  * Map the given PA to the appropriate VA if PHYSMEM_SETUP ioctl has already
4403253Smec  * been called for this PA range.
4413253Smec  * Returns 0 on success with the following error codes on failure:
4423253Smec  *	EPERM - The requested page is long term locked, and thus repeated
4433253Smec  *		requests to allocate this page will likely fail.
4443253Smec  *	EAGAIN - The requested page could not be allocated, but it is believed
4453253Smec  *		that future attempts could succeed.
4463253Smec  *	ENOMEM - There was not enough free memory in the system to safely
4473253Smec  *		map the requested page.
4483253Smec  *	EINVAL - The requested paddr was not PAGESIZE aligned or the
4493253Smec  *		PHYSMEM_SETUP ioctl was not called for this page.
4503253Smec  *	ENOENT - The requested page was iniside the kernel cage, and the
4513253Smec  *		PHYSMEM_CAGE flag was not set.
4523253Smec  *	EBUSY - The requested page is retired and the PHYSMEM_RETIRE flag
4533253Smec  *		was not set.
4543253Smec  */
4553253Smec static int
physmem_map_addrs(struct physmem_map_param * pmpp)4563253Smec physmem_map_addrs(struct physmem_map_param *pmpp)
4573253Smec {
4583253Smec 	caddr_t uvaddr;
4593253Smec 	page_t *pp;
4603253Smec 	uint64_t req_paddr;
4613253Smec 	struct vnode *vp;
4623253Smec 	int ret = 0;
4633253Smec 	struct physmem_hash *php;
4643253Smec 	uint_t flags = 0;
4653253Smec 
4663253Smec 	ASSERT(pmpp != NULL);
4673253Smec 	req_paddr = pmpp->req_paddr;
4683253Smec 
4693253Smec 	if (!IS_P2ALIGNED(req_paddr, PAGESIZE))
4703253Smec 		return (EINVAL);
4713253Smec 	/* Find the vnode for this map request */
4723253Smec 	rw_enter(&pph_rwlock, RW_READER);
4733253Smec 	php = physmem_get_hash(req_paddr, PAGESIZE, curproc);
4743253Smec 	if (php == NULL) {
4753253Smec 		rw_exit(&pph_rwlock);
4763253Smec 		return (EINVAL);
4773253Smec 	}
4783253Smec 	vp = php->ph_vnode;
4793253Smec 	uvaddr = php->ph_base_va + (req_paddr - php->ph_base_pa);
4803253Smec 	rw_exit(&pph_rwlock);
4813253Smec 
4823253Smec 	pp = page_numtopp_nolock(btop((size_t)req_paddr));
4833253Smec 	if (pp == NULL) {
4843253Smec 		pmpp->ret_va = NULL;
4853253Smec 		return (EPERM);
4863253Smec 	}
4873253Smec 
4883253Smec 	/*
4893253Smec 	 * Check to see if page already mapped correctly.  This can happen
4903253Smec 	 * when we failed to capture a page previously and it was captured
4913253Smec 	 * asynchronously for us.  Return success in this case.
4923253Smec 	 */
4933253Smec 	if (pp->p_vnode == vp) {
4943253Smec 		ASSERT(pp->p_offset == (u_offset_t)req_paddr);
4953253Smec 		pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr;
4963253Smec 		return (0);
4973253Smec 	}
4983253Smec 
4993253Smec 	/*
5003253Smec 	 * physmem should be responsible for checking for cage
5013253Smec 	 * and prom pages.
5023253Smec 	 */
5033253Smec 	if (pmpp->flags & PHYSMEM_CAGE)
5043253Smec 		flags = CAPTURE_GET_CAGE;
5053253Smec 	if (pmpp->flags & PHYSMEM_RETIRED)
5063253Smec 		flags |= CAPTURE_GET_RETIRED;
5073253Smec 
5083253Smec 	ret = page_trycapture(pp, 0, flags | CAPTURE_PHYSMEM, curproc);
5093253Smec 
5103253Smec 	if (ret != 0) {
5113253Smec 		pmpp->ret_va = NULL;
5123253Smec 		return (ret);
5133253Smec 	} else {
5143253Smec 		pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr;
5153253Smec 		return (0);
5163253Smec 	}
5173253Smec }
5183253Smec 
5193253Smec /*
5203253Smec  * Map the given page into the process's address space if possible.
5213253Smec  * We actually only hash the page in on the correct vnode as the page
5223253Smec  * will be mapped via segvn_pagefault.
5233253Smec  * returns 0 on success
5243253Smec  * returns 1 if there is no need to map this page anymore (process exited)
5253253Smec  * returns -1 if we failed to map the page.
5263253Smec  */
5273253Smec int
map_page_proc(page_t * pp,void * arg,uint_t flags)5283253Smec map_page_proc(page_t *pp, void *arg, uint_t flags)
5293253Smec {
5303253Smec 	struct vnode *vp;
5313253Smec 	proc_t *procp = (proc_t *)arg;
5323253Smec 	int ret;
5333253Smec 	u_offset_t paddr = (u_offset_t)ptob(pp->p_pagenum);
5343253Smec 	struct physmem_hash *php;
5353253Smec 
5363253Smec 	ASSERT(pp != NULL);
5373253Smec 
5383253Smec 	/*
5393253Smec 	 * Check against availrmem to make sure that we're not low on memory.
5403253Smec 	 * We check again here as ASYNC requests do not do this check elsewhere.
5413253Smec 	 * We return 1 as we don't want the page to have the PR_CAPTURE bit
5423253Smec 	 * set or be on the page capture hash.
5433253Smec 	 */
5443253Smec 	if (swapfs_minfree > availrmem + 1) {
5453253Smec 		page_free(pp, 1);
5463253Smec 		return (1);
5473253Smec 	}
5483253Smec 
5493253Smec 	/*
5503253Smec 	 * If this is an asynchronous request for the current process,
5513253Smec 	 * we can not map the page as it's possible that we are also in the
5523253Smec 	 * process of unmapping the page which could result in a deadlock
5533253Smec 	 * with the as lock.
5543253Smec 	 */
5553253Smec 	if ((flags & CAPTURE_ASYNC) && (curproc == procp)) {
5563253Smec 		page_free(pp, 1);
5573253Smec 		return (-1);
5583253Smec 	}
5593253Smec 
5603253Smec 	/* only return zeroed out pages */
5613253Smec 	pagezero(pp, 0, PAGESIZE);
5623253Smec 
5633253Smec 	rw_enter(&pph_rwlock, RW_READER);
5643253Smec 	php = physmem_get_hash(paddr, PAGESIZE, procp);
5653253Smec 	if (php == NULL) {
5663253Smec 		rw_exit(&pph_rwlock);
5673253Smec 		/*
5683253Smec 		 * Free the page as there is no longer a valid outstanding
5693253Smec 		 * request for this page.
5703253Smec 		 */
5713253Smec 		page_free(pp, 1);
5723253Smec 		return (1);
5733253Smec 	}
5743253Smec 
5753253Smec 	vp = php->ph_vnode;
5763253Smec 
5773253Smec 	/*
5783253Smec 	 * We need to protect against a possible deadlock here where we own
5793253Smec 	 * the vnode page hash mutex and want to acquire it again as there
5803253Smec 	 * are locations in the code, where we unlock a page while holding
5813253Smec 	 * the mutex which can lead to the page being captured and eventually
5823253Smec 	 * end up here.
5833253Smec 	 */
5843253Smec 	if (mutex_owned(page_vnode_mutex(vp))) {
5853253Smec 		rw_exit(&pph_rwlock);
5863253Smec 		page_free(pp, 1);
5873253Smec 		return (-1);
5883253Smec 	}
5893253Smec 
5903253Smec 	ret = page_hashin(pp, vp, paddr, NULL);
5913253Smec 	rw_exit(&pph_rwlock);
5923253Smec 	if (ret == 0) {
5933253Smec 		page_free(pp, 1);
5943253Smec 		return (-1);
5953253Smec 	}
5963253Smec 
5973253Smec 	page_downgrade(pp);
5983253Smec 
5993253Smec 	mutex_enter(&freemem_lock);
6003253Smec 	availrmem--;
6013253Smec 	mutex_exit(&freemem_lock);
6023253Smec 
6033253Smec 	return (0);
6043253Smec }
6053253Smec 
6063253Smec /*
6073253Smec  * The guts of the PHYSMEM_DESTROY ioctl.
6083253Smec  * The cookie passed in will provide all of the information needed to
6093253Smec  * free up the address space and physical memory associated with the
6103253Smec  * corresponding PHSYMEM_SETUP ioctl.
6113253Smec  * Returns 0 on success with the following error codes on failure:
6123253Smec  *	EINVAL - The cookie supplied is not valid.
6133253Smec  */
6143253Smec int
physmem_destroy_addrs(uint64_t p_cookie)6153253Smec physmem_destroy_addrs(uint64_t p_cookie)
6163253Smec {
6173253Smec 	struct as *as = curproc->p_as;
6183253Smec 	size_t len;
6193253Smec 	caddr_t uvaddr;
6203253Smec 
6213253Smec 	rw_enter(&pph_rwlock, RW_READER);
6223253Smec 	if (physmem_validate_cookie(p_cookie) == 0) {
6233253Smec 		rw_exit(&pph_rwlock);
6243253Smec 		return (EINVAL);
6253253Smec 	}
6263253Smec 
6273253Smec 	len = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_seg_len;
6283253Smec 	uvaddr = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_base_va;
6293253Smec 	rw_exit(&pph_rwlock);
6303253Smec 
6313253Smec 	(void) as_unmap(as, uvaddr, len);
6323253Smec 
6333253Smec 	return (0);
6343253Smec }
6353253Smec 
6363253Smec /*
6373253Smec  * If the page has been hashed into the physmem vnode, then just look it up
6383253Smec  * and return it via pl, otherwise return ENOMEM as the map ioctl has not
6393253Smec  * succeeded on the given page.
6403253Smec  */
6413253Smec /*ARGSUSED*/
6423253Smec static int
physmem_getpage(struct vnode * vp,offset_t off,size_t len,uint_t * protp,page_t * pl[],size_t plsz,struct seg * seg,caddr_t addr,enum seg_rw rw,struct cred * cr,caller_context_t * ct)6433253Smec physmem_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
6443253Smec     page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
6455331Samw     struct cred *cr, caller_context_t *ct)
6463253Smec {
6473253Smec 	page_t *pp;
6483253Smec 
6493253Smec 	ASSERT(len == PAGESIZE);
6503253Smec 	ASSERT(AS_READ_HELD(seg->s_as, &seg->s_as->a_lock));
6513253Smec 
6523253Smec 	/*
6533253Smec 	 * If the page is in the hash, then we successfully claimed this
6543253Smec 	 * page earlier, so return it to the caller.
6553253Smec 	 */
6563253Smec 	pp = page_lookup(vp, off, SE_SHARED);
6573253Smec 	if (pp != NULL) {
6583253Smec 		pl[0] = pp;
6593253Smec 		pl[1] = NULL;
6603253Smec 		*protp = PROT_ALL;
6613253Smec 		return (0);
6623253Smec 	}
6633253Smec 	return (ENOMEM);
6643253Smec }
6653253Smec 
6663253Smec /*
6673253Smec  * We can not allow a process mapping /dev/physmem pages to fork as there can
6683253Smec  * only be a single mapping to a /dev/physmem page at a given time.  Thus, the
6693253Smec  * return of EINVAL when we are not working on our own address space.
6703253Smec  * Otherwise we return zero as this function is required for normal operation.
6713253Smec  */
6723253Smec /*ARGSUSED*/
6733253Smec static int
physmem_addmap(struct vnode * vp,offset_t off,struct as * as,caddr_t addr,size_t len,uchar_t prot,uchar_t maxprot,uint_t flags,struct cred * cred,caller_context_t * ct)6743253Smec physmem_addmap(struct vnode *vp, offset_t off, struct as *as,
6753253Smec     caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
6765331Samw     struct cred *cred, caller_context_t *ct)
6773253Smec {
6783253Smec 	if (curproc->p_as != as) {
6793253Smec 		return (EINVAL);
6803253Smec 	}
6813253Smec 	return (0);
6823253Smec }
6833253Smec 
6843253Smec /* Will always get called for removing a whole segment. */
6853253Smec /*ARGSUSED*/
6863253Smec static int
physmem_delmap(struct vnode * vp,offset_t off,struct as * as,caddr_t addr,size_t len,uint_t prot,uint_t maxprot,uint_t flags,struct cred * cred,caller_context_t * ct)6873253Smec physmem_delmap(struct vnode *vp, offset_t off, struct as *as,
6883253Smec     caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags,
6895331Samw     struct cred *cred, caller_context_t *ct)
6903253Smec {
6913253Smec 	/*
6923253Smec 	 * Release our hold on the vnode so that the final VN_RELE will
6933253Smec 	 * call physmem_inactive to clean things up.
6943253Smec 	 */
6953253Smec 	VN_RELE(vp);
6963253Smec 
6973253Smec 	return (0);
6983253Smec }
6993253Smec 
7003253Smec /*
7013253Smec  * Clean up all the pages belonging to this vnode and then free it.
7023253Smec  */
7033253Smec /*ARGSUSED*/
7043253Smec static void
physmem_inactive(vnode_t * vp,cred_t * crp,caller_context_t * ct)7055331Samw physmem_inactive(vnode_t *vp, cred_t *crp, caller_context_t *ct)
7063253Smec {
7073253Smec 	page_t *pp;
7083253Smec 
7093253Smec 	/*
7103253Smec 	 * Remove the vnode from the hash now, to prevent asynchronous
7113253Smec 	 * attempts to map into this vnode.  This avoids a deadlock
7123253Smec 	 * where two threads try to get into this logic at the same
7133253Smec 	 * time and try to map the pages they are destroying into the
7143253Smec 	 * other's address space.
7153253Smec 	 * If it's not in the hash, just free it.
7163253Smec 	 */
7173253Smec 	if (physmem_remove_vnode_hash(vp) == 0) {
7183253Smec 		ASSERT(vp->v_pages == NULL);
7193253Smec 		vn_free(vp);
7203253Smec 		physmem_remove_hash_proc();
7213253Smec 		mutex_enter(&physmem_mutex);
7223253Smec 		physmem_vnodecnt--;
7233253Smec 		mutex_exit(&physmem_mutex);
7243253Smec 		return;
7253253Smec 	}
7263253Smec 
7273253Smec 	/*
7283253Smec 	 * At this point in time, no other logic can be adding or removing
7293253Smec 	 * pages from the vnode, otherwise the v_pages list could be inaccurate.
7303253Smec 	 */
7313253Smec 
7323253Smec 	while ((pp = vp->v_pages) != NULL) {
7333253Smec 		page_t *rpp;
7343253Smec 		if (page_tryupgrade(pp)) {
7353253Smec 			/*
7363253Smec 			 * set lckcnt for page_destroy to do availrmem
7373253Smec 			 * accounting
7383253Smec 			 */
7393253Smec 			pp->p_lckcnt = 1;
7403253Smec 			page_destroy(pp, 0);
7413253Smec 		} else {
7423253Smec 			/* failure to lock should be transient */
7433253Smec 			rpp = page_lookup(vp, ptob(pp->p_pagenum), SE_SHARED);
7443253Smec 			if (rpp != pp) {
7453253Smec 				page_unlock(rpp);
7463253Smec 				continue;
7473253Smec 			}
7483253Smec 			page_unlock(pp);
7493253Smec 		}
7503253Smec 	}
7513253Smec 	vn_free(vp);
7523253Smec 	physmem_remove_hash_proc();
7533253Smec 	mutex_enter(&physmem_mutex);
7543253Smec 	physmem_vnodecnt--;
7553253Smec 	mutex_exit(&physmem_mutex);
7563253Smec }
7573253Smec 
7583253Smec /*ARGSUSED*/
7593253Smec static int
physmem_ioctl(dev_t dev,int cmd,intptr_t arg,int mode,cred_t * credp,int * rvalp)7603253Smec physmem_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
7613253Smec     int *rvalp)
7623253Smec {
7633253Smec 	int ret;
7643253Smec 
7653253Smec 	switch (cmd) {
7663253Smec 	case PHYSMEM_SETUP:
7673253Smec 		{
7683253Smec 			struct physmem_setup_param psp;
7693253Smec 			if (ddi_copyin((void *)arg, &psp,
7703253Smec 			    sizeof (struct physmem_setup_param), 0))
7713253Smec 				return (EFAULT);
7723253Smec 			ret = physmem_setup_addrs(&psp);
7733253Smec 			if (ddi_copyout(&psp, (void *)arg, sizeof (psp), 0))
7743253Smec 				return (EFAULT);
7753253Smec 		}
7763253Smec 		break;
7773253Smec 	case PHYSMEM_MAP:
7783253Smec 		{
7793253Smec 			struct physmem_map_param pmp;
7803253Smec 			if (ddi_copyin((void *)arg, &pmp,
7813253Smec 			    sizeof (struct physmem_map_param), 0))
7823253Smec 				return (EFAULT);
7833253Smec 			ret = physmem_map_addrs(&pmp);
7843253Smec 			if (ddi_copyout(&pmp, (void *)arg, sizeof (pmp), 0))
7853253Smec 				return (EFAULT);
7863253Smec 		}
7873253Smec 		break;
7883253Smec 	case PHYSMEM_DESTROY:
7893253Smec 		{
7903253Smec 			uint64_t cookie;
7913253Smec 			if (ddi_copyin((void *)arg, &cookie,
7923253Smec 			    sizeof (uint64_t), 0))
7933253Smec 				return (EFAULT);
7943253Smec 			ret = physmem_destroy_addrs(cookie);
7953253Smec 		}
7963253Smec 		break;
7973253Smec 	default:
7983253Smec 		return (ENOTSUP);
7993253Smec 	}
8003253Smec 	return (ret);
8013253Smec }
8023253Smec 
8033253Smec /*ARGSUSED*/
8043253Smec static int
physmem_open(dev_t * devp,int flag,int otyp,cred_t * credp)8053253Smec physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp)
8063253Smec {
8073253Smec 	int ret;
8083253Smec 	static int msg_printed = 0;
8093253Smec 
8103253Smec 	if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) {
8113253Smec 		return (EINVAL);
8123253Smec 	}
8133253Smec 
8143253Smec 	/* need to make sure we have the right privileges */
8153253Smec 	if ((ret = secpolicy_resource(credp)) != 0)
8163253Smec 		return (ret);
8173253Smec 	if ((ret = secpolicy_lock_memory(credp)) != 0)
8183253Smec 		return (ret);
8193253Smec 
8203253Smec 	if (msg_printed == 0) {
8213253Smec 		cmn_err(CE_NOTE, "!driver has been opened. This driver may "
8223253Smec 		    "take out long term locks on pages which may impact "
8233253Smec 		    "dynamic reconfiguration events");
8243253Smec 		msg_printed = 1;
8253253Smec 	}
8263253Smec 
8273253Smec 	return (0);
8283253Smec }
8293253Smec 
8303253Smec /*ARGSUSED*/
8313253Smec static int
physmem_close(dev_t dev,int flag,int otyp,cred_t * credp)8323253Smec physmem_close(dev_t dev, int flag, int otyp, cred_t *credp)
8333253Smec {
8343253Smec 	return (0);
8353253Smec }
8363253Smec 
8373253Smec /*ARGSUSED*/
8383253Smec static int
physmem_getinfo(dev_info_t * dip,ddi_info_cmd_t infocmd,void * arg,void ** resultp)8393253Smec physmem_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd,
8403253Smec     void *arg, void **resultp)
8413253Smec {
8423253Smec 	switch (infocmd) {
8433253Smec 	case DDI_INFO_DEVT2DEVINFO:
8443253Smec 		*resultp = physmem_dip;
8453253Smec 		return (DDI_SUCCESS);
8463253Smec 
8473253Smec 	case DDI_INFO_DEVT2INSTANCE:
8483253Smec 		*resultp = (void *)(ulong_t)getminor((dev_t)arg);
8493253Smec 		return (DDI_SUCCESS);
8503253Smec 
8513253Smec 	default:
8523253Smec 		return (DDI_FAILURE);
8533253Smec 	}
8543253Smec }
8553253Smec 
8563253Smec static int
physmem_attach(dev_info_t * dip,ddi_attach_cmd_t cmd)8573253Smec physmem_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
8583253Smec {
8593253Smec 	int i;
8603253Smec 
8613253Smec 	if (cmd == DDI_RESUME) {
8623253Smec 		return (DDI_SUCCESS);
8633253Smec 	}
8643253Smec 
8653253Smec 	if (cmd != DDI_ATTACH)
8663253Smec 		return (DDI_FAILURE);
8673253Smec 
8683253Smec 	if (ddi_create_minor_node(dip, ddi_get_name(dip), S_IFCHR,
8693253Smec 	    ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS)
8703253Smec 		return (DDI_FAILURE);
8713253Smec 
8723253Smec 	physmem_dip = dip;
8733253Smec 
8743253Smec 	/* Initialize driver specific data */
8753253Smec 	if (physmem_setup_vnops()) {
8763253Smec 		ddi_remove_minor_node(dip, ddi_get_name(dip));
8773253Smec 		return (DDI_FAILURE);
8783253Smec 	}
8793253Smec 
8803253Smec 	for (i = 0; i < PPH_SIZE; i++)
8813253Smec 		pph[i] = NULL;
8823253Smec 
8833253Smec 	page_capture_register_callback(PC_PHYSMEM, 10000,
8843253Smec 	    map_page_proc);
8853253Smec 
8863253Smec 	return (DDI_SUCCESS);
8873253Smec }
8883253Smec 
8893253Smec static int
physmem_detach(dev_info_t * dip,ddi_detach_cmd_t cmd)8903253Smec physmem_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
8913253Smec {
8923253Smec 	int ret = DDI_SUCCESS;
8933253Smec 
8943253Smec 	if (cmd == DDI_SUSPEND) {
8953253Smec 		return (DDI_SUCCESS);
8963253Smec 	}
8973253Smec 
8983253Smec 	if (cmd != DDI_DETACH)
8993253Smec 		return (DDI_FAILURE);
9003253Smec 
9013253Smec 	ASSERT(physmem_dip == dip);
9023253Smec 
9033253Smec 	mutex_enter(&physmem_mutex);
9043253Smec 	if (physmem_vnodecnt == 0) {
9053253Smec 		if (physmem_vnodeops != NULL) {
9063253Smec 			vn_freevnodeops(physmem_vnodeops);
9073253Smec 			physmem_vnodeops = NULL;
9083253Smec 			page_capture_unregister_callback(PC_PHYSMEM);
9093253Smec 		}
9103253Smec 	} else {
9113253Smec 		ret = EBUSY;
9123253Smec 	}
9133253Smec 	mutex_exit(&physmem_mutex);
9143253Smec 	if (ret == DDI_SUCCESS)
9153253Smec 		ddi_remove_minor_node(dip, ddi_get_name(dip));
9163253Smec 	return (ret);
9173253Smec }
9183253Smec 
9193253Smec static struct cb_ops physmem_cb_ops = {
9203253Smec 	physmem_open,	/* open */
9213253Smec 	physmem_close,	/* close */
9223253Smec 	nodev,		/* strategy */
9233253Smec 	nodev,		/* print */
9243253Smec 	nodev,		/* dump */
9253253Smec 	nodev,		/* read */
9263253Smec 	nodev,		/* write */
9273253Smec 	physmem_ioctl,	/* ioctl */
9283253Smec 	nodev,		/* devmap */
9293253Smec 	nodev,		/* mmap */
9303253Smec 	nodev,		/* segmap */
9313253Smec 	nochpoll,	/* chpoll */
9323253Smec 	ddi_prop_op,	/* prop_op */
9333253Smec 	NULL,		/* cb_str */
9343253Smec 	D_NEW | D_MP | D_DEVMAP,
9353253Smec 	CB_REV,
9363253Smec 	NULL,
9373253Smec 	NULL
9383253Smec };
9393253Smec 
9403253Smec static struct dev_ops physmem_ops = {
9413253Smec 	DEVO_REV,
9423253Smec 	0,
9433253Smec 	physmem_getinfo,
9443253Smec 	nulldev,
9453253Smec 	nulldev,
9463253Smec 	physmem_attach,
9473253Smec 	physmem_detach,
9483253Smec 	nodev,
9493253Smec 	&physmem_cb_ops,
9503253Smec 	NULL,
951*7656SSherry.Moore@Sun.COM 	NULL,
952*7656SSherry.Moore@Sun.COM 	ddi_quiesce_not_needed,		/* quiesce */
9533253Smec };
9543253Smec 
9553253Smec static struct modldrv modldrv = {
9563253Smec 	&mod_driverops,
957*7656SSherry.Moore@Sun.COM 	"physmem driver",
9583253Smec 	&physmem_ops
9593253Smec };
9603253Smec 
9613253Smec static struct modlinkage modlinkage = {
9623253Smec 	MODREV_1,
9633253Smec 	&modldrv,
9643253Smec 	NULL
9653253Smec };
9663253Smec 
9673253Smec int
_init(void)9683253Smec _init(void)
9693253Smec {
9703253Smec 	return (mod_install(&modlinkage));
9713253Smec }
9723253Smec 
9733253Smec int
_info(struct modinfo * modinfop)9743253Smec _info(struct modinfo *modinfop)
9753253Smec {
9763253Smec 	return (mod_info(&modlinkage, modinfop));
9773253Smec }
9783253Smec 
9793253Smec int
_fini(void)9803253Smec _fini(void)
9813253Smec {
9823253Smec 	return (mod_remove(&modlinkage));
9833253Smec }
984