xref: /onnv-gate/usr/src/uts/common/io/physmem.c (revision 5331:3047ad28a67b)
13253Smec /*
23253Smec  * CDDL HEADER START
33253Smec  *
43253Smec  * The contents of this file are subject to the terms of the
53253Smec  * Common Development and Distribution License (the "License").
63253Smec  * You may not use this file except in compliance with the License.
73253Smec  *
83253Smec  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
93253Smec  * or http://www.opensolaris.org/os/licensing.
103253Smec  * See the License for the specific language governing permissions
113253Smec  * and limitations under the License.
123253Smec  *
133253Smec  * When distributing Covered Code, include this CDDL HEADER in each
143253Smec  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
153253Smec  * If applicable, add the following below this CDDL HEADER, with the
163253Smec  * fields enclosed by brackets "[]" replaced with your own identifying
173253Smec  * information: Portions Copyright [yyyy] [name of copyright owner]
183253Smec  *
193253Smec  * CDDL HEADER END
203253Smec  */
213253Smec /*
223616Smec  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
233253Smec  * Use is subject to license terms.
243253Smec  */
253253Smec 
263253Smec #pragma ident	"%Z%%M%	%I%	%E% SMI"
273253Smec 
283253Smec #include <sys/types.h>
293253Smec #include <sys/modctl.h>
303253Smec #include <sys/conf.h>
313253Smec #include <sys/ddi.h>
323253Smec #include <sys/sunddi.h>
333253Smec #include <sys/devops.h>
343253Smec #include <sys/stat.h>
353253Smec #include <sys/file.h>
363253Smec #include <sys/cred.h>
373253Smec #include <sys/policy.h>
383253Smec #include <sys/errno.h>
393253Smec #include <vm/seg_dev.h>
403253Smec #include <vm/seg_vn.h>
413253Smec #include <vm/page.h>
423253Smec #include <sys/fs/swapnode.h>
433253Smec #include <sys/sysmacros.h>
443253Smec #include <sys/fcntl.h>
453253Smec #include <sys/vmsystm.h>
463253Smec #include <sys/physmem.h>
473898Srsb #include <sys/vfs_opreg.h>
483253Smec 
493253Smec static dev_info_t		*physmem_dip = NULL;
503253Smec 
513253Smec /*
523253Smec  * Linked list element hanging off physmem_proc_hash below, which holds all
533253Smec  * the information for a given segment which has been setup for this process.
543253Smec  * This is a simple linked list as we are assuming that for a given process
553253Smec  * the setup ioctl will only be called a handful of times.  If this assumption
563253Smec  * changes in the future, a quicker to traverse data structure should be used.
573253Smec  */
583253Smec struct physmem_hash {
593253Smec 	struct physmem_hash *ph_next;
603253Smec 	uint64_t ph_base_pa;
613253Smec 	caddr_t ph_base_va;
623253Smec 	size_t ph_seg_len;
633253Smec 	struct vnode *ph_vnode;
643253Smec };
653253Smec 
663253Smec /*
673253Smec  * Hash of all of the processes which have setup mappings with the driver with
683253Smec  * pointers to per process data.
693253Smec  */
703253Smec struct physmem_proc_hash {
713253Smec 	struct proc *pph_proc;
723253Smec 	struct physmem_hash *pph_hash;
733253Smec 	struct physmem_proc_hash *pph_next;
743253Smec };
753253Smec 
763253Smec 
773253Smec /* Needs to be a power of two for simple hash algorithm */
783253Smec #define	PPH_SIZE	8
793253Smec struct physmem_proc_hash *pph[PPH_SIZE];
803253Smec 
813253Smec /*
823253Smec  * Lock which protects the pph hash above.  To add an element (either a new
833253Smec  * process or a new segment) the WRITE lock must be held.  To traverse the
843253Smec  * list, only a READ lock is needed.
853253Smec  */
863253Smec krwlock_t pph_rwlock;
873253Smec 
883253Smec #define	PHYSMEM_HASH(procp) ((int)((((uintptr_t)procp) >> 8) & (PPH_SIZE - 1)))
893253Smec 
903253Smec /*
913253Smec  * Need to keep a reference count of how many processes have the driver
923253Smec  * open to prevent it from disappearing.
933253Smec  */
943253Smec uint64_t physmem_vnodecnt;
953253Smec kmutex_t physmem_mutex;		/* protects phsymem_vnodecnt */
963253Smec 
973253Smec static int physmem_getpage(struct vnode *vp, offset_t off, size_t len,
983253Smec     uint_t *protp, page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
99*5331Samw     enum seg_rw rw, struct cred *cr, caller_context_t *ct);
1003253Smec 
1013253Smec static int physmem_addmap(struct vnode *vp, offset_t off, struct as *as,
1023253Smec     caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
103*5331Samw     struct cred *cred, caller_context_t *ct);
1043253Smec 
1053253Smec static int physmem_delmap(struct vnode *vp, offset_t off, struct as *as,
1063253Smec     caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags,
107*5331Samw     struct cred *cred, caller_context_t *ct);
1083253Smec 
109*5331Samw static void physmem_inactive(vnode_t *vp, cred_t *crp, caller_context_t *ct);
1103253Smec 
1113253Smec const fs_operation_def_t physmem_vnodeops_template[] = {
1123898Srsb 	VOPNAME_GETPAGE,	{ .vop_getpage = physmem_getpage },
1133898Srsb 	VOPNAME_ADDMAP,		{ .vop_addmap = physmem_addmap },
1143898Srsb 	VOPNAME_DELMAP,		{ .vop_delmap = physmem_delmap },
1153898Srsb 	VOPNAME_INACTIVE,	{ .vop_inactive = physmem_inactive },
1163898Srsb 	NULL,			NULL
1173253Smec };
1183253Smec 
1193253Smec vnodeops_t *physmem_vnodeops = NULL;
1203253Smec 
1213253Smec /*
1223253Smec  * Removes the current process from the hash if the process has no more
1233253Smec  * physmem segments active.
1243253Smec  */
1253253Smec void
1263253Smec physmem_remove_hash_proc()
1273253Smec {
1283253Smec 	int index;
1293253Smec 	struct physmem_proc_hash **walker;
1303253Smec 	struct physmem_proc_hash *victim = NULL;
1313253Smec 
1323253Smec 	index = PHYSMEM_HASH(curproc);
1333253Smec 	rw_enter(&pph_rwlock, RW_WRITER);
1343253Smec 	walker = &pph[index];
1353253Smec 	while (*walker != NULL) {
1363253Smec 		if ((*walker)->pph_proc == curproc &&
1373253Smec 		    (*walker)->pph_hash == NULL) {
1383253Smec 			victim = *walker;
1393253Smec 			*walker = victim->pph_next;
1403253Smec 			break;
1413253Smec 		}
1423253Smec 		walker = &((*walker)->pph_next);
1433253Smec 	}
1443253Smec 	rw_exit(&pph_rwlock);
1453253Smec 	if (victim != NULL)
1463253Smec 		kmem_free(victim, sizeof (struct physmem_proc_hash));
1473253Smec }
1483253Smec 
1493253Smec /*
1503253Smec  * Add a new entry to the hash for the given process to cache the
1513253Smec  * address ranges that it is working on.  If this is the first hash
1523253Smec  * item to be added for this process, we will create the head pointer
1533253Smec  * for this process.
1543253Smec  * Returns 0 on success, ERANGE when the physical address is already in the
1553616Smec  * hash.
1563253Smec  */
1573253Smec int
1583253Smec physmem_add_hash(struct physmem_hash *php)
1593253Smec {
1603253Smec 	int index;
1613253Smec 	struct physmem_proc_hash *iterator;
1623253Smec 	struct physmem_proc_hash *newp = NULL;
1633253Smec 	struct physmem_hash *temp;
1643253Smec 	int ret = 0;
1653253Smec 
1663253Smec 	index = PHYSMEM_HASH(curproc);
1673253Smec 
1683253Smec insert:
1693253Smec 	rw_enter(&pph_rwlock, RW_WRITER);
1703253Smec 	iterator = pph[index];
1713253Smec 	while (iterator != NULL) {
1723253Smec 		if (iterator->pph_proc == curproc) {
1733253Smec 			/*
1743253Smec 			 * check to make sure a single process does not try to
1753253Smec 			 * map the same region twice.
1763253Smec 			 */
1773253Smec 			for (temp = iterator->pph_hash; temp != NULL;
1783253Smec 			    temp = temp->ph_next) {
1793253Smec 				if ((php->ph_base_pa >= temp->ph_base_pa &&
1803253Smec 				    php->ph_base_pa < temp->ph_base_pa +
1813253Smec 				    temp->ph_seg_len) ||
1823253Smec 				    (temp->ph_base_pa >= php->ph_base_pa &&
1833253Smec 				    temp->ph_base_pa < php->ph_base_pa +
1843253Smec 				    php->ph_seg_len)) {
1853253Smec 					ret = ERANGE;
1863253Smec 					break;
1873253Smec 				}
1883253Smec 			}
1893253Smec 			if (ret == 0) {
1903253Smec 				php->ph_next = iterator->pph_hash;
1913253Smec 				iterator->pph_hash = php;
1923253Smec 			}
1933253Smec 			rw_exit(&pph_rwlock);
1943253Smec 			/* Need to check for two threads in sync */
1953253Smec 			if (newp != NULL)
1963253Smec 				kmem_free(newp, sizeof (*newp));
1973253Smec 			return (ret);
1983253Smec 		}
1993253Smec 		iterator = iterator->pph_next;
2003253Smec 	}
2013253Smec 
2023253Smec 	if (newp != NULL) {
2033253Smec 		newp->pph_proc = curproc;
2043253Smec 		newp->pph_next = pph[index];
2053253Smec 		newp->pph_hash = php;
2063253Smec 		php->ph_next = NULL;
2073253Smec 		pph[index] = newp;
2083253Smec 		rw_exit(&pph_rwlock);
2093253Smec 		return (0);
2103253Smec 	}
2113253Smec 
2123253Smec 	rw_exit(&pph_rwlock);
2133253Smec 	/* Dropped the lock so we could use KM_SLEEP */
2143253Smec 	newp = kmem_zalloc(sizeof (struct physmem_proc_hash), KM_SLEEP);
2153253Smec 	goto insert;
2163253Smec }
2173253Smec 
2183253Smec /*
2193253Smec  * Will return the pointer to the physmem_hash struct if the setup routine
2203253Smec  * has previously been called for this memory.
2213253Smec  * Returns NULL on failure.
2223253Smec  */
2233253Smec struct physmem_hash *
2243253Smec physmem_get_hash(uint64_t req_paddr, size_t len, proc_t *procp)
2253253Smec {
2263253Smec 	int index;
2273253Smec 	struct physmem_proc_hash *proc_hp;
2283253Smec 	struct physmem_hash *php;
2293253Smec 
2303253Smec 	ASSERT(rw_lock_held(&pph_rwlock));
2313253Smec 
2323253Smec 	index = PHYSMEM_HASH(procp);
2333253Smec 	proc_hp = pph[index];
2343253Smec 	while (proc_hp != NULL) {
2353253Smec 		if (proc_hp->pph_proc == procp) {
2363253Smec 			php = proc_hp->pph_hash;
2373253Smec 			while (php != NULL) {
2383253Smec 				if ((req_paddr >= php->ph_base_pa) &&
2393253Smec 				    (req_paddr + len <=
2403253Smec 				    php->ph_base_pa + php->ph_seg_len)) {
2413253Smec 					return (php);
2423253Smec 				}
2433253Smec 				php = php->ph_next;
2443253Smec 			}
2453253Smec 		}
2463253Smec 		proc_hp = proc_hp->pph_next;
2473253Smec 	}
2483253Smec 	return (NULL);
2493253Smec }
2503253Smec 
2513253Smec int
2523253Smec physmem_validate_cookie(uint64_t p_cookie)
2533253Smec {
2543253Smec 	int index;
2553253Smec 	struct physmem_proc_hash *proc_hp;
2563253Smec 	struct physmem_hash *php;
2573253Smec 
2583253Smec 	ASSERT(rw_lock_held(&pph_rwlock));
2593253Smec 
2603253Smec 	index = PHYSMEM_HASH(curproc);
2613253Smec 	proc_hp = pph[index];
2623253Smec 	while (proc_hp != NULL) {
2633253Smec 		if (proc_hp->pph_proc == curproc) {
2643253Smec 			php = proc_hp->pph_hash;
2653253Smec 			while (php != NULL) {
2663253Smec 				if ((uint64_t)(uintptr_t)php == p_cookie) {
2673253Smec 					return (1);
2683253Smec 				}
2693253Smec 				php = php->ph_next;
2703253Smec 			}
2713253Smec 		}
2723253Smec 		proc_hp = proc_hp->pph_next;
2733253Smec 	}
2743253Smec 	return (0);
2753253Smec }
2763253Smec 
2773253Smec /*
2783253Smec  * Remove the given vnode from the pph hash.  If it exists in the hash the
2793253Smec  * process still has to be around as the vnode is obviously still around and
2803253Smec  * since it's a physmem vnode, it must be in the hash.
2813253Smec  * If it is not in the hash that must mean that the setup ioctl failed.
2823253Smec  * Return 0 in this instance, 1 if it is in the hash.
2833253Smec  */
2843253Smec int
2853253Smec physmem_remove_vnode_hash(vnode_t *vp)
2863253Smec {
2873253Smec 	int index;
2883253Smec 	struct physmem_proc_hash *proc_hp;
2893253Smec 	struct physmem_hash **phpp;
2903253Smec 	struct physmem_hash *victim;
2913253Smec 
2923253Smec 	index = PHYSMEM_HASH(curproc);
2933253Smec 	/* synchronize with the map routine */
2943253Smec 	rw_enter(&pph_rwlock, RW_WRITER);
2953253Smec 	proc_hp = pph[index];
2963253Smec 	while (proc_hp != NULL) {
2973253Smec 		if (proc_hp->pph_proc == curproc) {
2983253Smec 			phpp = &proc_hp->pph_hash;
2993253Smec 			while (*phpp != NULL) {
3003253Smec 				if ((*phpp)->ph_vnode == vp) {
3013253Smec 					victim = *phpp;
3023253Smec 					*phpp = victim->ph_next;
3033253Smec 
3043253Smec 					rw_exit(&pph_rwlock);
3053253Smec 					kmem_free(victim, sizeof (*victim));
3063253Smec 					return (1);
3073253Smec 				}
3083253Smec 				phpp = &(*phpp)->ph_next;
3093253Smec 			}
3103253Smec 		}
3113253Smec 		proc_hp = proc_hp->pph_next;
3123253Smec 	}
3133253Smec 	rw_exit(&pph_rwlock);
3143253Smec 
3153253Smec 	/* not found */
3163253Smec 	return (0);
3173253Smec }
3183253Smec 
3193253Smec int
3203253Smec physmem_setup_vnops()
3213253Smec {
3223253Smec 	int error;
3233253Smec 	char *name = "physmem";
3243253Smec 	if (physmem_vnodeops != NULL)
3253253Smec 		cmn_err(CE_PANIC, "physmem vnodeops already set\n");
3263253Smec 	error = vn_make_ops(name, physmem_vnodeops_template, &physmem_vnodeops);
3273253Smec 	if (error != 0) {
3283253Smec 		cmn_err(CE_WARN, "physmem_setup_vnops: bad vnode ops template");
3293253Smec 	}
3303253Smec 	return (error);
3313253Smec }
3323253Smec 
3333253Smec /*
3343253Smec  * The guts of the PHYSMEM_SETUP ioctl.
3353253Smec  * Create a segment in the address space with the specified parameters.
3363253Smec  * If pspp->user_va is NULL, as_gap will be used to find an appropriate VA.
337*5331Samw  * We do not do bounds checking on the requested physical addresses, if they
3383253Smec  * do not exist in the system, they will not be mappable.
3393253Smec  * Returns 0 on success with the following error codes on failure:
3403253Smec  *	ENOMEM - The VA range requested was already mapped if pspp->user_va is
3413253Smec  *		non-NULL or the system was unable to find enough VA space for
3423253Smec  *		the desired length if user_va was NULL>
3433253Smec  *	EINVAL - The requested PA, VA, or length was not PAGESIZE aligned.
3443253Smec  */
3453253Smec int
3463253Smec physmem_setup_addrs(struct physmem_setup_param *pspp)
3473253Smec {
3483253Smec 	struct as *as = curproc->p_as;
3493253Smec 	struct segvn_crargs vn_a;
3503253Smec 	int ret = 0;
3513253Smec 	uint64_t base_pa;
3523253Smec 	size_t len;
3533253Smec 	caddr_t uvaddr;
3543253Smec 	struct vnode *vp;
3553253Smec 	struct physmem_hash *php;
3563253Smec 
3573253Smec 	ASSERT(pspp != NULL);
3583253Smec 	base_pa = pspp->req_paddr;
3593253Smec 	len = pspp->len;
3603253Smec 	uvaddr = (caddr_t)(uintptr_t)pspp->user_va;
3613253Smec 
3623253Smec 	/* Sanity checking */
3633253Smec 	if (!IS_P2ALIGNED(base_pa, PAGESIZE))
3643253Smec 		return (EINVAL);
3653253Smec 	if (!IS_P2ALIGNED(len, PAGESIZE))
3663253Smec 		return (EINVAL);
3673253Smec 	if (uvaddr != NULL && !IS_P2ALIGNED(uvaddr, PAGESIZE))
3683253Smec 		return (EINVAL);
3693253Smec 
3703253Smec 	php = kmem_zalloc(sizeof (struct physmem_hash), KM_SLEEP);
3713253Smec 
3723253Smec 	/* Need to bump vnode count so that the driver can not be unloaded */
3733253Smec 	mutex_enter(&physmem_mutex);
3743253Smec 	physmem_vnodecnt++;
3753253Smec 	mutex_exit(&physmem_mutex);
3763253Smec 
3773253Smec 	vp = vn_alloc(KM_SLEEP);
3783253Smec 	ASSERT(vp != NULL);	/* SLEEP can't return NULL */
3793253Smec 	vn_setops(vp, physmem_vnodeops);
3803253Smec 
3813253Smec 	php->ph_vnode = vp;
3823253Smec 
3833253Smec 	vn_a.vp = vp;
3843253Smec 	vn_a.offset = (u_offset_t)base_pa;
3853253Smec 	vn_a.type = MAP_SHARED;
3863253Smec 	vn_a.prot = PROT_ALL;
3873253Smec 	vn_a.maxprot = PROT_ALL;
3883253Smec 	vn_a.flags = 0;
3893253Smec 	vn_a.cred = NULL;
3903253Smec 	vn_a.amp = NULL;
3913253Smec 	vn_a.szc = 0;
3923253Smec 	vn_a.lgrp_mem_policy_flags = 0;
3933253Smec 
3943253Smec 	as_rangelock(as);
3953253Smec 	if (uvaddr != NULL) {
3963253Smec 		if (as_gap(as, len, &uvaddr, &len, AH_LO, NULL) == -1) {
3973253Smec 			ret = ENOMEM;
3983253Smec fail:
3993253Smec 			as_rangeunlock(as);
4003253Smec 			vn_free(vp);
4013253Smec 			kmem_free(php, sizeof (*php));
4023253Smec 			mutex_enter(&physmem_mutex);
4033253Smec 			physmem_vnodecnt--;
4043253Smec 			mutex_exit(&physmem_mutex);
4053253Smec 			return (ret);
4063253Smec 		}
4073253Smec 	} else {
4083253Smec 		/* We pick the address for the user */
4093253Smec 		map_addr(&uvaddr, len, 0, 1, 0);
4103253Smec 		if (uvaddr == NULL) {
4113253Smec 			ret = ENOMEM;
4123253Smec 			goto fail;
4133253Smec 		}
4143253Smec 	}
4153253Smec 	ret = as_map(as, uvaddr, len, segvn_create, &vn_a);
4163253Smec 
4173253Smec 	if (ret == 0) {
4183616Smec 		as_rangeunlock(as);
4193253Smec 		php->ph_base_pa = base_pa;
4203253Smec 		php->ph_base_va = uvaddr;
4213253Smec 		php->ph_seg_len = len;
4223253Smec 		pspp->user_va = (uint64_t)(uintptr_t)uvaddr;
4233253Smec 		pspp->cookie = (uint64_t)(uintptr_t)php;
4243253Smec 		ret = physmem_add_hash(php);
4253253Smec 		if (ret == 0)
4263253Smec 			return (0);
4273616Smec 
4283616Smec 		/* Note that the call to as_unmap will free the vnode */
4293253Smec 		(void) as_unmap(as, uvaddr, len);
4303616Smec 		kmem_free(php, sizeof (*php));
4313253Smec 		return (ret);
4323253Smec 	}
4333253Smec 
4343253Smec 	goto fail;
4353253Smec 	/*NOTREACHED*/
4363253Smec }
4373253Smec 
4383253Smec /*
4393253Smec  * The guts of the PHYSMEM_MAP ioctl.
4403253Smec  * Map the given PA to the appropriate VA if PHYSMEM_SETUP ioctl has already
4413253Smec  * been called for this PA range.
4423253Smec  * Returns 0 on success with the following error codes on failure:
4433253Smec  *	EPERM - The requested page is long term locked, and thus repeated
4443253Smec  *		requests to allocate this page will likely fail.
4453253Smec  *	EAGAIN - The requested page could not be allocated, but it is believed
4463253Smec  *		that future attempts could succeed.
4473253Smec  *	ENOMEM - There was not enough free memory in the system to safely
4483253Smec  *		map the requested page.
4493253Smec  *	EINVAL - The requested paddr was not PAGESIZE aligned or the
4503253Smec  *		PHYSMEM_SETUP ioctl was not called for this page.
4513253Smec  *	ENOENT - The requested page was iniside the kernel cage, and the
4523253Smec  *		PHYSMEM_CAGE flag was not set.
4533253Smec  *	EBUSY - The requested page is retired and the PHYSMEM_RETIRE flag
4543253Smec  *		was not set.
4553253Smec  */
4563253Smec static int
4573253Smec physmem_map_addrs(struct physmem_map_param *pmpp)
4583253Smec {
4593253Smec 	caddr_t uvaddr;
4603253Smec 	page_t *pp;
4613253Smec 	uint64_t req_paddr;
4623253Smec 	struct vnode *vp;
4633253Smec 	int ret = 0;
4643253Smec 	struct physmem_hash *php;
4653253Smec 	uint_t flags = 0;
4663253Smec 
4673253Smec 	ASSERT(pmpp != NULL);
4683253Smec 	req_paddr = pmpp->req_paddr;
4693253Smec 
4703253Smec 	if (!IS_P2ALIGNED(req_paddr, PAGESIZE))
4713253Smec 		return (EINVAL);
4723253Smec 	/* Find the vnode for this map request */
4733253Smec 	rw_enter(&pph_rwlock, RW_READER);
4743253Smec 	php = physmem_get_hash(req_paddr, PAGESIZE, curproc);
4753253Smec 	if (php == NULL) {
4763253Smec 		rw_exit(&pph_rwlock);
4773253Smec 		return (EINVAL);
4783253Smec 	}
4793253Smec 	vp = php->ph_vnode;
4803253Smec 	uvaddr = php->ph_base_va + (req_paddr - php->ph_base_pa);
4813253Smec 	rw_exit(&pph_rwlock);
4823253Smec 
4833253Smec 	pp = page_numtopp_nolock(btop((size_t)req_paddr));
4843253Smec 	if (pp == NULL) {
4853253Smec 		pmpp->ret_va = NULL;
4863253Smec 		return (EPERM);
4873253Smec 	}
4883253Smec 
4893253Smec 	/*
4903253Smec 	 * Check to see if page already mapped correctly.  This can happen
4913253Smec 	 * when we failed to capture a page previously and it was captured
4923253Smec 	 * asynchronously for us.  Return success in this case.
4933253Smec 	 */
4943253Smec 	if (pp->p_vnode == vp) {
4953253Smec 		ASSERT(pp->p_offset == (u_offset_t)req_paddr);
4963253Smec 		pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr;
4973253Smec 		return (0);
4983253Smec 	}
4993253Smec 
5003253Smec 	/*
5013253Smec 	 * physmem should be responsible for checking for cage
5023253Smec 	 * and prom pages.
5033253Smec 	 */
5043253Smec 	if (pmpp->flags & PHYSMEM_CAGE)
5053253Smec 		flags = CAPTURE_GET_CAGE;
5063253Smec 	if (pmpp->flags & PHYSMEM_RETIRED)
5073253Smec 		flags |= CAPTURE_GET_RETIRED;
5083253Smec 
5093253Smec 	ret = page_trycapture(pp, 0, flags | CAPTURE_PHYSMEM, curproc);
5103253Smec 
5113253Smec 	if (ret != 0) {
5123253Smec 		pmpp->ret_va = NULL;
5133253Smec 		return (ret);
5143253Smec 	} else {
5153253Smec 		pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr;
5163253Smec 		return (0);
5173253Smec 	}
5183253Smec }
5193253Smec 
5203253Smec /*
5213253Smec  * Map the given page into the process's address space if possible.
5223253Smec  * We actually only hash the page in on the correct vnode as the page
5233253Smec  * will be mapped via segvn_pagefault.
5243253Smec  * returns 0 on success
5253253Smec  * returns 1 if there is no need to map this page anymore (process exited)
5263253Smec  * returns -1 if we failed to map the page.
5273253Smec  */
5283253Smec int
5293253Smec map_page_proc(page_t *pp, void *arg, uint_t flags)
5303253Smec {
5313253Smec 	struct vnode *vp;
5323253Smec 	proc_t *procp = (proc_t *)arg;
5333253Smec 	int ret;
5343253Smec 	u_offset_t paddr = (u_offset_t)ptob(pp->p_pagenum);
5353253Smec 	struct physmem_hash *php;
5363253Smec 
5373253Smec 	ASSERT(pp != NULL);
5383253Smec 
5393253Smec 	/*
5403253Smec 	 * Check against availrmem to make sure that we're not low on memory.
5413253Smec 	 * We check again here as ASYNC requests do not do this check elsewhere.
5423253Smec 	 * We return 1 as we don't want the page to have the PR_CAPTURE bit
5433253Smec 	 * set or be on the page capture hash.
5443253Smec 	 */
5453253Smec 	if (swapfs_minfree > availrmem + 1) {
5463253Smec 		page_free(pp, 1);
5473253Smec 		return (1);
5483253Smec 	}
5493253Smec 
5503253Smec 	/*
5513253Smec 	 * If this is an asynchronous request for the current process,
5523253Smec 	 * we can not map the page as it's possible that we are also in the
5533253Smec 	 * process of unmapping the page which could result in a deadlock
5543253Smec 	 * with the as lock.
5553253Smec 	 */
5563253Smec 	if ((flags & CAPTURE_ASYNC) && (curproc == procp)) {
5573253Smec 		page_free(pp, 1);
5583253Smec 		return (-1);
5593253Smec 	}
5603253Smec 
5613253Smec 	/* only return zeroed out pages */
5623253Smec 	pagezero(pp, 0, PAGESIZE);
5633253Smec 
5643253Smec 	rw_enter(&pph_rwlock, RW_READER);
5653253Smec 	php = physmem_get_hash(paddr, PAGESIZE, procp);
5663253Smec 	if (php == NULL) {
5673253Smec 		rw_exit(&pph_rwlock);
5683253Smec 		/*
5693253Smec 		 * Free the page as there is no longer a valid outstanding
5703253Smec 		 * request for this page.
5713253Smec 		 */
5723253Smec 		page_free(pp, 1);
5733253Smec 		return (1);
5743253Smec 	}
5753253Smec 
5763253Smec 	vp = php->ph_vnode;
5773253Smec 
5783253Smec 	/*
5793253Smec 	 * We need to protect against a possible deadlock here where we own
5803253Smec 	 * the vnode page hash mutex and want to acquire it again as there
5813253Smec 	 * are locations in the code, where we unlock a page while holding
5823253Smec 	 * the mutex which can lead to the page being captured and eventually
5833253Smec 	 * end up here.
5843253Smec 	 */
5853253Smec 	if (mutex_owned(page_vnode_mutex(vp))) {
5863253Smec 		rw_exit(&pph_rwlock);
5873253Smec 		page_free(pp, 1);
5883253Smec 		return (-1);
5893253Smec 	}
5903253Smec 
5913253Smec 	ret = page_hashin(pp, vp, paddr, NULL);
5923253Smec 	rw_exit(&pph_rwlock);
5933253Smec 	if (ret == 0) {
5943253Smec 		page_free(pp, 1);
5953253Smec 		return (-1);
5963253Smec 	}
5973253Smec 
5983253Smec 	page_downgrade(pp);
5993253Smec 
6003253Smec 	mutex_enter(&freemem_lock);
6013253Smec 	availrmem--;
6023253Smec 	mutex_exit(&freemem_lock);
6033253Smec 
6043253Smec 	return (0);
6053253Smec }
6063253Smec 
6073253Smec /*
6083253Smec  * The guts of the PHYSMEM_DESTROY ioctl.
6093253Smec  * The cookie passed in will provide all of the information needed to
6103253Smec  * free up the address space and physical memory associated with the
6113253Smec  * corresponding PHSYMEM_SETUP ioctl.
6123253Smec  * Returns 0 on success with the following error codes on failure:
6133253Smec  *	EINVAL - The cookie supplied is not valid.
6143253Smec  */
6153253Smec int
6163253Smec physmem_destroy_addrs(uint64_t p_cookie)
6173253Smec {
6183253Smec 	struct as *as = curproc->p_as;
6193253Smec 	size_t len;
6203253Smec 	caddr_t uvaddr;
6213253Smec 
6223253Smec 	rw_enter(&pph_rwlock, RW_READER);
6233253Smec 	if (physmem_validate_cookie(p_cookie) == 0) {
6243253Smec 		rw_exit(&pph_rwlock);
6253253Smec 		return (EINVAL);
6263253Smec 	}
6273253Smec 
6283253Smec 	len = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_seg_len;
6293253Smec 	uvaddr = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_base_va;
6303253Smec 	rw_exit(&pph_rwlock);
6313253Smec 
6323253Smec 	(void) as_unmap(as, uvaddr, len);
6333253Smec 
6343253Smec 	return (0);
6353253Smec }
6363253Smec 
6373253Smec /*
6383253Smec  * If the page has been hashed into the physmem vnode, then just look it up
6393253Smec  * and return it via pl, otherwise return ENOMEM as the map ioctl has not
6403253Smec  * succeeded on the given page.
6413253Smec  */
6423253Smec /*ARGSUSED*/
6433253Smec static int
6443253Smec physmem_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
6453253Smec     page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
646*5331Samw     struct cred *cr, caller_context_t *ct)
6473253Smec {
6483253Smec 	page_t *pp;
6493253Smec 
6503253Smec 	ASSERT(len == PAGESIZE);
6513253Smec 	ASSERT(AS_READ_HELD(seg->s_as, &seg->s_as->a_lock));
6523253Smec 
6533253Smec 	/*
6543253Smec 	 * If the page is in the hash, then we successfully claimed this
6553253Smec 	 * page earlier, so return it to the caller.
6563253Smec 	 */
6573253Smec 	pp = page_lookup(vp, off, SE_SHARED);
6583253Smec 	if (pp != NULL) {
6593253Smec 		pl[0] = pp;
6603253Smec 		pl[1] = NULL;
6613253Smec 		*protp = PROT_ALL;
6623253Smec 		return (0);
6633253Smec 	}
6643253Smec 	return (ENOMEM);
6653253Smec }
6663253Smec 
6673253Smec /*
6683253Smec  * We can not allow a process mapping /dev/physmem pages to fork as there can
6693253Smec  * only be a single mapping to a /dev/physmem page at a given time.  Thus, the
6703253Smec  * return of EINVAL when we are not working on our own address space.
6713253Smec  * Otherwise we return zero as this function is required for normal operation.
6723253Smec  */
6733253Smec /*ARGSUSED*/
6743253Smec static int
6753253Smec physmem_addmap(struct vnode *vp, offset_t off, struct as *as,
6763253Smec     caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
677*5331Samw     struct cred *cred, caller_context_t *ct)
6783253Smec {
6793253Smec 	if (curproc->p_as != as) {
6803253Smec 		return (EINVAL);
6813253Smec 	}
6823253Smec 	return (0);
6833253Smec }
6843253Smec 
6853253Smec /* Will always get called for removing a whole segment. */
6863253Smec /*ARGSUSED*/
6873253Smec static int
6883253Smec physmem_delmap(struct vnode *vp, offset_t off, struct as *as,
6893253Smec     caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags,
690*5331Samw     struct cred *cred, caller_context_t *ct)
6913253Smec {
6923253Smec 	/*
6933253Smec 	 * Release our hold on the vnode so that the final VN_RELE will
6943253Smec 	 * call physmem_inactive to clean things up.
6953253Smec 	 */
6963253Smec 	VN_RELE(vp);
6973253Smec 
6983253Smec 	return (0);
6993253Smec }
7003253Smec 
7013253Smec /*
7023253Smec  * Clean up all the pages belonging to this vnode and then free it.
7033253Smec  */
7043253Smec /*ARGSUSED*/
7053253Smec static void
706*5331Samw physmem_inactive(vnode_t *vp, cred_t *crp, caller_context_t *ct)
7073253Smec {
7083253Smec 	page_t *pp;
7093253Smec 
7103253Smec 	/*
7113253Smec 	 * Remove the vnode from the hash now, to prevent asynchronous
7123253Smec 	 * attempts to map into this vnode.  This avoids a deadlock
7133253Smec 	 * where two threads try to get into this logic at the same
7143253Smec 	 * time and try to map the pages they are destroying into the
7153253Smec 	 * other's address space.
7163253Smec 	 * If it's not in the hash, just free it.
7173253Smec 	 */
7183253Smec 	if (physmem_remove_vnode_hash(vp) == 0) {
7193253Smec 		ASSERT(vp->v_pages == NULL);
7203253Smec 		vn_free(vp);
7213253Smec 		physmem_remove_hash_proc();
7223253Smec 		mutex_enter(&physmem_mutex);
7233253Smec 		physmem_vnodecnt--;
7243253Smec 		mutex_exit(&physmem_mutex);
7253253Smec 		return;
7263253Smec 	}
7273253Smec 
7283253Smec 	/*
7293253Smec 	 * At this point in time, no other logic can be adding or removing
7303253Smec 	 * pages from the vnode, otherwise the v_pages list could be inaccurate.
7313253Smec 	 */
7323253Smec 
7333253Smec 	while ((pp = vp->v_pages) != NULL) {
7343253Smec 		page_t *rpp;
7353253Smec 		if (page_tryupgrade(pp)) {
7363253Smec 			/*
7373253Smec 			 * set lckcnt for page_destroy to do availrmem
7383253Smec 			 * accounting
7393253Smec 			 */
7403253Smec 			pp->p_lckcnt = 1;
7413253Smec 			page_destroy(pp, 0);
7423253Smec 		} else {
7433253Smec 			/* failure to lock should be transient */
7443253Smec 			rpp = page_lookup(vp, ptob(pp->p_pagenum), SE_SHARED);
7453253Smec 			if (rpp != pp) {
7463253Smec 				page_unlock(rpp);
7473253Smec 				continue;
7483253Smec 			}
7493253Smec 			page_unlock(pp);
7503253Smec 		}
7513253Smec 	}
7523253Smec 	vn_free(vp);
7533253Smec 	physmem_remove_hash_proc();
7543253Smec 	mutex_enter(&physmem_mutex);
7553253Smec 	physmem_vnodecnt--;
7563253Smec 	mutex_exit(&physmem_mutex);
7573253Smec }
7583253Smec 
7593253Smec /*ARGSUSED*/
7603253Smec static int
7613253Smec physmem_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
7623253Smec     int *rvalp)
7633253Smec {
7643253Smec 	int ret;
7653253Smec 
7663253Smec 	switch (cmd) {
7673253Smec 	case PHYSMEM_SETUP:
7683253Smec 		{
7693253Smec 			struct physmem_setup_param psp;
7703253Smec 			if (ddi_copyin((void *)arg, &psp,
7713253Smec 			    sizeof (struct physmem_setup_param), 0))
7723253Smec 				return (EFAULT);
7733253Smec 			ret = physmem_setup_addrs(&psp);
7743253Smec 			if (ddi_copyout(&psp, (void *)arg, sizeof (psp), 0))
7753253Smec 				return (EFAULT);
7763253Smec 		}
7773253Smec 		break;
7783253Smec 	case PHYSMEM_MAP:
7793253Smec 		{
7803253Smec 			struct physmem_map_param pmp;
7813253Smec 			if (ddi_copyin((void *)arg, &pmp,
7823253Smec 			    sizeof (struct physmem_map_param), 0))
7833253Smec 				return (EFAULT);
7843253Smec 			ret = physmem_map_addrs(&pmp);
7853253Smec 			if (ddi_copyout(&pmp, (void *)arg, sizeof (pmp), 0))
7863253Smec 				return (EFAULT);
7873253Smec 		}
7883253Smec 		break;
7893253Smec 	case PHYSMEM_DESTROY:
7903253Smec 		{
7913253Smec 			uint64_t cookie;
7923253Smec 			if (ddi_copyin((void *)arg, &cookie,
7933253Smec 			    sizeof (uint64_t), 0))
7943253Smec 				return (EFAULT);
7953253Smec 			ret = physmem_destroy_addrs(cookie);
7963253Smec 		}
7973253Smec 		break;
7983253Smec 	default:
7993253Smec 		return (ENOTSUP);
8003253Smec 	}
8013253Smec 	return (ret);
8023253Smec }
8033253Smec 
8043253Smec /*ARGSUSED*/
8053253Smec static int
8063253Smec physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp)
8073253Smec {
8083253Smec 	int ret;
8093253Smec 	static int msg_printed = 0;
8103253Smec 
8113253Smec 	if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) {
8123253Smec 		return (EINVAL);
8133253Smec 	}
8143253Smec 
8153253Smec 	/* need to make sure we have the right privileges */
8163253Smec 	if ((ret = secpolicy_resource(credp)) != 0)
8173253Smec 		return (ret);
8183253Smec 	if ((ret = secpolicy_lock_memory(credp)) != 0)
8193253Smec 		return (ret);
8203253Smec 
8213253Smec 	if (msg_printed == 0) {
8223253Smec 		cmn_err(CE_NOTE, "!driver has been opened. This driver may "
8233253Smec 		    "take out long term locks on pages which may impact "
8243253Smec 		    "dynamic reconfiguration events");
8253253Smec 		msg_printed = 1;
8263253Smec 	}
8273253Smec 
8283253Smec 	return (0);
8293253Smec }
8303253Smec 
8313253Smec /*ARGSUSED*/
8323253Smec static int
8333253Smec physmem_close(dev_t dev, int flag, int otyp, cred_t *credp)
8343253Smec {
8353253Smec 	return (0);
8363253Smec }
8373253Smec 
8383253Smec /*ARGSUSED*/
8393253Smec static int
8403253Smec physmem_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd,
8413253Smec     void *arg, void **resultp)
8423253Smec {
8433253Smec 	switch (infocmd) {
8443253Smec 	case DDI_INFO_DEVT2DEVINFO:
8453253Smec 		*resultp = physmem_dip;
8463253Smec 		return (DDI_SUCCESS);
8473253Smec 
8483253Smec 	case DDI_INFO_DEVT2INSTANCE:
8493253Smec 		*resultp = (void *)(ulong_t)getminor((dev_t)arg);
8503253Smec 		return (DDI_SUCCESS);
8513253Smec 
8523253Smec 	default:
8533253Smec 		return (DDI_FAILURE);
8543253Smec 	}
8553253Smec }
8563253Smec 
8573253Smec static int
8583253Smec physmem_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
8593253Smec {
8603253Smec 	int i;
8613253Smec 
8623253Smec 	if (cmd == DDI_RESUME) {
8633253Smec 		return (DDI_SUCCESS);
8643253Smec 	}
8653253Smec 
8663253Smec 	if (cmd != DDI_ATTACH)
8673253Smec 		return (DDI_FAILURE);
8683253Smec 
8693253Smec 	if (ddi_create_minor_node(dip, ddi_get_name(dip), S_IFCHR,
8703253Smec 	    ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS)
8713253Smec 		return (DDI_FAILURE);
8723253Smec 
8733253Smec 	physmem_dip = dip;
8743253Smec 
8753253Smec 	/* Initialize driver specific data */
8763253Smec 	if (physmem_setup_vnops()) {
8773253Smec 		ddi_remove_minor_node(dip, ddi_get_name(dip));
8783253Smec 		return (DDI_FAILURE);
8793253Smec 	}
8803253Smec 
8813253Smec 	for (i = 0; i < PPH_SIZE; i++)
8823253Smec 		pph[i] = NULL;
8833253Smec 
8843253Smec 	page_capture_register_callback(PC_PHYSMEM, 10000,
8853253Smec 	    map_page_proc);
8863253Smec 
8873253Smec 	return (DDI_SUCCESS);
8883253Smec }
8893253Smec 
8903253Smec static int
8913253Smec physmem_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
8923253Smec {
8933253Smec 	int ret = DDI_SUCCESS;
8943253Smec 
8953253Smec 	if (cmd == DDI_SUSPEND) {
8963253Smec 		return (DDI_SUCCESS);
8973253Smec 	}
8983253Smec 
8993253Smec 	if (cmd != DDI_DETACH)
9003253Smec 		return (DDI_FAILURE);
9013253Smec 
9023253Smec 	ASSERT(physmem_dip == dip);
9033253Smec 
9043253Smec 	mutex_enter(&physmem_mutex);
9053253Smec 	if (physmem_vnodecnt == 0) {
9063253Smec 		if (physmem_vnodeops != NULL) {
9073253Smec 			vn_freevnodeops(physmem_vnodeops);
9083253Smec 			physmem_vnodeops = NULL;
9093253Smec 			page_capture_unregister_callback(PC_PHYSMEM);
9103253Smec 		}
9113253Smec 	} else {
9123253Smec 		ret = EBUSY;
9133253Smec 	}
9143253Smec 	mutex_exit(&physmem_mutex);
9153253Smec 	if (ret == DDI_SUCCESS)
9163253Smec 		ddi_remove_minor_node(dip, ddi_get_name(dip));
9173253Smec 	return (ret);
9183253Smec }
9193253Smec 
9203253Smec static struct cb_ops physmem_cb_ops = {
9213253Smec 	physmem_open,	/* open */
9223253Smec 	physmem_close,	/* close */
9233253Smec 	nodev,		/* strategy */
9243253Smec 	nodev,		/* print */
9253253Smec 	nodev,		/* dump */
9263253Smec 	nodev,		/* read */
9273253Smec 	nodev,		/* write */
9283253Smec 	physmem_ioctl,	/* ioctl */
9293253Smec 	nodev,		/* devmap */
9303253Smec 	nodev,		/* mmap */
9313253Smec 	nodev,		/* segmap */
9323253Smec 	nochpoll,	/* chpoll */
9333253Smec 	ddi_prop_op,	/* prop_op */
9343253Smec 	NULL,		/* cb_str */
9353253Smec 	D_NEW | D_MP | D_DEVMAP,
9363253Smec 	CB_REV,
9373253Smec 	NULL,
9383253Smec 	NULL
9393253Smec };
9403253Smec 
9413253Smec static struct dev_ops physmem_ops = {
9423253Smec 	DEVO_REV,
9433253Smec 	0,
9443253Smec 	physmem_getinfo,
9453253Smec 	nulldev,
9463253Smec 	nulldev,
9473253Smec 	physmem_attach,
9483253Smec 	physmem_detach,
9493253Smec 	nodev,
9503253Smec 	&physmem_cb_ops,
9513253Smec 	NULL,
9523253Smec 	NULL
9533253Smec };
9543253Smec 
9553253Smec static struct modldrv modldrv = {
9563253Smec 	&mod_driverops,
9573253Smec 	"physmem driver %I%",
9583253Smec 	&physmem_ops
9593253Smec };
9603253Smec 
9613253Smec static struct modlinkage modlinkage = {
9623253Smec 	MODREV_1,
9633253Smec 	&modldrv,
9643253Smec 	NULL
9653253Smec };
9663253Smec 
9673253Smec int
9683253Smec _init(void)
9693253Smec {
9703253Smec 	return (mod_install(&modlinkage));
9713253Smec }
9723253Smec 
9733253Smec int
9743253Smec _info(struct modinfo *modinfop)
9753253Smec {
9763253Smec 	return (mod_info(&modlinkage, modinfop));
9773253Smec }
9783253Smec 
9793253Smec int
9803253Smec _fini(void)
9813253Smec {
9823253Smec 	return (mod_remove(&modlinkage));
9833253Smec }
984