xref: /onnv-gate/usr/src/uts/common/io/physmem.c (revision 3253:c929f34b62c5)
1*3253Smec /*
2*3253Smec  * CDDL HEADER START
3*3253Smec  *
4*3253Smec  * The contents of this file are subject to the terms of the
5*3253Smec  * Common Development and Distribution License (the "License").
6*3253Smec  * You may not use this file except in compliance with the License.
7*3253Smec  *
8*3253Smec  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9*3253Smec  * or http://www.opensolaris.org/os/licensing.
10*3253Smec  * See the License for the specific language governing permissions
11*3253Smec  * and limitations under the License.
12*3253Smec  *
13*3253Smec  * When distributing Covered Code, include this CDDL HEADER in each
14*3253Smec  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15*3253Smec  * If applicable, add the following below this CDDL HEADER, with the
16*3253Smec  * fields enclosed by brackets "[]" replaced with your own identifying
17*3253Smec  * information: Portions Copyright [yyyy] [name of copyright owner]
18*3253Smec  *
19*3253Smec  * CDDL HEADER END
20*3253Smec  */
21*3253Smec /*
22*3253Smec  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23*3253Smec  * Use is subject to license terms.
24*3253Smec  */
25*3253Smec 
26*3253Smec #pragma ident	"%Z%%M%	%I%	%E% SMI"
27*3253Smec 
28*3253Smec #include <sys/types.h>
29*3253Smec #include <sys/modctl.h>
30*3253Smec #include <sys/conf.h>
31*3253Smec #include <sys/ddi.h>
32*3253Smec #include <sys/sunddi.h>
33*3253Smec #include <sys/devops.h>
34*3253Smec #include <sys/stat.h>
35*3253Smec #include <sys/file.h>
36*3253Smec #include <sys/cred.h>
37*3253Smec #include <sys/policy.h>
38*3253Smec #include <sys/errno.h>
39*3253Smec #include <vm/seg_dev.h>
40*3253Smec #include <vm/seg_vn.h>
41*3253Smec #include <vm/page.h>
42*3253Smec #include <sys/fs/swapnode.h>
43*3253Smec #include <sys/sysmacros.h>
44*3253Smec #include <sys/fcntl.h>
45*3253Smec #include <sys/vmsystm.h>
46*3253Smec #include <sys/physmem.h>
47*3253Smec 
48*3253Smec static dev_info_t		*physmem_dip = NULL;
49*3253Smec 
50*3253Smec /*
51*3253Smec  * Linked list element hanging off physmem_proc_hash below, which holds all
52*3253Smec  * the information for a given segment which has been setup for this process.
53*3253Smec  * This is a simple linked list as we are assuming that for a given process
54*3253Smec  * the setup ioctl will only be called a handful of times.  If this assumption
55*3253Smec  * changes in the future, a quicker to traverse data structure should be used.
56*3253Smec  */
57*3253Smec struct physmem_hash {
58*3253Smec 	struct physmem_hash *ph_next;
59*3253Smec 	uint64_t ph_base_pa;
60*3253Smec 	caddr_t ph_base_va;
61*3253Smec 	size_t ph_seg_len;
62*3253Smec 	struct vnode *ph_vnode;
63*3253Smec };
64*3253Smec 
65*3253Smec /*
66*3253Smec  * Hash of all of the processes which have setup mappings with the driver with
67*3253Smec  * pointers to per process data.
68*3253Smec  */
69*3253Smec struct physmem_proc_hash {
70*3253Smec 	struct proc *pph_proc;
71*3253Smec 	struct physmem_hash *pph_hash;
72*3253Smec 	struct physmem_proc_hash *pph_next;
73*3253Smec };
74*3253Smec 
75*3253Smec 
76*3253Smec /* Needs to be a power of two for simple hash algorithm */
77*3253Smec #define	PPH_SIZE	8
78*3253Smec struct physmem_proc_hash *pph[PPH_SIZE];
79*3253Smec 
80*3253Smec /*
81*3253Smec  * Lock which protects the pph hash above.  To add an element (either a new
82*3253Smec  * process or a new segment) the WRITE lock must be held.  To traverse the
83*3253Smec  * list, only a READ lock is needed.
84*3253Smec  */
85*3253Smec krwlock_t pph_rwlock;
86*3253Smec 
87*3253Smec #define	PHYSMEM_HASH(procp) ((int)((((uintptr_t)procp) >> 8) & (PPH_SIZE - 1)))
88*3253Smec 
89*3253Smec /*
90*3253Smec  * Need to keep a reference count of how many processes have the driver
91*3253Smec  * open to prevent it from disappearing.
92*3253Smec  */
93*3253Smec uint64_t physmem_vnodecnt;
94*3253Smec kmutex_t physmem_mutex;		/* protects phsymem_vnodecnt */
95*3253Smec 
96*3253Smec static int physmem_getpage(struct vnode *vp, offset_t off, size_t len,
97*3253Smec     uint_t *protp, page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr,
98*3253Smec     enum seg_rw rw, struct cred *cr);
99*3253Smec 
100*3253Smec static int physmem_addmap(struct vnode *vp, offset_t off, struct as *as,
101*3253Smec     caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
102*3253Smec     struct cred *cred);
103*3253Smec 
104*3253Smec static int physmem_delmap(struct vnode *vp, offset_t off, struct as *as,
105*3253Smec     caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags,
106*3253Smec     struct cred *cred);
107*3253Smec 
108*3253Smec static void physmem_inactive(vnode_t *vp, cred_t *crp);
109*3253Smec 
110*3253Smec const fs_operation_def_t physmem_vnodeops_template[] = {
111*3253Smec 	VOPNAME_GETPAGE, physmem_getpage,
112*3253Smec 	VOPNAME_ADDMAP, (fs_generic_func_p) physmem_addmap,
113*3253Smec 	VOPNAME_DELMAP, physmem_delmap,
114*3253Smec 	VOPNAME_INACTIVE, (fs_generic_func_p) physmem_inactive,
115*3253Smec 	NULL, NULL
116*3253Smec };
117*3253Smec 
118*3253Smec vnodeops_t *physmem_vnodeops = NULL;
119*3253Smec 
120*3253Smec /*
121*3253Smec  * Removes the current process from the hash if the process has no more
122*3253Smec  * physmem segments active.
123*3253Smec  */
124*3253Smec void
125*3253Smec physmem_remove_hash_proc()
126*3253Smec {
127*3253Smec 	int index;
128*3253Smec 	struct physmem_proc_hash **walker;
129*3253Smec 	struct physmem_proc_hash *victim = NULL;
130*3253Smec 
131*3253Smec 	index = PHYSMEM_HASH(curproc);
132*3253Smec 	rw_enter(&pph_rwlock, RW_WRITER);
133*3253Smec 	walker = &pph[index];
134*3253Smec 	while (*walker != NULL) {
135*3253Smec 		if ((*walker)->pph_proc == curproc &&
136*3253Smec 		    (*walker)->pph_hash == NULL) {
137*3253Smec 			victim = *walker;
138*3253Smec 			*walker = victim->pph_next;
139*3253Smec 			break;
140*3253Smec 		}
141*3253Smec 		walker = &((*walker)->pph_next);
142*3253Smec 	}
143*3253Smec 	rw_exit(&pph_rwlock);
144*3253Smec 	if (victim != NULL)
145*3253Smec 		kmem_free(victim, sizeof (struct physmem_proc_hash));
146*3253Smec }
147*3253Smec 
148*3253Smec /*
149*3253Smec  * Add a new entry to the hash for the given process to cache the
150*3253Smec  * address ranges that it is working on.  If this is the first hash
151*3253Smec  * item to be added for this process, we will create the head pointer
152*3253Smec  * for this process.
153*3253Smec  * Returns 0 on success, ERANGE when the physical address is already in the
154*3253Smec  * hash.  Note that we add it to the hash as we have already called as_map
155*3253Smec  * and thus the as_unmap call will try to free the vnode, which needs
156*3253Smec  * to be found in the hash.
157*3253Smec  */
158*3253Smec int
159*3253Smec physmem_add_hash(struct physmem_hash *php)
160*3253Smec {
161*3253Smec 	int index;
162*3253Smec 	struct physmem_proc_hash *iterator;
163*3253Smec 	struct physmem_proc_hash *newp = NULL;
164*3253Smec 	struct physmem_hash *temp;
165*3253Smec 	int ret = 0;
166*3253Smec 
167*3253Smec 	index = PHYSMEM_HASH(curproc);
168*3253Smec 
169*3253Smec insert:
170*3253Smec 	rw_enter(&pph_rwlock, RW_WRITER);
171*3253Smec 	iterator = pph[index];
172*3253Smec 	while (iterator != NULL) {
173*3253Smec 		if (iterator->pph_proc == curproc) {
174*3253Smec 			/*
175*3253Smec 			 * check to make sure a single process does not try to
176*3253Smec 			 * map the same region twice.
177*3253Smec 			 */
178*3253Smec 			for (temp = iterator->pph_hash; temp != NULL;
179*3253Smec 			    temp = temp->ph_next) {
180*3253Smec 				if ((php->ph_base_pa >= temp->ph_base_pa &&
181*3253Smec 				    php->ph_base_pa < temp->ph_base_pa +
182*3253Smec 				    temp->ph_seg_len) ||
183*3253Smec 				    (temp->ph_base_pa >= php->ph_base_pa &&
184*3253Smec 				    temp->ph_base_pa < php->ph_base_pa +
185*3253Smec 				    php->ph_seg_len)) {
186*3253Smec 					ret = ERANGE;
187*3253Smec 					break;
188*3253Smec 				}
189*3253Smec 			}
190*3253Smec 			if (ret == 0) {
191*3253Smec 				php->ph_next = iterator->pph_hash;
192*3253Smec 				iterator->pph_hash = php;
193*3253Smec 			}
194*3253Smec 			rw_exit(&pph_rwlock);
195*3253Smec 			/* Need to check for two threads in sync */
196*3253Smec 			if (newp != NULL)
197*3253Smec 				kmem_free(newp, sizeof (*newp));
198*3253Smec 			return (ret);
199*3253Smec 		}
200*3253Smec 		iterator = iterator->pph_next;
201*3253Smec 	}
202*3253Smec 
203*3253Smec 	if (newp != NULL) {
204*3253Smec 		newp->pph_proc = curproc;
205*3253Smec 		newp->pph_next = pph[index];
206*3253Smec 		newp->pph_hash = php;
207*3253Smec 		php->ph_next = NULL;
208*3253Smec 		pph[index] = newp;
209*3253Smec 		rw_exit(&pph_rwlock);
210*3253Smec 		return (0);
211*3253Smec 	}
212*3253Smec 
213*3253Smec 	rw_exit(&pph_rwlock);
214*3253Smec 	/* Dropped the lock so we could use KM_SLEEP */
215*3253Smec 	newp = kmem_zalloc(sizeof (struct physmem_proc_hash), KM_SLEEP);
216*3253Smec 	goto insert;
217*3253Smec }
218*3253Smec 
219*3253Smec /*
220*3253Smec  * Will return the pointer to the physmem_hash struct if the setup routine
221*3253Smec  * has previously been called for this memory.
222*3253Smec  * Returns NULL on failure.
223*3253Smec  */
224*3253Smec struct physmem_hash *
225*3253Smec physmem_get_hash(uint64_t req_paddr, size_t len, proc_t *procp)
226*3253Smec {
227*3253Smec 	int index;
228*3253Smec 	struct physmem_proc_hash *proc_hp;
229*3253Smec 	struct physmem_hash *php;
230*3253Smec 
231*3253Smec 	ASSERT(rw_lock_held(&pph_rwlock));
232*3253Smec 
233*3253Smec 	index = PHYSMEM_HASH(procp);
234*3253Smec 	proc_hp = pph[index];
235*3253Smec 	while (proc_hp != NULL) {
236*3253Smec 		if (proc_hp->pph_proc == procp) {
237*3253Smec 			php = proc_hp->pph_hash;
238*3253Smec 			while (php != NULL) {
239*3253Smec 				if ((req_paddr >= php->ph_base_pa) &&
240*3253Smec 				    (req_paddr + len <=
241*3253Smec 				    php->ph_base_pa + php->ph_seg_len)) {
242*3253Smec 					return (php);
243*3253Smec 				}
244*3253Smec 				php = php->ph_next;
245*3253Smec 			}
246*3253Smec 		}
247*3253Smec 		proc_hp = proc_hp->pph_next;
248*3253Smec 	}
249*3253Smec 	return (NULL);
250*3253Smec }
251*3253Smec 
252*3253Smec int
253*3253Smec physmem_validate_cookie(uint64_t p_cookie)
254*3253Smec {
255*3253Smec 	int index;
256*3253Smec 	struct physmem_proc_hash *proc_hp;
257*3253Smec 	struct physmem_hash *php;
258*3253Smec 
259*3253Smec 	ASSERT(rw_lock_held(&pph_rwlock));
260*3253Smec 
261*3253Smec 	index = PHYSMEM_HASH(curproc);
262*3253Smec 	proc_hp = pph[index];
263*3253Smec 	while (proc_hp != NULL) {
264*3253Smec 		if (proc_hp->pph_proc == curproc) {
265*3253Smec 			php = proc_hp->pph_hash;
266*3253Smec 			while (php != NULL) {
267*3253Smec 				if ((uint64_t)(uintptr_t)php == p_cookie) {
268*3253Smec 					return (1);
269*3253Smec 				}
270*3253Smec 				php = php->ph_next;
271*3253Smec 			}
272*3253Smec 		}
273*3253Smec 		proc_hp = proc_hp->pph_next;
274*3253Smec 	}
275*3253Smec 	return (0);
276*3253Smec }
277*3253Smec 
278*3253Smec /*
279*3253Smec  * Remove the given vnode from the pph hash.  If it exists in the hash the
280*3253Smec  * process still has to be around as the vnode is obviously still around and
281*3253Smec  * since it's a physmem vnode, it must be in the hash.
282*3253Smec  * If it is not in the hash that must mean that the setup ioctl failed.
283*3253Smec  * Return 0 in this instance, 1 if it is in the hash.
284*3253Smec  */
285*3253Smec int
286*3253Smec physmem_remove_vnode_hash(vnode_t *vp)
287*3253Smec {
288*3253Smec 	int index;
289*3253Smec 	struct physmem_proc_hash *proc_hp;
290*3253Smec 	struct physmem_hash **phpp;
291*3253Smec 	struct physmem_hash *victim;
292*3253Smec 
293*3253Smec 	index = PHYSMEM_HASH(curproc);
294*3253Smec 	/* synchronize with the map routine */
295*3253Smec 	rw_enter(&pph_rwlock, RW_WRITER);
296*3253Smec 	proc_hp = pph[index];
297*3253Smec 	while (proc_hp != NULL) {
298*3253Smec 		if (proc_hp->pph_proc == curproc) {
299*3253Smec 			phpp = &proc_hp->pph_hash;
300*3253Smec 			while (*phpp != NULL) {
301*3253Smec 				if ((*phpp)->ph_vnode == vp) {
302*3253Smec 					victim = *phpp;
303*3253Smec 					*phpp = victim->ph_next;
304*3253Smec 
305*3253Smec 					rw_exit(&pph_rwlock);
306*3253Smec 					kmem_free(victim, sizeof (*victim));
307*3253Smec 					return (1);
308*3253Smec 				}
309*3253Smec 				phpp = &(*phpp)->ph_next;
310*3253Smec 			}
311*3253Smec 		}
312*3253Smec 		proc_hp = proc_hp->pph_next;
313*3253Smec 	}
314*3253Smec 	rw_exit(&pph_rwlock);
315*3253Smec 
316*3253Smec 	/* not found */
317*3253Smec 	return (0);
318*3253Smec }
319*3253Smec 
320*3253Smec int
321*3253Smec physmem_setup_vnops()
322*3253Smec {
323*3253Smec 	int error;
324*3253Smec 	char *name = "physmem";
325*3253Smec 	if (physmem_vnodeops != NULL)
326*3253Smec 		cmn_err(CE_PANIC, "physmem vnodeops already set\n");
327*3253Smec 	error = vn_make_ops(name, physmem_vnodeops_template, &physmem_vnodeops);
328*3253Smec 	if (error != 0) {
329*3253Smec 		cmn_err(CE_WARN, "physmem_setup_vnops: bad vnode ops template");
330*3253Smec 	}
331*3253Smec 	return (error);
332*3253Smec }
333*3253Smec 
334*3253Smec /*
335*3253Smec  * The guts of the PHYSMEM_SETUP ioctl.
336*3253Smec  * Create a segment in the address space with the specified parameters.
337*3253Smec  * If pspp->user_va is NULL, as_gap will be used to find an appropriate VA.
338*3253Smec  * We do not do bounds checking on the requested phsycial addresses, if they
339*3253Smec  * do not exist in the system, they will not be mappable.
340*3253Smec  * Returns 0 on success with the following error codes on failure:
341*3253Smec  *	ENOMEM - The VA range requested was already mapped if pspp->user_va is
342*3253Smec  *		non-NULL or the system was unable to find enough VA space for
343*3253Smec  *		the desired length if user_va was NULL>
344*3253Smec  *	EINVAL - The requested PA, VA, or length was not PAGESIZE aligned.
345*3253Smec  */
346*3253Smec int
347*3253Smec physmem_setup_addrs(struct physmem_setup_param *pspp)
348*3253Smec {
349*3253Smec 	struct as *as = curproc->p_as;
350*3253Smec 	struct segvn_crargs vn_a;
351*3253Smec 	int ret = 0;
352*3253Smec 	uint64_t base_pa;
353*3253Smec 	size_t len;
354*3253Smec 	caddr_t uvaddr;
355*3253Smec 	struct vnode *vp;
356*3253Smec 	struct physmem_hash *php;
357*3253Smec 
358*3253Smec 	ASSERT(pspp != NULL);
359*3253Smec 	base_pa = pspp->req_paddr;
360*3253Smec 	len = pspp->len;
361*3253Smec 	uvaddr = (caddr_t)(uintptr_t)pspp->user_va;
362*3253Smec 
363*3253Smec 	/* Sanity checking */
364*3253Smec 	if (!IS_P2ALIGNED(base_pa, PAGESIZE))
365*3253Smec 		return (EINVAL);
366*3253Smec 	if (!IS_P2ALIGNED(len, PAGESIZE))
367*3253Smec 		return (EINVAL);
368*3253Smec 	if (uvaddr != NULL && !IS_P2ALIGNED(uvaddr, PAGESIZE))
369*3253Smec 		return (EINVAL);
370*3253Smec 
371*3253Smec 	php = kmem_zalloc(sizeof (struct physmem_hash), KM_SLEEP);
372*3253Smec 
373*3253Smec 	/* Need to bump vnode count so that the driver can not be unloaded */
374*3253Smec 	mutex_enter(&physmem_mutex);
375*3253Smec 	physmem_vnodecnt++;
376*3253Smec 	mutex_exit(&physmem_mutex);
377*3253Smec 
378*3253Smec 	vp = vn_alloc(KM_SLEEP);
379*3253Smec 	ASSERT(vp != NULL);	/* SLEEP can't return NULL */
380*3253Smec 	vn_setops(vp, physmem_vnodeops);
381*3253Smec 
382*3253Smec 	php->ph_vnode = vp;
383*3253Smec 
384*3253Smec 	vn_a.vp = vp;
385*3253Smec 	vn_a.offset = (u_offset_t)base_pa;
386*3253Smec 	vn_a.type = MAP_SHARED;
387*3253Smec 	vn_a.prot = PROT_ALL;
388*3253Smec 	vn_a.maxprot = PROT_ALL;
389*3253Smec 	vn_a.flags = 0;
390*3253Smec 	vn_a.cred = NULL;
391*3253Smec 	vn_a.amp = NULL;
392*3253Smec 	vn_a.szc = 0;
393*3253Smec 	vn_a.lgrp_mem_policy_flags = 0;
394*3253Smec 
395*3253Smec 	as_rangelock(as);
396*3253Smec 	if (uvaddr != NULL) {
397*3253Smec 		if (as_gap(as, len, &uvaddr, &len, AH_LO, NULL) == -1) {
398*3253Smec 			ret = ENOMEM;
399*3253Smec fail:
400*3253Smec 			as_rangeunlock(as);
401*3253Smec 			vn_free(vp);
402*3253Smec 			kmem_free(php, sizeof (*php));
403*3253Smec 			mutex_enter(&physmem_mutex);
404*3253Smec 			physmem_vnodecnt--;
405*3253Smec 			mutex_exit(&physmem_mutex);
406*3253Smec 			return (ret);
407*3253Smec 		}
408*3253Smec 	} else {
409*3253Smec 		/* We pick the address for the user */
410*3253Smec 		map_addr(&uvaddr, len, 0, 1, 0);
411*3253Smec 		if (uvaddr == NULL) {
412*3253Smec 			ret = ENOMEM;
413*3253Smec 			goto fail;
414*3253Smec 		}
415*3253Smec 	}
416*3253Smec 	ret = as_map(as, uvaddr, len, segvn_create, &vn_a);
417*3253Smec 
418*3253Smec 	as_rangeunlock(as);
419*3253Smec 	if (ret == 0) {
420*3253Smec 		php->ph_base_pa = base_pa;
421*3253Smec 		php->ph_base_va = uvaddr;
422*3253Smec 		php->ph_seg_len = len;
423*3253Smec 		pspp->user_va = (uint64_t)(uintptr_t)uvaddr;
424*3253Smec 		pspp->cookie = (uint64_t)(uintptr_t)php;
425*3253Smec 		ret = physmem_add_hash(php);
426*3253Smec 		if (ret == 0)
427*3253Smec 			return (0);
428*3253Smec 		(void) as_unmap(as, uvaddr, len);
429*3253Smec 		return (ret);
430*3253Smec 	}
431*3253Smec 
432*3253Smec 	goto fail;
433*3253Smec 	/*NOTREACHED*/
434*3253Smec }
435*3253Smec 
436*3253Smec /*
437*3253Smec  * The guts of the PHYSMEM_MAP ioctl.
438*3253Smec  * Map the given PA to the appropriate VA if PHYSMEM_SETUP ioctl has already
439*3253Smec  * been called for this PA range.
440*3253Smec  * Returns 0 on success with the following error codes on failure:
441*3253Smec  *	EPERM - The requested page is long term locked, and thus repeated
442*3253Smec  *		requests to allocate this page will likely fail.
443*3253Smec  *	EAGAIN - The requested page could not be allocated, but it is believed
444*3253Smec  *		that future attempts could succeed.
445*3253Smec  *	ENOMEM - There was not enough free memory in the system to safely
446*3253Smec  *		map the requested page.
447*3253Smec  *	EINVAL - The requested paddr was not PAGESIZE aligned or the
448*3253Smec  *		PHYSMEM_SETUP ioctl was not called for this page.
449*3253Smec  *	ENOENT - The requested page was iniside the kernel cage, and the
450*3253Smec  *		PHYSMEM_CAGE flag was not set.
451*3253Smec  *	EBUSY - The requested page is retired and the PHYSMEM_RETIRE flag
452*3253Smec  *		was not set.
453*3253Smec  */
454*3253Smec static int
455*3253Smec physmem_map_addrs(struct physmem_map_param *pmpp)
456*3253Smec {
457*3253Smec 	caddr_t uvaddr;
458*3253Smec 	page_t *pp;
459*3253Smec 	uint64_t req_paddr;
460*3253Smec 	struct vnode *vp;
461*3253Smec 	int ret = 0;
462*3253Smec 	struct physmem_hash *php;
463*3253Smec 	uint_t flags = 0;
464*3253Smec 
465*3253Smec 	ASSERT(pmpp != NULL);
466*3253Smec 	req_paddr = pmpp->req_paddr;
467*3253Smec 
468*3253Smec 	if (!IS_P2ALIGNED(req_paddr, PAGESIZE))
469*3253Smec 		return (EINVAL);
470*3253Smec 	/* Find the vnode for this map request */
471*3253Smec 	rw_enter(&pph_rwlock, RW_READER);
472*3253Smec 	php = physmem_get_hash(req_paddr, PAGESIZE, curproc);
473*3253Smec 	if (php == NULL) {
474*3253Smec 		rw_exit(&pph_rwlock);
475*3253Smec 		return (EINVAL);
476*3253Smec 	}
477*3253Smec 	vp = php->ph_vnode;
478*3253Smec 	uvaddr = php->ph_base_va + (req_paddr - php->ph_base_pa);
479*3253Smec 	rw_exit(&pph_rwlock);
480*3253Smec 
481*3253Smec 	pp = page_numtopp_nolock(btop((size_t)req_paddr));
482*3253Smec 	if (pp == NULL) {
483*3253Smec 		pmpp->ret_va = NULL;
484*3253Smec 		return (EPERM);
485*3253Smec 	}
486*3253Smec 
487*3253Smec 	/*
488*3253Smec 	 * Check to see if page already mapped correctly.  This can happen
489*3253Smec 	 * when we failed to capture a page previously and it was captured
490*3253Smec 	 * asynchronously for us.  Return success in this case.
491*3253Smec 	 */
492*3253Smec 	if (pp->p_vnode == vp) {
493*3253Smec 		ASSERT(pp->p_offset == (u_offset_t)req_paddr);
494*3253Smec 		pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr;
495*3253Smec 		return (0);
496*3253Smec 	}
497*3253Smec 
498*3253Smec 	/*
499*3253Smec 	 * physmem should be responsible for checking for cage
500*3253Smec 	 * and prom pages.
501*3253Smec 	 */
502*3253Smec 	if (pmpp->flags & PHYSMEM_CAGE)
503*3253Smec 		flags = CAPTURE_GET_CAGE;
504*3253Smec 	if (pmpp->flags & PHYSMEM_RETIRED)
505*3253Smec 		flags |= CAPTURE_GET_RETIRED;
506*3253Smec 
507*3253Smec 	ret = page_trycapture(pp, 0, flags | CAPTURE_PHYSMEM, curproc);
508*3253Smec 
509*3253Smec 	if (ret != 0) {
510*3253Smec 		pmpp->ret_va = NULL;
511*3253Smec 		return (ret);
512*3253Smec 	} else {
513*3253Smec 		pmpp->ret_va = (uint64_t)(uintptr_t)uvaddr;
514*3253Smec 		return (0);
515*3253Smec 	}
516*3253Smec }
517*3253Smec 
518*3253Smec /*
519*3253Smec  * Map the given page into the process's address space if possible.
520*3253Smec  * We actually only hash the page in on the correct vnode as the page
521*3253Smec  * will be mapped via segvn_pagefault.
522*3253Smec  * returns 0 on success
523*3253Smec  * returns 1 if there is no need to map this page anymore (process exited)
524*3253Smec  * returns -1 if we failed to map the page.
525*3253Smec  */
526*3253Smec int
527*3253Smec map_page_proc(page_t *pp, void *arg, uint_t flags)
528*3253Smec {
529*3253Smec 	struct vnode *vp;
530*3253Smec 	proc_t *procp = (proc_t *)arg;
531*3253Smec 	int ret;
532*3253Smec 	u_offset_t paddr = (u_offset_t)ptob(pp->p_pagenum);
533*3253Smec 	struct physmem_hash *php;
534*3253Smec 
535*3253Smec 	ASSERT(pp != NULL);
536*3253Smec 
537*3253Smec 	/*
538*3253Smec 	 * Check against availrmem to make sure that we're not low on memory.
539*3253Smec 	 * We check again here as ASYNC requests do not do this check elsewhere.
540*3253Smec 	 * We return 1 as we don't want the page to have the PR_CAPTURE bit
541*3253Smec 	 * set or be on the page capture hash.
542*3253Smec 	 */
543*3253Smec 	if (swapfs_minfree > availrmem + 1) {
544*3253Smec 		page_free(pp, 1);
545*3253Smec 		return (1);
546*3253Smec 	}
547*3253Smec 
548*3253Smec 	/*
549*3253Smec 	 * If this is an asynchronous request for the current process,
550*3253Smec 	 * we can not map the page as it's possible that we are also in the
551*3253Smec 	 * process of unmapping the page which could result in a deadlock
552*3253Smec 	 * with the as lock.
553*3253Smec 	 */
554*3253Smec 	if ((flags & CAPTURE_ASYNC) && (curproc == procp)) {
555*3253Smec 		page_free(pp, 1);
556*3253Smec 		return (-1);
557*3253Smec 	}
558*3253Smec 
559*3253Smec 	/* only return zeroed out pages */
560*3253Smec 	pagezero(pp, 0, PAGESIZE);
561*3253Smec 
562*3253Smec 	rw_enter(&pph_rwlock, RW_READER);
563*3253Smec 	php = physmem_get_hash(paddr, PAGESIZE, procp);
564*3253Smec 	if (php == NULL) {
565*3253Smec 		rw_exit(&pph_rwlock);
566*3253Smec 		/*
567*3253Smec 		 * Free the page as there is no longer a valid outstanding
568*3253Smec 		 * request for this page.
569*3253Smec 		 */
570*3253Smec 		page_free(pp, 1);
571*3253Smec 		return (1);
572*3253Smec 	}
573*3253Smec 
574*3253Smec 	vp = php->ph_vnode;
575*3253Smec 
576*3253Smec 	/*
577*3253Smec 	 * We need to protect against a possible deadlock here where we own
578*3253Smec 	 * the vnode page hash mutex and want to acquire it again as there
579*3253Smec 	 * are locations in the code, where we unlock a page while holding
580*3253Smec 	 * the mutex which can lead to the page being captured and eventually
581*3253Smec 	 * end up here.
582*3253Smec 	 */
583*3253Smec 	if (mutex_owned(page_vnode_mutex(vp))) {
584*3253Smec 		rw_exit(&pph_rwlock);
585*3253Smec 		page_free(pp, 1);
586*3253Smec 		return (-1);
587*3253Smec 	}
588*3253Smec 
589*3253Smec 	ret = page_hashin(pp, vp, paddr, NULL);
590*3253Smec 	rw_exit(&pph_rwlock);
591*3253Smec 	if (ret == 0) {
592*3253Smec 		page_free(pp, 1);
593*3253Smec 		return (-1);
594*3253Smec 	}
595*3253Smec 
596*3253Smec 	page_downgrade(pp);
597*3253Smec 
598*3253Smec 	mutex_enter(&freemem_lock);
599*3253Smec 	availrmem--;
600*3253Smec 	mutex_exit(&freemem_lock);
601*3253Smec 
602*3253Smec 	return (0);
603*3253Smec }
604*3253Smec 
605*3253Smec /*
606*3253Smec  * The guts of the PHYSMEM_DESTROY ioctl.
607*3253Smec  * The cookie passed in will provide all of the information needed to
608*3253Smec  * free up the address space and physical memory associated with the
609*3253Smec  * corresponding PHSYMEM_SETUP ioctl.
610*3253Smec  * Returns 0 on success with the following error codes on failure:
611*3253Smec  *	EINVAL - The cookie supplied is not valid.
612*3253Smec  */
613*3253Smec int
614*3253Smec physmem_destroy_addrs(uint64_t p_cookie)
615*3253Smec {
616*3253Smec 	struct as *as = curproc->p_as;
617*3253Smec 	size_t len;
618*3253Smec 	caddr_t uvaddr;
619*3253Smec 
620*3253Smec 	rw_enter(&pph_rwlock, RW_READER);
621*3253Smec 	if (physmem_validate_cookie(p_cookie) == 0) {
622*3253Smec 		rw_exit(&pph_rwlock);
623*3253Smec 		return (EINVAL);
624*3253Smec 	}
625*3253Smec 
626*3253Smec 	len = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_seg_len;
627*3253Smec 	uvaddr = ((struct physmem_hash *)(uintptr_t)p_cookie)->ph_base_va;
628*3253Smec 	rw_exit(&pph_rwlock);
629*3253Smec 
630*3253Smec 	(void) as_unmap(as, uvaddr, len);
631*3253Smec 
632*3253Smec 	return (0);
633*3253Smec }
634*3253Smec 
635*3253Smec /*
636*3253Smec  * If the page has been hashed into the physmem vnode, then just look it up
637*3253Smec  * and return it via pl, otherwise return ENOMEM as the map ioctl has not
638*3253Smec  * succeeded on the given page.
639*3253Smec  */
640*3253Smec /*ARGSUSED*/
641*3253Smec static int
642*3253Smec physmem_getpage(struct vnode *vp, offset_t off, size_t len, uint_t *protp,
643*3253Smec     page_t *pl[], size_t plsz, struct seg *seg, caddr_t addr, enum seg_rw rw,
644*3253Smec     struct cred *cr)
645*3253Smec {
646*3253Smec 	page_t *pp;
647*3253Smec 
648*3253Smec 	ASSERT(len == PAGESIZE);
649*3253Smec 	ASSERT(AS_READ_HELD(seg->s_as, &seg->s_as->a_lock));
650*3253Smec 
651*3253Smec 	/*
652*3253Smec 	 * If the page is in the hash, then we successfully claimed this
653*3253Smec 	 * page earlier, so return it to the caller.
654*3253Smec 	 */
655*3253Smec 	pp = page_lookup(vp, off, SE_SHARED);
656*3253Smec 	if (pp != NULL) {
657*3253Smec 		pl[0] = pp;
658*3253Smec 		pl[1] = NULL;
659*3253Smec 		*protp = PROT_ALL;
660*3253Smec 		return (0);
661*3253Smec 	}
662*3253Smec 	return (ENOMEM);
663*3253Smec }
664*3253Smec 
665*3253Smec /*
666*3253Smec  * We can not allow a process mapping /dev/physmem pages to fork as there can
667*3253Smec  * only be a single mapping to a /dev/physmem page at a given time.  Thus, the
668*3253Smec  * return of EINVAL when we are not working on our own address space.
669*3253Smec  * Otherwise we return zero as this function is required for normal operation.
670*3253Smec  */
671*3253Smec /*ARGSUSED*/
672*3253Smec static int
673*3253Smec physmem_addmap(struct vnode *vp, offset_t off, struct as *as,
674*3253Smec     caddr_t addr, size_t len, uchar_t prot, uchar_t maxprot, uint_t flags,
675*3253Smec     struct cred *cred)
676*3253Smec {
677*3253Smec 	if (curproc->p_as != as) {
678*3253Smec 		return (EINVAL);
679*3253Smec 	}
680*3253Smec 	return (0);
681*3253Smec }
682*3253Smec 
683*3253Smec /* Will always get called for removing a whole segment. */
684*3253Smec /*ARGSUSED*/
685*3253Smec static int
686*3253Smec physmem_delmap(struct vnode *vp, offset_t off, struct as *as,
687*3253Smec     caddr_t addr, size_t len, uint_t prot, uint_t maxprot, uint_t flags,
688*3253Smec     struct cred *cred)
689*3253Smec {
690*3253Smec 	/*
691*3253Smec 	 * Release our hold on the vnode so that the final VN_RELE will
692*3253Smec 	 * call physmem_inactive to clean things up.
693*3253Smec 	 */
694*3253Smec 	VN_RELE(vp);
695*3253Smec 
696*3253Smec 	return (0);
697*3253Smec }
698*3253Smec 
699*3253Smec /*
700*3253Smec  * Clean up all the pages belonging to this vnode and then free it.
701*3253Smec  */
702*3253Smec /*ARGSUSED*/
703*3253Smec static void
704*3253Smec physmem_inactive(vnode_t *vp, cred_t *crp)
705*3253Smec {
706*3253Smec 	page_t *pp;
707*3253Smec 
708*3253Smec 	/*
709*3253Smec 	 * Remove the vnode from the hash now, to prevent asynchronous
710*3253Smec 	 * attempts to map into this vnode.  This avoids a deadlock
711*3253Smec 	 * where two threads try to get into this logic at the same
712*3253Smec 	 * time and try to map the pages they are destroying into the
713*3253Smec 	 * other's address space.
714*3253Smec 	 * If it's not in the hash, just free it.
715*3253Smec 	 */
716*3253Smec 	if (physmem_remove_vnode_hash(vp) == 0) {
717*3253Smec 		ASSERT(vp->v_pages == NULL);
718*3253Smec 		vn_free(vp);
719*3253Smec 		physmem_remove_hash_proc();
720*3253Smec 		mutex_enter(&physmem_mutex);
721*3253Smec 		physmem_vnodecnt--;
722*3253Smec 		mutex_exit(&physmem_mutex);
723*3253Smec 		return;
724*3253Smec 	}
725*3253Smec 
726*3253Smec 	/*
727*3253Smec 	 * At this point in time, no other logic can be adding or removing
728*3253Smec 	 * pages from the vnode, otherwise the v_pages list could be inaccurate.
729*3253Smec 	 */
730*3253Smec 
731*3253Smec 	while ((pp = vp->v_pages) != NULL) {
732*3253Smec 		page_t *rpp;
733*3253Smec 		if (page_tryupgrade(pp)) {
734*3253Smec 			/*
735*3253Smec 			 * set lckcnt for page_destroy to do availrmem
736*3253Smec 			 * accounting
737*3253Smec 			 */
738*3253Smec 			pp->p_lckcnt = 1;
739*3253Smec 			page_destroy(pp, 0);
740*3253Smec 		} else {
741*3253Smec 			/* failure to lock should be transient */
742*3253Smec 			rpp = page_lookup(vp, ptob(pp->p_pagenum), SE_SHARED);
743*3253Smec 			if (rpp != pp) {
744*3253Smec 				page_unlock(rpp);
745*3253Smec 				continue;
746*3253Smec 			}
747*3253Smec 			page_unlock(pp);
748*3253Smec 		}
749*3253Smec 	}
750*3253Smec 	vn_free(vp);
751*3253Smec 	physmem_remove_hash_proc();
752*3253Smec 	mutex_enter(&physmem_mutex);
753*3253Smec 	physmem_vnodecnt--;
754*3253Smec 	mutex_exit(&physmem_mutex);
755*3253Smec }
756*3253Smec 
757*3253Smec /*ARGSUSED*/
758*3253Smec static int
759*3253Smec physmem_ioctl(dev_t dev, int cmd, intptr_t arg, int mode, cred_t *credp,
760*3253Smec     int *rvalp)
761*3253Smec {
762*3253Smec 	int ret;
763*3253Smec 
764*3253Smec 	switch (cmd) {
765*3253Smec 	case PHYSMEM_SETUP:
766*3253Smec 		{
767*3253Smec 			struct physmem_setup_param psp;
768*3253Smec 			if (ddi_copyin((void *)arg, &psp,
769*3253Smec 			    sizeof (struct physmem_setup_param), 0))
770*3253Smec 				return (EFAULT);
771*3253Smec 			ret = physmem_setup_addrs(&psp);
772*3253Smec 			if (ddi_copyout(&psp, (void *)arg, sizeof (psp), 0))
773*3253Smec 				return (EFAULT);
774*3253Smec 		}
775*3253Smec 		break;
776*3253Smec 	case PHYSMEM_MAP:
777*3253Smec 		{
778*3253Smec 			struct physmem_map_param pmp;
779*3253Smec 			if (ddi_copyin((void *)arg, &pmp,
780*3253Smec 			    sizeof (struct physmem_map_param), 0))
781*3253Smec 				return (EFAULT);
782*3253Smec 			ret = physmem_map_addrs(&pmp);
783*3253Smec 			if (ddi_copyout(&pmp, (void *)arg, sizeof (pmp), 0))
784*3253Smec 				return (EFAULT);
785*3253Smec 		}
786*3253Smec 		break;
787*3253Smec 	case PHYSMEM_DESTROY:
788*3253Smec 		{
789*3253Smec 			uint64_t cookie;
790*3253Smec 			if (ddi_copyin((void *)arg, &cookie,
791*3253Smec 			    sizeof (uint64_t), 0))
792*3253Smec 				return (EFAULT);
793*3253Smec 			ret = physmem_destroy_addrs(cookie);
794*3253Smec 		}
795*3253Smec 		break;
796*3253Smec 	default:
797*3253Smec 		return (ENOTSUP);
798*3253Smec 	}
799*3253Smec 	return (ret);
800*3253Smec }
801*3253Smec 
802*3253Smec /*ARGSUSED*/
803*3253Smec static int
804*3253Smec physmem_open(dev_t *devp, int flag, int otyp, cred_t *credp)
805*3253Smec {
806*3253Smec 	int ret;
807*3253Smec 	static int msg_printed = 0;
808*3253Smec 
809*3253Smec 	if ((flag & (FWRITE | FREAD)) != (FWRITE | FREAD)) {
810*3253Smec 		return (EINVAL);
811*3253Smec 	}
812*3253Smec 
813*3253Smec 	/* need to make sure we have the right privileges */
814*3253Smec 	if ((ret = secpolicy_resource(credp)) != 0)
815*3253Smec 		return (ret);
816*3253Smec 	if ((ret = secpolicy_lock_memory(credp)) != 0)
817*3253Smec 		return (ret);
818*3253Smec 
819*3253Smec 	if (msg_printed == 0) {
820*3253Smec 		cmn_err(CE_NOTE, "!driver has been opened. This driver may "
821*3253Smec 		    "take out long term locks on pages which may impact "
822*3253Smec 		    "dynamic reconfiguration events");
823*3253Smec 		msg_printed = 1;
824*3253Smec 	}
825*3253Smec 
826*3253Smec 	return (0);
827*3253Smec }
828*3253Smec 
829*3253Smec /*ARGSUSED*/
830*3253Smec static int
831*3253Smec physmem_close(dev_t dev, int flag, int otyp, cred_t *credp)
832*3253Smec {
833*3253Smec 	return (0);
834*3253Smec }
835*3253Smec 
836*3253Smec /*ARGSUSED*/
837*3253Smec static int
838*3253Smec physmem_getinfo(dev_info_t *dip, ddi_info_cmd_t infocmd,
839*3253Smec     void *arg, void **resultp)
840*3253Smec {
841*3253Smec 	switch (infocmd) {
842*3253Smec 	case DDI_INFO_DEVT2DEVINFO:
843*3253Smec 		*resultp = physmem_dip;
844*3253Smec 		return (DDI_SUCCESS);
845*3253Smec 
846*3253Smec 	case DDI_INFO_DEVT2INSTANCE:
847*3253Smec 		*resultp = (void *)(ulong_t)getminor((dev_t)arg);
848*3253Smec 		return (DDI_SUCCESS);
849*3253Smec 
850*3253Smec 	default:
851*3253Smec 		return (DDI_FAILURE);
852*3253Smec 	}
853*3253Smec }
854*3253Smec 
855*3253Smec static int
856*3253Smec physmem_attach(dev_info_t *dip, ddi_attach_cmd_t cmd)
857*3253Smec {
858*3253Smec 	int i;
859*3253Smec 
860*3253Smec 	if (cmd == DDI_RESUME) {
861*3253Smec 		return (DDI_SUCCESS);
862*3253Smec 	}
863*3253Smec 
864*3253Smec 	if (cmd != DDI_ATTACH)
865*3253Smec 		return (DDI_FAILURE);
866*3253Smec 
867*3253Smec 	if (ddi_create_minor_node(dip, ddi_get_name(dip), S_IFCHR,
868*3253Smec 	    ddi_get_instance(dip), DDI_PSEUDO, 0) != DDI_SUCCESS)
869*3253Smec 		return (DDI_FAILURE);
870*3253Smec 
871*3253Smec 	physmem_dip = dip;
872*3253Smec 
873*3253Smec 	/* Initialize driver specific data */
874*3253Smec 	if (physmem_setup_vnops()) {
875*3253Smec 		ddi_remove_minor_node(dip, ddi_get_name(dip));
876*3253Smec 		return (DDI_FAILURE);
877*3253Smec 	}
878*3253Smec 
879*3253Smec 	for (i = 0; i < PPH_SIZE; i++)
880*3253Smec 		pph[i] = NULL;
881*3253Smec 
882*3253Smec 	page_capture_register_callback(PC_PHYSMEM, 10000,
883*3253Smec 	    map_page_proc);
884*3253Smec 
885*3253Smec 	return (DDI_SUCCESS);
886*3253Smec }
887*3253Smec 
888*3253Smec static int
889*3253Smec physmem_detach(dev_info_t *dip, ddi_detach_cmd_t cmd)
890*3253Smec {
891*3253Smec 	int ret = DDI_SUCCESS;
892*3253Smec 
893*3253Smec 	if (cmd == DDI_SUSPEND) {
894*3253Smec 		return (DDI_SUCCESS);
895*3253Smec 	}
896*3253Smec 
897*3253Smec 	if (cmd != DDI_DETACH)
898*3253Smec 		return (DDI_FAILURE);
899*3253Smec 
900*3253Smec 	ASSERT(physmem_dip == dip);
901*3253Smec 
902*3253Smec 	mutex_enter(&physmem_mutex);
903*3253Smec 	if (physmem_vnodecnt == 0) {
904*3253Smec 		if (physmem_vnodeops != NULL) {
905*3253Smec 			vn_freevnodeops(physmem_vnodeops);
906*3253Smec 			physmem_vnodeops = NULL;
907*3253Smec 			page_capture_unregister_callback(PC_PHYSMEM);
908*3253Smec 		}
909*3253Smec 	} else {
910*3253Smec 		ret = EBUSY;
911*3253Smec 	}
912*3253Smec 	mutex_exit(&physmem_mutex);
913*3253Smec 	if (ret == DDI_SUCCESS)
914*3253Smec 		ddi_remove_minor_node(dip, ddi_get_name(dip));
915*3253Smec 	return (ret);
916*3253Smec }
917*3253Smec 
918*3253Smec static struct cb_ops physmem_cb_ops = {
919*3253Smec 	physmem_open,	/* open */
920*3253Smec 	physmem_close,	/* close */
921*3253Smec 	nodev,		/* strategy */
922*3253Smec 	nodev,		/* print */
923*3253Smec 	nodev,		/* dump */
924*3253Smec 	nodev,		/* read */
925*3253Smec 	nodev,		/* write */
926*3253Smec 	physmem_ioctl,	/* ioctl */
927*3253Smec 	nodev,		/* devmap */
928*3253Smec 	nodev,		/* mmap */
929*3253Smec 	nodev,		/* segmap */
930*3253Smec 	nochpoll,	/* chpoll */
931*3253Smec 	ddi_prop_op,	/* prop_op */
932*3253Smec 	NULL,		/* cb_str */
933*3253Smec 	D_NEW | D_MP | D_DEVMAP,
934*3253Smec 	CB_REV,
935*3253Smec 	NULL,
936*3253Smec 	NULL
937*3253Smec };
938*3253Smec 
939*3253Smec static struct dev_ops physmem_ops = {
940*3253Smec 	DEVO_REV,
941*3253Smec 	0,
942*3253Smec 	physmem_getinfo,
943*3253Smec 	nulldev,
944*3253Smec 	nulldev,
945*3253Smec 	physmem_attach,
946*3253Smec 	physmem_detach,
947*3253Smec 	nodev,
948*3253Smec 	&physmem_cb_ops,
949*3253Smec 	NULL,
950*3253Smec 	NULL
951*3253Smec };
952*3253Smec 
953*3253Smec static struct modldrv modldrv = {
954*3253Smec 	&mod_driverops,
955*3253Smec 	"physmem driver %I%",
956*3253Smec 	&physmem_ops
957*3253Smec };
958*3253Smec 
959*3253Smec static struct modlinkage modlinkage = {
960*3253Smec 	MODREV_1,
961*3253Smec 	&modldrv,
962*3253Smec 	NULL
963*3253Smec };
964*3253Smec 
965*3253Smec int
966*3253Smec _init(void)
967*3253Smec {
968*3253Smec 	return (mod_install(&modlinkage));
969*3253Smec }
970*3253Smec 
971*3253Smec int
972*3253Smec _info(struct modinfo *modinfop)
973*3253Smec {
974*3253Smec 	return (mod_info(&modlinkage, modinfop));
975*3253Smec }
976*3253Smec 
977*3253Smec int
978*3253Smec _fini(void)
979*3253Smec {
980*3253Smec 	return (mod_remove(&modlinkage));
981*3253Smec }
982