xref: /onnv-gate/usr/src/uts/common/fs/hsfs/hsfs_vnops.c (revision 10440:ba48e0ae8d55)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
51349Speterte  * Common Development and Distribution License (the "License").
61349Speterte  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
21*10440SRoger.Faulkner@Sun.COM 
220Sstevel@tonic-gate /*
23*10440SRoger.Faulkner@Sun.COM  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
240Sstevel@tonic-gate  * Use is subject to license terms.
250Sstevel@tonic-gate  */
260Sstevel@tonic-gate 
270Sstevel@tonic-gate /*
280Sstevel@tonic-gate  * Vnode operations for the High Sierra filesystem
290Sstevel@tonic-gate  */
300Sstevel@tonic-gate 
310Sstevel@tonic-gate #include <sys/types.h>
320Sstevel@tonic-gate #include <sys/t_lock.h>
330Sstevel@tonic-gate #include <sys/param.h>
340Sstevel@tonic-gate #include <sys/time.h>
350Sstevel@tonic-gate #include <sys/systm.h>
360Sstevel@tonic-gate #include <sys/sysmacros.h>
370Sstevel@tonic-gate #include <sys/resource.h>
380Sstevel@tonic-gate #include <sys/signal.h>
390Sstevel@tonic-gate #include <sys/cred.h>
400Sstevel@tonic-gate #include <sys/user.h>
410Sstevel@tonic-gate #include <sys/buf.h>
420Sstevel@tonic-gate #include <sys/vfs.h>
433898Srsb #include <sys/vfs_opreg.h>
440Sstevel@tonic-gate #include <sys/stat.h>
450Sstevel@tonic-gate #include <sys/vnode.h>
460Sstevel@tonic-gate #include <sys/mode.h>
470Sstevel@tonic-gate #include <sys/proc.h>
480Sstevel@tonic-gate #include <sys/disp.h>
490Sstevel@tonic-gate #include <sys/file.h>
500Sstevel@tonic-gate #include <sys/fcntl.h>
510Sstevel@tonic-gate #include <sys/flock.h>
520Sstevel@tonic-gate #include <sys/kmem.h>
530Sstevel@tonic-gate #include <sys/uio.h>
540Sstevel@tonic-gate #include <sys/conf.h>
550Sstevel@tonic-gate #include <sys/errno.h>
560Sstevel@tonic-gate #include <sys/mman.h>
570Sstevel@tonic-gate #include <sys/pathname.h>
580Sstevel@tonic-gate #include <sys/debug.h>
590Sstevel@tonic-gate #include <sys/vmsystm.h>
600Sstevel@tonic-gate #include <sys/cmn_err.h>
610Sstevel@tonic-gate #include <sys/fbuf.h>
620Sstevel@tonic-gate #include <sys/dirent.h>
630Sstevel@tonic-gate #include <sys/errno.h>
645312Smg147109 #include <sys/dkio.h>
655312Smg147109 #include <sys/cmn_err.h>
665312Smg147109 #include <sys/atomic.h>
670Sstevel@tonic-gate 
680Sstevel@tonic-gate #include <vm/hat.h>
690Sstevel@tonic-gate #include <vm/page.h>
700Sstevel@tonic-gate #include <vm/pvn.h>
710Sstevel@tonic-gate #include <vm/as.h>
720Sstevel@tonic-gate #include <vm/seg.h>
730Sstevel@tonic-gate #include <vm/seg_map.h>
740Sstevel@tonic-gate #include <vm/seg_kmem.h>
750Sstevel@tonic-gate #include <vm/seg_vn.h>
760Sstevel@tonic-gate #include <vm/rm.h>
770Sstevel@tonic-gate #include <vm/page.h>
780Sstevel@tonic-gate #include <sys/swap.h>
795312Smg147109 #include <sys/avl.h>
805312Smg147109 #include <sys/sunldi.h>
815312Smg147109 #include <sys/ddi.h>
825312Smg147109 #include <sys/sunddi.h>
835312Smg147109 #include <sys/sdt.h>
845312Smg147109 
855312Smg147109 /*
865312Smg147109  * For struct modlinkage
875312Smg147109  */
885312Smg147109 #include <sys/modctl.h>
890Sstevel@tonic-gate 
900Sstevel@tonic-gate #include <sys/fs/hsfs_spec.h>
910Sstevel@tonic-gate #include <sys/fs/hsfs_node.h>
920Sstevel@tonic-gate #include <sys/fs/hsfs_impl.h>
930Sstevel@tonic-gate #include <sys/fs/hsfs_susp.h>
940Sstevel@tonic-gate #include <sys/fs/hsfs_rrip.h>
950Sstevel@tonic-gate 
960Sstevel@tonic-gate #include <fs/fs_subr.h>
970Sstevel@tonic-gate 
985312Smg147109 /* # of contiguous requests to detect sequential access pattern */
995312Smg147109 static int seq_contig_requests = 2;
1005312Smg147109 
1015312Smg147109 /*
1025312Smg147109  * This is the max number os taskq threads that will be created
1035312Smg147109  * if required. Since we are using a Dynamic TaskQ by default only
1045312Smg147109  * one thread is created initially.
1055312Smg147109  *
1065312Smg147109  * NOTE: In the usual hsfs use case this per fs instance number
1075312Smg147109  * of taskq threads should not place any undue load on a system.
1085312Smg147109  * Even on an unusual system with say 100 CDROM drives, 800 threads
1095312Smg147109  * will not be created unless all the drives are loaded and all
1105312Smg147109  * of them are saturated with I/O at the same time! If there is at
1115312Smg147109  * all a complaint of system load due to such an unusual case it
1125312Smg147109  * should be easy enough to change to one per-machine Dynamic TaskQ
1135312Smg147109  * for all hsfs mounts with a nthreads of say 32.
1145312Smg147109  */
1155312Smg147109 static int hsfs_taskq_nthreads = 8;	/* # of taskq threads per fs */
1165312Smg147109 
1175312Smg147109 /* Min count of adjacent bufs that will avoid buf coalescing */
1185312Smg147109 static int hsched_coalesce_min = 2;
1195312Smg147109 
1205312Smg147109 /*
1215312Smg147109  * Kmem caches for heavily used small allocations. Using these kmem
1225312Smg147109  * caches provides a factor of 3 reduction in system time and greatly
1235312Smg147109  * aids overall throughput esp. on SPARC.
1245312Smg147109  */
1255312Smg147109 struct kmem_cache *hio_cache;
1265312Smg147109 struct kmem_cache *hio_info_cache;
1275312Smg147109 
1284866Sfrankho /*
1294866Sfrankho  * This tunable allows us to ignore inode numbers from rrip-1.12.
1304866Sfrankho  * In this case, we fall back to our default inode algorithm.
1314866Sfrankho  */
1324866Sfrankho extern int use_rrip_inodes;
1334866Sfrankho 
1345312Smg147109 /*
1355312Smg147109  * Free behind logic from UFS to tame our thirst for
1365312Smg147109  * the page cache.
1375312Smg147109  * See usr/src/uts/common/fs/ufs/ufs_vnops.c for more
1385312Smg147109  * explanation.
1395312Smg147109  */
1405312Smg147109 static int	freebehind = 1;
1415312Smg147109 static int	smallfile = 0;
1425312Smg147109 static int	cache_read_ahead = 0;
1435312Smg147109 static u_offset_t smallfile64 = 32 * 1024;
1445312Smg147109 #define	SMALLFILE1_D 1000
1455312Smg147109 #define	SMALLFILE2_D 10
1465312Smg147109 static u_offset_t smallfile1 = 32 * 1024;
1475312Smg147109 static u_offset_t smallfile2 = 32 * 1024;
1485312Smg147109 static clock_t smallfile_update = 0; /* when to recompute */
1495312Smg147109 static uint_t smallfile1_d = SMALLFILE1_D;
1505312Smg147109 static uint_t smallfile2_d = SMALLFILE2_D;
1515312Smg147109 
1525312Smg147109 static int hsched_deadline_compare(const void *x1, const void *x2);
1535312Smg147109 static int hsched_offset_compare(const void *x1, const void *x2);
1545312Smg147109 static void hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra);
1555312Smg147109 int hsched_invoke_strategy(struct hsfs *fsp);
1564866Sfrankho 
1570Sstevel@tonic-gate /* ARGSUSED */
1580Sstevel@tonic-gate static int
hsfs_fsync(vnode_t * cp,int syncflag,cred_t * cred,caller_context_t * ct)1595331Samw hsfs_fsync(vnode_t *cp,
1605331Samw 	int syncflag,
1615331Samw 	cred_t *cred,
1625331Samw 	caller_context_t *ct)
1630Sstevel@tonic-gate {
1640Sstevel@tonic-gate 	return (0);
1650Sstevel@tonic-gate }
1660Sstevel@tonic-gate 
1670Sstevel@tonic-gate 
1680Sstevel@tonic-gate /*ARGSUSED*/
1690Sstevel@tonic-gate static int
hsfs_read(struct vnode * vp,struct uio * uiop,int ioflag,struct cred * cred,struct caller_context * ct)1705331Samw hsfs_read(struct vnode *vp,
1715331Samw 	struct uio *uiop,
1725331Samw 	int ioflag,
1735331Samw 	struct cred *cred,
1740Sstevel@tonic-gate 	struct caller_context *ct)
1750Sstevel@tonic-gate {
176206Speterte 	caddr_t base;
177206Speterte 	offset_t diff;
178206Speterte 	int error;
1790Sstevel@tonic-gate 	struct hsnode *hp;
180206Speterte 	uint_t filesize;
1815312Smg147109 	int dofree;
1820Sstevel@tonic-gate 
1830Sstevel@tonic-gate 	hp = VTOH(vp);
1840Sstevel@tonic-gate 	/*
1850Sstevel@tonic-gate 	 * if vp is of type VDIR, make sure dirent
1860Sstevel@tonic-gate 	 * is filled up with all info (because of ptbl)
1870Sstevel@tonic-gate 	 */
1880Sstevel@tonic-gate 	if (vp->v_type == VDIR) {
1890Sstevel@tonic-gate 		if (hp->hs_dirent.ext_size == 0)
1900Sstevel@tonic-gate 			hs_filldirent(vp, &hp->hs_dirent);
1910Sstevel@tonic-gate 	}
1920Sstevel@tonic-gate 	filesize = hp->hs_dirent.ext_size;
1930Sstevel@tonic-gate 
194206Speterte 	/* Sanity checks. */
195206Speterte 	if (uiop->uio_resid == 0 ||		/* No data wanted. */
1961349Speterte 	    uiop->uio_loffset > HS_MAXFILEOFF ||	/* Offset too big. */
197206Speterte 	    uiop->uio_loffset >= filesize)	/* Past EOF. */
198206Speterte 		return (0);
1990Sstevel@tonic-gate 
2000Sstevel@tonic-gate 	do {
201206Speterte 		/*
202206Speterte 		 * We want to ask for only the "right" amount of data.
203206Speterte 		 * In this case that means:-
204206Speterte 		 *
205206Speterte 		 * We can't get data from beyond our EOF. If asked,
206206Speterte 		 * we will give a short read.
207206Speterte 		 *
208206Speterte 		 * segmap_getmapflt returns buffers of MAXBSIZE bytes.
209206Speterte 		 * These buffers are always MAXBSIZE aligned.
210206Speterte 		 * If our starting offset is not MAXBSIZE aligned,
211206Speterte 		 * we can only ask for less than MAXBSIZE bytes.
212206Speterte 		 *
213206Speterte 		 * If our requested offset and length are such that
214206Speterte 		 * they belong in different MAXBSIZE aligned slots
215206Speterte 		 * then we'll be making more than one call on
216206Speterte 		 * segmap_getmapflt.
217206Speterte 		 *
218206Speterte 		 * This diagram shows the variables we use and their
219206Speterte 		 * relationships.
220206Speterte 		 *
221206Speterte 		 * |<-----MAXBSIZE----->|
222206Speterte 		 * +--------------------------...+
223206Speterte 		 * |.....mapon->|<--n-->|....*...|EOF
224206Speterte 		 * +--------------------------...+
225206Speterte 		 * uio_loffset->|
226206Speterte 		 * uio_resid....|<---------->|
227206Speterte 		 * diff.........|<-------------->|
228206Speterte 		 *
229206Speterte 		 * So, in this case our offset is not aligned
230206Speterte 		 * and our request takes us outside of the
231206Speterte 		 * MAXBSIZE window. We will break this up into
232206Speterte 		 * two segmap_getmapflt calls.
233206Speterte 		 */
234206Speterte 		size_t nbytes;
235206Speterte 		offset_t mapon;
236206Speterte 		size_t n;
237206Speterte 		uint_t flags;
2380Sstevel@tonic-gate 
239206Speterte 		mapon = uiop->uio_loffset & MAXBOFFSET;
240206Speterte 		diff = filesize - uiop->uio_loffset;
241206Speterte 		nbytes = (size_t)MIN(MAXBSIZE - mapon, uiop->uio_resid);
242206Speterte 		n = MIN(diff, nbytes);
243206Speterte 		if (n <= 0) {
244206Speterte 			/* EOF or request satisfied. */
245206Speterte 			return (0);
2460Sstevel@tonic-gate 		}
2470Sstevel@tonic-gate 
2485312Smg147109 		/*
2495312Smg147109 		 * Freebehind computation taken from:
2505312Smg147109 		 * usr/src/uts/common/fs/ufs/ufs_vnops.c
2515312Smg147109 		 */
2525312Smg147109 		if (drv_hztousec(ddi_get_lbolt()) >= smallfile_update) {
2535312Smg147109 			uint64_t percpufreeb;
2545312Smg147109 			if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D;
2555312Smg147109 			if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D;
2565312Smg147109 			percpufreeb = ptob((uint64_t)freemem) / ncpus_online;
2575312Smg147109 			smallfile1 = percpufreeb / smallfile1_d;
2585312Smg147109 			smallfile2 = percpufreeb / smallfile2_d;
2595312Smg147109 			smallfile1 = MAX(smallfile1, smallfile);
2605312Smg147109 			smallfile1 = MAX(smallfile1, smallfile64);
2615312Smg147109 			smallfile2 = MAX(smallfile1, smallfile2);
2625312Smg147109 			smallfile_update = drv_hztousec(ddi_get_lbolt())
2635312Smg147109 			    + 1000000;
2645312Smg147109 		}
2655312Smg147109 
2665312Smg147109 		dofree = freebehind &&
2675312Smg147109 		    hp->hs_prev_offset == uiop->uio_loffset &&
2685312Smg147109 		    hp->hs_ra_bytes > 0;
2695312Smg147109 
270206Speterte 		base = segmap_getmapflt(segkmap, vp,
271206Speterte 		    (u_offset_t)uiop->uio_loffset, n, 1, S_READ);
2720Sstevel@tonic-gate 
273206Speterte 		error = uiomove(base + mapon, n, UIO_READ, uiop);
274206Speterte 
2750Sstevel@tonic-gate 		if (error == 0) {
2760Sstevel@tonic-gate 			/*
2770Sstevel@tonic-gate 			 * if read a whole block, or read to eof,
2780Sstevel@tonic-gate 			 *  won't need this buffer again soon.
2790Sstevel@tonic-gate 			 */
280206Speterte 			if (n + mapon == MAXBSIZE ||
281206Speterte 			    uiop->uio_loffset == filesize)
2820Sstevel@tonic-gate 				flags = SM_DONTNEED;
2830Sstevel@tonic-gate 			else
2840Sstevel@tonic-gate 				flags = 0;
2855312Smg147109 
2865312Smg147109 			if (dofree) {
2875312Smg147109 				flags = SM_FREE | SM_ASYNC;
2885312Smg147109 				if ((cache_read_ahead == 0) &&
2895312Smg147109 				    uiop->uio_loffset > smallfile2)
2905312Smg147109 					flags |=  SM_DONTNEED;
2915312Smg147109 			}
2925312Smg147109 
2930Sstevel@tonic-gate 			error = segmap_release(segkmap, base, flags);
2940Sstevel@tonic-gate 		} else
2950Sstevel@tonic-gate 			(void) segmap_release(segkmap, base, 0);
2960Sstevel@tonic-gate 	} while (error == 0 && uiop->uio_resid > 0);
2970Sstevel@tonic-gate 
2980Sstevel@tonic-gate 	return (error);
2990Sstevel@tonic-gate }
3000Sstevel@tonic-gate 
3010Sstevel@tonic-gate /*ARGSUSED2*/
3020Sstevel@tonic-gate static int
hsfs_getattr(struct vnode * vp,struct vattr * vap,int flags,struct cred * cred,caller_context_t * ct)3030Sstevel@tonic-gate hsfs_getattr(
3040Sstevel@tonic-gate 	struct vnode *vp,
3050Sstevel@tonic-gate 	struct vattr *vap,
3060Sstevel@tonic-gate 	int flags,
3075331Samw 	struct cred *cred,
3085331Samw 	caller_context_t *ct)
3090Sstevel@tonic-gate {
3100Sstevel@tonic-gate 	struct hsnode *hp;
3110Sstevel@tonic-gate 	struct vfs *vfsp;
3120Sstevel@tonic-gate 	struct hsfs *fsp;
3130Sstevel@tonic-gate 
3140Sstevel@tonic-gate 	hp = VTOH(vp);
3150Sstevel@tonic-gate 	fsp = VFS_TO_HSFS(vp->v_vfsp);
3160Sstevel@tonic-gate 	vfsp = vp->v_vfsp;
3170Sstevel@tonic-gate 
3180Sstevel@tonic-gate 	if ((hp->hs_dirent.ext_size == 0) && (vp->v_type == VDIR)) {
3190Sstevel@tonic-gate 		hs_filldirent(vp, &hp->hs_dirent);
3200Sstevel@tonic-gate 	}
3210Sstevel@tonic-gate 	vap->va_type = IFTOVT(hp->hs_dirent.mode);
3220Sstevel@tonic-gate 	vap->va_mode = hp->hs_dirent.mode;
3230Sstevel@tonic-gate 	vap->va_uid = hp->hs_dirent.uid;
3240Sstevel@tonic-gate 	vap->va_gid = hp->hs_dirent.gid;
3250Sstevel@tonic-gate 
3260Sstevel@tonic-gate 	vap->va_fsid = vfsp->vfs_dev;
3270Sstevel@tonic-gate 	vap->va_nodeid = (ino64_t)hp->hs_nodeid;
3280Sstevel@tonic-gate 	vap->va_nlink = hp->hs_dirent.nlink;
3290Sstevel@tonic-gate 	vap->va_size =	(offset_t)hp->hs_dirent.ext_size;
3300Sstevel@tonic-gate 
3310Sstevel@tonic-gate 	vap->va_atime.tv_sec = hp->hs_dirent.adate.tv_sec;
3320Sstevel@tonic-gate 	vap->va_atime.tv_nsec = hp->hs_dirent.adate.tv_usec*1000;
3330Sstevel@tonic-gate 	vap->va_mtime.tv_sec = hp->hs_dirent.mdate.tv_sec;
3340Sstevel@tonic-gate 	vap->va_mtime.tv_nsec = hp->hs_dirent.mdate.tv_usec*1000;
3350Sstevel@tonic-gate 	vap->va_ctime.tv_sec = hp->hs_dirent.cdate.tv_sec;
3360Sstevel@tonic-gate 	vap->va_ctime.tv_nsec = hp->hs_dirent.cdate.tv_usec*1000;
3370Sstevel@tonic-gate 	if (vp->v_type == VCHR || vp->v_type == VBLK)
3380Sstevel@tonic-gate 		vap->va_rdev = hp->hs_dirent.r_dev;
3390Sstevel@tonic-gate 	else
3400Sstevel@tonic-gate 		vap->va_rdev = 0;
3410Sstevel@tonic-gate 	vap->va_blksize = vfsp->vfs_bsize;
3420Sstevel@tonic-gate 	/* no. of blocks = no. of data blocks + no. of xar blocks */
3430Sstevel@tonic-gate 	vap->va_nblocks = (fsblkcnt64_t)howmany(vap->va_size + (u_longlong_t)
3440Sstevel@tonic-gate 	    (hp->hs_dirent.xar_len << fsp->hsfs_vol.lbn_shift), DEV_BSIZE);
3450Sstevel@tonic-gate 	vap->va_seq = hp->hs_seq;
3460Sstevel@tonic-gate 	return (0);
3470Sstevel@tonic-gate }
3480Sstevel@tonic-gate 
3490Sstevel@tonic-gate /*ARGSUSED*/
3500Sstevel@tonic-gate static int
hsfs_readlink(struct vnode * vp,struct uio * uiop,struct cred * cred,caller_context_t * ct)3515331Samw hsfs_readlink(struct vnode *vp,
3525331Samw 	struct uio *uiop,
3535331Samw 	struct cred *cred,
3545331Samw 	caller_context_t *ct)
3550Sstevel@tonic-gate {
3560Sstevel@tonic-gate 	struct hsnode *hp;
3570Sstevel@tonic-gate 
3580Sstevel@tonic-gate 	if (vp->v_type != VLNK)
3590Sstevel@tonic-gate 		return (EINVAL);
3600Sstevel@tonic-gate 
3610Sstevel@tonic-gate 	hp = VTOH(vp);
3620Sstevel@tonic-gate 
3630Sstevel@tonic-gate 	if (hp->hs_dirent.sym_link == (char *)NULL)
3640Sstevel@tonic-gate 		return (ENOENT);
3650Sstevel@tonic-gate 
3660Sstevel@tonic-gate 	return (uiomove(hp->hs_dirent.sym_link,
3670Sstevel@tonic-gate 	    (size_t)MIN(hp->hs_dirent.ext_size,
3680Sstevel@tonic-gate 	    uiop->uio_resid), UIO_READ, uiop));
3690Sstevel@tonic-gate }
3700Sstevel@tonic-gate 
3710Sstevel@tonic-gate /*ARGSUSED*/
3720Sstevel@tonic-gate static void
hsfs_inactive(struct vnode * vp,struct cred * cred,caller_context_t * ct)3735331Samw hsfs_inactive(struct vnode *vp,
3745331Samw 	struct cred *cred,
3755331Samw 	caller_context_t *ct)
3760Sstevel@tonic-gate {
3770Sstevel@tonic-gate 	struct hsnode *hp;
3780Sstevel@tonic-gate 	struct hsfs *fsp;
3790Sstevel@tonic-gate 
3800Sstevel@tonic-gate 	int nopage;
3810Sstevel@tonic-gate 
3820Sstevel@tonic-gate 	hp = VTOH(vp);
3830Sstevel@tonic-gate 	fsp = VFS_TO_HSFS(vp->v_vfsp);
3840Sstevel@tonic-gate 	/*
3850Sstevel@tonic-gate 	 * Note: acquiring and holding v_lock for quite a while
3860Sstevel@tonic-gate 	 * here serializes on the vnode; this is unfortunate, but
3870Sstevel@tonic-gate 	 * likely not to overly impact performance, as the underlying
3880Sstevel@tonic-gate 	 * device (CDROM drive) is quite slow.
3890Sstevel@tonic-gate 	 */
3900Sstevel@tonic-gate 	rw_enter(&fsp->hsfs_hash_lock, RW_WRITER);
3910Sstevel@tonic-gate 	mutex_enter(&hp->hs_contents_lock);
3920Sstevel@tonic-gate 	mutex_enter(&vp->v_lock);
3930Sstevel@tonic-gate 
3940Sstevel@tonic-gate 	if (vp->v_count < 1) {
3950Sstevel@tonic-gate 		panic("hsfs_inactive: v_count < 1");
3960Sstevel@tonic-gate 		/*NOTREACHED*/
3970Sstevel@tonic-gate 	}
3980Sstevel@tonic-gate 
3990Sstevel@tonic-gate 	if (vp->v_count > 1 || (hp->hs_flags & HREF) == 0) {
4000Sstevel@tonic-gate 		vp->v_count--;	/* release hold from vn_rele */
4010Sstevel@tonic-gate 		mutex_exit(&vp->v_lock);
4020Sstevel@tonic-gate 		mutex_exit(&hp->hs_contents_lock);
4030Sstevel@tonic-gate 		rw_exit(&fsp->hsfs_hash_lock);
4040Sstevel@tonic-gate 		return;
4050Sstevel@tonic-gate 	}
4060Sstevel@tonic-gate 	vp->v_count--;	/* release hold from vn_rele */
4070Sstevel@tonic-gate 	if (vp->v_count == 0) {
4080Sstevel@tonic-gate 		/*
4090Sstevel@tonic-gate 		 * Free the hsnode.
4100Sstevel@tonic-gate 		 * If there are no pages associated with the
4110Sstevel@tonic-gate 		 * hsnode, give it back to the kmem_cache,
4120Sstevel@tonic-gate 		 * else put at the end of this file system's
4130Sstevel@tonic-gate 		 * internal free list.
4140Sstevel@tonic-gate 		 */
4150Sstevel@tonic-gate 		nopage = !vn_has_cached_data(vp);
4160Sstevel@tonic-gate 		hp->hs_flags = 0;
4170Sstevel@tonic-gate 		/*
4180Sstevel@tonic-gate 		 * exit these locks now, since hs_freenode may
4190Sstevel@tonic-gate 		 * kmem_free the hsnode and embedded vnode
4200Sstevel@tonic-gate 		 */
4210Sstevel@tonic-gate 		mutex_exit(&vp->v_lock);
4220Sstevel@tonic-gate 		mutex_exit(&hp->hs_contents_lock);
4230Sstevel@tonic-gate 		hs_freenode(vp, fsp, nopage);
4240Sstevel@tonic-gate 	} else {
4250Sstevel@tonic-gate 		mutex_exit(&vp->v_lock);
4260Sstevel@tonic-gate 		mutex_exit(&hp->hs_contents_lock);
4270Sstevel@tonic-gate 	}
4280Sstevel@tonic-gate 	rw_exit(&fsp->hsfs_hash_lock);
4290Sstevel@tonic-gate }
4300Sstevel@tonic-gate 
4310Sstevel@tonic-gate 
4320Sstevel@tonic-gate /*ARGSUSED*/
4330Sstevel@tonic-gate static int
hsfs_lookup(struct vnode * dvp,char * nm,struct vnode ** vpp,struct pathname * pnp,int flags,struct vnode * rdir,struct cred * cred,caller_context_t * ct,int * direntflags,pathname_t * realpnp)4340Sstevel@tonic-gate hsfs_lookup(
4350Sstevel@tonic-gate 	struct vnode *dvp,
4360Sstevel@tonic-gate 	char *nm,
4370Sstevel@tonic-gate 	struct vnode **vpp,
4380Sstevel@tonic-gate 	struct pathname *pnp,
4390Sstevel@tonic-gate 	int flags,
4400Sstevel@tonic-gate 	struct vnode *rdir,
4415331Samw 	struct cred *cred,
4425331Samw 	caller_context_t *ct,
4435331Samw 	int *direntflags,
4445331Samw 	pathname_t *realpnp)
4450Sstevel@tonic-gate {
4460Sstevel@tonic-gate 	int error;
4470Sstevel@tonic-gate 	int namelen = (int)strlen(nm);
4480Sstevel@tonic-gate 
4490Sstevel@tonic-gate 	if (*nm == '\0') {
4500Sstevel@tonic-gate 		VN_HOLD(dvp);
4510Sstevel@tonic-gate 		*vpp = dvp;
4520Sstevel@tonic-gate 		return (0);
4530Sstevel@tonic-gate 	}
4540Sstevel@tonic-gate 
4550Sstevel@tonic-gate 	/*
4560Sstevel@tonic-gate 	 * If we're looking for ourself, life is simple.
4570Sstevel@tonic-gate 	 */
4580Sstevel@tonic-gate 	if (namelen == 1 && *nm == '.') {
4590Sstevel@tonic-gate 		if (error = hs_access(dvp, (mode_t)VEXEC, cred))
4600Sstevel@tonic-gate 			return (error);
4610Sstevel@tonic-gate 		VN_HOLD(dvp);
4620Sstevel@tonic-gate 		*vpp = dvp;
4630Sstevel@tonic-gate 		return (0);
4640Sstevel@tonic-gate 	}
4650Sstevel@tonic-gate 
4660Sstevel@tonic-gate 	return (hs_dirlook(dvp, nm, namelen, vpp, cred));
4670Sstevel@tonic-gate }
4680Sstevel@tonic-gate 
4690Sstevel@tonic-gate 
4700Sstevel@tonic-gate /*ARGSUSED*/
4710Sstevel@tonic-gate static int
hsfs_readdir(struct vnode * vp,struct uio * uiop,struct cred * cred,int * eofp,caller_context_t * ct,int flags)4720Sstevel@tonic-gate hsfs_readdir(
4735331Samw 	struct vnode		*vp,
4745331Samw 	struct uio		*uiop,
4755331Samw 	struct cred		*cred,
4765331Samw 	int			*eofp,
4775331Samw 	caller_context_t	*ct,
4785331Samw 	int			flags)
4790Sstevel@tonic-gate {
4800Sstevel@tonic-gate 	struct hsnode	*dhp;
4810Sstevel@tonic-gate 	struct hsfs	*fsp;
4820Sstevel@tonic-gate 	struct hs_direntry hd;
4830Sstevel@tonic-gate 	struct dirent64	*nd;
4840Sstevel@tonic-gate 	int		error;
4850Sstevel@tonic-gate 	uint_t		offset;		/* real offset in directory */
4860Sstevel@tonic-gate 	uint_t		dirsiz;		/* real size of directory */
4870Sstevel@tonic-gate 	uchar_t		*blkp;
4880Sstevel@tonic-gate 	int		hdlen;		/* length of hs directory entry */
4890Sstevel@tonic-gate 	long		ndlen;		/* length of dirent entry */
4900Sstevel@tonic-gate 	int		bytes_wanted;
4910Sstevel@tonic-gate 	size_t		bufsize;	/* size of dirent buffer */
4920Sstevel@tonic-gate 	char		*outbuf;	/* ptr to dirent buffer */
4930Sstevel@tonic-gate 	char		*dname;
4940Sstevel@tonic-gate 	int		dnamelen;
4950Sstevel@tonic-gate 	size_t		dname_size;
4960Sstevel@tonic-gate 	struct fbuf	*fbp;
4970Sstevel@tonic-gate 	uint_t		last_offset;	/* last index into current dir block */
4980Sstevel@tonic-gate 	ino64_t		dirino;	/* temporary storage before storing in dirent */
4990Sstevel@tonic-gate 	off_t		diroff;
5000Sstevel@tonic-gate 
5010Sstevel@tonic-gate 	dhp = VTOH(vp);
5020Sstevel@tonic-gate 	fsp = VFS_TO_HSFS(vp->v_vfsp);
5030Sstevel@tonic-gate 	if (dhp->hs_dirent.ext_size == 0)
5040Sstevel@tonic-gate 		hs_filldirent(vp, &dhp->hs_dirent);
5050Sstevel@tonic-gate 	dirsiz = dhp->hs_dirent.ext_size;
5060Sstevel@tonic-gate 	if (uiop->uio_loffset >= dirsiz) {	/* at or beyond EOF */
5070Sstevel@tonic-gate 		if (eofp)
5080Sstevel@tonic-gate 			*eofp = 1;
5090Sstevel@tonic-gate 		return (0);
5100Sstevel@tonic-gate 	}
5111349Speterte 	ASSERT(uiop->uio_loffset <= HS_MAXFILEOFF);
5121349Speterte 	offset = uiop->uio_loffset;
5130Sstevel@tonic-gate 
5140Sstevel@tonic-gate 	dname_size = fsp->hsfs_namemax + 1;	/* 1 for the ending NUL */
5150Sstevel@tonic-gate 	dname = kmem_alloc(dname_size, KM_SLEEP);
5160Sstevel@tonic-gate 	bufsize = uiop->uio_resid + sizeof (struct dirent64);
5170Sstevel@tonic-gate 
5180Sstevel@tonic-gate 	outbuf = kmem_alloc(bufsize, KM_SLEEP);
5190Sstevel@tonic-gate 	nd = (struct dirent64 *)outbuf;
5200Sstevel@tonic-gate 
5210Sstevel@tonic-gate 	while (offset < dirsiz) {
522494Sfrankho 		bytes_wanted = MIN(MAXBSIZE, dirsiz - (offset & MAXBMASK));
5230Sstevel@tonic-gate 
5240Sstevel@tonic-gate 		error = fbread(vp, (offset_t)(offset & MAXBMASK),
5254866Sfrankho 		    (unsigned int)bytes_wanted, S_READ, &fbp);
5260Sstevel@tonic-gate 		if (error)
5270Sstevel@tonic-gate 			goto done;
5280Sstevel@tonic-gate 
5290Sstevel@tonic-gate 		blkp = (uchar_t *)fbp->fb_addr;
530494Sfrankho 		last_offset = (offset & MAXBMASK) + fbp->fb_count;
5310Sstevel@tonic-gate 
5320Sstevel@tonic-gate #define	rel_offset(offset) ((offset) & MAXBOFFSET)	/* index into blkp */
5330Sstevel@tonic-gate 
5340Sstevel@tonic-gate 		while (offset < last_offset) {
5350Sstevel@tonic-gate 			/*
536494Sfrankho 			 * Very similar validation code is found in
537494Sfrankho 			 * process_dirblock(), hsfs_node.c.
538494Sfrankho 			 * For an explanation, see there.
539494Sfrankho 			 * It may make sense for the future to
540494Sfrankho 			 * "consolidate" the code in hs_parsedir(),
541494Sfrankho 			 * process_dirblock() and hsfs_readdir() into
542494Sfrankho 			 * a single utility function.
5430Sstevel@tonic-gate 			 */
5440Sstevel@tonic-gate 			hdlen = (int)((uchar_t)
5454866Sfrankho 			    HDE_DIR_LEN(&blkp[rel_offset(offset)]));
546494Sfrankho 			if (hdlen < HDE_ROOT_DIR_REC_SIZE ||
547494Sfrankho 			    offset + hdlen > last_offset) {
5480Sstevel@tonic-gate 				/*
549494Sfrankho 				 * advance to next sector boundary
5500Sstevel@tonic-gate 				 */
551494Sfrankho 				offset = roundup(offset + 1, HS_SECTOR_SIZE);
552494Sfrankho 				if (hdlen)
553494Sfrankho 					hs_log_bogus_disk_warning(fsp,
554494Sfrankho 					    HSFS_ERR_TRAILING_JUNK, 0);
555494Sfrankho 
556494Sfrankho 				continue;
5570Sstevel@tonic-gate 			}
5580Sstevel@tonic-gate 
5590Sstevel@tonic-gate 			bzero(&hd, sizeof (hd));
5600Sstevel@tonic-gate 
5610Sstevel@tonic-gate 			/*
5620Sstevel@tonic-gate 			 * Just ignore invalid directory entries.
5630Sstevel@tonic-gate 			 * XXX - maybe hs_parsedir() will detect EXISTENCE bit
5640Sstevel@tonic-gate 			 */
5650Sstevel@tonic-gate 			if (!hs_parsedir(fsp, &blkp[rel_offset(offset)],
5664866Sfrankho 			    &hd, dname, &dnamelen, last_offset - offset)) {
5670Sstevel@tonic-gate 				/*
5680Sstevel@tonic-gate 				 * Determine if there is enough room
5690Sstevel@tonic-gate 				 */
5700Sstevel@tonic-gate 				ndlen = (long)DIRENT64_RECLEN((dnamelen));
5710Sstevel@tonic-gate 
5720Sstevel@tonic-gate 				if ((ndlen + ((char *)nd - outbuf)) >
5730Sstevel@tonic-gate 				    uiop->uio_resid) {
5740Sstevel@tonic-gate 					fbrelse(fbp, S_READ);
5750Sstevel@tonic-gate 					goto done; /* output buffer full */
5760Sstevel@tonic-gate 				}
5770Sstevel@tonic-gate 
5780Sstevel@tonic-gate 				diroff = offset + hdlen;
5790Sstevel@tonic-gate 				/*
5804866Sfrankho 				 * If the media carries rrip-v1.12 or newer,
5814866Sfrankho 				 * and we trust the inodes from the rrip data
5824866Sfrankho 				 * (use_rrip_inodes != 0), use that data. If the
5834866Sfrankho 				 * media has been created by a recent mkisofs
5844866Sfrankho 				 * version, we may trust all numbers in the
5854866Sfrankho 				 * starting extent number; otherwise, we cannot
5864866Sfrankho 				 * do this for zero sized files and symlinks,
5874866Sfrankho 				 * because if we did we'd end up mapping all of
5884866Sfrankho 				 * them to the same node. We use HS_DUMMY_INO
5894866Sfrankho 				 * in this case and make sure that we will not
5904866Sfrankho 				 * map all files to the same meta data.
5910Sstevel@tonic-gate 				 */
5924866Sfrankho 				if (hd.inode != 0 && use_rrip_inodes) {
5934866Sfrankho 					dirino = hd.inode;
5944866Sfrankho 				} else if ((hd.ext_size == 0 ||
5954866Sfrankho 				    hd.sym_link != (char *)NULL) &&
5964866Sfrankho 				    (fsp->hsfs_flags & HSFSMNT_INODE) == 0) {
5974866Sfrankho 					dirino = HS_DUMMY_INO;
5980Sstevel@tonic-gate 				} else {
5994866Sfrankho 					dirino = hd.ext_lbn;
6000Sstevel@tonic-gate 				}
6010Sstevel@tonic-gate 
6020Sstevel@tonic-gate 				/* strncpy(9f) will zero uninitialized bytes */
6030Sstevel@tonic-gate 
6040Sstevel@tonic-gate 				ASSERT(strlen(dname) + 1 <=
6050Sstevel@tonic-gate 				    DIRENT64_NAMELEN(ndlen));
6060Sstevel@tonic-gate 				(void) strncpy(nd->d_name, dname,
6070Sstevel@tonic-gate 				    DIRENT64_NAMELEN(ndlen));
6080Sstevel@tonic-gate 				nd->d_reclen = (ushort_t)ndlen;
6090Sstevel@tonic-gate 				nd->d_off = (offset_t)diroff;
6100Sstevel@tonic-gate 				nd->d_ino = dirino;
6110Sstevel@tonic-gate 				nd = (struct dirent64 *)((char *)nd + ndlen);
6120Sstevel@tonic-gate 
6130Sstevel@tonic-gate 				/*
6140Sstevel@tonic-gate 				 * free up space allocated for symlink
6150Sstevel@tonic-gate 				 */
6160Sstevel@tonic-gate 				if (hd.sym_link != (char *)NULL) {
6170Sstevel@tonic-gate 					kmem_free(hd.sym_link,
6180Sstevel@tonic-gate 					    (size_t)(hd.ext_size+1));
6190Sstevel@tonic-gate 					hd.sym_link = (char *)NULL;
6200Sstevel@tonic-gate 				}
6210Sstevel@tonic-gate 			}
6220Sstevel@tonic-gate 			offset += hdlen;
6230Sstevel@tonic-gate 		}
6240Sstevel@tonic-gate 		fbrelse(fbp, S_READ);
6250Sstevel@tonic-gate 	}
6260Sstevel@tonic-gate 
6270Sstevel@tonic-gate 	/*
6280Sstevel@tonic-gate 	 * Got here for one of the following reasons:
6290Sstevel@tonic-gate 	 *	1) outbuf is full (error == 0)
6300Sstevel@tonic-gate 	 *	2) end of directory reached (error == 0)
6310Sstevel@tonic-gate 	 *	3) error reading directory sector (error != 0)
6320Sstevel@tonic-gate 	 *	4) directory entry crosses sector boundary (error == 0)
6330Sstevel@tonic-gate 	 *
6340Sstevel@tonic-gate 	 * If any directory entries have been copied, don't report
6350Sstevel@tonic-gate 	 * case 4.  Instead, return the valid directory entries.
6360Sstevel@tonic-gate 	 *
6370Sstevel@tonic-gate 	 * If no entries have been copied, report the error.
6380Sstevel@tonic-gate 	 * If case 4, this will be indistiguishable from EOF.
6390Sstevel@tonic-gate 	 */
6400Sstevel@tonic-gate done:
6410Sstevel@tonic-gate 	ndlen = ((char *)nd - outbuf);
6420Sstevel@tonic-gate 	if (ndlen != 0) {
6430Sstevel@tonic-gate 		error = uiomove(outbuf, (size_t)ndlen, UIO_READ, uiop);
6441349Speterte 		uiop->uio_loffset = offset;
6450Sstevel@tonic-gate 	}
6460Sstevel@tonic-gate 	kmem_free(dname, dname_size);
6470Sstevel@tonic-gate 	kmem_free(outbuf, bufsize);
6480Sstevel@tonic-gate 	if (eofp && error == 0)
6491349Speterte 		*eofp = (uiop->uio_loffset >= dirsiz);
6500Sstevel@tonic-gate 	return (error);
6510Sstevel@tonic-gate }
6520Sstevel@tonic-gate 
6535331Samw /*ARGSUSED2*/
6540Sstevel@tonic-gate static int
hsfs_fid(struct vnode * vp,struct fid * fidp,caller_context_t * ct)6555331Samw hsfs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct)
6560Sstevel@tonic-gate {
6570Sstevel@tonic-gate 	struct hsnode *hp;
6580Sstevel@tonic-gate 	struct hsfid *fid;
6590Sstevel@tonic-gate 
6600Sstevel@tonic-gate 	if (fidp->fid_len < (sizeof (*fid) - sizeof (fid->hf_len))) {
6610Sstevel@tonic-gate 		fidp->fid_len = sizeof (*fid) - sizeof (fid->hf_len);
6620Sstevel@tonic-gate 		return (ENOSPC);
6630Sstevel@tonic-gate 	}
6640Sstevel@tonic-gate 
6650Sstevel@tonic-gate 	fid = (struct hsfid *)fidp;
6660Sstevel@tonic-gate 	fid->hf_len = sizeof (*fid) - sizeof (fid->hf_len);
6670Sstevel@tonic-gate 	hp = VTOH(vp);
6680Sstevel@tonic-gate 	mutex_enter(&hp->hs_contents_lock);
6690Sstevel@tonic-gate 	fid->hf_dir_lbn = hp->hs_dir_lbn;
6700Sstevel@tonic-gate 	fid->hf_dir_off = (ushort_t)hp->hs_dir_off;
6714866Sfrankho 	fid->hf_ino = hp->hs_nodeid;
6720Sstevel@tonic-gate 	mutex_exit(&hp->hs_contents_lock);
6730Sstevel@tonic-gate 	return (0);
6740Sstevel@tonic-gate }
6750Sstevel@tonic-gate 
6760Sstevel@tonic-gate /*ARGSUSED*/
6770Sstevel@tonic-gate static int
hsfs_open(struct vnode ** vpp,int flag,struct cred * cred,caller_context_t * ct)6785331Samw hsfs_open(struct vnode **vpp,
6795331Samw 	int flag,
6805331Samw 	struct cred *cred,
6815331Samw 	caller_context_t *ct)
6820Sstevel@tonic-gate {
6830Sstevel@tonic-gate 	return (0);
6840Sstevel@tonic-gate }
6850Sstevel@tonic-gate 
6860Sstevel@tonic-gate /*ARGSUSED*/
6870Sstevel@tonic-gate static int
hsfs_close(struct vnode * vp,int flag,int count,offset_t offset,struct cred * cred,caller_context_t * ct)6880Sstevel@tonic-gate hsfs_close(
6890Sstevel@tonic-gate 	struct vnode *vp,
6900Sstevel@tonic-gate 	int flag,
6910Sstevel@tonic-gate 	int count,
6920Sstevel@tonic-gate 	offset_t offset,
6935331Samw 	struct cred *cred,
6945331Samw 	caller_context_t *ct)
6950Sstevel@tonic-gate {
6960Sstevel@tonic-gate 	(void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0);
6970Sstevel@tonic-gate 	cleanshares(vp, ttoproc(curthread)->p_pid);
6980Sstevel@tonic-gate 	return (0);
6990Sstevel@tonic-gate }
7000Sstevel@tonic-gate 
7010Sstevel@tonic-gate /*ARGSUSED2*/
7020Sstevel@tonic-gate static int
hsfs_access(struct vnode * vp,int mode,int flags,cred_t * cred,caller_context_t * ct)7035331Samw hsfs_access(struct vnode *vp,
7045331Samw 	int mode,
7055331Samw 	int flags,
7065331Samw 	cred_t *cred,
7075331Samw 	caller_context_t *ct)
7080Sstevel@tonic-gate {
7090Sstevel@tonic-gate 	return (hs_access(vp, (mode_t)mode, cred));
7100Sstevel@tonic-gate }
7110Sstevel@tonic-gate 
7120Sstevel@tonic-gate /*
7130Sstevel@tonic-gate  * the seek time of a CD-ROM is very slow, and data transfer
7140Sstevel@tonic-gate  * rate is even worse (max. 150K per sec).  The design
7150Sstevel@tonic-gate  * decision is to reduce access to cd-rom as much as possible,
7160Sstevel@tonic-gate  * and to transfer a sizable block (read-ahead) of data at a time.
7170Sstevel@tonic-gate  * UFS style of read ahead one block at a time is not appropriate,
7180Sstevel@tonic-gate  * and is not supported
7190Sstevel@tonic-gate  */
7200Sstevel@tonic-gate 
7210Sstevel@tonic-gate /*
7220Sstevel@tonic-gate  * KLUSTSIZE should be a multiple of PAGESIZE and <= MAXPHYS.
7230Sstevel@tonic-gate  */
7240Sstevel@tonic-gate #define	KLUSTSIZE	(56 * 1024)
7250Sstevel@tonic-gate /* we don't support read ahead */
7260Sstevel@tonic-gate int hsfs_lostpage;	/* no. of times we lost original page */
7270Sstevel@tonic-gate 
7280Sstevel@tonic-gate /*
7290Sstevel@tonic-gate  * Used to prevent biodone() from releasing buf resources that
7300Sstevel@tonic-gate  * we didn't allocate in quite the usual way.
7310Sstevel@tonic-gate  */
7320Sstevel@tonic-gate /*ARGSUSED*/
7330Sstevel@tonic-gate int
hsfs_iodone(struct buf * bp)7340Sstevel@tonic-gate hsfs_iodone(struct buf *bp)
7350Sstevel@tonic-gate {
7360Sstevel@tonic-gate 	sema_v(&bp->b_io);
7370Sstevel@tonic-gate 	return (0);
7380Sstevel@tonic-gate }
7390Sstevel@tonic-gate 
7400Sstevel@tonic-gate /*
7415312Smg147109  * The taskq thread that invokes the scheduling function to ensure
7425312Smg147109  * that all readaheads are complete and cleans up the associated
7435312Smg147109  * memory and releases the page lock.
7445312Smg147109  */
7455312Smg147109 void
hsfs_ra_task(void * arg)7465312Smg147109 hsfs_ra_task(void *arg)
7475312Smg147109 {
7485312Smg147109 	struct hio_info *info = arg;
7495312Smg147109 	uint_t count;
7505312Smg147109 	struct buf *wbuf;
7515312Smg147109 
7525312Smg147109 	ASSERT(info->pp != NULL);
7535312Smg147109 
7545312Smg147109 	for (count = 0; count < info->bufsused; count++) {
7555312Smg147109 		wbuf = &(info->bufs[count]);
7565312Smg147109 
7575312Smg147109 		DTRACE_PROBE1(hsfs_io_wait_ra, struct buf *, wbuf);
7585312Smg147109 		while (sema_tryp(&(info->sema[count])) == 0) {
7595312Smg147109 			if (hsched_invoke_strategy(info->fsp)) {
7605312Smg147109 				sema_p(&(info->sema[count]));
7615312Smg147109 				break;
7625312Smg147109 			}
7635312Smg147109 		}
7645312Smg147109 		sema_destroy(&(info->sema[count]));
7655312Smg147109 		DTRACE_PROBE1(hsfs_io_done_ra, struct buf *, wbuf);
7665312Smg147109 		biofini(&(info->bufs[count]));
7675312Smg147109 	}
7685312Smg147109 	for (count = 0; count < info->bufsused; count++) {
7695312Smg147109 		if (info->vas[count] != NULL) {
7705312Smg147109 			ppmapout(info->vas[count]);
7715312Smg147109 		}
7725312Smg147109 	}
7735312Smg147109 	kmem_free(info->vas, info->bufcnt * sizeof (caddr_t));
7745312Smg147109 	kmem_free(info->bufs, info->bufcnt * sizeof (struct buf));
7755312Smg147109 	kmem_free(info->sema, info->bufcnt * sizeof (ksema_t));
7765312Smg147109 
7775312Smg147109 	pvn_read_done(info->pp, 0);
7785312Smg147109 	kmem_cache_free(hio_info_cache, info);
7795312Smg147109 }
7805312Smg147109 
7815312Smg147109 /*
7825312Smg147109  * Submit asynchronous readahead requests to the I/O scheduler
7835312Smg147109  * depending on the number of pages to read ahead. These requests
7845312Smg147109  * are asynchronous to the calling thread but I/O requests issued
7855312Smg147109  * subsequently by other threads with higher LBNs must wait for
7865312Smg147109  * these readaheads to complete since we have a single ordered
7875312Smg147109  * I/O pipeline. Thus these readaheads are semi-asynchronous.
7885312Smg147109  * A TaskQ handles waiting for the readaheads to complete.
7895312Smg147109  *
7905312Smg147109  * This function is mostly a copy of hsfs_getapage but somewhat
7915312Smg147109  * simpler. A readahead request is aborted if page allocation
7925312Smg147109  * fails.
7935312Smg147109  */
7945312Smg147109 /*ARGSUSED*/
7955312Smg147109 static int
hsfs_getpage_ra(struct vnode * vp,u_offset_t off,struct seg * seg,caddr_t addr,struct hsnode * hp,struct hsfs * fsp,int xarsiz,offset_t bof,int chunk_lbn_count,int chunk_data_bytes)7965312Smg147109 hsfs_getpage_ra(
7975312Smg147109 	struct vnode *vp,
7985312Smg147109 	u_offset_t off,
7995312Smg147109 	struct seg *seg,
8005312Smg147109 	caddr_t addr,
8015312Smg147109 	struct hsnode *hp,
8025312Smg147109 	struct hsfs *fsp,
8035312Smg147109 	int	xarsiz,
8045312Smg147109 	offset_t	bof,
8055312Smg147109 	int	chunk_lbn_count,
8065312Smg147109 	int	chunk_data_bytes)
8075312Smg147109 {
8085312Smg147109 	struct buf *bufs;
8095312Smg147109 	caddr_t *vas;
8105312Smg147109 	caddr_t va;
8115312Smg147109 	struct page *pp, *searchp, *lastp;
8125312Smg147109 	struct vnode *devvp;
8135312Smg147109 	ulong_t	byte_offset;
8145312Smg147109 	size_t	io_len_tmp;
8155312Smg147109 	uint_t	io_off, io_len;
8165312Smg147109 	uint_t	xlen;
8175312Smg147109 	uint_t	filsiz;
8185312Smg147109 	uint_t	secsize;
8195312Smg147109 	uint_t	bufcnt;
8205312Smg147109 	uint_t	bufsused;
8215312Smg147109 	uint_t	count;
8225312Smg147109 	uint_t	io_end;
8235312Smg147109 	uint_t	which_chunk_lbn;
8245312Smg147109 	uint_t	offset_lbn;
8255312Smg147109 	uint_t	offset_extra;
8265312Smg147109 	offset_t	offset_bytes;
8275312Smg147109 	uint_t	remaining_bytes;
8285312Smg147109 	uint_t	extension;
8295312Smg147109 	int	remainder;	/* must be signed */
8305312Smg147109 	diskaddr_t driver_block;
8315312Smg147109 	u_offset_t io_off_tmp;
8325312Smg147109 	ksema_t	*fio_done;
8335312Smg147109 	struct hio_info *info;
8345312Smg147109 	size_t len;
8355312Smg147109 
8365312Smg147109 	ASSERT(fsp->hqueue != NULL);
8375312Smg147109 
8385312Smg147109 	if (addr >= seg->s_base + seg->s_size) {
8395312Smg147109 		return (-1);
8405312Smg147109 	}
8415312Smg147109 
8425312Smg147109 	devvp = fsp->hsfs_devvp;
8435312Smg147109 	secsize = fsp->hsfs_vol.lbn_size;  /* bytes per logical block */
8445312Smg147109 
8455312Smg147109 	/* file data size */
8465312Smg147109 	filsiz = hp->hs_dirent.ext_size;
8475312Smg147109 
8485312Smg147109 	if (off >= filsiz)
8495312Smg147109 		return (0);
8505312Smg147109 
8515312Smg147109 	extension = 0;
8525312Smg147109 	pp = NULL;
8535312Smg147109 
8545312Smg147109 	extension += hp->hs_ra_bytes;
8555312Smg147109 
8565312Smg147109 	/*
8575406Smg147109 	 * Some CD writers (e.g. Kodak Photo CD writers)
8585406Smg147109 	 * create CDs in TAO mode and reserve tracks that
8595406Smg147109 	 * are not completely written. Some sectors remain
8605406Smg147109 	 * unreadable for this reason and give I/O errors.
8615406Smg147109 	 * Also, there's no point in reading sectors
8625406Smg147109 	 * we'll never look at.  So, if we're asked to go
8635406Smg147109 	 * beyond the end of a file, truncate to the length
8645406Smg147109 	 * of that file.
8655312Smg147109 	 *
8665406Smg147109 	 * Additionally, this behaviour is required by section
8675406Smg147109 	 * 6.4.5 of ISO 9660:1988(E).
8685312Smg147109 	 */
8695312Smg147109 	len = MIN(extension ? extension : PAGESIZE, filsiz - off);
8705312Smg147109 
8715312Smg147109 	/* A little paranoia */
8725312Smg147109 	if (len <= 0)
8735312Smg147109 		return (-1);
8745312Smg147109 
8755312Smg147109 	/*
8765312Smg147109 	 * After all that, make sure we're asking for things in units
8775312Smg147109 	 * that bdev_strategy() will understand (see bug 4202551).
8785312Smg147109 	 */
8795312Smg147109 	len = roundup(len, DEV_BSIZE);
8805312Smg147109 
8815312Smg147109 	pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp,
8825312Smg147109 	    &io_len_tmp, off, len, 1);
8835312Smg147109 
8845312Smg147109 	if (pp == NULL) {
8855312Smg147109 		hp->hs_num_contig = 0;
8865312Smg147109 		hp->hs_ra_bytes = 0;
8875312Smg147109 		hp->hs_prev_offset = 0;
8885312Smg147109 		return (-1);
8895312Smg147109 	}
8905312Smg147109 
8915312Smg147109 	io_off = (uint_t)io_off_tmp;
8925312Smg147109 	io_len = (uint_t)io_len_tmp;
8935312Smg147109 
8945312Smg147109 	/* check for truncation */
8955312Smg147109 	/*
8965312Smg147109 	 * xxx Clean up and return EIO instead?
8975312Smg147109 	 * xxx Ought to go to u_offset_t for everything, but we
8985312Smg147109 	 * xxx call lots of things that want uint_t arguments.
8995312Smg147109 	 */
9005312Smg147109 	ASSERT(io_off == io_off_tmp);
9015312Smg147109 
9025312Smg147109 	/*
9035312Smg147109 	 * get enough buffers for worst-case scenario
9045312Smg147109 	 * (i.e., no coalescing possible).
9055312Smg147109 	 */
9065312Smg147109 	bufcnt = (len + secsize - 1) / secsize;
9075312Smg147109 	bufs = kmem_alloc(bufcnt * sizeof (struct buf), KM_SLEEP);
9085312Smg147109 	vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP);
9095312Smg147109 
9105312Smg147109 	/*
9115312Smg147109 	 * Allocate a array of semaphores since we are doing I/O
9125312Smg147109 	 * scheduling.
9135312Smg147109 	 */
9145312Smg147109 	fio_done = kmem_alloc(bufcnt * sizeof (ksema_t), KM_SLEEP);
9155312Smg147109 
9165312Smg147109 	/*
9175312Smg147109 	 * If our filesize is not an integer multiple of PAGESIZE,
9185312Smg147109 	 * we zero that part of the last page that's between EOF and
9195312Smg147109 	 * the PAGESIZE boundary.
9205312Smg147109 	 */
9215312Smg147109 	xlen = io_len & PAGEOFFSET;
9225312Smg147109 	if (xlen != 0)
9235312Smg147109 		pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
9245312Smg147109 
9255312Smg147109 	DTRACE_PROBE2(hsfs_readahead, struct vnode *, vp, uint_t, io_len);
9265312Smg147109 
9275312Smg147109 	va = NULL;
9285312Smg147109 	lastp = NULL;
9295312Smg147109 	searchp = pp;
9305312Smg147109 	io_end = io_off + io_len;
9315312Smg147109 	for (count = 0, byte_offset = io_off;
9325312Smg147109 	    byte_offset < io_end;
9335312Smg147109 	    count++) {
9345312Smg147109 		ASSERT(count < bufcnt);
9355312Smg147109 
9365312Smg147109 		bioinit(&bufs[count]);
9375312Smg147109 		bufs[count].b_edev = devvp->v_rdev;
9385312Smg147109 		bufs[count].b_dev = cmpdev(devvp->v_rdev);
9395312Smg147109 		bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ;
9405312Smg147109 		bufs[count].b_iodone = hsfs_iodone;
9415312Smg147109 		bufs[count].b_vp = vp;
9425312Smg147109 		bufs[count].b_file = vp;
9435312Smg147109 
9445312Smg147109 		/* Compute disk address for interleaving. */
9455312Smg147109 
9465312Smg147109 		/* considered without skips */
9475312Smg147109 		which_chunk_lbn = byte_offset / chunk_data_bytes;
9485312Smg147109 
9495312Smg147109 		/* factor in skips */
9505312Smg147109 		offset_lbn = which_chunk_lbn * chunk_lbn_count;
9515312Smg147109 
9525312Smg147109 		/* convert to physical byte offset for lbn */
9535312Smg147109 		offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp);
9545312Smg147109 
9555312Smg147109 		/* don't forget offset into lbn */
9565312Smg147109 		offset_extra = byte_offset % chunk_data_bytes;
9575312Smg147109 
9585312Smg147109 		/* get virtual block number for driver */
9595312Smg147109 		driver_block = lbtodb(bof + xarsiz
9605312Smg147109 		    + offset_bytes + offset_extra);
9615312Smg147109 
9625312Smg147109 		if (lastp != searchp) {
9635312Smg147109 			/* this branch taken first time through loop */
9645312Smg147109 			va = vas[count] = ppmapin(searchp, PROT_WRITE,
9655312Smg147109 			    (caddr_t)-1);
9665312Smg147109 			/* ppmapin() guarantees not to return NULL */
9675312Smg147109 		} else {
9685312Smg147109 			vas[count] = NULL;
9695312Smg147109 		}
9705312Smg147109 
9715312Smg147109 		bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE;
9725312Smg147109 		bufs[count].b_offset =
9735312Smg147109 		    (offset_t)(byte_offset - io_off + off);
9745312Smg147109 
9755312Smg147109 		/*
9765312Smg147109 		 * We specifically use the b_lblkno member here
9775312Smg147109 		 * as even in the 32 bit world driver_block can
9785312Smg147109 		 * get very large in line with the ISO9660 spec.
9795312Smg147109 		 */
9805312Smg147109 
9815312Smg147109 		bufs[count].b_lblkno = driver_block;
9825312Smg147109 
9835312Smg147109 		remaining_bytes = ((which_chunk_lbn + 1) * chunk_data_bytes)
9845312Smg147109 		    - byte_offset;
9855312Smg147109 
9865312Smg147109 		/*
9875312Smg147109 		 * remaining_bytes can't be zero, as we derived
9885312Smg147109 		 * which_chunk_lbn directly from byte_offset.
9895312Smg147109 		 */
9905312Smg147109 		if ((remaining_bytes + byte_offset) < (off + len)) {
9915312Smg147109 			/* coalesce-read the rest of the chunk */
9925312Smg147109 			bufs[count].b_bcount = remaining_bytes;
9935312Smg147109 		} else {
9945312Smg147109 			/* get the final bits */
9955312Smg147109 			bufs[count].b_bcount = off + len - byte_offset;
9965312Smg147109 		}
9975312Smg147109 
9985312Smg147109 		remainder = PAGESIZE - (byte_offset % PAGESIZE);
9995312Smg147109 		if (bufs[count].b_bcount > remainder) {
10005312Smg147109 			bufs[count].b_bcount = remainder;
10015312Smg147109 		}
10025312Smg147109 
10035312Smg147109 		bufs[count].b_bufsize = bufs[count].b_bcount;
10045312Smg147109 		if (((offset_t)byte_offset + bufs[count].b_bcount) >
10055312Smg147109 		    HS_MAXFILEOFF) {
10065312Smg147109 			break;
10075312Smg147109 		}
10085312Smg147109 		byte_offset += bufs[count].b_bcount;
10095312Smg147109 
10105312Smg147109 		/*
10115312Smg147109 		 * We are scheduling I/O so we need to enqueue
10125312Smg147109 		 * requests rather than calling bdev_strategy
10135312Smg147109 		 * here. A later invocation of the scheduling
10145312Smg147109 		 * function will take care of doing the actual
10155312Smg147109 		 * I/O as it selects requests from the queue as
10165312Smg147109 		 * per the scheduling logic.
10175312Smg147109 		 */
10185312Smg147109 		struct hio *hsio = kmem_cache_alloc(hio_cache,
10195312Smg147109 		    KM_SLEEP);
10205312Smg147109 
10215312Smg147109 		sema_init(&fio_done[count], 0, NULL,
10225312Smg147109 		    SEMA_DEFAULT, NULL);
10235312Smg147109 		hsio->bp = &bufs[count];
10245312Smg147109 		hsio->sema = &fio_done[count];
10255312Smg147109 		hsio->io_lblkno = bufs[count].b_lblkno;
10265312Smg147109 		hsio->nblocks = howmany(hsio->bp->b_bcount,
10275312Smg147109 		    DEV_BSIZE);
10285312Smg147109 
10295312Smg147109 		/* used for deadline */
10305312Smg147109 		hsio->io_timestamp = drv_hztousec(ddi_get_lbolt());
10315312Smg147109 
10325312Smg147109 		/* for I/O coalescing */
10335312Smg147109 		hsio->contig_chain = NULL;
10345312Smg147109 		hsched_enqueue_io(fsp, hsio, 1);
10355312Smg147109 
10365312Smg147109 		lwp_stat_update(LWP_STAT_INBLK, 1);
10375312Smg147109 		lastp = searchp;
10385312Smg147109 		if ((remainder - bufs[count].b_bcount) < 1) {
10395312Smg147109 			searchp = searchp->p_next;
10405312Smg147109 		}
10415312Smg147109 	}
10425312Smg147109 
10435312Smg147109 	bufsused = count;
10445312Smg147109 	info = kmem_cache_alloc(hio_info_cache, KM_SLEEP);
10455312Smg147109 	info->bufs = bufs;
10465312Smg147109 	info->vas = vas;
10475312Smg147109 	info->sema = fio_done;
10485312Smg147109 	info->bufsused = bufsused;
10495312Smg147109 	info->bufcnt = bufcnt;
10505312Smg147109 	info->fsp = fsp;
10515312Smg147109 	info->pp = pp;
10525312Smg147109 
10535312Smg147109 	(void) taskq_dispatch(fsp->hqueue->ra_task,
10545312Smg147109 	    hsfs_ra_task, info, KM_SLEEP);
10555312Smg147109 	/*
10565312Smg147109 	 * The I/O locked pages are unlocked in our taskq thread.
10575312Smg147109 	 */
10585312Smg147109 	return (0);
10595312Smg147109 }
10605312Smg147109 
10615312Smg147109 /*
10620Sstevel@tonic-gate  * Each file may have a different interleaving on disk.  This makes
10630Sstevel@tonic-gate  * things somewhat interesting.  The gist is that there are some
10640Sstevel@tonic-gate  * number of contiguous data sectors, followed by some other number
10650Sstevel@tonic-gate  * of contiguous skip sectors.  The sum of those two sets of sectors
10660Sstevel@tonic-gate  * defines the interleave size.  Unfortunately, it means that we generally
10670Sstevel@tonic-gate  * can't simply read N sectors starting at a given offset to satisfy
10680Sstevel@tonic-gate  * any given request.
10690Sstevel@tonic-gate  *
10700Sstevel@tonic-gate  * What we do is get the relevant memory pages via pvn_read_kluster(),
10710Sstevel@tonic-gate  * then stride through the interleaves, setting up a buf for each
10720Sstevel@tonic-gate  * sector that needs to be brought in.  Instead of kmem_alloc'ing
10730Sstevel@tonic-gate  * space for the sectors, though, we just point at the appropriate
10740Sstevel@tonic-gate  * spot in the relevant page for each of them.  This saves us a bunch
10750Sstevel@tonic-gate  * of copying.
10765312Smg147109  *
10775312Smg147109  * NOTICE: The code below in hsfs_getapage is mostly same as the code
10785312Smg147109  *         in hsfs_getpage_ra above (with some omissions). If you are
10795312Smg147109  *         making any change to this function, please also look at
10805312Smg147109  *         hsfs_getpage_ra.
10810Sstevel@tonic-gate  */
10820Sstevel@tonic-gate /*ARGSUSED*/
10830Sstevel@tonic-gate static int
hsfs_getapage(struct vnode * vp,u_offset_t off,size_t len,uint_t * protp,struct page * pl[],size_t plsz,struct seg * seg,caddr_t addr,enum seg_rw rw,struct cred * cred)10840Sstevel@tonic-gate hsfs_getapage(
10850Sstevel@tonic-gate 	struct vnode *vp,
10860Sstevel@tonic-gate 	u_offset_t off,
10870Sstevel@tonic-gate 	size_t len,
10880Sstevel@tonic-gate 	uint_t *protp,
10890Sstevel@tonic-gate 	struct page *pl[],
10900Sstevel@tonic-gate 	size_t plsz,
10910Sstevel@tonic-gate 	struct seg *seg,
10920Sstevel@tonic-gate 	caddr_t addr,
10930Sstevel@tonic-gate 	enum seg_rw rw,
10940Sstevel@tonic-gate 	struct cred *cred)
10950Sstevel@tonic-gate {
10960Sstevel@tonic-gate 	struct hsnode *hp;
10970Sstevel@tonic-gate 	struct hsfs *fsp;
10980Sstevel@tonic-gate 	int	err;
10990Sstevel@tonic-gate 	struct buf *bufs;
11000Sstevel@tonic-gate 	caddr_t *vas;
11010Sstevel@tonic-gate 	caddr_t va;
11020Sstevel@tonic-gate 	struct page *pp, *searchp, *lastp;
11030Sstevel@tonic-gate 	page_t	*pagefound;
11040Sstevel@tonic-gate 	offset_t	bof;
11050Sstevel@tonic-gate 	struct vnode *devvp;
11060Sstevel@tonic-gate 	ulong_t	byte_offset;
11070Sstevel@tonic-gate 	size_t	io_len_tmp;
11080Sstevel@tonic-gate 	uint_t	io_off, io_len;
11090Sstevel@tonic-gate 	uint_t	xlen;
11100Sstevel@tonic-gate 	uint_t	filsiz;
11110Sstevel@tonic-gate 	uint_t	secsize;
11120Sstevel@tonic-gate 	uint_t	bufcnt;
11130Sstevel@tonic-gate 	uint_t	bufsused;
11140Sstevel@tonic-gate 	uint_t	count;
11150Sstevel@tonic-gate 	uint_t	io_end;
11160Sstevel@tonic-gate 	uint_t	which_chunk_lbn;
11170Sstevel@tonic-gate 	uint_t	offset_lbn;
11180Sstevel@tonic-gate 	uint_t	offset_extra;
11190Sstevel@tonic-gate 	offset_t	offset_bytes;
11200Sstevel@tonic-gate 	uint_t	remaining_bytes;
11210Sstevel@tonic-gate 	uint_t	extension;
11220Sstevel@tonic-gate 	int	remainder;	/* must be signed */
11230Sstevel@tonic-gate 	int	chunk_lbn_count;
11240Sstevel@tonic-gate 	int	chunk_data_bytes;
11250Sstevel@tonic-gate 	int	xarsiz;
11260Sstevel@tonic-gate 	diskaddr_t driver_block;
11270Sstevel@tonic-gate 	u_offset_t io_off_tmp;
11285312Smg147109 	ksema_t *fio_done;
11295312Smg147109 	int	calcdone;
11300Sstevel@tonic-gate 
11310Sstevel@tonic-gate 	/*
11320Sstevel@tonic-gate 	 * We don't support asynchronous operation at the moment, so
11330Sstevel@tonic-gate 	 * just pretend we did it.  If the pages are ever actually
11340Sstevel@tonic-gate 	 * needed, they'll get brought in then.
11350Sstevel@tonic-gate 	 */
11360Sstevel@tonic-gate 	if (pl == NULL)
11370Sstevel@tonic-gate 		return (0);
11380Sstevel@tonic-gate 
11390Sstevel@tonic-gate 	hp = VTOH(vp);
11400Sstevel@tonic-gate 	fsp = VFS_TO_HSFS(vp->v_vfsp);
11410Sstevel@tonic-gate 	devvp = fsp->hsfs_devvp;
11420Sstevel@tonic-gate 	secsize = fsp->hsfs_vol.lbn_size;  /* bytes per logical block */
11430Sstevel@tonic-gate 
11440Sstevel@tonic-gate 	/* file data size */
11450Sstevel@tonic-gate 	filsiz = hp->hs_dirent.ext_size;
11460Sstevel@tonic-gate 
11470Sstevel@tonic-gate 	/* disk addr for start of file */
11480Sstevel@tonic-gate 	bof = LBN_TO_BYTE((offset_t)hp->hs_dirent.ext_lbn, vp->v_vfsp);
11490Sstevel@tonic-gate 
11500Sstevel@tonic-gate 	/* xarsiz byte must be skipped for data */
11510Sstevel@tonic-gate 	xarsiz = hp->hs_dirent.xar_len << fsp->hsfs_vol.lbn_shift;
11520Sstevel@tonic-gate 
11530Sstevel@tonic-gate 	/* how many logical blocks in an interleave (data+skip) */
11540Sstevel@tonic-gate 	chunk_lbn_count = hp->hs_dirent.intlf_sz + hp->hs_dirent.intlf_sk;
11550Sstevel@tonic-gate 
11560Sstevel@tonic-gate 	if (chunk_lbn_count == 0) {
11570Sstevel@tonic-gate 		chunk_lbn_count = 1;
11580Sstevel@tonic-gate 	}
11590Sstevel@tonic-gate 
11600Sstevel@tonic-gate 	/*
11610Sstevel@tonic-gate 	 * Convert interleaving size into bytes.  The zero case
11620Sstevel@tonic-gate 	 * (no interleaving) optimization is handled as a side-
11630Sstevel@tonic-gate 	 * effect of the read-ahead logic.
11640Sstevel@tonic-gate 	 */
11650Sstevel@tonic-gate 	if (hp->hs_dirent.intlf_sz == 0) {
11660Sstevel@tonic-gate 		chunk_data_bytes = LBN_TO_BYTE(1, vp->v_vfsp);
11675312Smg147109 		/*
11685312Smg147109 		 * Optimization: If our pagesize is a multiple of LBN
11695312Smg147109 		 * bytes, we can avoid breaking up a page into individual
11705312Smg147109 		 * lbn-sized requests.
11715312Smg147109 		 */
11725312Smg147109 		if (PAGESIZE % chunk_data_bytes == 0) {
11735312Smg147109 			chunk_lbn_count = BYTE_TO_LBN(PAGESIZE, vp->v_vfsp);
11745312Smg147109 			chunk_data_bytes = PAGESIZE;
11755312Smg147109 		}
11760Sstevel@tonic-gate 	} else {
11774866Sfrankho 		chunk_data_bytes =
11784866Sfrankho 		    LBN_TO_BYTE(hp->hs_dirent.intlf_sz, vp->v_vfsp);
11790Sstevel@tonic-gate 	}
11800Sstevel@tonic-gate 
11810Sstevel@tonic-gate reread:
11820Sstevel@tonic-gate 	err = 0;
11830Sstevel@tonic-gate 	pagefound = 0;
11845312Smg147109 	calcdone = 0;
11850Sstevel@tonic-gate 
11860Sstevel@tonic-gate 	/*
11870Sstevel@tonic-gate 	 * Do some read-ahead.  This mostly saves us a bit of
11880Sstevel@tonic-gate 	 * system cpu time more than anything else when doing
11890Sstevel@tonic-gate 	 * sequential reads.  At some point, could do the
11900Sstevel@tonic-gate 	 * read-ahead asynchronously which might gain us something
11910Sstevel@tonic-gate 	 * on wall time, but it seems unlikely....
11920Sstevel@tonic-gate 	 *
11930Sstevel@tonic-gate 	 * We do the easy case here, which is to read through
11940Sstevel@tonic-gate 	 * the end of the chunk, minus whatever's at the end that
11950Sstevel@tonic-gate 	 * won't exactly fill a page.
11960Sstevel@tonic-gate 	 */
11975312Smg147109 	if (hp->hs_ra_bytes > 0 && chunk_data_bytes != PAGESIZE) {
11985312Smg147109 		which_chunk_lbn = (off + len) / chunk_data_bytes;
11995312Smg147109 		extension = ((which_chunk_lbn + 1) * chunk_data_bytes) - off;
12005312Smg147109 		extension -= (extension % PAGESIZE);
1201206Speterte 	} else {
12025312Smg147109 		extension = roundup(len, PAGESIZE);
12030Sstevel@tonic-gate 	}
12040Sstevel@tonic-gate 
12055312Smg147109 	atomic_inc_64(&fsp->total_pages_requested);
12060Sstevel@tonic-gate 
12070Sstevel@tonic-gate 	pp = NULL;
12080Sstevel@tonic-gate again:
12090Sstevel@tonic-gate 	/* search for page in buffer */
12100Sstevel@tonic-gate 	if ((pagefound = page_exists(vp, off)) == 0) {
12110Sstevel@tonic-gate 		/*
12120Sstevel@tonic-gate 		 * Need to really do disk IO to get the page.
12130Sstevel@tonic-gate 		 */
12145312Smg147109 		if (!calcdone) {
12155312Smg147109 			extension += hp->hs_ra_bytes;
12165312Smg147109 
12175312Smg147109 			/*
12185312Smg147109 			 * Some cd writers don't write sectors that aren't
12195312Smg147109 			 * used. Also, there's no point in reading sectors
12205312Smg147109 			 * we'll never look at.  So, if we're asked to go
12215312Smg147109 			 * beyond the end of a file, truncate to the length
12225312Smg147109 			 * of that file.
12235312Smg147109 			 *
12245312Smg147109 			 * Additionally, this behaviour is required by section
12255312Smg147109 			 * 6.4.5 of ISO 9660:1988(E).
12265312Smg147109 			 */
12275312Smg147109 			len = MIN(extension ? extension : PAGESIZE,
12285312Smg147109 			    filsiz - off);
12295312Smg147109 
12305312Smg147109 			/* A little paranoia. */
12315312Smg147109 			ASSERT(len > 0);
12325312Smg147109 
12335312Smg147109 			/*
12345312Smg147109 			 * After all that, make sure we're asking for things
12355312Smg147109 			 * in units that bdev_strategy() will understand
12365312Smg147109 			 * (see bug 4202551).
12375312Smg147109 			 */
12385312Smg147109 			len = roundup(len, DEV_BSIZE);
12395312Smg147109 			calcdone = 1;
12405312Smg147109 		}
12415312Smg147109 
12420Sstevel@tonic-gate 		pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp,
12430Sstevel@tonic-gate 		    &io_len_tmp, off, len, 0);
12440Sstevel@tonic-gate 
12455312Smg147109 		if (pp == NULL) {
12465312Smg147109 			/*
12475312Smg147109 			 * Pressure on memory, roll back readahead
12485312Smg147109 			 */
12495312Smg147109 			hp->hs_num_contig = 0;
12505312Smg147109 			hp->hs_ra_bytes = 0;
12515312Smg147109 			hp->hs_prev_offset = 0;
12520Sstevel@tonic-gate 			goto again;
12535312Smg147109 		}
12540Sstevel@tonic-gate 
12550Sstevel@tonic-gate 		io_off = (uint_t)io_off_tmp;
12560Sstevel@tonic-gate 		io_len = (uint_t)io_len_tmp;
12570Sstevel@tonic-gate 
12580Sstevel@tonic-gate 		/* check for truncation */
12590Sstevel@tonic-gate 		/*
12600Sstevel@tonic-gate 		 * xxx Clean up and return EIO instead?
12610Sstevel@tonic-gate 		 * xxx Ought to go to u_offset_t for everything, but we
12620Sstevel@tonic-gate 		 * xxx call lots of things that want uint_t arguments.
12630Sstevel@tonic-gate 		 */
12640Sstevel@tonic-gate 		ASSERT(io_off == io_off_tmp);
12650Sstevel@tonic-gate 
12660Sstevel@tonic-gate 		/*
12670Sstevel@tonic-gate 		 * get enough buffers for worst-case scenario
12680Sstevel@tonic-gate 		 * (i.e., no coalescing possible).
12690Sstevel@tonic-gate 		 */
12700Sstevel@tonic-gate 		bufcnt = (len + secsize - 1) / secsize;
12710Sstevel@tonic-gate 		bufs = kmem_zalloc(bufcnt * sizeof (struct buf), KM_SLEEP);
12720Sstevel@tonic-gate 		vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP);
12735312Smg147109 
12745312Smg147109 		/*
12755312Smg147109 		 * Allocate a array of semaphores if we are doing I/O
12765312Smg147109 		 * scheduling.
12775312Smg147109 		 */
12785312Smg147109 		if (fsp->hqueue != NULL)
12795312Smg147109 			fio_done = kmem_alloc(bufcnt * sizeof (ksema_t),
12805312Smg147109 			    KM_SLEEP);
12810Sstevel@tonic-gate 		for (count = 0; count < bufcnt; count++) {
12825312Smg147109 			bioinit(&bufs[count]);
12830Sstevel@tonic-gate 			bufs[count].b_edev = devvp->v_rdev;
12840Sstevel@tonic-gate 			bufs[count].b_dev = cmpdev(devvp->v_rdev);
12850Sstevel@tonic-gate 			bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ;
12860Sstevel@tonic-gate 			bufs[count].b_iodone = hsfs_iodone;
12870Sstevel@tonic-gate 			bufs[count].b_vp = vp;
12880Sstevel@tonic-gate 			bufs[count].b_file = vp;
12890Sstevel@tonic-gate 		}
12900Sstevel@tonic-gate 
1291206Speterte 		/*
1292206Speterte 		 * If our filesize is not an integer multiple of PAGESIZE,
1293206Speterte 		 * we zero that part of the last page that's between EOF and
1294206Speterte 		 * the PAGESIZE boundary.
1295206Speterte 		 */
12960Sstevel@tonic-gate 		xlen = io_len & PAGEOFFSET;
12970Sstevel@tonic-gate 		if (xlen != 0)
12980Sstevel@tonic-gate 			pagezero(pp->p_prev, xlen, PAGESIZE - xlen);
12990Sstevel@tonic-gate 
13000Sstevel@tonic-gate 		va = NULL;
13010Sstevel@tonic-gate 		lastp = NULL;
13020Sstevel@tonic-gate 		searchp = pp;
13030Sstevel@tonic-gate 		io_end = io_off + io_len;
13040Sstevel@tonic-gate 		for (count = 0, byte_offset = io_off;
13054866Sfrankho 		    byte_offset < io_end; count++) {
13060Sstevel@tonic-gate 			ASSERT(count < bufcnt);
13070Sstevel@tonic-gate 
13080Sstevel@tonic-gate 			/* Compute disk address for interleaving. */
13090Sstevel@tonic-gate 
13100Sstevel@tonic-gate 			/* considered without skips */
13110Sstevel@tonic-gate 			which_chunk_lbn = byte_offset / chunk_data_bytes;
13120Sstevel@tonic-gate 
13130Sstevel@tonic-gate 			/* factor in skips */
13140Sstevel@tonic-gate 			offset_lbn = which_chunk_lbn * chunk_lbn_count;
13150Sstevel@tonic-gate 
13160Sstevel@tonic-gate 			/* convert to physical byte offset for lbn */
13170Sstevel@tonic-gate 			offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp);
13180Sstevel@tonic-gate 
13190Sstevel@tonic-gate 			/* don't forget offset into lbn */
13200Sstevel@tonic-gate 			offset_extra = byte_offset % chunk_data_bytes;
13210Sstevel@tonic-gate 
13220Sstevel@tonic-gate 			/* get virtual block number for driver */
13234866Sfrankho 			driver_block =
13244866Sfrankho 			    lbtodb(bof + xarsiz + offset_bytes + offset_extra);
13250Sstevel@tonic-gate 
13260Sstevel@tonic-gate 			if (lastp != searchp) {
13270Sstevel@tonic-gate 				/* this branch taken first time through loop */
13284866Sfrankho 				va = vas[count] =
13294866Sfrankho 				    ppmapin(searchp, PROT_WRITE, (caddr_t)-1);
13300Sstevel@tonic-gate 				/* ppmapin() guarantees not to return NULL */
13310Sstevel@tonic-gate 			} else {
13320Sstevel@tonic-gate 				vas[count] = NULL;
13330Sstevel@tonic-gate 			}
13340Sstevel@tonic-gate 
13350Sstevel@tonic-gate 			bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE;
13360Sstevel@tonic-gate 			bufs[count].b_offset =
13370Sstevel@tonic-gate 			    (offset_t)(byte_offset - io_off + off);
13380Sstevel@tonic-gate 
13390Sstevel@tonic-gate 			/*
13400Sstevel@tonic-gate 			 * We specifically use the b_lblkno member here
13410Sstevel@tonic-gate 			 * as even in the 32 bit world driver_block can
13420Sstevel@tonic-gate 			 * get very large in line with the ISO9660 spec.
13430Sstevel@tonic-gate 			 */
13440Sstevel@tonic-gate 
13450Sstevel@tonic-gate 			bufs[count].b_lblkno = driver_block;
13460Sstevel@tonic-gate 
13474866Sfrankho 			remaining_bytes =
13484866Sfrankho 			    ((which_chunk_lbn + 1) * chunk_data_bytes)
13494866Sfrankho 			    - byte_offset;
13500Sstevel@tonic-gate 
13510Sstevel@tonic-gate 			/*
13520Sstevel@tonic-gate 			 * remaining_bytes can't be zero, as we derived
13530Sstevel@tonic-gate 			 * which_chunk_lbn directly from byte_offset.
13540Sstevel@tonic-gate 			 */
13551349Speterte 			if ((remaining_bytes + byte_offset) < (off + len)) {
13560Sstevel@tonic-gate 				/* coalesce-read the rest of the chunk */
13570Sstevel@tonic-gate 				bufs[count].b_bcount = remaining_bytes;
13580Sstevel@tonic-gate 			} else {
13590Sstevel@tonic-gate 				/* get the final bits */
13600Sstevel@tonic-gate 				bufs[count].b_bcount = off + len - byte_offset;
13610Sstevel@tonic-gate 			}
13620Sstevel@tonic-gate 
13630Sstevel@tonic-gate 			/*
13640Sstevel@tonic-gate 			 * It would be nice to do multiple pages'
13650Sstevel@tonic-gate 			 * worth at once here when the opportunity
13660Sstevel@tonic-gate 			 * arises, as that has been shown to improve
13670Sstevel@tonic-gate 			 * our wall time.  However, to do that
13680Sstevel@tonic-gate 			 * requires that we use the pageio subsystem,
13690Sstevel@tonic-gate 			 * which doesn't mix well with what we're
13700Sstevel@tonic-gate 			 * already using here.  We can't use pageio
13710Sstevel@tonic-gate 			 * all the time, because that subsystem
13720Sstevel@tonic-gate 			 * assumes that a page is stored in N
13730Sstevel@tonic-gate 			 * contiguous blocks on the device.
13740Sstevel@tonic-gate 			 * Interleaving violates that assumption.
13755312Smg147109 			 *
13765312Smg147109 			 * Update: This is now not so big a problem
13775312Smg147109 			 * because of the I/O scheduler sitting below
13785312Smg147109 			 * that can re-order and coalesce I/O requests.
13790Sstevel@tonic-gate 			 */
13800Sstevel@tonic-gate 
13810Sstevel@tonic-gate 			remainder = PAGESIZE - (byte_offset % PAGESIZE);
13820Sstevel@tonic-gate 			if (bufs[count].b_bcount > remainder) {
13830Sstevel@tonic-gate 				bufs[count].b_bcount = remainder;
13840Sstevel@tonic-gate 			}
13850Sstevel@tonic-gate 
13860Sstevel@tonic-gate 			bufs[count].b_bufsize = bufs[count].b_bcount;
13871349Speterte 			if (((offset_t)byte_offset + bufs[count].b_bcount) >
13884866Sfrankho 			    HS_MAXFILEOFF) {
13891349Speterte 				break;
13901349Speterte 			}
13910Sstevel@tonic-gate 			byte_offset += bufs[count].b_bcount;
13920Sstevel@tonic-gate 
13935312Smg147109 			if (fsp->hqueue == NULL) {
13945312Smg147109 				(void) bdev_strategy(&bufs[count]);
13955312Smg147109 
13965312Smg147109 			} else {
13975312Smg147109 				/*
13985312Smg147109 				 * We are scheduling I/O so we need to enqueue
13995312Smg147109 				 * requests rather than calling bdev_strategy
14005312Smg147109 				 * here. A later invocation of the scheduling
14015312Smg147109 				 * function will take care of doing the actual
14025312Smg147109 				 * I/O as it selects requests from the queue as
14035312Smg147109 				 * per the scheduling logic.
14045312Smg147109 				 */
14055312Smg147109 				struct hio *hsio = kmem_cache_alloc(hio_cache,
14065312Smg147109 				    KM_SLEEP);
14075312Smg147109 
14085312Smg147109 				sema_init(&fio_done[count], 0, NULL,
14095312Smg147109 				    SEMA_DEFAULT, NULL);
14105312Smg147109 				hsio->bp = &bufs[count];
14115312Smg147109 				hsio->sema = &fio_done[count];
14125312Smg147109 				hsio->io_lblkno = bufs[count].b_lblkno;
14135312Smg147109 				hsio->nblocks = howmany(hsio->bp->b_bcount,
14145312Smg147109 				    DEV_BSIZE);
14155312Smg147109 
14165312Smg147109 				/* used for deadline */
14175312Smg147109 				hsio->io_timestamp =
14185312Smg147109 				    drv_hztousec(ddi_get_lbolt());
14195312Smg147109 
14205312Smg147109 				/* for I/O coalescing */
14215312Smg147109 				hsio->contig_chain = NULL;
14225312Smg147109 				hsched_enqueue_io(fsp, hsio, 0);
14235312Smg147109 			}
14240Sstevel@tonic-gate 
14250Sstevel@tonic-gate 			lwp_stat_update(LWP_STAT_INBLK, 1);
14260Sstevel@tonic-gate 			lastp = searchp;
14270Sstevel@tonic-gate 			if ((remainder - bufs[count].b_bcount) < 1) {
14280Sstevel@tonic-gate 				searchp = searchp->p_next;
14290Sstevel@tonic-gate 			}
14300Sstevel@tonic-gate 		}
14310Sstevel@tonic-gate 
14320Sstevel@tonic-gate 		bufsused = count;
14330Sstevel@tonic-gate 		/* Now wait for everything to come in */
14345312Smg147109 		if (fsp->hqueue == NULL) {
14355312Smg147109 			for (count = 0; count < bufsused; count++) {
14365312Smg147109 				if (err == 0) {
14375312Smg147109 					err = biowait(&bufs[count]);
14385312Smg147109 				} else
14395312Smg147109 					(void) biowait(&bufs[count]);
14405312Smg147109 			}
14415312Smg147109 		} else {
14425312Smg147109 			for (count = 0; count < bufsused; count++) {
14435312Smg147109 				struct buf *wbuf;
14445312Smg147109 
14455312Smg147109 				/*
14465312Smg147109 				 * Invoke scheduling function till our buf
14475312Smg147109 				 * is processed. In doing this it might
14485312Smg147109 				 * process bufs enqueued by other threads
14495312Smg147109 				 * which is good.
14505312Smg147109 				 */
14515312Smg147109 				wbuf = &bufs[count];
14525312Smg147109 				DTRACE_PROBE1(hsfs_io_wait, struct buf *, wbuf);
14535312Smg147109 				while (sema_tryp(&fio_done[count]) == 0) {
14545312Smg147109 					/*
14555312Smg147109 					 * hsched_invoke_strategy will return 1
14565312Smg147109 					 * if the I/O queue is empty. This means
14575312Smg147109 					 * that there is another thread who has
14585312Smg147109 					 * issued our buf and is waiting. So we
14595312Smg147109 					 * just block instead of spinning.
14605312Smg147109 					 */
14615312Smg147109 					if (hsched_invoke_strategy(fsp)) {
14625312Smg147109 						sema_p(&fio_done[count]);
14635312Smg147109 						break;
14645312Smg147109 					}
14655312Smg147109 				}
14665312Smg147109 				sema_destroy(&fio_done[count]);
14675312Smg147109 				DTRACE_PROBE1(hsfs_io_done, struct buf *, wbuf);
14685312Smg147109 
14695312Smg147109 				if (err == 0) {
14705312Smg147109 					err = geterror(wbuf);
14715312Smg147109 				}
14725312Smg147109 			}
14735312Smg147109 			kmem_free(fio_done, bufcnt * sizeof (ksema_t));
14740Sstevel@tonic-gate 		}
14750Sstevel@tonic-gate 
14760Sstevel@tonic-gate 		/* Don't leak resources */
14770Sstevel@tonic-gate 		for (count = 0; count < bufcnt; count++) {
14785312Smg147109 			biofini(&bufs[count]);
14790Sstevel@tonic-gate 			if (count < bufsused && vas[count] != NULL) {
14800Sstevel@tonic-gate 				ppmapout(vas[count]);
14810Sstevel@tonic-gate 			}
14820Sstevel@tonic-gate 		}
14830Sstevel@tonic-gate 
14840Sstevel@tonic-gate 		kmem_free(vas, bufcnt * sizeof (caddr_t));
14850Sstevel@tonic-gate 		kmem_free(bufs, bufcnt * sizeof (struct buf));
14860Sstevel@tonic-gate 	}
14870Sstevel@tonic-gate 
14880Sstevel@tonic-gate 	if (err) {
14890Sstevel@tonic-gate 		pvn_read_done(pp, B_ERROR);
14900Sstevel@tonic-gate 		return (err);
14910Sstevel@tonic-gate 	}
14920Sstevel@tonic-gate 
14930Sstevel@tonic-gate 	/*
14940Sstevel@tonic-gate 	 * Lock the requested page, and the one after it if possible.
14950Sstevel@tonic-gate 	 * Don't bother if our caller hasn't given us a place to stash
14960Sstevel@tonic-gate 	 * the page pointers, since otherwise we'd lock pages that would
14970Sstevel@tonic-gate 	 * never get unlocked.
14980Sstevel@tonic-gate 	 */
14990Sstevel@tonic-gate 	if (pagefound) {
15000Sstevel@tonic-gate 		int index;
15010Sstevel@tonic-gate 		ulong_t soff;
15020Sstevel@tonic-gate 
15030Sstevel@tonic-gate 		/*
15040Sstevel@tonic-gate 		 * Make sure it's in memory before we say it's here.
15050Sstevel@tonic-gate 		 */
15060Sstevel@tonic-gate 		if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) {
15070Sstevel@tonic-gate 			hsfs_lostpage++;
15080Sstevel@tonic-gate 			goto reread;
15090Sstevel@tonic-gate 		}
15100Sstevel@tonic-gate 
15110Sstevel@tonic-gate 		pl[0] = pp;
15120Sstevel@tonic-gate 		index = 1;
15135312Smg147109 		atomic_inc_64(&fsp->cache_read_pages);
15140Sstevel@tonic-gate 
15150Sstevel@tonic-gate 		/*
15160Sstevel@tonic-gate 		 * Try to lock the next page, if it exists, without
15170Sstevel@tonic-gate 		 * blocking.
15180Sstevel@tonic-gate 		 */
15190Sstevel@tonic-gate 		plsz -= PAGESIZE;
15200Sstevel@tonic-gate 		/* LINTED (plsz is unsigned) */
15210Sstevel@tonic-gate 		for (soff = off + PAGESIZE; plsz > 0;
15220Sstevel@tonic-gate 		    soff += PAGESIZE, plsz -= PAGESIZE) {
15230Sstevel@tonic-gate 			pp = page_lookup_nowait(vp, (u_offset_t)soff,
15244866Sfrankho 			    SE_SHARED);
15250Sstevel@tonic-gate 			if (pp == NULL)
15260Sstevel@tonic-gate 				break;
15270Sstevel@tonic-gate 			pl[index++] = pp;
15280Sstevel@tonic-gate 		}
15290Sstevel@tonic-gate 		pl[index] = NULL;
15305312Smg147109 
15315312Smg147109 		/*
15325312Smg147109 		 * Schedule a semi-asynchronous readahead if we are
15335312Smg147109 		 * accessing the last cached page for the current
15345312Smg147109 		 * file.
15355312Smg147109 		 *
15365312Smg147109 		 * Doing this here means that readaheads will be
15375312Smg147109 		 * issued only if cache-hits occur. This is an advantage
15385312Smg147109 		 * since cache-hits would mean that readahead is giving
15395312Smg147109 		 * the desired benefit. If cache-hits do not occur there
15405312Smg147109 		 * is no point in reading ahead of time - the system
15415312Smg147109 		 * is loaded anyway.
15425312Smg147109 		 */
15435312Smg147109 		if (fsp->hqueue != NULL &&
15445312Smg147109 		    hp->hs_prev_offset - off == PAGESIZE &&
15455312Smg147109 		    hp->hs_prev_offset < filsiz &&
15465312Smg147109 		    hp->hs_ra_bytes > 0 &&
15475312Smg147109 		    !page_exists(vp, hp->hs_prev_offset)) {
15485312Smg147109 			(void) hsfs_getpage_ra(vp, hp->hs_prev_offset, seg,
15495312Smg147109 			    addr + PAGESIZE, hp, fsp, xarsiz, bof,
15505312Smg147109 			    chunk_lbn_count, chunk_data_bytes);
15515312Smg147109 		}
15525312Smg147109 
15530Sstevel@tonic-gate 		return (0);
15540Sstevel@tonic-gate 	}
15550Sstevel@tonic-gate 
15560Sstevel@tonic-gate 	if (pp != NULL) {
15570Sstevel@tonic-gate 		pvn_plist_init(pp, pl, plsz, off, io_len, rw);
15580Sstevel@tonic-gate 	}
15590Sstevel@tonic-gate 
15600Sstevel@tonic-gate 	return (err);
15610Sstevel@tonic-gate }
15620Sstevel@tonic-gate 
15635331Samw /*ARGSUSED*/
15640Sstevel@tonic-gate static int
hsfs_getpage(struct vnode * vp,offset_t off,size_t len,uint_t * protp,struct page * pl[],size_t plsz,struct seg * seg,caddr_t addr,enum seg_rw rw,struct cred * cred,caller_context_t * ct)15650Sstevel@tonic-gate hsfs_getpage(
15660Sstevel@tonic-gate 	struct vnode *vp,
15670Sstevel@tonic-gate 	offset_t off,
15680Sstevel@tonic-gate 	size_t len,
15690Sstevel@tonic-gate 	uint_t *protp,
15700Sstevel@tonic-gate 	struct page *pl[],
15710Sstevel@tonic-gate 	size_t plsz,
15720Sstevel@tonic-gate 	struct seg *seg,
15730Sstevel@tonic-gate 	caddr_t addr,
15740Sstevel@tonic-gate 	enum seg_rw rw,
15755331Samw 	struct cred *cred,
15765331Samw 	caller_context_t *ct)
15770Sstevel@tonic-gate {
15780Sstevel@tonic-gate 	int err;
15790Sstevel@tonic-gate 	uint_t filsiz;
15805312Smg147109 	struct hsfs *fsp;
15815312Smg147109 	struct hsnode *hp;
15825312Smg147109 
15835312Smg147109 	fsp = VFS_TO_HSFS(vp->v_vfsp);
15845312Smg147109 	hp = VTOH(vp);
15850Sstevel@tonic-gate 
15860Sstevel@tonic-gate 	/* does not support write */
15870Sstevel@tonic-gate 	if (rw == S_WRITE) {
15880Sstevel@tonic-gate 		panic("write attempt on READ ONLY HSFS");
15890Sstevel@tonic-gate 		/*NOTREACHED*/
15900Sstevel@tonic-gate 	}
15910Sstevel@tonic-gate 
15920Sstevel@tonic-gate 	if (vp->v_flag & VNOMAP) {
15930Sstevel@tonic-gate 		return (ENOSYS);
15940Sstevel@tonic-gate 	}
15950Sstevel@tonic-gate 
15961349Speterte 	ASSERT(off <= HS_MAXFILEOFF);
15970Sstevel@tonic-gate 
15980Sstevel@tonic-gate 	/*
15990Sstevel@tonic-gate 	 * Determine file data size for EOF check.
16000Sstevel@tonic-gate 	 */
16010Sstevel@tonic-gate 	filsiz = hp->hs_dirent.ext_size;
16020Sstevel@tonic-gate 	if ((off + len) > (offset_t)(filsiz + PAGEOFFSET) && seg != segkmap)
16030Sstevel@tonic-gate 		return (EFAULT);	/* beyond EOF */
16040Sstevel@tonic-gate 
16055312Smg147109 	/*
16065312Smg147109 	 * Async Read-ahead computation.
16075312Smg147109 	 * This attempts to detect sequential access pattern and
16085312Smg147109 	 * enables reading extra pages ahead of time.
16095312Smg147109 	 */
16105312Smg147109 	if (fsp->hqueue != NULL) {
16115312Smg147109 		/*
16125312Smg147109 		 * This check for sequential access also takes into
16135312Smg147109 		 * account segmap weirdness when reading in chunks
16145312Smg147109 		 * less than the segmap size of 8K.
16155312Smg147109 		 */
16165312Smg147109 		if (hp->hs_prev_offset == off || (off <
16175312Smg147109 		    hp->hs_prev_offset && off + MAX(len, PAGESIZE)
16185312Smg147109 		    >= hp->hs_prev_offset)) {
16195312Smg147109 			if (hp->hs_num_contig <
16205312Smg147109 			    (seq_contig_requests - 1)) {
16215312Smg147109 				hp->hs_num_contig++;
16225312Smg147109 
16235312Smg147109 			} else {
16245312Smg147109 				/*
16255312Smg147109 				 * We increase readahead quantum till
16265312Smg147109 				 * a predefined max. max_readahead_bytes
16275312Smg147109 				 * is a multiple of PAGESIZE.
16285312Smg147109 				 */
16295312Smg147109 				if (hp->hs_ra_bytes <
16305312Smg147109 				    fsp->hqueue->max_ra_bytes) {
16315312Smg147109 					hp->hs_ra_bytes += PAGESIZE;
16325312Smg147109 				}
16335312Smg147109 			}
16345312Smg147109 		} else {
16355312Smg147109 			/*
16365312Smg147109 			 * Not contiguous so reduce read ahead counters.
16375312Smg147109 			 */
16385312Smg147109 			if (hp->hs_ra_bytes > 0)
16395312Smg147109 				hp->hs_ra_bytes -= PAGESIZE;
16405312Smg147109 
16415312Smg147109 			if (hp->hs_ra_bytes <= 0) {
16425312Smg147109 				hp->hs_ra_bytes = 0;
16435312Smg147109 				if (hp->hs_num_contig > 0)
16445312Smg147109 					hp->hs_num_contig--;
16455312Smg147109 			}
16465312Smg147109 		}
16475312Smg147109 		/*
16485312Smg147109 		 * Length must be rounded up to page boundary.
16495312Smg147109 		 * since we read in units of pages.
16505312Smg147109 		 */
16515312Smg147109 		hp->hs_prev_offset = off + roundup(len, PAGESIZE);
16525312Smg147109 		DTRACE_PROBE1(hsfs_compute_ra, struct hsnode *, hp);
16535312Smg147109 	}
16540Sstevel@tonic-gate 	if (protp != NULL)
16550Sstevel@tonic-gate 		*protp = PROT_ALL;
16560Sstevel@tonic-gate 
16570Sstevel@tonic-gate 	if (len <= PAGESIZE)
16580Sstevel@tonic-gate 		err = hsfs_getapage(vp, (u_offset_t)off, len, protp, pl, plsz,
16590Sstevel@tonic-gate 		    seg, addr, rw, cred);
16600Sstevel@tonic-gate 	else
16610Sstevel@tonic-gate 		err = pvn_getpages(hsfs_getapage, vp, off, len, protp,
16620Sstevel@tonic-gate 		    pl, plsz, seg, addr, rw, cred);
16630Sstevel@tonic-gate 
16640Sstevel@tonic-gate 	return (err);
16650Sstevel@tonic-gate }
16660Sstevel@tonic-gate 
16670Sstevel@tonic-gate 
16680Sstevel@tonic-gate 
16690Sstevel@tonic-gate /*
16700Sstevel@tonic-gate  * This function should never be called. We need to have it to pass
16710Sstevel@tonic-gate  * it as an argument to other functions.
16720Sstevel@tonic-gate  */
16730Sstevel@tonic-gate /*ARGSUSED*/
16740Sstevel@tonic-gate int
hsfs_putapage(vnode_t * vp,page_t * pp,u_offset_t * offp,size_t * lenp,int flags,cred_t * cr)16750Sstevel@tonic-gate hsfs_putapage(
16760Sstevel@tonic-gate 	vnode_t		*vp,
16770Sstevel@tonic-gate 	page_t		*pp,
16780Sstevel@tonic-gate 	u_offset_t	*offp,
16790Sstevel@tonic-gate 	size_t		*lenp,
16800Sstevel@tonic-gate 	int		flags,
16810Sstevel@tonic-gate 	cred_t		*cr)
16820Sstevel@tonic-gate {
16830Sstevel@tonic-gate 	/* should never happen - just destroy it */
16840Sstevel@tonic-gate 	cmn_err(CE_NOTE, "hsfs_putapage: dirty HSFS page");
16850Sstevel@tonic-gate 	pvn_write_done(pp, B_ERROR | B_WRITE | B_INVAL | B_FORCE | flags);
16860Sstevel@tonic-gate 	return (0);
16870Sstevel@tonic-gate }
16880Sstevel@tonic-gate 
16890Sstevel@tonic-gate 
16900Sstevel@tonic-gate /*
16910Sstevel@tonic-gate  * The only flags we support are B_INVAL, B_FREE and B_DONTNEED.
16920Sstevel@tonic-gate  * B_INVAL is set by:
16930Sstevel@tonic-gate  *
16940Sstevel@tonic-gate  *	1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag.
16950Sstevel@tonic-gate  *	2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice
16960Sstevel@tonic-gate  *	   which translates to an MC_SYNC with the MS_INVALIDATE flag.
16970Sstevel@tonic-gate  *
16980Sstevel@tonic-gate  * The B_FREE (as well as the B_DONTNEED) flag is set when the
16990Sstevel@tonic-gate  * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked
17000Sstevel@tonic-gate  * from SEGVN to release pages behind a pagefault.
17010Sstevel@tonic-gate  */
17020Sstevel@tonic-gate /*ARGSUSED*/
17030Sstevel@tonic-gate static int
hsfs_putpage(struct vnode * vp,offset_t off,size_t len,int flags,struct cred * cr,caller_context_t * ct)17040Sstevel@tonic-gate hsfs_putpage(
17055331Samw 	struct vnode		*vp,
17065331Samw 	offset_t		off,
17075331Samw 	size_t			len,
17085331Samw 	int			flags,
17095331Samw 	struct cred		*cr,
17105331Samw 	caller_context_t	*ct)
17110Sstevel@tonic-gate {
17120Sstevel@tonic-gate 	int error = 0;
17130Sstevel@tonic-gate 
17140Sstevel@tonic-gate 	if (vp->v_count == 0) {
17150Sstevel@tonic-gate 		panic("hsfs_putpage: bad v_count");
17160Sstevel@tonic-gate 		/*NOTREACHED*/
17170Sstevel@tonic-gate 	}
17180Sstevel@tonic-gate 
17190Sstevel@tonic-gate 	if (vp->v_flag & VNOMAP)
17200Sstevel@tonic-gate 		return (ENOSYS);
17210Sstevel@tonic-gate 
17221349Speterte 	ASSERT(off <= HS_MAXFILEOFF);
17230Sstevel@tonic-gate 
17240Sstevel@tonic-gate 	if (!vn_has_cached_data(vp))	/* no pages mapped */
17250Sstevel@tonic-gate 		return (0);
17260Sstevel@tonic-gate 
17274866Sfrankho 	if (len == 0) {		/* from 'off' to EOF */
17284866Sfrankho 		error = pvn_vplist_dirty(vp, off, hsfs_putapage, flags, cr);
17294866Sfrankho 	} else {
17300Sstevel@tonic-gate 		offset_t end_off = off + len;
17310Sstevel@tonic-gate 		offset_t file_size = VTOH(vp)->hs_dirent.ext_size;
17320Sstevel@tonic-gate 		offset_t io_off;
17330Sstevel@tonic-gate 
17340Sstevel@tonic-gate 		file_size = (file_size + PAGESIZE - 1) & PAGEMASK;
17350Sstevel@tonic-gate 		if (end_off > file_size)
17360Sstevel@tonic-gate 			end_off = file_size;
17370Sstevel@tonic-gate 
17380Sstevel@tonic-gate 		for (io_off = off; io_off < end_off; io_off += PAGESIZE) {
17390Sstevel@tonic-gate 			page_t *pp;
17400Sstevel@tonic-gate 
17410Sstevel@tonic-gate 			/*
17420Sstevel@tonic-gate 			 * We insist on getting the page only if we are
17430Sstevel@tonic-gate 			 * about to invalidate, free or write it and
17440Sstevel@tonic-gate 			 * the B_ASYNC flag is not set.
17450Sstevel@tonic-gate 			 */
17460Sstevel@tonic-gate 			if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) {
17470Sstevel@tonic-gate 				pp = page_lookup(vp, io_off,
17484866Sfrankho 				    (flags & (B_INVAL | B_FREE)) ?
17494866Sfrankho 				    SE_EXCL : SE_SHARED);
17500Sstevel@tonic-gate 			} else {
17510Sstevel@tonic-gate 				pp = page_lookup_nowait(vp, io_off,
17524866Sfrankho 				    (flags & B_FREE) ? SE_EXCL : SE_SHARED);
17530Sstevel@tonic-gate 			}
17540Sstevel@tonic-gate 
17550Sstevel@tonic-gate 			if (pp == NULL)
17560Sstevel@tonic-gate 				continue;
17575312Smg147109 
17580Sstevel@tonic-gate 			/*
17590Sstevel@tonic-gate 			 * Normally pvn_getdirty() should return 0, which
17600Sstevel@tonic-gate 			 * impies that it has done the job for us.
17610Sstevel@tonic-gate 			 * The shouldn't-happen scenario is when it returns 1.
17620Sstevel@tonic-gate 			 * This means that the page has been modified and
17630Sstevel@tonic-gate 			 * needs to be put back.
17640Sstevel@tonic-gate 			 * Since we can't write on a CD, we fake a failed
17650Sstevel@tonic-gate 			 * I/O and force pvn_write_done() to destroy the page.
17660Sstevel@tonic-gate 			 */
17670Sstevel@tonic-gate 			if (pvn_getdirty(pp, flags) == 1) {
17680Sstevel@tonic-gate 				cmn_err(CE_NOTE,
17694866Sfrankho 				    "hsfs_putpage: dirty HSFS page");
17700Sstevel@tonic-gate 				pvn_write_done(pp, flags |
17710Sstevel@tonic-gate 				    B_ERROR | B_WRITE | B_INVAL | B_FORCE);
17720Sstevel@tonic-gate 			}
17730Sstevel@tonic-gate 		}
17740Sstevel@tonic-gate 	}
17750Sstevel@tonic-gate 	return (error);
17760Sstevel@tonic-gate }
17770Sstevel@tonic-gate 
17780Sstevel@tonic-gate 
17790Sstevel@tonic-gate /*ARGSUSED*/
17800Sstevel@tonic-gate static int
hsfs_map(struct vnode * vp,offset_t off,struct as * as,caddr_t * addrp,size_t len,uchar_t prot,uchar_t maxprot,uint_t flags,struct cred * cred,caller_context_t * ct)17810Sstevel@tonic-gate hsfs_map(
17820Sstevel@tonic-gate 	struct vnode *vp,
17830Sstevel@tonic-gate 	offset_t off,
17840Sstevel@tonic-gate 	struct as *as,
17850Sstevel@tonic-gate 	caddr_t *addrp,
17860Sstevel@tonic-gate 	size_t len,
17870Sstevel@tonic-gate 	uchar_t prot,
17880Sstevel@tonic-gate 	uchar_t maxprot,
17890Sstevel@tonic-gate 	uint_t flags,
17905331Samw 	struct cred *cred,
17915331Samw 	caller_context_t *ct)
17920Sstevel@tonic-gate {
17930Sstevel@tonic-gate 	struct segvn_crargs vn_a;
17940Sstevel@tonic-gate 	int error;
17950Sstevel@tonic-gate 
17960Sstevel@tonic-gate 	/* VFS_RECORD(vp->v_vfsp, VS_MAP, VS_CALL); */
17970Sstevel@tonic-gate 
17980Sstevel@tonic-gate 	if (vp->v_flag & VNOMAP)
17990Sstevel@tonic-gate 		return (ENOSYS);
18000Sstevel@tonic-gate 
18011349Speterte 	if (off > HS_MAXFILEOFF || off < 0 ||
18021349Speterte 	    (off + len) < 0 || (off + len) > HS_MAXFILEOFF)
1803143Speterte 		return (ENXIO);
18040Sstevel@tonic-gate 
18050Sstevel@tonic-gate 	if (vp->v_type != VREG) {
18060Sstevel@tonic-gate 		return (ENODEV);
18070Sstevel@tonic-gate 	}
18080Sstevel@tonic-gate 
18090Sstevel@tonic-gate 	/*
18100Sstevel@tonic-gate 	 * If file is being locked, disallow mapping.
18110Sstevel@tonic-gate 	 */
18120Sstevel@tonic-gate 	if (vn_has_mandatory_locks(vp, VTOH(vp)->hs_dirent.mode))
18130Sstevel@tonic-gate 		return (EAGAIN);
18140Sstevel@tonic-gate 
18150Sstevel@tonic-gate 	as_rangelock(as);
18166036Smec 	error = choose_addr(as, addrp, len, off, ADDR_VACALIGN, flags);
18176036Smec 	if (error != 0) {
18186036Smec 		as_rangeunlock(as);
18196036Smec 		return (error);
18200Sstevel@tonic-gate 	}
18210Sstevel@tonic-gate 
18220Sstevel@tonic-gate 	vn_a.vp = vp;
18230Sstevel@tonic-gate 	vn_a.offset = off;
18240Sstevel@tonic-gate 	vn_a.type = flags & MAP_TYPE;
18250Sstevel@tonic-gate 	vn_a.prot = prot;
18260Sstevel@tonic-gate 	vn_a.maxprot = maxprot;
18270Sstevel@tonic-gate 	vn_a.flags = flags & ~MAP_TYPE;
18280Sstevel@tonic-gate 	vn_a.cred = cred;
18290Sstevel@tonic-gate 	vn_a.amp = NULL;
18300Sstevel@tonic-gate 	vn_a.szc = 0;
18310Sstevel@tonic-gate 	vn_a.lgrp_mem_policy_flags = 0;
18320Sstevel@tonic-gate 
18330Sstevel@tonic-gate 	error = as_map(as, *addrp, len, segvn_create, &vn_a);
18340Sstevel@tonic-gate 	as_rangeunlock(as);
18350Sstevel@tonic-gate 	return (error);
18360Sstevel@tonic-gate }
18370Sstevel@tonic-gate 
18380Sstevel@tonic-gate /* ARGSUSED */
18390Sstevel@tonic-gate static int
hsfs_addmap(struct vnode * vp,offset_t off,struct as * as,caddr_t addr,size_t len,uchar_t prot,uchar_t maxprot,uint_t flags,struct cred * cr,caller_context_t * ct)18400Sstevel@tonic-gate hsfs_addmap(
18410Sstevel@tonic-gate 	struct vnode *vp,
18420Sstevel@tonic-gate 	offset_t off,
18430Sstevel@tonic-gate 	struct as *as,
18440Sstevel@tonic-gate 	caddr_t addr,
18450Sstevel@tonic-gate 	size_t len,
18460Sstevel@tonic-gate 	uchar_t prot,
18470Sstevel@tonic-gate 	uchar_t maxprot,
18480Sstevel@tonic-gate 	uint_t flags,
18495331Samw 	struct cred *cr,
18505331Samw 	caller_context_t *ct)
18510Sstevel@tonic-gate {
18520Sstevel@tonic-gate 	struct hsnode *hp;
18530Sstevel@tonic-gate 
18540Sstevel@tonic-gate 	if (vp->v_flag & VNOMAP)
18550Sstevel@tonic-gate 		return (ENOSYS);
18560Sstevel@tonic-gate 
18570Sstevel@tonic-gate 	hp = VTOH(vp);
18580Sstevel@tonic-gate 	mutex_enter(&hp->hs_contents_lock);
18590Sstevel@tonic-gate 	hp->hs_mapcnt += btopr(len);
18600Sstevel@tonic-gate 	mutex_exit(&hp->hs_contents_lock);
18610Sstevel@tonic-gate 	return (0);
18620Sstevel@tonic-gate }
18630Sstevel@tonic-gate 
18640Sstevel@tonic-gate /*ARGSUSED*/
18650Sstevel@tonic-gate static int
hsfs_delmap(struct vnode * vp,offset_t off,struct as * as,caddr_t addr,size_t len,uint_t prot,uint_t maxprot,uint_t flags,struct cred * cr,caller_context_t * ct)18660Sstevel@tonic-gate hsfs_delmap(
18670Sstevel@tonic-gate 	struct vnode *vp,
18680Sstevel@tonic-gate 	offset_t off,
18690Sstevel@tonic-gate 	struct as *as,
18700Sstevel@tonic-gate 	caddr_t addr,
18710Sstevel@tonic-gate 	size_t len,
18720Sstevel@tonic-gate 	uint_t prot,
18730Sstevel@tonic-gate 	uint_t maxprot,
18740Sstevel@tonic-gate 	uint_t flags,
18755331Samw 	struct cred *cr,
18765331Samw 	caller_context_t *ct)
18770Sstevel@tonic-gate {
18780Sstevel@tonic-gate 	struct hsnode *hp;
18790Sstevel@tonic-gate 
18800Sstevel@tonic-gate 	if (vp->v_flag & VNOMAP)
18810Sstevel@tonic-gate 		return (ENOSYS);
18820Sstevel@tonic-gate 
18830Sstevel@tonic-gate 	hp = VTOH(vp);
18840Sstevel@tonic-gate 	mutex_enter(&hp->hs_contents_lock);
18850Sstevel@tonic-gate 	hp->hs_mapcnt -= btopr(len);	/* Count released mappings */
18860Sstevel@tonic-gate 	ASSERT(hp->hs_mapcnt >= 0);
18870Sstevel@tonic-gate 	mutex_exit(&hp->hs_contents_lock);
18880Sstevel@tonic-gate 	return (0);
18890Sstevel@tonic-gate }
18900Sstevel@tonic-gate 
18910Sstevel@tonic-gate /* ARGSUSED */
18920Sstevel@tonic-gate static int
hsfs_seek(struct vnode * vp,offset_t ooff,offset_t * noffp,caller_context_t * ct)18935331Samw hsfs_seek(
18945331Samw 	struct vnode *vp,
18955331Samw 	offset_t ooff,
18965331Samw 	offset_t *noffp,
18975331Samw 	caller_context_t *ct)
18980Sstevel@tonic-gate {
18990Sstevel@tonic-gate 	return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0);
19000Sstevel@tonic-gate }
19010Sstevel@tonic-gate 
19020Sstevel@tonic-gate /* ARGSUSED */
19030Sstevel@tonic-gate static int
hsfs_frlock(struct vnode * vp,int cmd,struct flock64 * bfp,int flag,offset_t offset,struct flk_callback * flk_cbp,cred_t * cr,caller_context_t * ct)19040Sstevel@tonic-gate hsfs_frlock(
19050Sstevel@tonic-gate 	struct vnode *vp,
19060Sstevel@tonic-gate 	int cmd,
19070Sstevel@tonic-gate 	struct flock64 *bfp,
19080Sstevel@tonic-gate 	int flag,
19090Sstevel@tonic-gate 	offset_t offset,
19100Sstevel@tonic-gate 	struct flk_callback *flk_cbp,
19115331Samw 	cred_t *cr,
19125331Samw 	caller_context_t *ct)
19130Sstevel@tonic-gate {
19140Sstevel@tonic-gate 	struct hsnode *hp = VTOH(vp);
19150Sstevel@tonic-gate 
19160Sstevel@tonic-gate 	/*
19170Sstevel@tonic-gate 	 * If the file is being mapped, disallow fs_frlock.
19180Sstevel@tonic-gate 	 * We are not holding the hs_contents_lock while checking
19190Sstevel@tonic-gate 	 * hs_mapcnt because the current locking strategy drops all
19200Sstevel@tonic-gate 	 * locks before calling fs_frlock.
19210Sstevel@tonic-gate 	 * So, hs_mapcnt could change before we enter fs_frlock making
19220Sstevel@tonic-gate 	 * it meaningless to have held hs_contents_lock in the first place.
19230Sstevel@tonic-gate 	 */
19240Sstevel@tonic-gate 	if (hp->hs_mapcnt > 0 && MANDLOCK(vp, hp->hs_dirent.mode))
19250Sstevel@tonic-gate 		return (EAGAIN);
19260Sstevel@tonic-gate 
19275331Samw 	return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct));
19280Sstevel@tonic-gate }
19290Sstevel@tonic-gate 
19305312Smg147109 static int
hsched_deadline_compare(const void * x1,const void * x2)19315312Smg147109 hsched_deadline_compare(const void *x1, const void *x2)
19325312Smg147109 {
19335312Smg147109 	const struct hio *h1 = x1;
19345312Smg147109 	const struct hio *h2 = x2;
19355312Smg147109 
19365312Smg147109 	if (h1->io_timestamp < h2->io_timestamp)
19375312Smg147109 		return (-1);
19385312Smg147109 	if (h1->io_timestamp > h2->io_timestamp)
19395312Smg147109 		return (1);
19405312Smg147109 
19415312Smg147109 	if (h1->io_lblkno < h2->io_lblkno)
19425312Smg147109 		return (-1);
19435312Smg147109 	if (h1->io_lblkno > h2->io_lblkno)
19445312Smg147109 		return (1);
19455312Smg147109 
19465312Smg147109 	if (h1 < h2)
19475312Smg147109 		return (-1);
19485312Smg147109 	if (h1 > h2)
19495312Smg147109 		return (1);
19505312Smg147109 
19515312Smg147109 	return (0);
19525312Smg147109 }
19535312Smg147109 
19545312Smg147109 static int
hsched_offset_compare(const void * x1,const void * x2)19555312Smg147109 hsched_offset_compare(const void *x1, const void *x2)
19565312Smg147109 {
19575312Smg147109 	const struct hio *h1 = x1;
19585312Smg147109 	const struct hio *h2 = x2;
19595312Smg147109 
19605312Smg147109 	if (h1->io_lblkno < h2->io_lblkno)
19615312Smg147109 		return (-1);
19625312Smg147109 	if (h1->io_lblkno > h2->io_lblkno)
19635312Smg147109 		return (1);
19645312Smg147109 
19655312Smg147109 	if (h1 < h2)
19665312Smg147109 		return (-1);
19675312Smg147109 	if (h1 > h2)
19685312Smg147109 		return (1);
19695312Smg147109 
19705312Smg147109 	return (0);
19715312Smg147109 }
19725312Smg147109 
19735312Smg147109 void
hsched_init_caches(void)19745312Smg147109 hsched_init_caches(void)
19755312Smg147109 {
19765312Smg147109 	hio_cache = kmem_cache_create("hsfs_hio_cache",
19775312Smg147109 	    sizeof (struct hio), 0, NULL,
19785312Smg147109 	    NULL, NULL, NULL, NULL, 0);
19795312Smg147109 
19805312Smg147109 	hio_info_cache = kmem_cache_create("hsfs_hio_info_cache",
19815312Smg147109 	    sizeof (struct hio_info), 0, NULL,
19825312Smg147109 	    NULL, NULL, NULL, NULL, 0);
19835312Smg147109 }
19845312Smg147109 
19855312Smg147109 void
hsched_fini_caches(void)19865312Smg147109 hsched_fini_caches(void)
19875312Smg147109 {
19885312Smg147109 	kmem_cache_destroy(hio_cache);
19895312Smg147109 	kmem_cache_destroy(hio_info_cache);
19905312Smg147109 }
19915312Smg147109 
19925312Smg147109 /*
19935312Smg147109  * Initialize I/O scheduling structures. This is called via hsfs_mount
19945312Smg147109  */
19955312Smg147109 void
hsched_init(struct hsfs * fsp,int fsid,struct modlinkage * modlinkage)19965312Smg147109 hsched_init(struct hsfs *fsp, int fsid, struct modlinkage *modlinkage)
19975312Smg147109 {
19985312Smg147109 	struct hsfs_queue *hqueue = fsp->hqueue;
19995312Smg147109 	struct vnode *vp = fsp->hsfs_devvp;
20005312Smg147109 
20015312Smg147109 	/* TaskQ name of the form: hsched_task_ + stringof(int) */
20025312Smg147109 	char namebuf[23];
20035312Smg147109 	int error, err;
20045312Smg147109 	struct dk_cinfo info;
20055312Smg147109 	ldi_handle_t lh;
20065312Smg147109 	ldi_ident_t li;
20075312Smg147109 
20085312Smg147109 	/*
20095312Smg147109 	 * Default maxtransfer = 16k chunk
20105312Smg147109 	 */
20115312Smg147109 	hqueue->dev_maxtransfer = 16384;
20125312Smg147109 
20135312Smg147109 	/*
20145312Smg147109 	 * Try to fetch the maximum device transfer size. This is used to
20155312Smg147109 	 * ensure that a coalesced block does not exceed the maxtransfer.
20165312Smg147109 	 */
20175312Smg147109 	err  = ldi_ident_from_mod(modlinkage, &li);
20185312Smg147109 	if (err) {
20195312Smg147109 		cmn_err(CE_NOTE, "hsched_init: Querying device failed");
20205312Smg147109 		cmn_err(CE_NOTE, "hsched_init: ldi_ident_from_mod err=%d\n",
20215312Smg147109 		    err);
20225312Smg147109 		goto set_ra;
20235312Smg147109 	}
20245312Smg147109 
20255312Smg147109 	err = ldi_open_by_dev(&(vp->v_rdev), OTYP_CHR, FREAD, CRED(), &lh, li);
20265312Smg147109 	ldi_ident_release(li);
20275312Smg147109 	if (err) {
20285312Smg147109 		cmn_err(CE_NOTE, "hsched_init: Querying device failed");
20295312Smg147109 		cmn_err(CE_NOTE, "hsched_init: ldi_open err=%d\n", err);
20305312Smg147109 		goto set_ra;
20315312Smg147109 	}
20325312Smg147109 
20335312Smg147109 	error = ldi_ioctl(lh, DKIOCINFO, (intptr_t)&info, FKIOCTL,
20345312Smg147109 	    CRED(), &err);
20355312Smg147109 	err = ldi_close(lh, FREAD, CRED());
20365312Smg147109 	if (err) {
20375312Smg147109 		cmn_err(CE_NOTE, "hsched_init: Querying device failed");
20385312Smg147109 		cmn_err(CE_NOTE, "hsched_init: ldi_close err=%d\n", err);
20395312Smg147109 	}
20405312Smg147109 
20415312Smg147109 	if (error == 0) {
20425312Smg147109 		hqueue->dev_maxtransfer = ldbtob(info.dki_maxtransfer);
20435312Smg147109 	}
20445312Smg147109 
20455312Smg147109 set_ra:
20465312Smg147109 	/*
20475312Smg147109 	 * Max size of data to read ahead for sequential access pattern.
20485312Smg147109 	 * Conservative to avoid letting the underlying CD drive to spin
20495312Smg147109 	 * down, in case the application is reading slowly.
20505312Smg147109 	 * We read ahead upto a max of 4 pages.
20515312Smg147109 	 */
20525312Smg147109 	hqueue->max_ra_bytes = PAGESIZE * 8;
20535312Smg147109 
20545312Smg147109 	mutex_init(&(hqueue->hsfs_queue_lock), NULL, MUTEX_DEFAULT, NULL);
20555312Smg147109 	mutex_init(&(hqueue->strategy_lock), NULL, MUTEX_DEFAULT, NULL);
20565312Smg147109 	avl_create(&(hqueue->read_tree), hsched_offset_compare,
20575312Smg147109 	    sizeof (struct hio), offsetof(struct hio, io_offset_node));
20585312Smg147109 	avl_create(&(hqueue->deadline_tree), hsched_deadline_compare,
20595312Smg147109 	    sizeof (struct hio), offsetof(struct hio, io_deadline_node));
20605312Smg147109 
20615312Smg147109 	(void) snprintf(namebuf, sizeof (namebuf), "hsched_task_%d", fsid);
20625312Smg147109 	hqueue->ra_task = taskq_create(namebuf, hsfs_taskq_nthreads,
20635312Smg147109 	    minclsyspri + 2, 1, 104857600 / PAGESIZE, TASKQ_DYNAMIC);
20645312Smg147109 
20655312Smg147109 	hqueue->next = NULL;
20665312Smg147109 	hqueue->nbuf = kmem_zalloc(sizeof (struct buf), KM_SLEEP);
20675312Smg147109 }
20685312Smg147109 
20695312Smg147109 void
hsched_fini(struct hsfs_queue * hqueue)20705312Smg147109 hsched_fini(struct hsfs_queue *hqueue)
20715312Smg147109 {
20725312Smg147109 	if (hqueue != NULL) {
20735406Smg147109 		/*
20745406Smg147109 		 * Remove the sentinel if there was one.
20755406Smg147109 		 */
20765406Smg147109 		if (hqueue->next != NULL) {
20775406Smg147109 			avl_remove(&hqueue->read_tree, hqueue->next);
20785406Smg147109 			kmem_cache_free(hio_cache, hqueue->next);
20795406Smg147109 		}
20805312Smg147109 		avl_destroy(&(hqueue->read_tree));
20815312Smg147109 		avl_destroy(&(hqueue->deadline_tree));
20825312Smg147109 		mutex_destroy(&(hqueue->hsfs_queue_lock));
20835312Smg147109 		mutex_destroy(&(hqueue->strategy_lock));
20845312Smg147109 
20855312Smg147109 		/*
20865312Smg147109 		 * If there are any existing readahead threads running
20875312Smg147109 		 * taskq_destroy will wait for them to finish.
20885312Smg147109 		 */
20895312Smg147109 		taskq_destroy(hqueue->ra_task);
20905312Smg147109 		kmem_free(hqueue->nbuf, sizeof (struct buf));
20915312Smg147109 	}
20925312Smg147109 }
20935312Smg147109 
20945312Smg147109 /*
20955312Smg147109  * Determine if two I/O requests are adjacent to each other so
20965312Smg147109  * that they can coalesced.
20975312Smg147109  */
20985312Smg147109 #define	IS_ADJACENT(io, nio) \
20995312Smg147109 	(((io)->io_lblkno + (io)->nblocks == (nio)->io_lblkno) && \
21005312Smg147109 	(io)->bp->b_edev == (nio)->bp->b_edev)
21015312Smg147109 
21025312Smg147109 /*
21035312Smg147109  * This performs the actual I/O scheduling logic. We use the Circular
21045312Smg147109  * Look algorithm here. Sort the I/O requests in ascending order of
21055312Smg147109  * logical block number and process them starting with the lowest
21065312Smg147109  * numbered block and progressing towards higher block numbers in the
21075312Smg147109  * queue. Once there are no more higher numbered blocks, start again
21085312Smg147109  * with the lowest one. This is good for CD/DVD as you keep moving
21095312Smg147109  * the head in one direction along the outward spiral track and avoid
21105312Smg147109  * too many seeks as much as possible. The re-ordering also allows
21115312Smg147109  * us to coalesce adjacent requests into one larger request.
21125312Smg147109  * This is thus essentially a 1-way Elevator with front merging.
21135312Smg147109  *
21145312Smg147109  * In addition each read request here has a deadline and will be
21155312Smg147109  * processed out of turn if the deadline (500ms) expires.
21165312Smg147109  *
21175312Smg147109  * This function is necessarily serialized via hqueue->strategy_lock.
21185312Smg147109  * This function sits just below hsfs_getapage and processes all read
21195312Smg147109  * requests orginating from that function.
21205312Smg147109  */
21215312Smg147109 int
hsched_invoke_strategy(struct hsfs * fsp)21225312Smg147109 hsched_invoke_strategy(struct hsfs *fsp)
21235312Smg147109 {
21245312Smg147109 	struct hsfs_queue *hqueue;
21255312Smg147109 	struct buf *nbuf;
21265312Smg147109 	struct hio *fio, *nio, *tio, *prev, *last;
21275312Smg147109 	size_t bsize, soffset, offset, data;
21285312Smg147109 	int bioret, bufcount;
21295312Smg147109 	struct vnode *fvp;
21305312Smg147109 	ksema_t *io_done;
21315312Smg147109 	caddr_t iodata;
21325312Smg147109 
21335312Smg147109 	hqueue = fsp->hqueue;
21345312Smg147109 	mutex_enter(&hqueue->strategy_lock);
21355312Smg147109 	mutex_enter(&hqueue->hsfs_queue_lock);
21365312Smg147109 
21375312Smg147109 	/*
21385312Smg147109 	 * Check for Deadline expiration first
21395312Smg147109 	 */
21405312Smg147109 	fio = avl_first(&hqueue->deadline_tree);
21415312Smg147109 
21425312Smg147109 	/*
21435312Smg147109 	 * Paranoid check for empty I/O queue. Both deadline
21445312Smg147109 	 * and read trees contain same data sorted in different
21455312Smg147109 	 * ways. So empty deadline tree = empty read tree.
21465312Smg147109 	 */
21475312Smg147109 	if (fio == NULL) {
21485312Smg147109 		/*
21495312Smg147109 		 * Remove the sentinel if there was one.
21505312Smg147109 		 */
21515312Smg147109 		if (hqueue->next != NULL) {
21525312Smg147109 			avl_remove(&hqueue->read_tree, hqueue->next);
21535312Smg147109 			kmem_cache_free(hio_cache, hqueue->next);
21545312Smg147109 			hqueue->next = NULL;
21555312Smg147109 		}
21565312Smg147109 		mutex_exit(&hqueue->hsfs_queue_lock);
21575312Smg147109 		mutex_exit(&hqueue->strategy_lock);
21585312Smg147109 		return (1);
21595312Smg147109 	}
21605312Smg147109 
21615312Smg147109 	if (drv_hztousec(ddi_get_lbolt()) - fio->io_timestamp
21625312Smg147109 	    < HSFS_READ_DEADLINE) {
21635312Smg147109 		/*
21645312Smg147109 		 * Apply standard scheduling logic. This uses the
21655312Smg147109 		 * C-LOOK approach. Process I/O requests in ascending
21665312Smg147109 		 * order of logical block address till no subsequent
21675312Smg147109 		 * higher numbered block request remains. Then start
21685312Smg147109 		 * again from the lowest numbered block in the queue.
21695312Smg147109 		 *
21705312Smg147109 		 * We do this cheaply here by means of a sentinel.
21715312Smg147109 		 * The last processed I/O structure from the previous
21725312Smg147109 		 * invocation of this func, is left dangling in the
21735312Smg147109 		 * read_tree so that we can easily scan to the next
21745312Smg147109 		 * higher numbered request and remove the sentinel.
21755312Smg147109 		 */
21765312Smg147109 		fio = NULL;
21775312Smg147109 		if (hqueue->next != NULL) {
21785312Smg147109 			fio = AVL_NEXT(&hqueue->read_tree, hqueue->next);
21795312Smg147109 			avl_remove(&hqueue->read_tree, hqueue->next);
21805312Smg147109 			kmem_cache_free(hio_cache, hqueue->next);
21815312Smg147109 			hqueue->next = NULL;
21825312Smg147109 		}
21835312Smg147109 		if (fio == NULL) {
21845312Smg147109 			fio = avl_first(&hqueue->read_tree);
21855312Smg147109 		}
21865312Smg147109 	} else if (hqueue->next != NULL) {
21875312Smg147109 		DTRACE_PROBE1(hsfs_deadline_expiry, struct hio *, fio);
21885312Smg147109 
21895312Smg147109 		avl_remove(&hqueue->read_tree, hqueue->next);
21905312Smg147109 		kmem_cache_free(hio_cache, hqueue->next);
21915312Smg147109 		hqueue->next = NULL;
21925312Smg147109 	}
21935312Smg147109 
21945312Smg147109 	/*
21955312Smg147109 	 * In addition we try to coalesce contiguous
21965312Smg147109 	 * requests into one bigger request.
21975312Smg147109 	 */
21985312Smg147109 	bufcount = 1;
21995312Smg147109 	bsize = ldbtob(fio->nblocks);
22005312Smg147109 	fvp = fio->bp->b_file;
22015312Smg147109 	nio = AVL_NEXT(&hqueue->read_tree, fio);
22025312Smg147109 	tio = fio;
22035312Smg147109 	while (nio != NULL && IS_ADJACENT(tio, nio) &&
22045312Smg147109 	    bsize < hqueue->dev_maxtransfer) {
22055312Smg147109 		avl_remove(&hqueue->deadline_tree, tio);
22065312Smg147109 		avl_remove(&hqueue->read_tree, tio);
22075312Smg147109 		tio->contig_chain = nio;
22085312Smg147109 		bsize += ldbtob(nio->nblocks);
22095312Smg147109 		prev = tio;
22105312Smg147109 		tio = nio;
22115312Smg147109 
22125312Smg147109 		/*
22135312Smg147109 		 * This check is required to detect the case where
22145312Smg147109 		 * we are merging adjacent buffers belonging to
22155312Smg147109 		 * different files. fvp is used to set the b_file
22165312Smg147109 		 * parameter in the coalesced buf. b_file is used
22175312Smg147109 		 * by DTrace so we do not want DTrace to accrue
22185312Smg147109 		 * requests to two different files to any one file.
22195312Smg147109 		 */
22205312Smg147109 		if (fvp && tio->bp->b_file != fvp) {
22215312Smg147109 			fvp = NULL;
22225312Smg147109 		}
22235312Smg147109 
22245312Smg147109 		nio = AVL_NEXT(&hqueue->read_tree, nio);
22255312Smg147109 		bufcount++;
22265312Smg147109 	}
22275312Smg147109 
22285312Smg147109 	/*
22295312Smg147109 	 * tio is not removed from the read_tree as it serves as a sentinel
22305312Smg147109 	 * to cheaply allow us to scan to the next higher numbered I/O
22315312Smg147109 	 * request.
22325312Smg147109 	 */
22335312Smg147109 	hqueue->next = tio;
22345312Smg147109 	avl_remove(&hqueue->deadline_tree, tio);
22355312Smg147109 	mutex_exit(&hqueue->hsfs_queue_lock);
22365312Smg147109 	DTRACE_PROBE3(hsfs_io_dequeued, struct hio *, fio, int, bufcount,
22375312Smg147109 	    size_t, bsize);
22385312Smg147109 
22395312Smg147109 	/*
22405312Smg147109 	 * The benefit of coalescing occurs if the the savings in I/O outweighs
22415312Smg147109 	 * the cost of doing the additional work below.
22425312Smg147109 	 * It was observed that coalescing 2 buffers results in diminishing
22435312Smg147109 	 * returns, so we do coalescing if we have >2 adjacent bufs.
22445312Smg147109 	 */
22455312Smg147109 	if (bufcount > hsched_coalesce_min) {
22465312Smg147109 		/*
22475312Smg147109 		 * We have coalesced blocks. First allocate mem and buf for
22485312Smg147109 		 * the entire coalesced chunk.
22495312Smg147109 		 * Since we are guaranteed single-threaded here we pre-allocate
22505312Smg147109 		 * one buf at mount time and that is re-used every time. This
22515312Smg147109 		 * is a synthesized buf structure that uses kmem_alloced chunk.
22525312Smg147109 		 * Not quite a normal buf attached to pages.
22535312Smg147109 		 */
22545312Smg147109 		fsp->coalesced_bytes += bsize;
22555312Smg147109 		nbuf = hqueue->nbuf;
22565312Smg147109 		bioinit(nbuf);
22575312Smg147109 		nbuf->b_edev = fio->bp->b_edev;
22585312Smg147109 		nbuf->b_dev = fio->bp->b_dev;
22595312Smg147109 		nbuf->b_flags = fio->bp->b_flags;
22605312Smg147109 		nbuf->b_iodone = fio->bp->b_iodone;
22615312Smg147109 		iodata = kmem_alloc(bsize, KM_SLEEP);
22625312Smg147109 		nbuf->b_un.b_addr = iodata;
22635312Smg147109 		nbuf->b_lblkno = fio->bp->b_lblkno;
22645312Smg147109 		nbuf->b_vp = fvp;
22655312Smg147109 		nbuf->b_file = fvp;
22665312Smg147109 		nbuf->b_bcount = bsize;
22675312Smg147109 		nbuf->b_bufsize = bsize;
22685312Smg147109 
22695312Smg147109 		DTRACE_PROBE3(hsfs_coalesced_io_start, struct hio *, fio, int,
22705312Smg147109 		    bufcount, size_t, bsize);
22715312Smg147109 
22725312Smg147109 		/*
22735312Smg147109 		 * Perform I/O for the coalesced block.
22745312Smg147109 		 */
22755312Smg147109 		(void) bdev_strategy(nbuf);
22765312Smg147109 
22775312Smg147109 		/*
22785312Smg147109 		 * Duplicate the last IO node to leave the sentinel alone.
22795312Smg147109 		 * The sentinel is freed in the next invocation of this
22805312Smg147109 		 * function.
22815312Smg147109 		 */
22825312Smg147109 		prev->contig_chain = kmem_cache_alloc(hio_cache, KM_SLEEP);
22835312Smg147109 		prev->contig_chain->bp = tio->bp;
22845312Smg147109 		prev->contig_chain->sema = tio->sema;
22855312Smg147109 		tio = prev->contig_chain;
22865312Smg147109 		tio->contig_chain = NULL;
22875312Smg147109 		soffset = ldbtob(fio->bp->b_lblkno);
22885312Smg147109 		nio = fio;
22895312Smg147109 
22905312Smg147109 		bioret = biowait(nbuf);
22915312Smg147109 		data = bsize - nbuf->b_resid;
22925312Smg147109 		biofini(nbuf);
22935312Smg147109 		mutex_exit(&hqueue->strategy_lock);
22945312Smg147109 
22955312Smg147109 		/*
22965312Smg147109 		 * We use the b_resid parameter to detect how much
22975312Smg147109 		 * data was succesfully transferred. We will signal
22985312Smg147109 		 * a success to all the fully retrieved actual bufs
22995312Smg147109 		 * before coalescing, rest is signaled as error,
23005312Smg147109 		 * if any.
23015312Smg147109 		 */
23025312Smg147109 		tio = nio;
23035312Smg147109 		DTRACE_PROBE3(hsfs_coalesced_io_done, struct hio *, nio,
23045312Smg147109 		    int, bioret, size_t, data);
23055312Smg147109 
23065312Smg147109 		/*
23075312Smg147109 		 * Copy data and signal success to all the bufs
23085312Smg147109 		 * which can be fully satisfied from b_resid.
23095312Smg147109 		 */
23105312Smg147109 		while (nio != NULL && data >= nio->bp->b_bcount) {
23115312Smg147109 			offset = ldbtob(nio->bp->b_lblkno) - soffset;
23125312Smg147109 			bcopy(iodata + offset, nio->bp->b_un.b_addr,
23135312Smg147109 			    nio->bp->b_bcount);
23145312Smg147109 			data -= nio->bp->b_bcount;
23155312Smg147109 			bioerror(nio->bp, 0);
23165312Smg147109 			biodone(nio->bp);
23175312Smg147109 			sema_v(nio->sema);
23185312Smg147109 			tio = nio;
23195312Smg147109 			nio = nio->contig_chain;
23205312Smg147109 			kmem_cache_free(hio_cache, tio);
23215312Smg147109 		}
23225312Smg147109 
23235312Smg147109 		/*
23245312Smg147109 		 * Signal error to all the leftover bufs (if any)
23255312Smg147109 		 * after b_resid data is exhausted.
23265312Smg147109 		 */
23275312Smg147109 		while (nio != NULL) {
23285312Smg147109 			nio->bp->b_resid = nio->bp->b_bcount - data;
23295312Smg147109 			bzero(nio->bp->b_un.b_addr + data, nio->bp->b_resid);
23305312Smg147109 			bioerror(nio->bp, bioret);
23315312Smg147109 			biodone(nio->bp);
23325312Smg147109 			sema_v(nio->sema);
23335312Smg147109 			tio = nio;
23345312Smg147109 			nio = nio->contig_chain;
23355312Smg147109 			kmem_cache_free(hio_cache, tio);
23365312Smg147109 			data = 0;
23375312Smg147109 		}
23385312Smg147109 		kmem_free(iodata, bsize);
23395312Smg147109 	} else {
23405312Smg147109 
23415312Smg147109 		nbuf = tio->bp;
23425312Smg147109 		io_done = tio->sema;
23435312Smg147109 		nio = fio;
23445312Smg147109 		last = tio;
23455312Smg147109 
23465312Smg147109 		while (nio != NULL) {
23475312Smg147109 			(void) bdev_strategy(nio->bp);
23485312Smg147109 			nio = nio->contig_chain;
23495312Smg147109 		}
23505312Smg147109 		nio = fio;
23515312Smg147109 		mutex_exit(&hqueue->strategy_lock);
23525312Smg147109 
23535312Smg147109 		while (nio != NULL) {
23545312Smg147109 			if (nio == last) {
23555312Smg147109 				(void) biowait(nbuf);
23565312Smg147109 				sema_v(io_done);
23575312Smg147109 				break;
23585312Smg147109 				/* sentinel last not freed. See above. */
23595312Smg147109 			} else {
23605312Smg147109 				(void) biowait(nio->bp);
23615312Smg147109 				sema_v(nio->sema);
23625312Smg147109 			}
23635312Smg147109 			tio = nio;
23645312Smg147109 			nio = nio->contig_chain;
23655312Smg147109 			kmem_cache_free(hio_cache, tio);
23665312Smg147109 		}
23675312Smg147109 	}
23685312Smg147109 	return (0);
23695312Smg147109 }
23705312Smg147109 
23715312Smg147109 /*
23725312Smg147109  * Insert an I/O request in the I/O scheduler's pipeline
23735312Smg147109  * Using AVL tree makes it easy to reorder the I/O request
23745312Smg147109  * based on logical block number.
23755312Smg147109  */
23765312Smg147109 static void
hsched_enqueue_io(struct hsfs * fsp,struct hio * hsio,int ra)23775312Smg147109 hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra)
23785312Smg147109 {
23795312Smg147109 	struct hsfs_queue *hqueue = fsp->hqueue;
23805312Smg147109 
23815312Smg147109 	mutex_enter(&hqueue->hsfs_queue_lock);
23825312Smg147109 
23835312Smg147109 	fsp->physical_read_bytes += hsio->bp->b_bcount;
23845312Smg147109 	if (ra)
23855312Smg147109 		fsp->readahead_bytes += hsio->bp->b_bcount;
23865312Smg147109 
23875312Smg147109 	avl_add(&hqueue->deadline_tree, hsio);
23885312Smg147109 	avl_add(&hqueue->read_tree, hsio);
23895312Smg147109 
23905312Smg147109 	DTRACE_PROBE3(hsfs_io_enqueued, struct hio *, hsio,
23915312Smg147109 	    struct hsfs_queue *, hqueue, int, ra);
23925312Smg147109 
23935312Smg147109 	mutex_exit(&hqueue->hsfs_queue_lock);
23945312Smg147109 }
23955312Smg147109 
23962900Sfrankho /* ARGSUSED */
23972900Sfrankho static int
hsfs_pathconf(struct vnode * vp,int cmd,ulong_t * valp,struct cred * cr,caller_context_t * ct)23985331Samw hsfs_pathconf(struct vnode *vp,
23995331Samw 	int cmd,
24005331Samw 	ulong_t *valp,
24015331Samw 	struct cred *cr,
24025331Samw 	caller_context_t *ct)
24032900Sfrankho {
24042900Sfrankho 	struct hsfs	*fsp;
24052900Sfrankho 
24062900Sfrankho 	int		error = 0;
24072900Sfrankho 
24082900Sfrankho 	switch (cmd) {
24092900Sfrankho 
24102900Sfrankho 	case _PC_NAME_MAX:
24112900Sfrankho 		fsp = VFS_TO_HSFS(vp->v_vfsp);
24122900Sfrankho 		*valp = fsp->hsfs_namemax;
24132900Sfrankho 		break;
24142900Sfrankho 
24152900Sfrankho 	case _PC_FILESIZEBITS:
24162900Sfrankho 		*valp = 33;	/* Without multi extent support: 4 GB - 2k */
24172900Sfrankho 		break;
24182900Sfrankho 
2419*10440SRoger.Faulkner@Sun.COM 	case _PC_TIMESTAMP_RESOLUTION:
2420*10440SRoger.Faulkner@Sun.COM 		/*
2421*10440SRoger.Faulkner@Sun.COM 		 * HSFS keeps, at best, 1/100 second timestamp resolution.
2422*10440SRoger.Faulkner@Sun.COM 		 */
2423*10440SRoger.Faulkner@Sun.COM 		*valp = 10000000L;
2424*10440SRoger.Faulkner@Sun.COM 		break;
2425*10440SRoger.Faulkner@Sun.COM 
24262900Sfrankho 	default:
24275331Samw 		error = fs_pathconf(vp, cmd, valp, cr, ct);
2428*10440SRoger.Faulkner@Sun.COM 		break;
24292900Sfrankho 	}
24302900Sfrankho 
24312900Sfrankho 	return (error);
24322900Sfrankho }
24332900Sfrankho 
24342900Sfrankho 
24352900Sfrankho 
24360Sstevel@tonic-gate const fs_operation_def_t hsfs_vnodeops_template[] = {
24373898Srsb 	VOPNAME_OPEN,		{ .vop_open = hsfs_open },
24383898Srsb 	VOPNAME_CLOSE,		{ .vop_close = hsfs_close },
24393898Srsb 	VOPNAME_READ,		{ .vop_read = hsfs_read },
24403898Srsb 	VOPNAME_GETATTR,	{ .vop_getattr = hsfs_getattr },
24413898Srsb 	VOPNAME_ACCESS,		{ .vop_access = hsfs_access },
24423898Srsb 	VOPNAME_LOOKUP,		{ .vop_lookup = hsfs_lookup },
24433898Srsb 	VOPNAME_READDIR,	{ .vop_readdir = hsfs_readdir },
24443898Srsb 	VOPNAME_READLINK,	{ .vop_readlink = hsfs_readlink },
24453898Srsb 	VOPNAME_FSYNC,		{ .vop_fsync = hsfs_fsync },
24463898Srsb 	VOPNAME_INACTIVE,	{ .vop_inactive = hsfs_inactive },
24473898Srsb 	VOPNAME_FID,		{ .vop_fid = hsfs_fid },
24483898Srsb 	VOPNAME_SEEK,		{ .vop_seek = hsfs_seek },
24493898Srsb 	VOPNAME_FRLOCK,		{ .vop_frlock = hsfs_frlock },
24503898Srsb 	VOPNAME_GETPAGE,	{ .vop_getpage = hsfs_getpage },
24513898Srsb 	VOPNAME_PUTPAGE,	{ .vop_putpage = hsfs_putpage },
24523898Srsb 	VOPNAME_MAP,		{ .vop_map = hsfs_map },
24533898Srsb 	VOPNAME_ADDMAP,		{ .vop_addmap = hsfs_addmap },
24543898Srsb 	VOPNAME_DELMAP,		{ .vop_delmap = hsfs_delmap },
24553898Srsb 	VOPNAME_PATHCONF,	{ .vop_pathconf = hsfs_pathconf },
24563898Srsb 	NULL,			NULL
24570Sstevel@tonic-gate };
24580Sstevel@tonic-gate 
24590Sstevel@tonic-gate struct vnodeops *hsfs_vnodeops;
2460