10Sstevel@tonic-gate /* 20Sstevel@tonic-gate * CDDL HEADER START 30Sstevel@tonic-gate * 40Sstevel@tonic-gate * The contents of this file are subject to the terms of the 51349Speterte * Common Development and Distribution License (the "License"). 61349Speterte * You may not use this file except in compliance with the License. 70Sstevel@tonic-gate * 80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing. 100Sstevel@tonic-gate * See the License for the specific language governing permissions 110Sstevel@tonic-gate * and limitations under the License. 120Sstevel@tonic-gate * 130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each 140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the 160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying 170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner] 180Sstevel@tonic-gate * 190Sstevel@tonic-gate * CDDL HEADER END 200Sstevel@tonic-gate */ 210Sstevel@tonic-gate /* 223898Srsb * Copyright 2007 Sun Microsystems, Inc. All rights reserved. 230Sstevel@tonic-gate * Use is subject to license terms. 240Sstevel@tonic-gate */ 250Sstevel@tonic-gate 260Sstevel@tonic-gate #pragma ident "%Z%%M% %I% %E% SMI" 270Sstevel@tonic-gate 280Sstevel@tonic-gate /* 290Sstevel@tonic-gate * Vnode operations for the High Sierra filesystem 300Sstevel@tonic-gate */ 310Sstevel@tonic-gate 320Sstevel@tonic-gate #include <sys/types.h> 330Sstevel@tonic-gate #include <sys/t_lock.h> 340Sstevel@tonic-gate #include <sys/param.h> 350Sstevel@tonic-gate #include <sys/time.h> 360Sstevel@tonic-gate #include <sys/systm.h> 370Sstevel@tonic-gate #include <sys/sysmacros.h> 380Sstevel@tonic-gate #include <sys/resource.h> 390Sstevel@tonic-gate #include <sys/signal.h> 400Sstevel@tonic-gate #include <sys/cred.h> 410Sstevel@tonic-gate #include <sys/user.h> 420Sstevel@tonic-gate #include <sys/buf.h> 430Sstevel@tonic-gate #include <sys/vfs.h> 443898Srsb #include <sys/vfs_opreg.h> 450Sstevel@tonic-gate #include <sys/stat.h> 460Sstevel@tonic-gate #include <sys/vnode.h> 470Sstevel@tonic-gate #include <sys/mode.h> 480Sstevel@tonic-gate #include <sys/proc.h> 490Sstevel@tonic-gate #include <sys/disp.h> 500Sstevel@tonic-gate #include <sys/file.h> 510Sstevel@tonic-gate #include <sys/fcntl.h> 520Sstevel@tonic-gate #include <sys/flock.h> 530Sstevel@tonic-gate #include <sys/kmem.h> 540Sstevel@tonic-gate #include <sys/uio.h> 550Sstevel@tonic-gate #include <sys/conf.h> 560Sstevel@tonic-gate #include <sys/errno.h> 570Sstevel@tonic-gate #include <sys/mman.h> 580Sstevel@tonic-gate #include <sys/pathname.h> 590Sstevel@tonic-gate #include <sys/debug.h> 600Sstevel@tonic-gate #include <sys/vmsystm.h> 610Sstevel@tonic-gate #include <sys/cmn_err.h> 620Sstevel@tonic-gate #include <sys/fbuf.h> 630Sstevel@tonic-gate #include <sys/dirent.h> 640Sstevel@tonic-gate #include <sys/errno.h> 655312Smg147109 #include <sys/dkio.h> 665312Smg147109 #include <sys/cmn_err.h> 675312Smg147109 #include <sys/atomic.h> 680Sstevel@tonic-gate 690Sstevel@tonic-gate #include <vm/hat.h> 700Sstevel@tonic-gate #include <vm/page.h> 710Sstevel@tonic-gate #include <vm/pvn.h> 720Sstevel@tonic-gate #include <vm/as.h> 730Sstevel@tonic-gate #include <vm/seg.h> 740Sstevel@tonic-gate #include <vm/seg_map.h> 750Sstevel@tonic-gate #include <vm/seg_kmem.h> 760Sstevel@tonic-gate #include <vm/seg_vn.h> 770Sstevel@tonic-gate #include <vm/rm.h> 780Sstevel@tonic-gate #include <vm/page.h> 790Sstevel@tonic-gate #include <sys/swap.h> 805312Smg147109 #include <sys/avl.h> 815312Smg147109 #include <sys/sunldi.h> 825312Smg147109 #include <sys/ddi.h> 835312Smg147109 #include <sys/sunddi.h> 845312Smg147109 #include <sys/sdt.h> 855312Smg147109 865312Smg147109 /* 875312Smg147109 * For struct modlinkage 885312Smg147109 */ 895312Smg147109 #include <sys/modctl.h> 900Sstevel@tonic-gate 910Sstevel@tonic-gate #include <sys/fs/hsfs_spec.h> 920Sstevel@tonic-gate #include <sys/fs/hsfs_node.h> 930Sstevel@tonic-gate #include <sys/fs/hsfs_impl.h> 940Sstevel@tonic-gate #include <sys/fs/hsfs_susp.h> 950Sstevel@tonic-gate #include <sys/fs/hsfs_rrip.h> 960Sstevel@tonic-gate 970Sstevel@tonic-gate #include <fs/fs_subr.h> 980Sstevel@tonic-gate 995312Smg147109 /* # of contiguous requests to detect sequential access pattern */ 1005312Smg147109 static int seq_contig_requests = 2; 1015312Smg147109 1025312Smg147109 /* 1035312Smg147109 * This is the max number os taskq threads that will be created 1045312Smg147109 * if required. Since we are using a Dynamic TaskQ by default only 1055312Smg147109 * one thread is created initially. 1065312Smg147109 * 1075312Smg147109 * NOTE: In the usual hsfs use case this per fs instance number 1085312Smg147109 * of taskq threads should not place any undue load on a system. 1095312Smg147109 * Even on an unusual system with say 100 CDROM drives, 800 threads 1105312Smg147109 * will not be created unless all the drives are loaded and all 1115312Smg147109 * of them are saturated with I/O at the same time! If there is at 1125312Smg147109 * all a complaint of system load due to such an unusual case it 1135312Smg147109 * should be easy enough to change to one per-machine Dynamic TaskQ 1145312Smg147109 * for all hsfs mounts with a nthreads of say 32. 1155312Smg147109 */ 1165312Smg147109 static int hsfs_taskq_nthreads = 8; /* # of taskq threads per fs */ 1175312Smg147109 1185312Smg147109 /* Min count of adjacent bufs that will avoid buf coalescing */ 1195312Smg147109 static int hsched_coalesce_min = 2; 1205312Smg147109 1215312Smg147109 /* 1225312Smg147109 * Kmem caches for heavily used small allocations. Using these kmem 1235312Smg147109 * caches provides a factor of 3 reduction in system time and greatly 1245312Smg147109 * aids overall throughput esp. on SPARC. 1255312Smg147109 */ 1265312Smg147109 struct kmem_cache *hio_cache; 1275312Smg147109 struct kmem_cache *hio_info_cache; 1285312Smg147109 1294866Sfrankho /* 1304866Sfrankho * This tunable allows us to ignore inode numbers from rrip-1.12. 1314866Sfrankho * In this case, we fall back to our default inode algorithm. 1324866Sfrankho */ 1334866Sfrankho extern int use_rrip_inodes; 1344866Sfrankho 1355312Smg147109 /* 1365312Smg147109 * Free behind logic from UFS to tame our thirst for 1375312Smg147109 * the page cache. 1385312Smg147109 * See usr/src/uts/common/fs/ufs/ufs_vnops.c for more 1395312Smg147109 * explanation. 1405312Smg147109 */ 1415312Smg147109 static int freebehind = 1; 1425312Smg147109 static int smallfile = 0; 1435312Smg147109 static int cache_read_ahead = 0; 1445312Smg147109 static u_offset_t smallfile64 = 32 * 1024; 1455312Smg147109 #define SMALLFILE1_D 1000 1465312Smg147109 #define SMALLFILE2_D 10 1475312Smg147109 static u_offset_t smallfile1 = 32 * 1024; 1485312Smg147109 static u_offset_t smallfile2 = 32 * 1024; 1495312Smg147109 static clock_t smallfile_update = 0; /* when to recompute */ 1505312Smg147109 static uint_t smallfile1_d = SMALLFILE1_D; 1515312Smg147109 static uint_t smallfile2_d = SMALLFILE2_D; 1525312Smg147109 1535312Smg147109 static int hsched_deadline_compare(const void *x1, const void *x2); 1545312Smg147109 static int hsched_offset_compare(const void *x1, const void *x2); 1555312Smg147109 static void hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra); 1565312Smg147109 int hsched_invoke_strategy(struct hsfs *fsp); 1574866Sfrankho 1580Sstevel@tonic-gate /* ARGSUSED */ 1590Sstevel@tonic-gate static int 1605331Samw hsfs_fsync(vnode_t *cp, 1615331Samw int syncflag, 1625331Samw cred_t *cred, 1635331Samw caller_context_t *ct) 1640Sstevel@tonic-gate { 1650Sstevel@tonic-gate return (0); 1660Sstevel@tonic-gate } 1670Sstevel@tonic-gate 1680Sstevel@tonic-gate 1690Sstevel@tonic-gate /*ARGSUSED*/ 1700Sstevel@tonic-gate static int 1715331Samw hsfs_read(struct vnode *vp, 1725331Samw struct uio *uiop, 1735331Samw int ioflag, 1745331Samw struct cred *cred, 1750Sstevel@tonic-gate struct caller_context *ct) 1760Sstevel@tonic-gate { 177206Speterte caddr_t base; 178206Speterte offset_t diff; 179206Speterte int error; 1800Sstevel@tonic-gate struct hsnode *hp; 181206Speterte uint_t filesize; 1825312Smg147109 int dofree; 1830Sstevel@tonic-gate 1840Sstevel@tonic-gate hp = VTOH(vp); 1850Sstevel@tonic-gate /* 1860Sstevel@tonic-gate * if vp is of type VDIR, make sure dirent 1870Sstevel@tonic-gate * is filled up with all info (because of ptbl) 1880Sstevel@tonic-gate */ 1890Sstevel@tonic-gate if (vp->v_type == VDIR) { 1900Sstevel@tonic-gate if (hp->hs_dirent.ext_size == 0) 1910Sstevel@tonic-gate hs_filldirent(vp, &hp->hs_dirent); 1920Sstevel@tonic-gate } 1930Sstevel@tonic-gate filesize = hp->hs_dirent.ext_size; 1940Sstevel@tonic-gate 195206Speterte /* Sanity checks. */ 196206Speterte if (uiop->uio_resid == 0 || /* No data wanted. */ 1971349Speterte uiop->uio_loffset > HS_MAXFILEOFF || /* Offset too big. */ 198206Speterte uiop->uio_loffset >= filesize) /* Past EOF. */ 199206Speterte return (0); 2000Sstevel@tonic-gate 2010Sstevel@tonic-gate do { 202206Speterte /* 203206Speterte * We want to ask for only the "right" amount of data. 204206Speterte * In this case that means:- 205206Speterte * 206206Speterte * We can't get data from beyond our EOF. If asked, 207206Speterte * we will give a short read. 208206Speterte * 209206Speterte * segmap_getmapflt returns buffers of MAXBSIZE bytes. 210206Speterte * These buffers are always MAXBSIZE aligned. 211206Speterte * If our starting offset is not MAXBSIZE aligned, 212206Speterte * we can only ask for less than MAXBSIZE bytes. 213206Speterte * 214206Speterte * If our requested offset and length are such that 215206Speterte * they belong in different MAXBSIZE aligned slots 216206Speterte * then we'll be making more than one call on 217206Speterte * segmap_getmapflt. 218206Speterte * 219206Speterte * This diagram shows the variables we use and their 220206Speterte * relationships. 221206Speterte * 222206Speterte * |<-----MAXBSIZE----->| 223206Speterte * +--------------------------...+ 224206Speterte * |.....mapon->|<--n-->|....*...|EOF 225206Speterte * +--------------------------...+ 226206Speterte * uio_loffset->| 227206Speterte * uio_resid....|<---------->| 228206Speterte * diff.........|<-------------->| 229206Speterte * 230206Speterte * So, in this case our offset is not aligned 231206Speterte * and our request takes us outside of the 232206Speterte * MAXBSIZE window. We will break this up into 233206Speterte * two segmap_getmapflt calls. 234206Speterte */ 235206Speterte size_t nbytes; 236206Speterte offset_t mapon; 237206Speterte size_t n; 238206Speterte uint_t flags; 2390Sstevel@tonic-gate 240206Speterte mapon = uiop->uio_loffset & MAXBOFFSET; 241206Speterte diff = filesize - uiop->uio_loffset; 242206Speterte nbytes = (size_t)MIN(MAXBSIZE - mapon, uiop->uio_resid); 243206Speterte n = MIN(diff, nbytes); 244206Speterte if (n <= 0) { 245206Speterte /* EOF or request satisfied. */ 246206Speterte return (0); 2470Sstevel@tonic-gate } 2480Sstevel@tonic-gate 2495312Smg147109 /* 2505312Smg147109 * Freebehind computation taken from: 2515312Smg147109 * usr/src/uts/common/fs/ufs/ufs_vnops.c 2525312Smg147109 */ 2535312Smg147109 if (drv_hztousec(ddi_get_lbolt()) >= smallfile_update) { 2545312Smg147109 uint64_t percpufreeb; 2555312Smg147109 if (smallfile1_d == 0) smallfile1_d = SMALLFILE1_D; 2565312Smg147109 if (smallfile2_d == 0) smallfile2_d = SMALLFILE2_D; 2575312Smg147109 percpufreeb = ptob((uint64_t)freemem) / ncpus_online; 2585312Smg147109 smallfile1 = percpufreeb / smallfile1_d; 2595312Smg147109 smallfile2 = percpufreeb / smallfile2_d; 2605312Smg147109 smallfile1 = MAX(smallfile1, smallfile); 2615312Smg147109 smallfile1 = MAX(smallfile1, smallfile64); 2625312Smg147109 smallfile2 = MAX(smallfile1, smallfile2); 2635312Smg147109 smallfile_update = drv_hztousec(ddi_get_lbolt()) 2645312Smg147109 + 1000000; 2655312Smg147109 } 2665312Smg147109 2675312Smg147109 dofree = freebehind && 2685312Smg147109 hp->hs_prev_offset == uiop->uio_loffset && 2695312Smg147109 hp->hs_ra_bytes > 0; 2705312Smg147109 271206Speterte base = segmap_getmapflt(segkmap, vp, 272206Speterte (u_offset_t)uiop->uio_loffset, n, 1, S_READ); 2730Sstevel@tonic-gate 274206Speterte error = uiomove(base + mapon, n, UIO_READ, uiop); 275206Speterte 2760Sstevel@tonic-gate if (error == 0) { 2770Sstevel@tonic-gate /* 2780Sstevel@tonic-gate * if read a whole block, or read to eof, 2790Sstevel@tonic-gate * won't need this buffer again soon. 2800Sstevel@tonic-gate */ 281206Speterte if (n + mapon == MAXBSIZE || 282206Speterte uiop->uio_loffset == filesize) 2830Sstevel@tonic-gate flags = SM_DONTNEED; 2840Sstevel@tonic-gate else 2850Sstevel@tonic-gate flags = 0; 2865312Smg147109 2875312Smg147109 if (dofree) { 2885312Smg147109 flags = SM_FREE | SM_ASYNC; 2895312Smg147109 if ((cache_read_ahead == 0) && 2905312Smg147109 uiop->uio_loffset > smallfile2) 2915312Smg147109 flags |= SM_DONTNEED; 2925312Smg147109 } 2935312Smg147109 2940Sstevel@tonic-gate error = segmap_release(segkmap, base, flags); 2950Sstevel@tonic-gate } else 2960Sstevel@tonic-gate (void) segmap_release(segkmap, base, 0); 2970Sstevel@tonic-gate } while (error == 0 && uiop->uio_resid > 0); 2980Sstevel@tonic-gate 2990Sstevel@tonic-gate return (error); 3000Sstevel@tonic-gate } 3010Sstevel@tonic-gate 3020Sstevel@tonic-gate /*ARGSUSED2*/ 3030Sstevel@tonic-gate static int 3040Sstevel@tonic-gate hsfs_getattr( 3050Sstevel@tonic-gate struct vnode *vp, 3060Sstevel@tonic-gate struct vattr *vap, 3070Sstevel@tonic-gate int flags, 3085331Samw struct cred *cred, 3095331Samw caller_context_t *ct) 3100Sstevel@tonic-gate { 3110Sstevel@tonic-gate struct hsnode *hp; 3120Sstevel@tonic-gate struct vfs *vfsp; 3130Sstevel@tonic-gate struct hsfs *fsp; 3140Sstevel@tonic-gate 3150Sstevel@tonic-gate hp = VTOH(vp); 3160Sstevel@tonic-gate fsp = VFS_TO_HSFS(vp->v_vfsp); 3170Sstevel@tonic-gate vfsp = vp->v_vfsp; 3180Sstevel@tonic-gate 3190Sstevel@tonic-gate if ((hp->hs_dirent.ext_size == 0) && (vp->v_type == VDIR)) { 3200Sstevel@tonic-gate hs_filldirent(vp, &hp->hs_dirent); 3210Sstevel@tonic-gate } 3220Sstevel@tonic-gate vap->va_type = IFTOVT(hp->hs_dirent.mode); 3230Sstevel@tonic-gate vap->va_mode = hp->hs_dirent.mode; 3240Sstevel@tonic-gate vap->va_uid = hp->hs_dirent.uid; 3250Sstevel@tonic-gate vap->va_gid = hp->hs_dirent.gid; 3260Sstevel@tonic-gate 3270Sstevel@tonic-gate vap->va_fsid = vfsp->vfs_dev; 3280Sstevel@tonic-gate vap->va_nodeid = (ino64_t)hp->hs_nodeid; 3290Sstevel@tonic-gate vap->va_nlink = hp->hs_dirent.nlink; 3300Sstevel@tonic-gate vap->va_size = (offset_t)hp->hs_dirent.ext_size; 3310Sstevel@tonic-gate 3320Sstevel@tonic-gate vap->va_atime.tv_sec = hp->hs_dirent.adate.tv_sec; 3330Sstevel@tonic-gate vap->va_atime.tv_nsec = hp->hs_dirent.adate.tv_usec*1000; 3340Sstevel@tonic-gate vap->va_mtime.tv_sec = hp->hs_dirent.mdate.tv_sec; 3350Sstevel@tonic-gate vap->va_mtime.tv_nsec = hp->hs_dirent.mdate.tv_usec*1000; 3360Sstevel@tonic-gate vap->va_ctime.tv_sec = hp->hs_dirent.cdate.tv_sec; 3370Sstevel@tonic-gate vap->va_ctime.tv_nsec = hp->hs_dirent.cdate.tv_usec*1000; 3380Sstevel@tonic-gate if (vp->v_type == VCHR || vp->v_type == VBLK) 3390Sstevel@tonic-gate vap->va_rdev = hp->hs_dirent.r_dev; 3400Sstevel@tonic-gate else 3410Sstevel@tonic-gate vap->va_rdev = 0; 3420Sstevel@tonic-gate vap->va_blksize = vfsp->vfs_bsize; 3430Sstevel@tonic-gate /* no. of blocks = no. of data blocks + no. of xar blocks */ 3440Sstevel@tonic-gate vap->va_nblocks = (fsblkcnt64_t)howmany(vap->va_size + (u_longlong_t) 3450Sstevel@tonic-gate (hp->hs_dirent.xar_len << fsp->hsfs_vol.lbn_shift), DEV_BSIZE); 3460Sstevel@tonic-gate vap->va_seq = hp->hs_seq; 3470Sstevel@tonic-gate return (0); 3480Sstevel@tonic-gate } 3490Sstevel@tonic-gate 3500Sstevel@tonic-gate /*ARGSUSED*/ 3510Sstevel@tonic-gate static int 3525331Samw hsfs_readlink(struct vnode *vp, 3535331Samw struct uio *uiop, 3545331Samw struct cred *cred, 3555331Samw caller_context_t *ct) 3560Sstevel@tonic-gate { 3570Sstevel@tonic-gate struct hsnode *hp; 3580Sstevel@tonic-gate 3590Sstevel@tonic-gate if (vp->v_type != VLNK) 3600Sstevel@tonic-gate return (EINVAL); 3610Sstevel@tonic-gate 3620Sstevel@tonic-gate hp = VTOH(vp); 3630Sstevel@tonic-gate 3640Sstevel@tonic-gate if (hp->hs_dirent.sym_link == (char *)NULL) 3650Sstevel@tonic-gate return (ENOENT); 3660Sstevel@tonic-gate 3670Sstevel@tonic-gate return (uiomove(hp->hs_dirent.sym_link, 3680Sstevel@tonic-gate (size_t)MIN(hp->hs_dirent.ext_size, 3690Sstevel@tonic-gate uiop->uio_resid), UIO_READ, uiop)); 3700Sstevel@tonic-gate } 3710Sstevel@tonic-gate 3720Sstevel@tonic-gate /*ARGSUSED*/ 3730Sstevel@tonic-gate static void 3745331Samw hsfs_inactive(struct vnode *vp, 3755331Samw struct cred *cred, 3765331Samw caller_context_t *ct) 3770Sstevel@tonic-gate { 3780Sstevel@tonic-gate struct hsnode *hp; 3790Sstevel@tonic-gate struct hsfs *fsp; 3800Sstevel@tonic-gate 3810Sstevel@tonic-gate int nopage; 3820Sstevel@tonic-gate 3830Sstevel@tonic-gate hp = VTOH(vp); 3840Sstevel@tonic-gate fsp = VFS_TO_HSFS(vp->v_vfsp); 3850Sstevel@tonic-gate /* 3860Sstevel@tonic-gate * Note: acquiring and holding v_lock for quite a while 3870Sstevel@tonic-gate * here serializes on the vnode; this is unfortunate, but 3880Sstevel@tonic-gate * likely not to overly impact performance, as the underlying 3890Sstevel@tonic-gate * device (CDROM drive) is quite slow. 3900Sstevel@tonic-gate */ 3910Sstevel@tonic-gate rw_enter(&fsp->hsfs_hash_lock, RW_WRITER); 3920Sstevel@tonic-gate mutex_enter(&hp->hs_contents_lock); 3930Sstevel@tonic-gate mutex_enter(&vp->v_lock); 3940Sstevel@tonic-gate 3950Sstevel@tonic-gate if (vp->v_count < 1) { 3960Sstevel@tonic-gate panic("hsfs_inactive: v_count < 1"); 3970Sstevel@tonic-gate /*NOTREACHED*/ 3980Sstevel@tonic-gate } 3990Sstevel@tonic-gate 4000Sstevel@tonic-gate if (vp->v_count > 1 || (hp->hs_flags & HREF) == 0) { 4010Sstevel@tonic-gate vp->v_count--; /* release hold from vn_rele */ 4020Sstevel@tonic-gate mutex_exit(&vp->v_lock); 4030Sstevel@tonic-gate mutex_exit(&hp->hs_contents_lock); 4040Sstevel@tonic-gate rw_exit(&fsp->hsfs_hash_lock); 4050Sstevel@tonic-gate return; 4060Sstevel@tonic-gate } 4070Sstevel@tonic-gate vp->v_count--; /* release hold from vn_rele */ 4080Sstevel@tonic-gate if (vp->v_count == 0) { 4090Sstevel@tonic-gate /* 4100Sstevel@tonic-gate * Free the hsnode. 4110Sstevel@tonic-gate * If there are no pages associated with the 4120Sstevel@tonic-gate * hsnode, give it back to the kmem_cache, 4130Sstevel@tonic-gate * else put at the end of this file system's 4140Sstevel@tonic-gate * internal free list. 4150Sstevel@tonic-gate */ 4160Sstevel@tonic-gate nopage = !vn_has_cached_data(vp); 4170Sstevel@tonic-gate hp->hs_flags = 0; 4180Sstevel@tonic-gate /* 4190Sstevel@tonic-gate * exit these locks now, since hs_freenode may 4200Sstevel@tonic-gate * kmem_free the hsnode and embedded vnode 4210Sstevel@tonic-gate */ 4220Sstevel@tonic-gate mutex_exit(&vp->v_lock); 4230Sstevel@tonic-gate mutex_exit(&hp->hs_contents_lock); 4240Sstevel@tonic-gate hs_freenode(vp, fsp, nopage); 4250Sstevel@tonic-gate } else { 4260Sstevel@tonic-gate mutex_exit(&vp->v_lock); 4270Sstevel@tonic-gate mutex_exit(&hp->hs_contents_lock); 4280Sstevel@tonic-gate } 4290Sstevel@tonic-gate rw_exit(&fsp->hsfs_hash_lock); 4300Sstevel@tonic-gate } 4310Sstevel@tonic-gate 4320Sstevel@tonic-gate 4330Sstevel@tonic-gate /*ARGSUSED*/ 4340Sstevel@tonic-gate static int 4350Sstevel@tonic-gate hsfs_lookup( 4360Sstevel@tonic-gate struct vnode *dvp, 4370Sstevel@tonic-gate char *nm, 4380Sstevel@tonic-gate struct vnode **vpp, 4390Sstevel@tonic-gate struct pathname *pnp, 4400Sstevel@tonic-gate int flags, 4410Sstevel@tonic-gate struct vnode *rdir, 4425331Samw struct cred *cred, 4435331Samw caller_context_t *ct, 4445331Samw int *direntflags, 4455331Samw pathname_t *realpnp) 4460Sstevel@tonic-gate { 4470Sstevel@tonic-gate int error; 4480Sstevel@tonic-gate int namelen = (int)strlen(nm); 4490Sstevel@tonic-gate 4500Sstevel@tonic-gate if (*nm == '\0') { 4510Sstevel@tonic-gate VN_HOLD(dvp); 4520Sstevel@tonic-gate *vpp = dvp; 4530Sstevel@tonic-gate return (0); 4540Sstevel@tonic-gate } 4550Sstevel@tonic-gate 4560Sstevel@tonic-gate /* 4570Sstevel@tonic-gate * If we're looking for ourself, life is simple. 4580Sstevel@tonic-gate */ 4590Sstevel@tonic-gate if (namelen == 1 && *nm == '.') { 4600Sstevel@tonic-gate if (error = hs_access(dvp, (mode_t)VEXEC, cred)) 4610Sstevel@tonic-gate return (error); 4620Sstevel@tonic-gate VN_HOLD(dvp); 4630Sstevel@tonic-gate *vpp = dvp; 4640Sstevel@tonic-gate return (0); 4650Sstevel@tonic-gate } 4660Sstevel@tonic-gate 4670Sstevel@tonic-gate return (hs_dirlook(dvp, nm, namelen, vpp, cred)); 4680Sstevel@tonic-gate } 4690Sstevel@tonic-gate 4700Sstevel@tonic-gate 4710Sstevel@tonic-gate /*ARGSUSED*/ 4720Sstevel@tonic-gate static int 4730Sstevel@tonic-gate hsfs_readdir( 4745331Samw struct vnode *vp, 4755331Samw struct uio *uiop, 4765331Samw struct cred *cred, 4775331Samw int *eofp, 4785331Samw caller_context_t *ct, 4795331Samw int flags) 4800Sstevel@tonic-gate { 4810Sstevel@tonic-gate struct hsnode *dhp; 4820Sstevel@tonic-gate struct hsfs *fsp; 4830Sstevel@tonic-gate struct hs_direntry hd; 4840Sstevel@tonic-gate struct dirent64 *nd; 4850Sstevel@tonic-gate int error; 4860Sstevel@tonic-gate uint_t offset; /* real offset in directory */ 4870Sstevel@tonic-gate uint_t dirsiz; /* real size of directory */ 4880Sstevel@tonic-gate uchar_t *blkp; 4890Sstevel@tonic-gate int hdlen; /* length of hs directory entry */ 4900Sstevel@tonic-gate long ndlen; /* length of dirent entry */ 4910Sstevel@tonic-gate int bytes_wanted; 4920Sstevel@tonic-gate size_t bufsize; /* size of dirent buffer */ 4930Sstevel@tonic-gate char *outbuf; /* ptr to dirent buffer */ 4940Sstevel@tonic-gate char *dname; 4950Sstevel@tonic-gate int dnamelen; 4960Sstevel@tonic-gate size_t dname_size; 4970Sstevel@tonic-gate struct fbuf *fbp; 4980Sstevel@tonic-gate uint_t last_offset; /* last index into current dir block */ 4990Sstevel@tonic-gate ino64_t dirino; /* temporary storage before storing in dirent */ 5000Sstevel@tonic-gate off_t diroff; 5010Sstevel@tonic-gate 5020Sstevel@tonic-gate dhp = VTOH(vp); 5030Sstevel@tonic-gate fsp = VFS_TO_HSFS(vp->v_vfsp); 5040Sstevel@tonic-gate if (dhp->hs_dirent.ext_size == 0) 5050Sstevel@tonic-gate hs_filldirent(vp, &dhp->hs_dirent); 5060Sstevel@tonic-gate dirsiz = dhp->hs_dirent.ext_size; 5070Sstevel@tonic-gate if (uiop->uio_loffset >= dirsiz) { /* at or beyond EOF */ 5080Sstevel@tonic-gate if (eofp) 5090Sstevel@tonic-gate *eofp = 1; 5100Sstevel@tonic-gate return (0); 5110Sstevel@tonic-gate } 5121349Speterte ASSERT(uiop->uio_loffset <= HS_MAXFILEOFF); 5131349Speterte offset = uiop->uio_loffset; 5140Sstevel@tonic-gate 5150Sstevel@tonic-gate dname_size = fsp->hsfs_namemax + 1; /* 1 for the ending NUL */ 5160Sstevel@tonic-gate dname = kmem_alloc(dname_size, KM_SLEEP); 5170Sstevel@tonic-gate bufsize = uiop->uio_resid + sizeof (struct dirent64); 5180Sstevel@tonic-gate 5190Sstevel@tonic-gate outbuf = kmem_alloc(bufsize, KM_SLEEP); 5200Sstevel@tonic-gate nd = (struct dirent64 *)outbuf; 5210Sstevel@tonic-gate 5220Sstevel@tonic-gate while (offset < dirsiz) { 523494Sfrankho bytes_wanted = MIN(MAXBSIZE, dirsiz - (offset & MAXBMASK)); 5240Sstevel@tonic-gate 5250Sstevel@tonic-gate error = fbread(vp, (offset_t)(offset & MAXBMASK), 5264866Sfrankho (unsigned int)bytes_wanted, S_READ, &fbp); 5270Sstevel@tonic-gate if (error) 5280Sstevel@tonic-gate goto done; 5290Sstevel@tonic-gate 5300Sstevel@tonic-gate blkp = (uchar_t *)fbp->fb_addr; 531494Sfrankho last_offset = (offset & MAXBMASK) + fbp->fb_count; 5320Sstevel@tonic-gate 5330Sstevel@tonic-gate #define rel_offset(offset) ((offset) & MAXBOFFSET) /* index into blkp */ 5340Sstevel@tonic-gate 5350Sstevel@tonic-gate while (offset < last_offset) { 5360Sstevel@tonic-gate /* 537494Sfrankho * Very similar validation code is found in 538494Sfrankho * process_dirblock(), hsfs_node.c. 539494Sfrankho * For an explanation, see there. 540494Sfrankho * It may make sense for the future to 541494Sfrankho * "consolidate" the code in hs_parsedir(), 542494Sfrankho * process_dirblock() and hsfs_readdir() into 543494Sfrankho * a single utility function. 5440Sstevel@tonic-gate */ 5450Sstevel@tonic-gate hdlen = (int)((uchar_t) 5464866Sfrankho HDE_DIR_LEN(&blkp[rel_offset(offset)])); 547494Sfrankho if (hdlen < HDE_ROOT_DIR_REC_SIZE || 548494Sfrankho offset + hdlen > last_offset) { 5490Sstevel@tonic-gate /* 550494Sfrankho * advance to next sector boundary 5510Sstevel@tonic-gate */ 552494Sfrankho offset = roundup(offset + 1, HS_SECTOR_SIZE); 553494Sfrankho if (hdlen) 554494Sfrankho hs_log_bogus_disk_warning(fsp, 555494Sfrankho HSFS_ERR_TRAILING_JUNK, 0); 556494Sfrankho 557494Sfrankho continue; 5580Sstevel@tonic-gate } 5590Sstevel@tonic-gate 5600Sstevel@tonic-gate bzero(&hd, sizeof (hd)); 5610Sstevel@tonic-gate 5620Sstevel@tonic-gate /* 5630Sstevel@tonic-gate * Just ignore invalid directory entries. 5640Sstevel@tonic-gate * XXX - maybe hs_parsedir() will detect EXISTENCE bit 5650Sstevel@tonic-gate */ 5660Sstevel@tonic-gate if (!hs_parsedir(fsp, &blkp[rel_offset(offset)], 5674866Sfrankho &hd, dname, &dnamelen, last_offset - offset)) { 5680Sstevel@tonic-gate /* 5690Sstevel@tonic-gate * Determine if there is enough room 5700Sstevel@tonic-gate */ 5710Sstevel@tonic-gate ndlen = (long)DIRENT64_RECLEN((dnamelen)); 5720Sstevel@tonic-gate 5730Sstevel@tonic-gate if ((ndlen + ((char *)nd - outbuf)) > 5740Sstevel@tonic-gate uiop->uio_resid) { 5750Sstevel@tonic-gate fbrelse(fbp, S_READ); 5760Sstevel@tonic-gate goto done; /* output buffer full */ 5770Sstevel@tonic-gate } 5780Sstevel@tonic-gate 5790Sstevel@tonic-gate diroff = offset + hdlen; 5800Sstevel@tonic-gate /* 5814866Sfrankho * If the media carries rrip-v1.12 or newer, 5824866Sfrankho * and we trust the inodes from the rrip data 5834866Sfrankho * (use_rrip_inodes != 0), use that data. If the 5844866Sfrankho * media has been created by a recent mkisofs 5854866Sfrankho * version, we may trust all numbers in the 5864866Sfrankho * starting extent number; otherwise, we cannot 5874866Sfrankho * do this for zero sized files and symlinks, 5884866Sfrankho * because if we did we'd end up mapping all of 5894866Sfrankho * them to the same node. We use HS_DUMMY_INO 5904866Sfrankho * in this case and make sure that we will not 5914866Sfrankho * map all files to the same meta data. 5920Sstevel@tonic-gate */ 5934866Sfrankho if (hd.inode != 0 && use_rrip_inodes) { 5944866Sfrankho dirino = hd.inode; 5954866Sfrankho } else if ((hd.ext_size == 0 || 5964866Sfrankho hd.sym_link != (char *)NULL) && 5974866Sfrankho (fsp->hsfs_flags & HSFSMNT_INODE) == 0) { 5984866Sfrankho dirino = HS_DUMMY_INO; 5990Sstevel@tonic-gate } else { 6004866Sfrankho dirino = hd.ext_lbn; 6010Sstevel@tonic-gate } 6020Sstevel@tonic-gate 6030Sstevel@tonic-gate /* strncpy(9f) will zero uninitialized bytes */ 6040Sstevel@tonic-gate 6050Sstevel@tonic-gate ASSERT(strlen(dname) + 1 <= 6060Sstevel@tonic-gate DIRENT64_NAMELEN(ndlen)); 6070Sstevel@tonic-gate (void) strncpy(nd->d_name, dname, 6080Sstevel@tonic-gate DIRENT64_NAMELEN(ndlen)); 6090Sstevel@tonic-gate nd->d_reclen = (ushort_t)ndlen; 6100Sstevel@tonic-gate nd->d_off = (offset_t)diroff; 6110Sstevel@tonic-gate nd->d_ino = dirino; 6120Sstevel@tonic-gate nd = (struct dirent64 *)((char *)nd + ndlen); 6130Sstevel@tonic-gate 6140Sstevel@tonic-gate /* 6150Sstevel@tonic-gate * free up space allocated for symlink 6160Sstevel@tonic-gate */ 6170Sstevel@tonic-gate if (hd.sym_link != (char *)NULL) { 6180Sstevel@tonic-gate kmem_free(hd.sym_link, 6190Sstevel@tonic-gate (size_t)(hd.ext_size+1)); 6200Sstevel@tonic-gate hd.sym_link = (char *)NULL; 6210Sstevel@tonic-gate } 6220Sstevel@tonic-gate } 6230Sstevel@tonic-gate offset += hdlen; 6240Sstevel@tonic-gate } 6250Sstevel@tonic-gate fbrelse(fbp, S_READ); 6260Sstevel@tonic-gate } 6270Sstevel@tonic-gate 6280Sstevel@tonic-gate /* 6290Sstevel@tonic-gate * Got here for one of the following reasons: 6300Sstevel@tonic-gate * 1) outbuf is full (error == 0) 6310Sstevel@tonic-gate * 2) end of directory reached (error == 0) 6320Sstevel@tonic-gate * 3) error reading directory sector (error != 0) 6330Sstevel@tonic-gate * 4) directory entry crosses sector boundary (error == 0) 6340Sstevel@tonic-gate * 6350Sstevel@tonic-gate * If any directory entries have been copied, don't report 6360Sstevel@tonic-gate * case 4. Instead, return the valid directory entries. 6370Sstevel@tonic-gate * 6380Sstevel@tonic-gate * If no entries have been copied, report the error. 6390Sstevel@tonic-gate * If case 4, this will be indistiguishable from EOF. 6400Sstevel@tonic-gate */ 6410Sstevel@tonic-gate done: 6420Sstevel@tonic-gate ndlen = ((char *)nd - outbuf); 6430Sstevel@tonic-gate if (ndlen != 0) { 6440Sstevel@tonic-gate error = uiomove(outbuf, (size_t)ndlen, UIO_READ, uiop); 6451349Speterte uiop->uio_loffset = offset; 6460Sstevel@tonic-gate } 6470Sstevel@tonic-gate kmem_free(dname, dname_size); 6480Sstevel@tonic-gate kmem_free(outbuf, bufsize); 6490Sstevel@tonic-gate if (eofp && error == 0) 6501349Speterte *eofp = (uiop->uio_loffset >= dirsiz); 6510Sstevel@tonic-gate return (error); 6520Sstevel@tonic-gate } 6530Sstevel@tonic-gate 6545331Samw /*ARGSUSED2*/ 6550Sstevel@tonic-gate static int 6565331Samw hsfs_fid(struct vnode *vp, struct fid *fidp, caller_context_t *ct) 6570Sstevel@tonic-gate { 6580Sstevel@tonic-gate struct hsnode *hp; 6590Sstevel@tonic-gate struct hsfid *fid; 6600Sstevel@tonic-gate 6610Sstevel@tonic-gate if (fidp->fid_len < (sizeof (*fid) - sizeof (fid->hf_len))) { 6620Sstevel@tonic-gate fidp->fid_len = sizeof (*fid) - sizeof (fid->hf_len); 6630Sstevel@tonic-gate return (ENOSPC); 6640Sstevel@tonic-gate } 6650Sstevel@tonic-gate 6660Sstevel@tonic-gate fid = (struct hsfid *)fidp; 6670Sstevel@tonic-gate fid->hf_len = sizeof (*fid) - sizeof (fid->hf_len); 6680Sstevel@tonic-gate hp = VTOH(vp); 6690Sstevel@tonic-gate mutex_enter(&hp->hs_contents_lock); 6700Sstevel@tonic-gate fid->hf_dir_lbn = hp->hs_dir_lbn; 6710Sstevel@tonic-gate fid->hf_dir_off = (ushort_t)hp->hs_dir_off; 6724866Sfrankho fid->hf_ino = hp->hs_nodeid; 6730Sstevel@tonic-gate mutex_exit(&hp->hs_contents_lock); 6740Sstevel@tonic-gate return (0); 6750Sstevel@tonic-gate } 6760Sstevel@tonic-gate 6770Sstevel@tonic-gate /*ARGSUSED*/ 6780Sstevel@tonic-gate static int 6795331Samw hsfs_open(struct vnode **vpp, 6805331Samw int flag, 6815331Samw struct cred *cred, 6825331Samw caller_context_t *ct) 6830Sstevel@tonic-gate { 6840Sstevel@tonic-gate return (0); 6850Sstevel@tonic-gate } 6860Sstevel@tonic-gate 6870Sstevel@tonic-gate /*ARGSUSED*/ 6880Sstevel@tonic-gate static int 6890Sstevel@tonic-gate hsfs_close( 6900Sstevel@tonic-gate struct vnode *vp, 6910Sstevel@tonic-gate int flag, 6920Sstevel@tonic-gate int count, 6930Sstevel@tonic-gate offset_t offset, 6945331Samw struct cred *cred, 6955331Samw caller_context_t *ct) 6960Sstevel@tonic-gate { 6970Sstevel@tonic-gate (void) cleanlocks(vp, ttoproc(curthread)->p_pid, 0); 6980Sstevel@tonic-gate cleanshares(vp, ttoproc(curthread)->p_pid); 6990Sstevel@tonic-gate return (0); 7000Sstevel@tonic-gate } 7010Sstevel@tonic-gate 7020Sstevel@tonic-gate /*ARGSUSED2*/ 7030Sstevel@tonic-gate static int 7045331Samw hsfs_access(struct vnode *vp, 7055331Samw int mode, 7065331Samw int flags, 7075331Samw cred_t *cred, 7085331Samw caller_context_t *ct) 7090Sstevel@tonic-gate { 7100Sstevel@tonic-gate return (hs_access(vp, (mode_t)mode, cred)); 7110Sstevel@tonic-gate } 7120Sstevel@tonic-gate 7130Sstevel@tonic-gate /* 7140Sstevel@tonic-gate * the seek time of a CD-ROM is very slow, and data transfer 7150Sstevel@tonic-gate * rate is even worse (max. 150K per sec). The design 7160Sstevel@tonic-gate * decision is to reduce access to cd-rom as much as possible, 7170Sstevel@tonic-gate * and to transfer a sizable block (read-ahead) of data at a time. 7180Sstevel@tonic-gate * UFS style of read ahead one block at a time is not appropriate, 7190Sstevel@tonic-gate * and is not supported 7200Sstevel@tonic-gate */ 7210Sstevel@tonic-gate 7220Sstevel@tonic-gate /* 7230Sstevel@tonic-gate * KLUSTSIZE should be a multiple of PAGESIZE and <= MAXPHYS. 7240Sstevel@tonic-gate */ 7250Sstevel@tonic-gate #define KLUSTSIZE (56 * 1024) 7260Sstevel@tonic-gate /* we don't support read ahead */ 7270Sstevel@tonic-gate int hsfs_lostpage; /* no. of times we lost original page */ 7280Sstevel@tonic-gate 7290Sstevel@tonic-gate /* 7300Sstevel@tonic-gate * Used to prevent biodone() from releasing buf resources that 7310Sstevel@tonic-gate * we didn't allocate in quite the usual way. 7320Sstevel@tonic-gate */ 7330Sstevel@tonic-gate /*ARGSUSED*/ 7340Sstevel@tonic-gate int 7350Sstevel@tonic-gate hsfs_iodone(struct buf *bp) 7360Sstevel@tonic-gate { 7370Sstevel@tonic-gate sema_v(&bp->b_io); 7380Sstevel@tonic-gate return (0); 7390Sstevel@tonic-gate } 7400Sstevel@tonic-gate 7410Sstevel@tonic-gate /* 7425312Smg147109 * The taskq thread that invokes the scheduling function to ensure 7435312Smg147109 * that all readaheads are complete and cleans up the associated 7445312Smg147109 * memory and releases the page lock. 7455312Smg147109 */ 7465312Smg147109 void 7475312Smg147109 hsfs_ra_task(void *arg) 7485312Smg147109 { 7495312Smg147109 struct hio_info *info = arg; 7505312Smg147109 uint_t count; 7515312Smg147109 struct buf *wbuf; 7525312Smg147109 7535312Smg147109 ASSERT(info->pp != NULL); 7545312Smg147109 7555312Smg147109 for (count = 0; count < info->bufsused; count++) { 7565312Smg147109 wbuf = &(info->bufs[count]); 7575312Smg147109 7585312Smg147109 DTRACE_PROBE1(hsfs_io_wait_ra, struct buf *, wbuf); 7595312Smg147109 while (sema_tryp(&(info->sema[count])) == 0) { 7605312Smg147109 if (hsched_invoke_strategy(info->fsp)) { 7615312Smg147109 sema_p(&(info->sema[count])); 7625312Smg147109 break; 7635312Smg147109 } 7645312Smg147109 } 7655312Smg147109 sema_destroy(&(info->sema[count])); 7665312Smg147109 DTRACE_PROBE1(hsfs_io_done_ra, struct buf *, wbuf); 7675312Smg147109 biofini(&(info->bufs[count])); 7685312Smg147109 } 7695312Smg147109 for (count = 0; count < info->bufsused; count++) { 7705312Smg147109 if (info->vas[count] != NULL) { 7715312Smg147109 ppmapout(info->vas[count]); 7725312Smg147109 } 7735312Smg147109 } 7745312Smg147109 kmem_free(info->vas, info->bufcnt * sizeof (caddr_t)); 7755312Smg147109 kmem_free(info->bufs, info->bufcnt * sizeof (struct buf)); 7765312Smg147109 kmem_free(info->sema, info->bufcnt * sizeof (ksema_t)); 7775312Smg147109 7785312Smg147109 pvn_read_done(info->pp, 0); 7795312Smg147109 kmem_cache_free(hio_info_cache, info); 7805312Smg147109 } 7815312Smg147109 7825312Smg147109 /* 7835312Smg147109 * Submit asynchronous readahead requests to the I/O scheduler 7845312Smg147109 * depending on the number of pages to read ahead. These requests 7855312Smg147109 * are asynchronous to the calling thread but I/O requests issued 7865312Smg147109 * subsequently by other threads with higher LBNs must wait for 7875312Smg147109 * these readaheads to complete since we have a single ordered 7885312Smg147109 * I/O pipeline. Thus these readaheads are semi-asynchronous. 7895312Smg147109 * A TaskQ handles waiting for the readaheads to complete. 7905312Smg147109 * 7915312Smg147109 * This function is mostly a copy of hsfs_getapage but somewhat 7925312Smg147109 * simpler. A readahead request is aborted if page allocation 7935312Smg147109 * fails. 7945312Smg147109 */ 7955312Smg147109 /*ARGSUSED*/ 7965312Smg147109 static int 7975312Smg147109 hsfs_getpage_ra( 7985312Smg147109 struct vnode *vp, 7995312Smg147109 u_offset_t off, 8005312Smg147109 struct seg *seg, 8015312Smg147109 caddr_t addr, 8025312Smg147109 struct hsnode *hp, 8035312Smg147109 struct hsfs *fsp, 8045312Smg147109 int xarsiz, 8055312Smg147109 offset_t bof, 8065312Smg147109 int chunk_lbn_count, 8075312Smg147109 int chunk_data_bytes) 8085312Smg147109 { 8095312Smg147109 struct buf *bufs; 8105312Smg147109 caddr_t *vas; 8115312Smg147109 caddr_t va; 8125312Smg147109 struct page *pp, *searchp, *lastp; 8135312Smg147109 struct vnode *devvp; 8145312Smg147109 ulong_t byte_offset; 8155312Smg147109 size_t io_len_tmp; 8165312Smg147109 uint_t io_off, io_len; 8175312Smg147109 uint_t xlen; 8185312Smg147109 uint_t filsiz; 8195312Smg147109 uint_t secsize; 8205312Smg147109 uint_t bufcnt; 8215312Smg147109 uint_t bufsused; 8225312Smg147109 uint_t count; 8235312Smg147109 uint_t io_end; 8245312Smg147109 uint_t which_chunk_lbn; 8255312Smg147109 uint_t offset_lbn; 8265312Smg147109 uint_t offset_extra; 8275312Smg147109 offset_t offset_bytes; 8285312Smg147109 uint_t remaining_bytes; 8295312Smg147109 uint_t extension; 8305312Smg147109 int remainder; /* must be signed */ 8315312Smg147109 diskaddr_t driver_block; 8325312Smg147109 u_offset_t io_off_tmp; 8335312Smg147109 ksema_t *fio_done; 8345312Smg147109 struct hio_info *info; 8355312Smg147109 size_t len; 8365312Smg147109 8375312Smg147109 ASSERT(fsp->hqueue != NULL); 8385312Smg147109 8395312Smg147109 if (addr >= seg->s_base + seg->s_size) { 8405312Smg147109 return (-1); 8415312Smg147109 } 8425312Smg147109 8435312Smg147109 devvp = fsp->hsfs_devvp; 8445312Smg147109 secsize = fsp->hsfs_vol.lbn_size; /* bytes per logical block */ 8455312Smg147109 8465312Smg147109 /* file data size */ 8475312Smg147109 filsiz = hp->hs_dirent.ext_size; 8485312Smg147109 8495312Smg147109 if (off >= filsiz) 8505312Smg147109 return (0); 8515312Smg147109 8525312Smg147109 extension = 0; 8535312Smg147109 pp = NULL; 8545312Smg147109 8555312Smg147109 extension += hp->hs_ra_bytes; 8565312Smg147109 8575312Smg147109 /* 858*5406Smg147109 * Some CD writers (e.g. Kodak Photo CD writers) 859*5406Smg147109 * create CDs in TAO mode and reserve tracks that 860*5406Smg147109 * are not completely written. Some sectors remain 861*5406Smg147109 * unreadable for this reason and give I/O errors. 862*5406Smg147109 * Also, there's no point in reading sectors 863*5406Smg147109 * we'll never look at. So, if we're asked to go 864*5406Smg147109 * beyond the end of a file, truncate to the length 865*5406Smg147109 * of that file. 8665312Smg147109 * 867*5406Smg147109 * Additionally, this behaviour is required by section 868*5406Smg147109 * 6.4.5 of ISO 9660:1988(E). 8695312Smg147109 */ 8705312Smg147109 len = MIN(extension ? extension : PAGESIZE, filsiz - off); 8715312Smg147109 8725312Smg147109 /* A little paranoia */ 8735312Smg147109 if (len <= 0) 8745312Smg147109 return (-1); 8755312Smg147109 8765312Smg147109 /* 8775312Smg147109 * After all that, make sure we're asking for things in units 8785312Smg147109 * that bdev_strategy() will understand (see bug 4202551). 8795312Smg147109 */ 8805312Smg147109 len = roundup(len, DEV_BSIZE); 8815312Smg147109 8825312Smg147109 pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp, 8835312Smg147109 &io_len_tmp, off, len, 1); 8845312Smg147109 8855312Smg147109 if (pp == NULL) { 8865312Smg147109 hp->hs_num_contig = 0; 8875312Smg147109 hp->hs_ra_bytes = 0; 8885312Smg147109 hp->hs_prev_offset = 0; 8895312Smg147109 return (-1); 8905312Smg147109 } 8915312Smg147109 8925312Smg147109 io_off = (uint_t)io_off_tmp; 8935312Smg147109 io_len = (uint_t)io_len_tmp; 8945312Smg147109 8955312Smg147109 /* check for truncation */ 8965312Smg147109 /* 8975312Smg147109 * xxx Clean up and return EIO instead? 8985312Smg147109 * xxx Ought to go to u_offset_t for everything, but we 8995312Smg147109 * xxx call lots of things that want uint_t arguments. 9005312Smg147109 */ 9015312Smg147109 ASSERT(io_off == io_off_tmp); 9025312Smg147109 9035312Smg147109 /* 9045312Smg147109 * get enough buffers for worst-case scenario 9055312Smg147109 * (i.e., no coalescing possible). 9065312Smg147109 */ 9075312Smg147109 bufcnt = (len + secsize - 1) / secsize; 9085312Smg147109 bufs = kmem_alloc(bufcnt * sizeof (struct buf), KM_SLEEP); 9095312Smg147109 vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP); 9105312Smg147109 9115312Smg147109 /* 9125312Smg147109 * Allocate a array of semaphores since we are doing I/O 9135312Smg147109 * scheduling. 9145312Smg147109 */ 9155312Smg147109 fio_done = kmem_alloc(bufcnt * sizeof (ksema_t), KM_SLEEP); 9165312Smg147109 9175312Smg147109 /* 9185312Smg147109 * If our filesize is not an integer multiple of PAGESIZE, 9195312Smg147109 * we zero that part of the last page that's between EOF and 9205312Smg147109 * the PAGESIZE boundary. 9215312Smg147109 */ 9225312Smg147109 xlen = io_len & PAGEOFFSET; 9235312Smg147109 if (xlen != 0) 9245312Smg147109 pagezero(pp->p_prev, xlen, PAGESIZE - xlen); 9255312Smg147109 9265312Smg147109 DTRACE_PROBE2(hsfs_readahead, struct vnode *, vp, uint_t, io_len); 9275312Smg147109 9285312Smg147109 va = NULL; 9295312Smg147109 lastp = NULL; 9305312Smg147109 searchp = pp; 9315312Smg147109 io_end = io_off + io_len; 9325312Smg147109 for (count = 0, byte_offset = io_off; 9335312Smg147109 byte_offset < io_end; 9345312Smg147109 count++) { 9355312Smg147109 ASSERT(count < bufcnt); 9365312Smg147109 9375312Smg147109 bioinit(&bufs[count]); 9385312Smg147109 bufs[count].b_edev = devvp->v_rdev; 9395312Smg147109 bufs[count].b_dev = cmpdev(devvp->v_rdev); 9405312Smg147109 bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ; 9415312Smg147109 bufs[count].b_iodone = hsfs_iodone; 9425312Smg147109 bufs[count].b_vp = vp; 9435312Smg147109 bufs[count].b_file = vp; 9445312Smg147109 9455312Smg147109 /* Compute disk address for interleaving. */ 9465312Smg147109 9475312Smg147109 /* considered without skips */ 9485312Smg147109 which_chunk_lbn = byte_offset / chunk_data_bytes; 9495312Smg147109 9505312Smg147109 /* factor in skips */ 9515312Smg147109 offset_lbn = which_chunk_lbn * chunk_lbn_count; 9525312Smg147109 9535312Smg147109 /* convert to physical byte offset for lbn */ 9545312Smg147109 offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp); 9555312Smg147109 9565312Smg147109 /* don't forget offset into lbn */ 9575312Smg147109 offset_extra = byte_offset % chunk_data_bytes; 9585312Smg147109 9595312Smg147109 /* get virtual block number for driver */ 9605312Smg147109 driver_block = lbtodb(bof + xarsiz 9615312Smg147109 + offset_bytes + offset_extra); 9625312Smg147109 9635312Smg147109 if (lastp != searchp) { 9645312Smg147109 /* this branch taken first time through loop */ 9655312Smg147109 va = vas[count] = ppmapin(searchp, PROT_WRITE, 9665312Smg147109 (caddr_t)-1); 9675312Smg147109 /* ppmapin() guarantees not to return NULL */ 9685312Smg147109 } else { 9695312Smg147109 vas[count] = NULL; 9705312Smg147109 } 9715312Smg147109 9725312Smg147109 bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE; 9735312Smg147109 bufs[count].b_offset = 9745312Smg147109 (offset_t)(byte_offset - io_off + off); 9755312Smg147109 9765312Smg147109 /* 9775312Smg147109 * We specifically use the b_lblkno member here 9785312Smg147109 * as even in the 32 bit world driver_block can 9795312Smg147109 * get very large in line with the ISO9660 spec. 9805312Smg147109 */ 9815312Smg147109 9825312Smg147109 bufs[count].b_lblkno = driver_block; 9835312Smg147109 9845312Smg147109 remaining_bytes = ((which_chunk_lbn + 1) * chunk_data_bytes) 9855312Smg147109 - byte_offset; 9865312Smg147109 9875312Smg147109 /* 9885312Smg147109 * remaining_bytes can't be zero, as we derived 9895312Smg147109 * which_chunk_lbn directly from byte_offset. 9905312Smg147109 */ 9915312Smg147109 if ((remaining_bytes + byte_offset) < (off + len)) { 9925312Smg147109 /* coalesce-read the rest of the chunk */ 9935312Smg147109 bufs[count].b_bcount = remaining_bytes; 9945312Smg147109 } else { 9955312Smg147109 /* get the final bits */ 9965312Smg147109 bufs[count].b_bcount = off + len - byte_offset; 9975312Smg147109 } 9985312Smg147109 9995312Smg147109 remainder = PAGESIZE - (byte_offset % PAGESIZE); 10005312Smg147109 if (bufs[count].b_bcount > remainder) { 10015312Smg147109 bufs[count].b_bcount = remainder; 10025312Smg147109 } 10035312Smg147109 10045312Smg147109 bufs[count].b_bufsize = bufs[count].b_bcount; 10055312Smg147109 if (((offset_t)byte_offset + bufs[count].b_bcount) > 10065312Smg147109 HS_MAXFILEOFF) { 10075312Smg147109 break; 10085312Smg147109 } 10095312Smg147109 byte_offset += bufs[count].b_bcount; 10105312Smg147109 10115312Smg147109 /* 10125312Smg147109 * We are scheduling I/O so we need to enqueue 10135312Smg147109 * requests rather than calling bdev_strategy 10145312Smg147109 * here. A later invocation of the scheduling 10155312Smg147109 * function will take care of doing the actual 10165312Smg147109 * I/O as it selects requests from the queue as 10175312Smg147109 * per the scheduling logic. 10185312Smg147109 */ 10195312Smg147109 struct hio *hsio = kmem_cache_alloc(hio_cache, 10205312Smg147109 KM_SLEEP); 10215312Smg147109 10225312Smg147109 sema_init(&fio_done[count], 0, NULL, 10235312Smg147109 SEMA_DEFAULT, NULL); 10245312Smg147109 hsio->bp = &bufs[count]; 10255312Smg147109 hsio->sema = &fio_done[count]; 10265312Smg147109 hsio->io_lblkno = bufs[count].b_lblkno; 10275312Smg147109 hsio->nblocks = howmany(hsio->bp->b_bcount, 10285312Smg147109 DEV_BSIZE); 10295312Smg147109 10305312Smg147109 /* used for deadline */ 10315312Smg147109 hsio->io_timestamp = drv_hztousec(ddi_get_lbolt()); 10325312Smg147109 10335312Smg147109 /* for I/O coalescing */ 10345312Smg147109 hsio->contig_chain = NULL; 10355312Smg147109 hsched_enqueue_io(fsp, hsio, 1); 10365312Smg147109 10375312Smg147109 lwp_stat_update(LWP_STAT_INBLK, 1); 10385312Smg147109 lastp = searchp; 10395312Smg147109 if ((remainder - bufs[count].b_bcount) < 1) { 10405312Smg147109 searchp = searchp->p_next; 10415312Smg147109 } 10425312Smg147109 } 10435312Smg147109 10445312Smg147109 bufsused = count; 10455312Smg147109 info = kmem_cache_alloc(hio_info_cache, KM_SLEEP); 10465312Smg147109 info->bufs = bufs; 10475312Smg147109 info->vas = vas; 10485312Smg147109 info->sema = fio_done; 10495312Smg147109 info->bufsused = bufsused; 10505312Smg147109 info->bufcnt = bufcnt; 10515312Smg147109 info->fsp = fsp; 10525312Smg147109 info->pp = pp; 10535312Smg147109 10545312Smg147109 (void) taskq_dispatch(fsp->hqueue->ra_task, 10555312Smg147109 hsfs_ra_task, info, KM_SLEEP); 10565312Smg147109 /* 10575312Smg147109 * The I/O locked pages are unlocked in our taskq thread. 10585312Smg147109 */ 10595312Smg147109 return (0); 10605312Smg147109 } 10615312Smg147109 10625312Smg147109 /* 10630Sstevel@tonic-gate * Each file may have a different interleaving on disk. This makes 10640Sstevel@tonic-gate * things somewhat interesting. The gist is that there are some 10650Sstevel@tonic-gate * number of contiguous data sectors, followed by some other number 10660Sstevel@tonic-gate * of contiguous skip sectors. The sum of those two sets of sectors 10670Sstevel@tonic-gate * defines the interleave size. Unfortunately, it means that we generally 10680Sstevel@tonic-gate * can't simply read N sectors starting at a given offset to satisfy 10690Sstevel@tonic-gate * any given request. 10700Sstevel@tonic-gate * 10710Sstevel@tonic-gate * What we do is get the relevant memory pages via pvn_read_kluster(), 10720Sstevel@tonic-gate * then stride through the interleaves, setting up a buf for each 10730Sstevel@tonic-gate * sector that needs to be brought in. Instead of kmem_alloc'ing 10740Sstevel@tonic-gate * space for the sectors, though, we just point at the appropriate 10750Sstevel@tonic-gate * spot in the relevant page for each of them. This saves us a bunch 10760Sstevel@tonic-gate * of copying. 10775312Smg147109 * 10785312Smg147109 * NOTICE: The code below in hsfs_getapage is mostly same as the code 10795312Smg147109 * in hsfs_getpage_ra above (with some omissions). If you are 10805312Smg147109 * making any change to this function, please also look at 10815312Smg147109 * hsfs_getpage_ra. 10820Sstevel@tonic-gate */ 10830Sstevel@tonic-gate /*ARGSUSED*/ 10840Sstevel@tonic-gate static int 10850Sstevel@tonic-gate hsfs_getapage( 10860Sstevel@tonic-gate struct vnode *vp, 10870Sstevel@tonic-gate u_offset_t off, 10880Sstevel@tonic-gate size_t len, 10890Sstevel@tonic-gate uint_t *protp, 10900Sstevel@tonic-gate struct page *pl[], 10910Sstevel@tonic-gate size_t plsz, 10920Sstevel@tonic-gate struct seg *seg, 10930Sstevel@tonic-gate caddr_t addr, 10940Sstevel@tonic-gate enum seg_rw rw, 10950Sstevel@tonic-gate struct cred *cred) 10960Sstevel@tonic-gate { 10970Sstevel@tonic-gate struct hsnode *hp; 10980Sstevel@tonic-gate struct hsfs *fsp; 10990Sstevel@tonic-gate int err; 11000Sstevel@tonic-gate struct buf *bufs; 11010Sstevel@tonic-gate caddr_t *vas; 11020Sstevel@tonic-gate caddr_t va; 11030Sstevel@tonic-gate struct page *pp, *searchp, *lastp; 11040Sstevel@tonic-gate page_t *pagefound; 11050Sstevel@tonic-gate offset_t bof; 11060Sstevel@tonic-gate struct vnode *devvp; 11070Sstevel@tonic-gate ulong_t byte_offset; 11080Sstevel@tonic-gate size_t io_len_tmp; 11090Sstevel@tonic-gate uint_t io_off, io_len; 11100Sstevel@tonic-gate uint_t xlen; 11110Sstevel@tonic-gate uint_t filsiz; 11120Sstevel@tonic-gate uint_t secsize; 11130Sstevel@tonic-gate uint_t bufcnt; 11140Sstevel@tonic-gate uint_t bufsused; 11150Sstevel@tonic-gate uint_t count; 11160Sstevel@tonic-gate uint_t io_end; 11170Sstevel@tonic-gate uint_t which_chunk_lbn; 11180Sstevel@tonic-gate uint_t offset_lbn; 11190Sstevel@tonic-gate uint_t offset_extra; 11200Sstevel@tonic-gate offset_t offset_bytes; 11210Sstevel@tonic-gate uint_t remaining_bytes; 11220Sstevel@tonic-gate uint_t extension; 11230Sstevel@tonic-gate int remainder; /* must be signed */ 11240Sstevel@tonic-gate int chunk_lbn_count; 11250Sstevel@tonic-gate int chunk_data_bytes; 11260Sstevel@tonic-gate int xarsiz; 11270Sstevel@tonic-gate diskaddr_t driver_block; 11280Sstevel@tonic-gate u_offset_t io_off_tmp; 11295312Smg147109 ksema_t *fio_done; 11305312Smg147109 int calcdone; 11310Sstevel@tonic-gate 11320Sstevel@tonic-gate /* 11330Sstevel@tonic-gate * We don't support asynchronous operation at the moment, so 11340Sstevel@tonic-gate * just pretend we did it. If the pages are ever actually 11350Sstevel@tonic-gate * needed, they'll get brought in then. 11360Sstevel@tonic-gate */ 11370Sstevel@tonic-gate if (pl == NULL) 11380Sstevel@tonic-gate return (0); 11390Sstevel@tonic-gate 11400Sstevel@tonic-gate hp = VTOH(vp); 11410Sstevel@tonic-gate fsp = VFS_TO_HSFS(vp->v_vfsp); 11420Sstevel@tonic-gate devvp = fsp->hsfs_devvp; 11430Sstevel@tonic-gate secsize = fsp->hsfs_vol.lbn_size; /* bytes per logical block */ 11440Sstevel@tonic-gate 11450Sstevel@tonic-gate /* file data size */ 11460Sstevel@tonic-gate filsiz = hp->hs_dirent.ext_size; 11470Sstevel@tonic-gate 11480Sstevel@tonic-gate /* disk addr for start of file */ 11490Sstevel@tonic-gate bof = LBN_TO_BYTE((offset_t)hp->hs_dirent.ext_lbn, vp->v_vfsp); 11500Sstevel@tonic-gate 11510Sstevel@tonic-gate /* xarsiz byte must be skipped for data */ 11520Sstevel@tonic-gate xarsiz = hp->hs_dirent.xar_len << fsp->hsfs_vol.lbn_shift; 11530Sstevel@tonic-gate 11540Sstevel@tonic-gate /* how many logical blocks in an interleave (data+skip) */ 11550Sstevel@tonic-gate chunk_lbn_count = hp->hs_dirent.intlf_sz + hp->hs_dirent.intlf_sk; 11560Sstevel@tonic-gate 11570Sstevel@tonic-gate if (chunk_lbn_count == 0) { 11580Sstevel@tonic-gate chunk_lbn_count = 1; 11590Sstevel@tonic-gate } 11600Sstevel@tonic-gate 11610Sstevel@tonic-gate /* 11620Sstevel@tonic-gate * Convert interleaving size into bytes. The zero case 11630Sstevel@tonic-gate * (no interleaving) optimization is handled as a side- 11640Sstevel@tonic-gate * effect of the read-ahead logic. 11650Sstevel@tonic-gate */ 11660Sstevel@tonic-gate if (hp->hs_dirent.intlf_sz == 0) { 11670Sstevel@tonic-gate chunk_data_bytes = LBN_TO_BYTE(1, vp->v_vfsp); 11685312Smg147109 /* 11695312Smg147109 * Optimization: If our pagesize is a multiple of LBN 11705312Smg147109 * bytes, we can avoid breaking up a page into individual 11715312Smg147109 * lbn-sized requests. 11725312Smg147109 */ 11735312Smg147109 if (PAGESIZE % chunk_data_bytes == 0) { 11745312Smg147109 chunk_lbn_count = BYTE_TO_LBN(PAGESIZE, vp->v_vfsp); 11755312Smg147109 chunk_data_bytes = PAGESIZE; 11765312Smg147109 } 11770Sstevel@tonic-gate } else { 11784866Sfrankho chunk_data_bytes = 11794866Sfrankho LBN_TO_BYTE(hp->hs_dirent.intlf_sz, vp->v_vfsp); 11800Sstevel@tonic-gate } 11810Sstevel@tonic-gate 11820Sstevel@tonic-gate reread: 11830Sstevel@tonic-gate err = 0; 11840Sstevel@tonic-gate pagefound = 0; 11855312Smg147109 calcdone = 0; 11860Sstevel@tonic-gate 11870Sstevel@tonic-gate /* 11880Sstevel@tonic-gate * Do some read-ahead. This mostly saves us a bit of 11890Sstevel@tonic-gate * system cpu time more than anything else when doing 11900Sstevel@tonic-gate * sequential reads. At some point, could do the 11910Sstevel@tonic-gate * read-ahead asynchronously which might gain us something 11920Sstevel@tonic-gate * on wall time, but it seems unlikely.... 11930Sstevel@tonic-gate * 11940Sstevel@tonic-gate * We do the easy case here, which is to read through 11950Sstevel@tonic-gate * the end of the chunk, minus whatever's at the end that 11960Sstevel@tonic-gate * won't exactly fill a page. 11970Sstevel@tonic-gate */ 11985312Smg147109 if (hp->hs_ra_bytes > 0 && chunk_data_bytes != PAGESIZE) { 11995312Smg147109 which_chunk_lbn = (off + len) / chunk_data_bytes; 12005312Smg147109 extension = ((which_chunk_lbn + 1) * chunk_data_bytes) - off; 12015312Smg147109 extension -= (extension % PAGESIZE); 1202206Speterte } else { 12035312Smg147109 extension = roundup(len, PAGESIZE); 12040Sstevel@tonic-gate } 12050Sstevel@tonic-gate 12065312Smg147109 atomic_inc_64(&fsp->total_pages_requested); 12070Sstevel@tonic-gate 12080Sstevel@tonic-gate pp = NULL; 12090Sstevel@tonic-gate again: 12100Sstevel@tonic-gate /* search for page in buffer */ 12110Sstevel@tonic-gate if ((pagefound = page_exists(vp, off)) == 0) { 12120Sstevel@tonic-gate /* 12130Sstevel@tonic-gate * Need to really do disk IO to get the page. 12140Sstevel@tonic-gate */ 12155312Smg147109 if (!calcdone) { 12165312Smg147109 extension += hp->hs_ra_bytes; 12175312Smg147109 12185312Smg147109 /* 12195312Smg147109 * Some cd writers don't write sectors that aren't 12205312Smg147109 * used. Also, there's no point in reading sectors 12215312Smg147109 * we'll never look at. So, if we're asked to go 12225312Smg147109 * beyond the end of a file, truncate to the length 12235312Smg147109 * of that file. 12245312Smg147109 * 12255312Smg147109 * Additionally, this behaviour is required by section 12265312Smg147109 * 6.4.5 of ISO 9660:1988(E). 12275312Smg147109 */ 12285312Smg147109 len = MIN(extension ? extension : PAGESIZE, 12295312Smg147109 filsiz - off); 12305312Smg147109 12315312Smg147109 /* A little paranoia. */ 12325312Smg147109 ASSERT(len > 0); 12335312Smg147109 12345312Smg147109 /* 12355312Smg147109 * After all that, make sure we're asking for things 12365312Smg147109 * in units that bdev_strategy() will understand 12375312Smg147109 * (see bug 4202551). 12385312Smg147109 */ 12395312Smg147109 len = roundup(len, DEV_BSIZE); 12405312Smg147109 calcdone = 1; 12415312Smg147109 } 12425312Smg147109 12430Sstevel@tonic-gate pp = pvn_read_kluster(vp, off, seg, addr, &io_off_tmp, 12440Sstevel@tonic-gate &io_len_tmp, off, len, 0); 12450Sstevel@tonic-gate 12465312Smg147109 if (pp == NULL) { 12475312Smg147109 /* 12485312Smg147109 * Pressure on memory, roll back readahead 12495312Smg147109 */ 12505312Smg147109 hp->hs_num_contig = 0; 12515312Smg147109 hp->hs_ra_bytes = 0; 12525312Smg147109 hp->hs_prev_offset = 0; 12530Sstevel@tonic-gate goto again; 12545312Smg147109 } 12550Sstevel@tonic-gate 12560Sstevel@tonic-gate io_off = (uint_t)io_off_tmp; 12570Sstevel@tonic-gate io_len = (uint_t)io_len_tmp; 12580Sstevel@tonic-gate 12590Sstevel@tonic-gate /* check for truncation */ 12600Sstevel@tonic-gate /* 12610Sstevel@tonic-gate * xxx Clean up and return EIO instead? 12620Sstevel@tonic-gate * xxx Ought to go to u_offset_t for everything, but we 12630Sstevel@tonic-gate * xxx call lots of things that want uint_t arguments. 12640Sstevel@tonic-gate */ 12650Sstevel@tonic-gate ASSERT(io_off == io_off_tmp); 12660Sstevel@tonic-gate 12670Sstevel@tonic-gate /* 12680Sstevel@tonic-gate * get enough buffers for worst-case scenario 12690Sstevel@tonic-gate * (i.e., no coalescing possible). 12700Sstevel@tonic-gate */ 12710Sstevel@tonic-gate bufcnt = (len + secsize - 1) / secsize; 12720Sstevel@tonic-gate bufs = kmem_zalloc(bufcnt * sizeof (struct buf), KM_SLEEP); 12730Sstevel@tonic-gate vas = kmem_alloc(bufcnt * sizeof (caddr_t), KM_SLEEP); 12745312Smg147109 12755312Smg147109 /* 12765312Smg147109 * Allocate a array of semaphores if we are doing I/O 12775312Smg147109 * scheduling. 12785312Smg147109 */ 12795312Smg147109 if (fsp->hqueue != NULL) 12805312Smg147109 fio_done = kmem_alloc(bufcnt * sizeof (ksema_t), 12815312Smg147109 KM_SLEEP); 12820Sstevel@tonic-gate for (count = 0; count < bufcnt; count++) { 12835312Smg147109 bioinit(&bufs[count]); 12840Sstevel@tonic-gate bufs[count].b_edev = devvp->v_rdev; 12850Sstevel@tonic-gate bufs[count].b_dev = cmpdev(devvp->v_rdev); 12860Sstevel@tonic-gate bufs[count].b_flags = B_NOCACHE|B_BUSY|B_READ; 12870Sstevel@tonic-gate bufs[count].b_iodone = hsfs_iodone; 12880Sstevel@tonic-gate bufs[count].b_vp = vp; 12890Sstevel@tonic-gate bufs[count].b_file = vp; 12900Sstevel@tonic-gate } 12910Sstevel@tonic-gate 1292206Speterte /* 1293206Speterte * If our filesize is not an integer multiple of PAGESIZE, 1294206Speterte * we zero that part of the last page that's between EOF and 1295206Speterte * the PAGESIZE boundary. 1296206Speterte */ 12970Sstevel@tonic-gate xlen = io_len & PAGEOFFSET; 12980Sstevel@tonic-gate if (xlen != 0) 12990Sstevel@tonic-gate pagezero(pp->p_prev, xlen, PAGESIZE - xlen); 13000Sstevel@tonic-gate 13010Sstevel@tonic-gate va = NULL; 13020Sstevel@tonic-gate lastp = NULL; 13030Sstevel@tonic-gate searchp = pp; 13040Sstevel@tonic-gate io_end = io_off + io_len; 13050Sstevel@tonic-gate for (count = 0, byte_offset = io_off; 13064866Sfrankho byte_offset < io_end; count++) { 13070Sstevel@tonic-gate ASSERT(count < bufcnt); 13080Sstevel@tonic-gate 13090Sstevel@tonic-gate /* Compute disk address for interleaving. */ 13100Sstevel@tonic-gate 13110Sstevel@tonic-gate /* considered without skips */ 13120Sstevel@tonic-gate which_chunk_lbn = byte_offset / chunk_data_bytes; 13130Sstevel@tonic-gate 13140Sstevel@tonic-gate /* factor in skips */ 13150Sstevel@tonic-gate offset_lbn = which_chunk_lbn * chunk_lbn_count; 13160Sstevel@tonic-gate 13170Sstevel@tonic-gate /* convert to physical byte offset for lbn */ 13180Sstevel@tonic-gate offset_bytes = LBN_TO_BYTE(offset_lbn, vp->v_vfsp); 13190Sstevel@tonic-gate 13200Sstevel@tonic-gate /* don't forget offset into lbn */ 13210Sstevel@tonic-gate offset_extra = byte_offset % chunk_data_bytes; 13220Sstevel@tonic-gate 13230Sstevel@tonic-gate /* get virtual block number for driver */ 13244866Sfrankho driver_block = 13254866Sfrankho lbtodb(bof + xarsiz + offset_bytes + offset_extra); 13260Sstevel@tonic-gate 13270Sstevel@tonic-gate if (lastp != searchp) { 13280Sstevel@tonic-gate /* this branch taken first time through loop */ 13294866Sfrankho va = vas[count] = 13304866Sfrankho ppmapin(searchp, PROT_WRITE, (caddr_t)-1); 13310Sstevel@tonic-gate /* ppmapin() guarantees not to return NULL */ 13320Sstevel@tonic-gate } else { 13330Sstevel@tonic-gate vas[count] = NULL; 13340Sstevel@tonic-gate } 13350Sstevel@tonic-gate 13360Sstevel@tonic-gate bufs[count].b_un.b_addr = va + byte_offset % PAGESIZE; 13370Sstevel@tonic-gate bufs[count].b_offset = 13380Sstevel@tonic-gate (offset_t)(byte_offset - io_off + off); 13390Sstevel@tonic-gate 13400Sstevel@tonic-gate /* 13410Sstevel@tonic-gate * We specifically use the b_lblkno member here 13420Sstevel@tonic-gate * as even in the 32 bit world driver_block can 13430Sstevel@tonic-gate * get very large in line with the ISO9660 spec. 13440Sstevel@tonic-gate */ 13450Sstevel@tonic-gate 13460Sstevel@tonic-gate bufs[count].b_lblkno = driver_block; 13470Sstevel@tonic-gate 13484866Sfrankho remaining_bytes = 13494866Sfrankho ((which_chunk_lbn + 1) * chunk_data_bytes) 13504866Sfrankho - byte_offset; 13510Sstevel@tonic-gate 13520Sstevel@tonic-gate /* 13530Sstevel@tonic-gate * remaining_bytes can't be zero, as we derived 13540Sstevel@tonic-gate * which_chunk_lbn directly from byte_offset. 13550Sstevel@tonic-gate */ 13561349Speterte if ((remaining_bytes + byte_offset) < (off + len)) { 13570Sstevel@tonic-gate /* coalesce-read the rest of the chunk */ 13580Sstevel@tonic-gate bufs[count].b_bcount = remaining_bytes; 13590Sstevel@tonic-gate } else { 13600Sstevel@tonic-gate /* get the final bits */ 13610Sstevel@tonic-gate bufs[count].b_bcount = off + len - byte_offset; 13620Sstevel@tonic-gate } 13630Sstevel@tonic-gate 13640Sstevel@tonic-gate /* 13650Sstevel@tonic-gate * It would be nice to do multiple pages' 13660Sstevel@tonic-gate * worth at once here when the opportunity 13670Sstevel@tonic-gate * arises, as that has been shown to improve 13680Sstevel@tonic-gate * our wall time. However, to do that 13690Sstevel@tonic-gate * requires that we use the pageio subsystem, 13700Sstevel@tonic-gate * which doesn't mix well with what we're 13710Sstevel@tonic-gate * already using here. We can't use pageio 13720Sstevel@tonic-gate * all the time, because that subsystem 13730Sstevel@tonic-gate * assumes that a page is stored in N 13740Sstevel@tonic-gate * contiguous blocks on the device. 13750Sstevel@tonic-gate * Interleaving violates that assumption. 13765312Smg147109 * 13775312Smg147109 * Update: This is now not so big a problem 13785312Smg147109 * because of the I/O scheduler sitting below 13795312Smg147109 * that can re-order and coalesce I/O requests. 13800Sstevel@tonic-gate */ 13810Sstevel@tonic-gate 13820Sstevel@tonic-gate remainder = PAGESIZE - (byte_offset % PAGESIZE); 13830Sstevel@tonic-gate if (bufs[count].b_bcount > remainder) { 13840Sstevel@tonic-gate bufs[count].b_bcount = remainder; 13850Sstevel@tonic-gate } 13860Sstevel@tonic-gate 13870Sstevel@tonic-gate bufs[count].b_bufsize = bufs[count].b_bcount; 13881349Speterte if (((offset_t)byte_offset + bufs[count].b_bcount) > 13894866Sfrankho HS_MAXFILEOFF) { 13901349Speterte break; 13911349Speterte } 13920Sstevel@tonic-gate byte_offset += bufs[count].b_bcount; 13930Sstevel@tonic-gate 13945312Smg147109 if (fsp->hqueue == NULL) { 13955312Smg147109 (void) bdev_strategy(&bufs[count]); 13965312Smg147109 13975312Smg147109 } else { 13985312Smg147109 /* 13995312Smg147109 * We are scheduling I/O so we need to enqueue 14005312Smg147109 * requests rather than calling bdev_strategy 14015312Smg147109 * here. A later invocation of the scheduling 14025312Smg147109 * function will take care of doing the actual 14035312Smg147109 * I/O as it selects requests from the queue as 14045312Smg147109 * per the scheduling logic. 14055312Smg147109 */ 14065312Smg147109 struct hio *hsio = kmem_cache_alloc(hio_cache, 14075312Smg147109 KM_SLEEP); 14085312Smg147109 14095312Smg147109 sema_init(&fio_done[count], 0, NULL, 14105312Smg147109 SEMA_DEFAULT, NULL); 14115312Smg147109 hsio->bp = &bufs[count]; 14125312Smg147109 hsio->sema = &fio_done[count]; 14135312Smg147109 hsio->io_lblkno = bufs[count].b_lblkno; 14145312Smg147109 hsio->nblocks = howmany(hsio->bp->b_bcount, 14155312Smg147109 DEV_BSIZE); 14165312Smg147109 14175312Smg147109 /* used for deadline */ 14185312Smg147109 hsio->io_timestamp = 14195312Smg147109 drv_hztousec(ddi_get_lbolt()); 14205312Smg147109 14215312Smg147109 /* for I/O coalescing */ 14225312Smg147109 hsio->contig_chain = NULL; 14235312Smg147109 hsched_enqueue_io(fsp, hsio, 0); 14245312Smg147109 } 14250Sstevel@tonic-gate 14260Sstevel@tonic-gate lwp_stat_update(LWP_STAT_INBLK, 1); 14270Sstevel@tonic-gate lastp = searchp; 14280Sstevel@tonic-gate if ((remainder - bufs[count].b_bcount) < 1) { 14290Sstevel@tonic-gate searchp = searchp->p_next; 14300Sstevel@tonic-gate } 14310Sstevel@tonic-gate } 14320Sstevel@tonic-gate 14330Sstevel@tonic-gate bufsused = count; 14340Sstevel@tonic-gate /* Now wait for everything to come in */ 14355312Smg147109 if (fsp->hqueue == NULL) { 14365312Smg147109 for (count = 0; count < bufsused; count++) { 14375312Smg147109 if (err == 0) { 14385312Smg147109 err = biowait(&bufs[count]); 14395312Smg147109 } else 14405312Smg147109 (void) biowait(&bufs[count]); 14415312Smg147109 } 14425312Smg147109 } else { 14435312Smg147109 for (count = 0; count < bufsused; count++) { 14445312Smg147109 struct buf *wbuf; 14455312Smg147109 14465312Smg147109 /* 14475312Smg147109 * Invoke scheduling function till our buf 14485312Smg147109 * is processed. In doing this it might 14495312Smg147109 * process bufs enqueued by other threads 14505312Smg147109 * which is good. 14515312Smg147109 */ 14525312Smg147109 wbuf = &bufs[count]; 14535312Smg147109 DTRACE_PROBE1(hsfs_io_wait, struct buf *, wbuf); 14545312Smg147109 while (sema_tryp(&fio_done[count]) == 0) { 14555312Smg147109 /* 14565312Smg147109 * hsched_invoke_strategy will return 1 14575312Smg147109 * if the I/O queue is empty. This means 14585312Smg147109 * that there is another thread who has 14595312Smg147109 * issued our buf and is waiting. So we 14605312Smg147109 * just block instead of spinning. 14615312Smg147109 */ 14625312Smg147109 if (hsched_invoke_strategy(fsp)) { 14635312Smg147109 sema_p(&fio_done[count]); 14645312Smg147109 break; 14655312Smg147109 } 14665312Smg147109 } 14675312Smg147109 sema_destroy(&fio_done[count]); 14685312Smg147109 DTRACE_PROBE1(hsfs_io_done, struct buf *, wbuf); 14695312Smg147109 14705312Smg147109 if (err == 0) { 14715312Smg147109 err = geterror(wbuf); 14725312Smg147109 } 14735312Smg147109 } 14745312Smg147109 kmem_free(fio_done, bufcnt * sizeof (ksema_t)); 14750Sstevel@tonic-gate } 14760Sstevel@tonic-gate 14770Sstevel@tonic-gate /* Don't leak resources */ 14780Sstevel@tonic-gate for (count = 0; count < bufcnt; count++) { 14795312Smg147109 biofini(&bufs[count]); 14800Sstevel@tonic-gate if (count < bufsused && vas[count] != NULL) { 14810Sstevel@tonic-gate ppmapout(vas[count]); 14820Sstevel@tonic-gate } 14830Sstevel@tonic-gate } 14840Sstevel@tonic-gate 14850Sstevel@tonic-gate kmem_free(vas, bufcnt * sizeof (caddr_t)); 14860Sstevel@tonic-gate kmem_free(bufs, bufcnt * sizeof (struct buf)); 14870Sstevel@tonic-gate } 14880Sstevel@tonic-gate 14890Sstevel@tonic-gate if (err) { 14900Sstevel@tonic-gate pvn_read_done(pp, B_ERROR); 14910Sstevel@tonic-gate return (err); 14920Sstevel@tonic-gate } 14930Sstevel@tonic-gate 14940Sstevel@tonic-gate /* 14950Sstevel@tonic-gate * Lock the requested page, and the one after it if possible. 14960Sstevel@tonic-gate * Don't bother if our caller hasn't given us a place to stash 14970Sstevel@tonic-gate * the page pointers, since otherwise we'd lock pages that would 14980Sstevel@tonic-gate * never get unlocked. 14990Sstevel@tonic-gate */ 15000Sstevel@tonic-gate if (pagefound) { 15010Sstevel@tonic-gate int index; 15020Sstevel@tonic-gate ulong_t soff; 15030Sstevel@tonic-gate 15040Sstevel@tonic-gate /* 15050Sstevel@tonic-gate * Make sure it's in memory before we say it's here. 15060Sstevel@tonic-gate */ 15070Sstevel@tonic-gate if ((pp = page_lookup(vp, off, SE_SHARED)) == NULL) { 15080Sstevel@tonic-gate hsfs_lostpage++; 15090Sstevel@tonic-gate goto reread; 15100Sstevel@tonic-gate } 15110Sstevel@tonic-gate 15120Sstevel@tonic-gate pl[0] = pp; 15130Sstevel@tonic-gate index = 1; 15145312Smg147109 atomic_inc_64(&fsp->cache_read_pages); 15150Sstevel@tonic-gate 15160Sstevel@tonic-gate /* 15170Sstevel@tonic-gate * Try to lock the next page, if it exists, without 15180Sstevel@tonic-gate * blocking. 15190Sstevel@tonic-gate */ 15200Sstevel@tonic-gate plsz -= PAGESIZE; 15210Sstevel@tonic-gate /* LINTED (plsz is unsigned) */ 15220Sstevel@tonic-gate for (soff = off + PAGESIZE; plsz > 0; 15230Sstevel@tonic-gate soff += PAGESIZE, plsz -= PAGESIZE) { 15240Sstevel@tonic-gate pp = page_lookup_nowait(vp, (u_offset_t)soff, 15254866Sfrankho SE_SHARED); 15260Sstevel@tonic-gate if (pp == NULL) 15270Sstevel@tonic-gate break; 15280Sstevel@tonic-gate pl[index++] = pp; 15290Sstevel@tonic-gate } 15300Sstevel@tonic-gate pl[index] = NULL; 15315312Smg147109 15325312Smg147109 /* 15335312Smg147109 * Schedule a semi-asynchronous readahead if we are 15345312Smg147109 * accessing the last cached page for the current 15355312Smg147109 * file. 15365312Smg147109 * 15375312Smg147109 * Doing this here means that readaheads will be 15385312Smg147109 * issued only if cache-hits occur. This is an advantage 15395312Smg147109 * since cache-hits would mean that readahead is giving 15405312Smg147109 * the desired benefit. If cache-hits do not occur there 15415312Smg147109 * is no point in reading ahead of time - the system 15425312Smg147109 * is loaded anyway. 15435312Smg147109 */ 15445312Smg147109 if (fsp->hqueue != NULL && 15455312Smg147109 hp->hs_prev_offset - off == PAGESIZE && 15465312Smg147109 hp->hs_prev_offset < filsiz && 15475312Smg147109 hp->hs_ra_bytes > 0 && 15485312Smg147109 !page_exists(vp, hp->hs_prev_offset)) { 15495312Smg147109 (void) hsfs_getpage_ra(vp, hp->hs_prev_offset, seg, 15505312Smg147109 addr + PAGESIZE, hp, fsp, xarsiz, bof, 15515312Smg147109 chunk_lbn_count, chunk_data_bytes); 15525312Smg147109 } 15535312Smg147109 15540Sstevel@tonic-gate return (0); 15550Sstevel@tonic-gate } 15560Sstevel@tonic-gate 15570Sstevel@tonic-gate if (pp != NULL) { 15580Sstevel@tonic-gate pvn_plist_init(pp, pl, plsz, off, io_len, rw); 15590Sstevel@tonic-gate } 15600Sstevel@tonic-gate 15610Sstevel@tonic-gate return (err); 15620Sstevel@tonic-gate } 15630Sstevel@tonic-gate 15645331Samw /*ARGSUSED*/ 15650Sstevel@tonic-gate static int 15660Sstevel@tonic-gate hsfs_getpage( 15670Sstevel@tonic-gate struct vnode *vp, 15680Sstevel@tonic-gate offset_t off, 15690Sstevel@tonic-gate size_t len, 15700Sstevel@tonic-gate uint_t *protp, 15710Sstevel@tonic-gate struct page *pl[], 15720Sstevel@tonic-gate size_t plsz, 15730Sstevel@tonic-gate struct seg *seg, 15740Sstevel@tonic-gate caddr_t addr, 15750Sstevel@tonic-gate enum seg_rw rw, 15765331Samw struct cred *cred, 15775331Samw caller_context_t *ct) 15780Sstevel@tonic-gate { 15790Sstevel@tonic-gate int err; 15800Sstevel@tonic-gate uint_t filsiz; 15815312Smg147109 struct hsfs *fsp; 15825312Smg147109 struct hsnode *hp; 15835312Smg147109 15845312Smg147109 fsp = VFS_TO_HSFS(vp->v_vfsp); 15855312Smg147109 hp = VTOH(vp); 15860Sstevel@tonic-gate 15870Sstevel@tonic-gate /* does not support write */ 15880Sstevel@tonic-gate if (rw == S_WRITE) { 15890Sstevel@tonic-gate panic("write attempt on READ ONLY HSFS"); 15900Sstevel@tonic-gate /*NOTREACHED*/ 15910Sstevel@tonic-gate } 15920Sstevel@tonic-gate 15930Sstevel@tonic-gate if (vp->v_flag & VNOMAP) { 15940Sstevel@tonic-gate return (ENOSYS); 15950Sstevel@tonic-gate } 15960Sstevel@tonic-gate 15971349Speterte ASSERT(off <= HS_MAXFILEOFF); 15980Sstevel@tonic-gate 15990Sstevel@tonic-gate /* 16000Sstevel@tonic-gate * Determine file data size for EOF check. 16010Sstevel@tonic-gate */ 16020Sstevel@tonic-gate filsiz = hp->hs_dirent.ext_size; 16030Sstevel@tonic-gate if ((off + len) > (offset_t)(filsiz + PAGEOFFSET) && seg != segkmap) 16040Sstevel@tonic-gate return (EFAULT); /* beyond EOF */ 16050Sstevel@tonic-gate 16065312Smg147109 /* 16075312Smg147109 * Async Read-ahead computation. 16085312Smg147109 * This attempts to detect sequential access pattern and 16095312Smg147109 * enables reading extra pages ahead of time. 16105312Smg147109 */ 16115312Smg147109 if (fsp->hqueue != NULL) { 16125312Smg147109 /* 16135312Smg147109 * This check for sequential access also takes into 16145312Smg147109 * account segmap weirdness when reading in chunks 16155312Smg147109 * less than the segmap size of 8K. 16165312Smg147109 */ 16175312Smg147109 if (hp->hs_prev_offset == off || (off < 16185312Smg147109 hp->hs_prev_offset && off + MAX(len, PAGESIZE) 16195312Smg147109 >= hp->hs_prev_offset)) { 16205312Smg147109 if (hp->hs_num_contig < 16215312Smg147109 (seq_contig_requests - 1)) { 16225312Smg147109 hp->hs_num_contig++; 16235312Smg147109 16245312Smg147109 } else { 16255312Smg147109 /* 16265312Smg147109 * We increase readahead quantum till 16275312Smg147109 * a predefined max. max_readahead_bytes 16285312Smg147109 * is a multiple of PAGESIZE. 16295312Smg147109 */ 16305312Smg147109 if (hp->hs_ra_bytes < 16315312Smg147109 fsp->hqueue->max_ra_bytes) { 16325312Smg147109 hp->hs_ra_bytes += PAGESIZE; 16335312Smg147109 } 16345312Smg147109 } 16355312Smg147109 } else { 16365312Smg147109 /* 16375312Smg147109 * Not contiguous so reduce read ahead counters. 16385312Smg147109 */ 16395312Smg147109 if (hp->hs_ra_bytes > 0) 16405312Smg147109 hp->hs_ra_bytes -= PAGESIZE; 16415312Smg147109 16425312Smg147109 if (hp->hs_ra_bytes <= 0) { 16435312Smg147109 hp->hs_ra_bytes = 0; 16445312Smg147109 if (hp->hs_num_contig > 0) 16455312Smg147109 hp->hs_num_contig--; 16465312Smg147109 } 16475312Smg147109 } 16485312Smg147109 /* 16495312Smg147109 * Length must be rounded up to page boundary. 16505312Smg147109 * since we read in units of pages. 16515312Smg147109 */ 16525312Smg147109 hp->hs_prev_offset = off + roundup(len, PAGESIZE); 16535312Smg147109 DTRACE_PROBE1(hsfs_compute_ra, struct hsnode *, hp); 16545312Smg147109 } 16550Sstevel@tonic-gate if (protp != NULL) 16560Sstevel@tonic-gate *protp = PROT_ALL; 16570Sstevel@tonic-gate 16580Sstevel@tonic-gate if (len <= PAGESIZE) 16590Sstevel@tonic-gate err = hsfs_getapage(vp, (u_offset_t)off, len, protp, pl, plsz, 16600Sstevel@tonic-gate seg, addr, rw, cred); 16610Sstevel@tonic-gate else 16620Sstevel@tonic-gate err = pvn_getpages(hsfs_getapage, vp, off, len, protp, 16630Sstevel@tonic-gate pl, plsz, seg, addr, rw, cred); 16640Sstevel@tonic-gate 16650Sstevel@tonic-gate return (err); 16660Sstevel@tonic-gate } 16670Sstevel@tonic-gate 16680Sstevel@tonic-gate 16690Sstevel@tonic-gate 16700Sstevel@tonic-gate /* 16710Sstevel@tonic-gate * This function should never be called. We need to have it to pass 16720Sstevel@tonic-gate * it as an argument to other functions. 16730Sstevel@tonic-gate */ 16740Sstevel@tonic-gate /*ARGSUSED*/ 16750Sstevel@tonic-gate int 16760Sstevel@tonic-gate hsfs_putapage( 16770Sstevel@tonic-gate vnode_t *vp, 16780Sstevel@tonic-gate page_t *pp, 16790Sstevel@tonic-gate u_offset_t *offp, 16800Sstevel@tonic-gate size_t *lenp, 16810Sstevel@tonic-gate int flags, 16820Sstevel@tonic-gate cred_t *cr) 16830Sstevel@tonic-gate { 16840Sstevel@tonic-gate /* should never happen - just destroy it */ 16850Sstevel@tonic-gate cmn_err(CE_NOTE, "hsfs_putapage: dirty HSFS page"); 16860Sstevel@tonic-gate pvn_write_done(pp, B_ERROR | B_WRITE | B_INVAL | B_FORCE | flags); 16870Sstevel@tonic-gate return (0); 16880Sstevel@tonic-gate } 16890Sstevel@tonic-gate 16900Sstevel@tonic-gate 16910Sstevel@tonic-gate /* 16920Sstevel@tonic-gate * The only flags we support are B_INVAL, B_FREE and B_DONTNEED. 16930Sstevel@tonic-gate * B_INVAL is set by: 16940Sstevel@tonic-gate * 16950Sstevel@tonic-gate * 1) the MC_SYNC command of memcntl(2) to support the MS_INVALIDATE flag. 16960Sstevel@tonic-gate * 2) the MC_ADVISE command of memcntl(2) with the MADV_DONTNEED advice 16970Sstevel@tonic-gate * which translates to an MC_SYNC with the MS_INVALIDATE flag. 16980Sstevel@tonic-gate * 16990Sstevel@tonic-gate * The B_FREE (as well as the B_DONTNEED) flag is set when the 17000Sstevel@tonic-gate * MADV_SEQUENTIAL advice has been used. VOP_PUTPAGE is invoked 17010Sstevel@tonic-gate * from SEGVN to release pages behind a pagefault. 17020Sstevel@tonic-gate */ 17030Sstevel@tonic-gate /*ARGSUSED*/ 17040Sstevel@tonic-gate static int 17050Sstevel@tonic-gate hsfs_putpage( 17065331Samw struct vnode *vp, 17075331Samw offset_t off, 17085331Samw size_t len, 17095331Samw int flags, 17105331Samw struct cred *cr, 17115331Samw caller_context_t *ct) 17120Sstevel@tonic-gate { 17130Sstevel@tonic-gate int error = 0; 17140Sstevel@tonic-gate 17150Sstevel@tonic-gate if (vp->v_count == 0) { 17160Sstevel@tonic-gate panic("hsfs_putpage: bad v_count"); 17170Sstevel@tonic-gate /*NOTREACHED*/ 17180Sstevel@tonic-gate } 17190Sstevel@tonic-gate 17200Sstevel@tonic-gate if (vp->v_flag & VNOMAP) 17210Sstevel@tonic-gate return (ENOSYS); 17220Sstevel@tonic-gate 17231349Speterte ASSERT(off <= HS_MAXFILEOFF); 17240Sstevel@tonic-gate 17250Sstevel@tonic-gate if (!vn_has_cached_data(vp)) /* no pages mapped */ 17260Sstevel@tonic-gate return (0); 17270Sstevel@tonic-gate 17284866Sfrankho if (len == 0) { /* from 'off' to EOF */ 17294866Sfrankho error = pvn_vplist_dirty(vp, off, hsfs_putapage, flags, cr); 17304866Sfrankho } else { 17310Sstevel@tonic-gate offset_t end_off = off + len; 17320Sstevel@tonic-gate offset_t file_size = VTOH(vp)->hs_dirent.ext_size; 17330Sstevel@tonic-gate offset_t io_off; 17340Sstevel@tonic-gate 17350Sstevel@tonic-gate file_size = (file_size + PAGESIZE - 1) & PAGEMASK; 17360Sstevel@tonic-gate if (end_off > file_size) 17370Sstevel@tonic-gate end_off = file_size; 17380Sstevel@tonic-gate 17390Sstevel@tonic-gate for (io_off = off; io_off < end_off; io_off += PAGESIZE) { 17400Sstevel@tonic-gate page_t *pp; 17410Sstevel@tonic-gate 17420Sstevel@tonic-gate /* 17430Sstevel@tonic-gate * We insist on getting the page only if we are 17440Sstevel@tonic-gate * about to invalidate, free or write it and 17450Sstevel@tonic-gate * the B_ASYNC flag is not set. 17460Sstevel@tonic-gate */ 17470Sstevel@tonic-gate if ((flags & B_INVAL) || ((flags & B_ASYNC) == 0)) { 17480Sstevel@tonic-gate pp = page_lookup(vp, io_off, 17494866Sfrankho (flags & (B_INVAL | B_FREE)) ? 17504866Sfrankho SE_EXCL : SE_SHARED); 17510Sstevel@tonic-gate } else { 17520Sstevel@tonic-gate pp = page_lookup_nowait(vp, io_off, 17534866Sfrankho (flags & B_FREE) ? SE_EXCL : SE_SHARED); 17540Sstevel@tonic-gate } 17550Sstevel@tonic-gate 17560Sstevel@tonic-gate if (pp == NULL) 17570Sstevel@tonic-gate continue; 17585312Smg147109 17590Sstevel@tonic-gate /* 17600Sstevel@tonic-gate * Normally pvn_getdirty() should return 0, which 17610Sstevel@tonic-gate * impies that it has done the job for us. 17620Sstevel@tonic-gate * The shouldn't-happen scenario is when it returns 1. 17630Sstevel@tonic-gate * This means that the page has been modified and 17640Sstevel@tonic-gate * needs to be put back. 17650Sstevel@tonic-gate * Since we can't write on a CD, we fake a failed 17660Sstevel@tonic-gate * I/O and force pvn_write_done() to destroy the page. 17670Sstevel@tonic-gate */ 17680Sstevel@tonic-gate if (pvn_getdirty(pp, flags) == 1) { 17690Sstevel@tonic-gate cmn_err(CE_NOTE, 17704866Sfrankho "hsfs_putpage: dirty HSFS page"); 17710Sstevel@tonic-gate pvn_write_done(pp, flags | 17720Sstevel@tonic-gate B_ERROR | B_WRITE | B_INVAL | B_FORCE); 17730Sstevel@tonic-gate } 17740Sstevel@tonic-gate } 17750Sstevel@tonic-gate } 17760Sstevel@tonic-gate return (error); 17770Sstevel@tonic-gate } 17780Sstevel@tonic-gate 17790Sstevel@tonic-gate 17800Sstevel@tonic-gate /*ARGSUSED*/ 17810Sstevel@tonic-gate static int 17820Sstevel@tonic-gate hsfs_map( 17830Sstevel@tonic-gate struct vnode *vp, 17840Sstevel@tonic-gate offset_t off, 17850Sstevel@tonic-gate struct as *as, 17860Sstevel@tonic-gate caddr_t *addrp, 17870Sstevel@tonic-gate size_t len, 17880Sstevel@tonic-gate uchar_t prot, 17890Sstevel@tonic-gate uchar_t maxprot, 17900Sstevel@tonic-gate uint_t flags, 17915331Samw struct cred *cred, 17925331Samw caller_context_t *ct) 17930Sstevel@tonic-gate { 17940Sstevel@tonic-gate struct segvn_crargs vn_a; 17950Sstevel@tonic-gate int error; 17960Sstevel@tonic-gate 17970Sstevel@tonic-gate /* VFS_RECORD(vp->v_vfsp, VS_MAP, VS_CALL); */ 17980Sstevel@tonic-gate 17990Sstevel@tonic-gate if (vp->v_flag & VNOMAP) 18000Sstevel@tonic-gate return (ENOSYS); 18010Sstevel@tonic-gate 18021349Speterte if (off > HS_MAXFILEOFF || off < 0 || 18031349Speterte (off + len) < 0 || (off + len) > HS_MAXFILEOFF) 1804143Speterte return (ENXIO); 18050Sstevel@tonic-gate 18060Sstevel@tonic-gate if (vp->v_type != VREG) { 18070Sstevel@tonic-gate return (ENODEV); 18080Sstevel@tonic-gate } 18090Sstevel@tonic-gate 18100Sstevel@tonic-gate /* 18110Sstevel@tonic-gate * If file is being locked, disallow mapping. 18120Sstevel@tonic-gate */ 18130Sstevel@tonic-gate if (vn_has_mandatory_locks(vp, VTOH(vp)->hs_dirent.mode)) 18140Sstevel@tonic-gate return (EAGAIN); 18150Sstevel@tonic-gate 18160Sstevel@tonic-gate as_rangelock(as); 18170Sstevel@tonic-gate 18180Sstevel@tonic-gate if ((flags & MAP_FIXED) == 0) { 18190Sstevel@tonic-gate map_addr(addrp, len, off, 1, flags); 18200Sstevel@tonic-gate if (*addrp == NULL) { 18210Sstevel@tonic-gate as_rangeunlock(as); 18220Sstevel@tonic-gate return (ENOMEM); 18230Sstevel@tonic-gate } 18240Sstevel@tonic-gate } else { 18250Sstevel@tonic-gate /* 18260Sstevel@tonic-gate * User specified address - blow away any previous mappings 18270Sstevel@tonic-gate */ 18280Sstevel@tonic-gate (void) as_unmap(as, *addrp, len); 18290Sstevel@tonic-gate } 18300Sstevel@tonic-gate 18310Sstevel@tonic-gate vn_a.vp = vp; 18320Sstevel@tonic-gate vn_a.offset = off; 18330Sstevel@tonic-gate vn_a.type = flags & MAP_TYPE; 18340Sstevel@tonic-gate vn_a.prot = prot; 18350Sstevel@tonic-gate vn_a.maxprot = maxprot; 18360Sstevel@tonic-gate vn_a.flags = flags & ~MAP_TYPE; 18370Sstevel@tonic-gate vn_a.cred = cred; 18380Sstevel@tonic-gate vn_a.amp = NULL; 18390Sstevel@tonic-gate vn_a.szc = 0; 18400Sstevel@tonic-gate vn_a.lgrp_mem_policy_flags = 0; 18410Sstevel@tonic-gate 18420Sstevel@tonic-gate error = as_map(as, *addrp, len, segvn_create, &vn_a); 18430Sstevel@tonic-gate as_rangeunlock(as); 18440Sstevel@tonic-gate return (error); 18450Sstevel@tonic-gate } 18460Sstevel@tonic-gate 18470Sstevel@tonic-gate /* ARGSUSED */ 18480Sstevel@tonic-gate static int 18490Sstevel@tonic-gate hsfs_addmap( 18500Sstevel@tonic-gate struct vnode *vp, 18510Sstevel@tonic-gate offset_t off, 18520Sstevel@tonic-gate struct as *as, 18530Sstevel@tonic-gate caddr_t addr, 18540Sstevel@tonic-gate size_t len, 18550Sstevel@tonic-gate uchar_t prot, 18560Sstevel@tonic-gate uchar_t maxprot, 18570Sstevel@tonic-gate uint_t flags, 18585331Samw struct cred *cr, 18595331Samw caller_context_t *ct) 18600Sstevel@tonic-gate { 18610Sstevel@tonic-gate struct hsnode *hp; 18620Sstevel@tonic-gate 18630Sstevel@tonic-gate if (vp->v_flag & VNOMAP) 18640Sstevel@tonic-gate return (ENOSYS); 18650Sstevel@tonic-gate 18660Sstevel@tonic-gate hp = VTOH(vp); 18670Sstevel@tonic-gate mutex_enter(&hp->hs_contents_lock); 18680Sstevel@tonic-gate hp->hs_mapcnt += btopr(len); 18690Sstevel@tonic-gate mutex_exit(&hp->hs_contents_lock); 18700Sstevel@tonic-gate return (0); 18710Sstevel@tonic-gate } 18720Sstevel@tonic-gate 18730Sstevel@tonic-gate /*ARGSUSED*/ 18740Sstevel@tonic-gate static int 18750Sstevel@tonic-gate hsfs_delmap( 18760Sstevel@tonic-gate struct vnode *vp, 18770Sstevel@tonic-gate offset_t off, 18780Sstevel@tonic-gate struct as *as, 18790Sstevel@tonic-gate caddr_t addr, 18800Sstevel@tonic-gate size_t len, 18810Sstevel@tonic-gate uint_t prot, 18820Sstevel@tonic-gate uint_t maxprot, 18830Sstevel@tonic-gate uint_t flags, 18845331Samw struct cred *cr, 18855331Samw caller_context_t *ct) 18860Sstevel@tonic-gate { 18870Sstevel@tonic-gate struct hsnode *hp; 18880Sstevel@tonic-gate 18890Sstevel@tonic-gate if (vp->v_flag & VNOMAP) 18900Sstevel@tonic-gate return (ENOSYS); 18910Sstevel@tonic-gate 18920Sstevel@tonic-gate hp = VTOH(vp); 18930Sstevel@tonic-gate mutex_enter(&hp->hs_contents_lock); 18940Sstevel@tonic-gate hp->hs_mapcnt -= btopr(len); /* Count released mappings */ 18950Sstevel@tonic-gate ASSERT(hp->hs_mapcnt >= 0); 18960Sstevel@tonic-gate mutex_exit(&hp->hs_contents_lock); 18970Sstevel@tonic-gate return (0); 18980Sstevel@tonic-gate } 18990Sstevel@tonic-gate 19000Sstevel@tonic-gate /* ARGSUSED */ 19010Sstevel@tonic-gate static int 19025331Samw hsfs_seek( 19035331Samw struct vnode *vp, 19045331Samw offset_t ooff, 19055331Samw offset_t *noffp, 19065331Samw caller_context_t *ct) 19070Sstevel@tonic-gate { 19080Sstevel@tonic-gate return ((*noffp < 0 || *noffp > MAXOFFSET_T) ? EINVAL : 0); 19090Sstevel@tonic-gate } 19100Sstevel@tonic-gate 19110Sstevel@tonic-gate /* ARGSUSED */ 19120Sstevel@tonic-gate static int 19130Sstevel@tonic-gate hsfs_frlock( 19140Sstevel@tonic-gate struct vnode *vp, 19150Sstevel@tonic-gate int cmd, 19160Sstevel@tonic-gate struct flock64 *bfp, 19170Sstevel@tonic-gate int flag, 19180Sstevel@tonic-gate offset_t offset, 19190Sstevel@tonic-gate struct flk_callback *flk_cbp, 19205331Samw cred_t *cr, 19215331Samw caller_context_t *ct) 19220Sstevel@tonic-gate { 19230Sstevel@tonic-gate struct hsnode *hp = VTOH(vp); 19240Sstevel@tonic-gate 19250Sstevel@tonic-gate /* 19260Sstevel@tonic-gate * If the file is being mapped, disallow fs_frlock. 19270Sstevel@tonic-gate * We are not holding the hs_contents_lock while checking 19280Sstevel@tonic-gate * hs_mapcnt because the current locking strategy drops all 19290Sstevel@tonic-gate * locks before calling fs_frlock. 19300Sstevel@tonic-gate * So, hs_mapcnt could change before we enter fs_frlock making 19310Sstevel@tonic-gate * it meaningless to have held hs_contents_lock in the first place. 19320Sstevel@tonic-gate */ 19330Sstevel@tonic-gate if (hp->hs_mapcnt > 0 && MANDLOCK(vp, hp->hs_dirent.mode)) 19340Sstevel@tonic-gate return (EAGAIN); 19350Sstevel@tonic-gate 19365331Samw return (fs_frlock(vp, cmd, bfp, flag, offset, flk_cbp, cr, ct)); 19370Sstevel@tonic-gate } 19380Sstevel@tonic-gate 19395312Smg147109 static int 19405312Smg147109 hsched_deadline_compare(const void *x1, const void *x2) 19415312Smg147109 { 19425312Smg147109 const struct hio *h1 = x1; 19435312Smg147109 const struct hio *h2 = x2; 19445312Smg147109 19455312Smg147109 if (h1->io_timestamp < h2->io_timestamp) 19465312Smg147109 return (-1); 19475312Smg147109 if (h1->io_timestamp > h2->io_timestamp) 19485312Smg147109 return (1); 19495312Smg147109 19505312Smg147109 if (h1->io_lblkno < h2->io_lblkno) 19515312Smg147109 return (-1); 19525312Smg147109 if (h1->io_lblkno > h2->io_lblkno) 19535312Smg147109 return (1); 19545312Smg147109 19555312Smg147109 if (h1 < h2) 19565312Smg147109 return (-1); 19575312Smg147109 if (h1 > h2) 19585312Smg147109 return (1); 19595312Smg147109 19605312Smg147109 return (0); 19615312Smg147109 } 19625312Smg147109 19635312Smg147109 static int 19645312Smg147109 hsched_offset_compare(const void *x1, const void *x2) 19655312Smg147109 { 19665312Smg147109 const struct hio *h1 = x1; 19675312Smg147109 const struct hio *h2 = x2; 19685312Smg147109 19695312Smg147109 if (h1->io_lblkno < h2->io_lblkno) 19705312Smg147109 return (-1); 19715312Smg147109 if (h1->io_lblkno > h2->io_lblkno) 19725312Smg147109 return (1); 19735312Smg147109 19745312Smg147109 if (h1 < h2) 19755312Smg147109 return (-1); 19765312Smg147109 if (h1 > h2) 19775312Smg147109 return (1); 19785312Smg147109 19795312Smg147109 return (0); 19805312Smg147109 } 19815312Smg147109 19825312Smg147109 void 19835312Smg147109 hsched_init_caches(void) 19845312Smg147109 { 19855312Smg147109 hio_cache = kmem_cache_create("hsfs_hio_cache", 19865312Smg147109 sizeof (struct hio), 0, NULL, 19875312Smg147109 NULL, NULL, NULL, NULL, 0); 19885312Smg147109 19895312Smg147109 hio_info_cache = kmem_cache_create("hsfs_hio_info_cache", 19905312Smg147109 sizeof (struct hio_info), 0, NULL, 19915312Smg147109 NULL, NULL, NULL, NULL, 0); 19925312Smg147109 } 19935312Smg147109 19945312Smg147109 void 19955312Smg147109 hsched_fini_caches(void) 19965312Smg147109 { 19975312Smg147109 kmem_cache_destroy(hio_cache); 19985312Smg147109 kmem_cache_destroy(hio_info_cache); 19995312Smg147109 } 20005312Smg147109 20015312Smg147109 /* 20025312Smg147109 * Initialize I/O scheduling structures. This is called via hsfs_mount 20035312Smg147109 */ 20045312Smg147109 void 20055312Smg147109 hsched_init(struct hsfs *fsp, int fsid, struct modlinkage *modlinkage) 20065312Smg147109 { 20075312Smg147109 struct hsfs_queue *hqueue = fsp->hqueue; 20085312Smg147109 struct vnode *vp = fsp->hsfs_devvp; 20095312Smg147109 20105312Smg147109 /* TaskQ name of the form: hsched_task_ + stringof(int) */ 20115312Smg147109 char namebuf[23]; 20125312Smg147109 int error, err; 20135312Smg147109 struct dk_cinfo info; 20145312Smg147109 ldi_handle_t lh; 20155312Smg147109 ldi_ident_t li; 20165312Smg147109 20175312Smg147109 /* 20185312Smg147109 * Default maxtransfer = 16k chunk 20195312Smg147109 */ 20205312Smg147109 hqueue->dev_maxtransfer = 16384; 20215312Smg147109 20225312Smg147109 /* 20235312Smg147109 * Try to fetch the maximum device transfer size. This is used to 20245312Smg147109 * ensure that a coalesced block does not exceed the maxtransfer. 20255312Smg147109 */ 20265312Smg147109 err = ldi_ident_from_mod(modlinkage, &li); 20275312Smg147109 if (err) { 20285312Smg147109 cmn_err(CE_NOTE, "hsched_init: Querying device failed"); 20295312Smg147109 cmn_err(CE_NOTE, "hsched_init: ldi_ident_from_mod err=%d\n", 20305312Smg147109 err); 20315312Smg147109 goto set_ra; 20325312Smg147109 } 20335312Smg147109 20345312Smg147109 err = ldi_open_by_dev(&(vp->v_rdev), OTYP_CHR, FREAD, CRED(), &lh, li); 20355312Smg147109 ldi_ident_release(li); 20365312Smg147109 if (err) { 20375312Smg147109 cmn_err(CE_NOTE, "hsched_init: Querying device failed"); 20385312Smg147109 cmn_err(CE_NOTE, "hsched_init: ldi_open err=%d\n", err); 20395312Smg147109 goto set_ra; 20405312Smg147109 } 20415312Smg147109 20425312Smg147109 error = ldi_ioctl(lh, DKIOCINFO, (intptr_t)&info, FKIOCTL, 20435312Smg147109 CRED(), &err); 20445312Smg147109 err = ldi_close(lh, FREAD, CRED()); 20455312Smg147109 if (err) { 20465312Smg147109 cmn_err(CE_NOTE, "hsched_init: Querying device failed"); 20475312Smg147109 cmn_err(CE_NOTE, "hsched_init: ldi_close err=%d\n", err); 20485312Smg147109 } 20495312Smg147109 20505312Smg147109 if (error == 0) { 20515312Smg147109 hqueue->dev_maxtransfer = ldbtob(info.dki_maxtransfer); 20525312Smg147109 } 20535312Smg147109 20545312Smg147109 set_ra: 20555312Smg147109 /* 20565312Smg147109 * Max size of data to read ahead for sequential access pattern. 20575312Smg147109 * Conservative to avoid letting the underlying CD drive to spin 20585312Smg147109 * down, in case the application is reading slowly. 20595312Smg147109 * We read ahead upto a max of 4 pages. 20605312Smg147109 */ 20615312Smg147109 hqueue->max_ra_bytes = PAGESIZE * 8; 20625312Smg147109 20635312Smg147109 mutex_init(&(hqueue->hsfs_queue_lock), NULL, MUTEX_DEFAULT, NULL); 20645312Smg147109 mutex_init(&(hqueue->strategy_lock), NULL, MUTEX_DEFAULT, NULL); 20655312Smg147109 avl_create(&(hqueue->read_tree), hsched_offset_compare, 20665312Smg147109 sizeof (struct hio), offsetof(struct hio, io_offset_node)); 20675312Smg147109 avl_create(&(hqueue->deadline_tree), hsched_deadline_compare, 20685312Smg147109 sizeof (struct hio), offsetof(struct hio, io_deadline_node)); 20695312Smg147109 20705312Smg147109 (void) snprintf(namebuf, sizeof (namebuf), "hsched_task_%d", fsid); 20715312Smg147109 hqueue->ra_task = taskq_create(namebuf, hsfs_taskq_nthreads, 20725312Smg147109 minclsyspri + 2, 1, 104857600 / PAGESIZE, TASKQ_DYNAMIC); 20735312Smg147109 20745312Smg147109 hqueue->next = NULL; 20755312Smg147109 hqueue->nbuf = kmem_zalloc(sizeof (struct buf), KM_SLEEP); 20765312Smg147109 } 20775312Smg147109 20785312Smg147109 void 20795312Smg147109 hsched_fini(struct hsfs_queue *hqueue) 20805312Smg147109 { 20815312Smg147109 if (hqueue != NULL) { 2082*5406Smg147109 /* 2083*5406Smg147109 * Remove the sentinel if there was one. 2084*5406Smg147109 */ 2085*5406Smg147109 if (hqueue->next != NULL) { 2086*5406Smg147109 avl_remove(&hqueue->read_tree, hqueue->next); 2087*5406Smg147109 kmem_cache_free(hio_cache, hqueue->next); 2088*5406Smg147109 } 20895312Smg147109 avl_destroy(&(hqueue->read_tree)); 20905312Smg147109 avl_destroy(&(hqueue->deadline_tree)); 20915312Smg147109 mutex_destroy(&(hqueue->hsfs_queue_lock)); 20925312Smg147109 mutex_destroy(&(hqueue->strategy_lock)); 20935312Smg147109 20945312Smg147109 /* 20955312Smg147109 * If there are any existing readahead threads running 20965312Smg147109 * taskq_destroy will wait for them to finish. 20975312Smg147109 */ 20985312Smg147109 taskq_destroy(hqueue->ra_task); 20995312Smg147109 kmem_free(hqueue->nbuf, sizeof (struct buf)); 21005312Smg147109 } 21015312Smg147109 } 21025312Smg147109 21035312Smg147109 /* 21045312Smg147109 * Determine if two I/O requests are adjacent to each other so 21055312Smg147109 * that they can coalesced. 21065312Smg147109 */ 21075312Smg147109 #define IS_ADJACENT(io, nio) \ 21085312Smg147109 (((io)->io_lblkno + (io)->nblocks == (nio)->io_lblkno) && \ 21095312Smg147109 (io)->bp->b_edev == (nio)->bp->b_edev) 21105312Smg147109 21115312Smg147109 /* 21125312Smg147109 * This performs the actual I/O scheduling logic. We use the Circular 21135312Smg147109 * Look algorithm here. Sort the I/O requests in ascending order of 21145312Smg147109 * logical block number and process them starting with the lowest 21155312Smg147109 * numbered block and progressing towards higher block numbers in the 21165312Smg147109 * queue. Once there are no more higher numbered blocks, start again 21175312Smg147109 * with the lowest one. This is good for CD/DVD as you keep moving 21185312Smg147109 * the head in one direction along the outward spiral track and avoid 21195312Smg147109 * too many seeks as much as possible. The re-ordering also allows 21205312Smg147109 * us to coalesce adjacent requests into one larger request. 21215312Smg147109 * This is thus essentially a 1-way Elevator with front merging. 21225312Smg147109 * 21235312Smg147109 * In addition each read request here has a deadline and will be 21245312Smg147109 * processed out of turn if the deadline (500ms) expires. 21255312Smg147109 * 21265312Smg147109 * This function is necessarily serialized via hqueue->strategy_lock. 21275312Smg147109 * This function sits just below hsfs_getapage and processes all read 21285312Smg147109 * requests orginating from that function. 21295312Smg147109 */ 21305312Smg147109 int 21315312Smg147109 hsched_invoke_strategy(struct hsfs *fsp) 21325312Smg147109 { 21335312Smg147109 struct hsfs_queue *hqueue; 21345312Smg147109 struct buf *nbuf; 21355312Smg147109 struct hio *fio, *nio, *tio, *prev, *last; 21365312Smg147109 size_t bsize, soffset, offset, data; 21375312Smg147109 int bioret, bufcount; 21385312Smg147109 struct vnode *fvp; 21395312Smg147109 ksema_t *io_done; 21405312Smg147109 caddr_t iodata; 21415312Smg147109 21425312Smg147109 hqueue = fsp->hqueue; 21435312Smg147109 mutex_enter(&hqueue->strategy_lock); 21445312Smg147109 mutex_enter(&hqueue->hsfs_queue_lock); 21455312Smg147109 21465312Smg147109 /* 21475312Smg147109 * Check for Deadline expiration first 21485312Smg147109 */ 21495312Smg147109 fio = avl_first(&hqueue->deadline_tree); 21505312Smg147109 21515312Smg147109 /* 21525312Smg147109 * Paranoid check for empty I/O queue. Both deadline 21535312Smg147109 * and read trees contain same data sorted in different 21545312Smg147109 * ways. So empty deadline tree = empty read tree. 21555312Smg147109 */ 21565312Smg147109 if (fio == NULL) { 21575312Smg147109 /* 21585312Smg147109 * Remove the sentinel if there was one. 21595312Smg147109 */ 21605312Smg147109 if (hqueue->next != NULL) { 21615312Smg147109 avl_remove(&hqueue->read_tree, hqueue->next); 21625312Smg147109 kmem_cache_free(hio_cache, hqueue->next); 21635312Smg147109 hqueue->next = NULL; 21645312Smg147109 } 21655312Smg147109 mutex_exit(&hqueue->hsfs_queue_lock); 21665312Smg147109 mutex_exit(&hqueue->strategy_lock); 21675312Smg147109 return (1); 21685312Smg147109 } 21695312Smg147109 21705312Smg147109 if (drv_hztousec(ddi_get_lbolt()) - fio->io_timestamp 21715312Smg147109 < HSFS_READ_DEADLINE) { 21725312Smg147109 /* 21735312Smg147109 * Apply standard scheduling logic. This uses the 21745312Smg147109 * C-LOOK approach. Process I/O requests in ascending 21755312Smg147109 * order of logical block address till no subsequent 21765312Smg147109 * higher numbered block request remains. Then start 21775312Smg147109 * again from the lowest numbered block in the queue. 21785312Smg147109 * 21795312Smg147109 * We do this cheaply here by means of a sentinel. 21805312Smg147109 * The last processed I/O structure from the previous 21815312Smg147109 * invocation of this func, is left dangling in the 21825312Smg147109 * read_tree so that we can easily scan to the next 21835312Smg147109 * higher numbered request and remove the sentinel. 21845312Smg147109 */ 21855312Smg147109 fio = NULL; 21865312Smg147109 if (hqueue->next != NULL) { 21875312Smg147109 fio = AVL_NEXT(&hqueue->read_tree, hqueue->next); 21885312Smg147109 avl_remove(&hqueue->read_tree, hqueue->next); 21895312Smg147109 kmem_cache_free(hio_cache, hqueue->next); 21905312Smg147109 hqueue->next = NULL; 21915312Smg147109 } 21925312Smg147109 if (fio == NULL) { 21935312Smg147109 fio = avl_first(&hqueue->read_tree); 21945312Smg147109 } 21955312Smg147109 } else if (hqueue->next != NULL) { 21965312Smg147109 DTRACE_PROBE1(hsfs_deadline_expiry, struct hio *, fio); 21975312Smg147109 21985312Smg147109 avl_remove(&hqueue->read_tree, hqueue->next); 21995312Smg147109 kmem_cache_free(hio_cache, hqueue->next); 22005312Smg147109 hqueue->next = NULL; 22015312Smg147109 } 22025312Smg147109 22035312Smg147109 /* 22045312Smg147109 * In addition we try to coalesce contiguous 22055312Smg147109 * requests into one bigger request. 22065312Smg147109 */ 22075312Smg147109 bufcount = 1; 22085312Smg147109 bsize = ldbtob(fio->nblocks); 22095312Smg147109 fvp = fio->bp->b_file; 22105312Smg147109 nio = AVL_NEXT(&hqueue->read_tree, fio); 22115312Smg147109 tio = fio; 22125312Smg147109 while (nio != NULL && IS_ADJACENT(tio, nio) && 22135312Smg147109 bsize < hqueue->dev_maxtransfer) { 22145312Smg147109 avl_remove(&hqueue->deadline_tree, tio); 22155312Smg147109 avl_remove(&hqueue->read_tree, tio); 22165312Smg147109 tio->contig_chain = nio; 22175312Smg147109 bsize += ldbtob(nio->nblocks); 22185312Smg147109 prev = tio; 22195312Smg147109 tio = nio; 22205312Smg147109 22215312Smg147109 /* 22225312Smg147109 * This check is required to detect the case where 22235312Smg147109 * we are merging adjacent buffers belonging to 22245312Smg147109 * different files. fvp is used to set the b_file 22255312Smg147109 * parameter in the coalesced buf. b_file is used 22265312Smg147109 * by DTrace so we do not want DTrace to accrue 22275312Smg147109 * requests to two different files to any one file. 22285312Smg147109 */ 22295312Smg147109 if (fvp && tio->bp->b_file != fvp) { 22305312Smg147109 fvp = NULL; 22315312Smg147109 } 22325312Smg147109 22335312Smg147109 nio = AVL_NEXT(&hqueue->read_tree, nio); 22345312Smg147109 bufcount++; 22355312Smg147109 } 22365312Smg147109 22375312Smg147109 /* 22385312Smg147109 * tio is not removed from the read_tree as it serves as a sentinel 22395312Smg147109 * to cheaply allow us to scan to the next higher numbered I/O 22405312Smg147109 * request. 22415312Smg147109 */ 22425312Smg147109 hqueue->next = tio; 22435312Smg147109 avl_remove(&hqueue->deadline_tree, tio); 22445312Smg147109 mutex_exit(&hqueue->hsfs_queue_lock); 22455312Smg147109 DTRACE_PROBE3(hsfs_io_dequeued, struct hio *, fio, int, bufcount, 22465312Smg147109 size_t, bsize); 22475312Smg147109 22485312Smg147109 /* 22495312Smg147109 * The benefit of coalescing occurs if the the savings in I/O outweighs 22505312Smg147109 * the cost of doing the additional work below. 22515312Smg147109 * It was observed that coalescing 2 buffers results in diminishing 22525312Smg147109 * returns, so we do coalescing if we have >2 adjacent bufs. 22535312Smg147109 */ 22545312Smg147109 if (bufcount > hsched_coalesce_min) { 22555312Smg147109 /* 22565312Smg147109 * We have coalesced blocks. First allocate mem and buf for 22575312Smg147109 * the entire coalesced chunk. 22585312Smg147109 * Since we are guaranteed single-threaded here we pre-allocate 22595312Smg147109 * one buf at mount time and that is re-used every time. This 22605312Smg147109 * is a synthesized buf structure that uses kmem_alloced chunk. 22615312Smg147109 * Not quite a normal buf attached to pages. 22625312Smg147109 */ 22635312Smg147109 fsp->coalesced_bytes += bsize; 22645312Smg147109 nbuf = hqueue->nbuf; 22655312Smg147109 bioinit(nbuf); 22665312Smg147109 nbuf->b_edev = fio->bp->b_edev; 22675312Smg147109 nbuf->b_dev = fio->bp->b_dev; 22685312Smg147109 nbuf->b_flags = fio->bp->b_flags; 22695312Smg147109 nbuf->b_iodone = fio->bp->b_iodone; 22705312Smg147109 iodata = kmem_alloc(bsize, KM_SLEEP); 22715312Smg147109 nbuf->b_un.b_addr = iodata; 22725312Smg147109 nbuf->b_lblkno = fio->bp->b_lblkno; 22735312Smg147109 nbuf->b_vp = fvp; 22745312Smg147109 nbuf->b_file = fvp; 22755312Smg147109 nbuf->b_bcount = bsize; 22765312Smg147109 nbuf->b_bufsize = bsize; 22775312Smg147109 nbuf->b_resid = bsize; 22785312Smg147109 22795312Smg147109 DTRACE_PROBE3(hsfs_coalesced_io_start, struct hio *, fio, int, 22805312Smg147109 bufcount, size_t, bsize); 22815312Smg147109 22825312Smg147109 /* 22835312Smg147109 * Perform I/O for the coalesced block. 22845312Smg147109 */ 22855312Smg147109 (void) bdev_strategy(nbuf); 22865312Smg147109 22875312Smg147109 /* 22885312Smg147109 * Duplicate the last IO node to leave the sentinel alone. 22895312Smg147109 * The sentinel is freed in the next invocation of this 22905312Smg147109 * function. 22915312Smg147109 */ 22925312Smg147109 prev->contig_chain = kmem_cache_alloc(hio_cache, KM_SLEEP); 22935312Smg147109 prev->contig_chain->bp = tio->bp; 22945312Smg147109 prev->contig_chain->sema = tio->sema; 22955312Smg147109 tio = prev->contig_chain; 22965312Smg147109 tio->contig_chain = NULL; 22975312Smg147109 soffset = ldbtob(fio->bp->b_lblkno); 22985312Smg147109 nio = fio; 22995312Smg147109 23005312Smg147109 bioret = biowait(nbuf); 23015312Smg147109 data = bsize - nbuf->b_resid; 23025312Smg147109 biofini(nbuf); 23035312Smg147109 mutex_exit(&hqueue->strategy_lock); 23045312Smg147109 23055312Smg147109 /* 23065312Smg147109 * We use the b_resid parameter to detect how much 23075312Smg147109 * data was succesfully transferred. We will signal 23085312Smg147109 * a success to all the fully retrieved actual bufs 23095312Smg147109 * before coalescing, rest is signaled as error, 23105312Smg147109 * if any. 23115312Smg147109 */ 23125312Smg147109 tio = nio; 23135312Smg147109 DTRACE_PROBE3(hsfs_coalesced_io_done, struct hio *, nio, 23145312Smg147109 int, bioret, size_t, data); 23155312Smg147109 23165312Smg147109 /* 23175312Smg147109 * Copy data and signal success to all the bufs 23185312Smg147109 * which can be fully satisfied from b_resid. 23195312Smg147109 */ 23205312Smg147109 while (nio != NULL && data >= nio->bp->b_bcount) { 23215312Smg147109 offset = ldbtob(nio->bp->b_lblkno) - soffset; 23225312Smg147109 bcopy(iodata + offset, nio->bp->b_un.b_addr, 23235312Smg147109 nio->bp->b_bcount); 23245312Smg147109 data -= nio->bp->b_bcount; 23255312Smg147109 bioerror(nio->bp, 0); 23265312Smg147109 biodone(nio->bp); 23275312Smg147109 sema_v(nio->sema); 23285312Smg147109 tio = nio; 23295312Smg147109 nio = nio->contig_chain; 23305312Smg147109 kmem_cache_free(hio_cache, tio); 23315312Smg147109 } 23325312Smg147109 23335312Smg147109 /* 23345312Smg147109 * Signal error to all the leftover bufs (if any) 23355312Smg147109 * after b_resid data is exhausted. 23365312Smg147109 */ 23375312Smg147109 while (nio != NULL) { 23385312Smg147109 nio->bp->b_resid = nio->bp->b_bcount - data; 23395312Smg147109 bzero(nio->bp->b_un.b_addr + data, nio->bp->b_resid); 23405312Smg147109 bioerror(nio->bp, bioret); 23415312Smg147109 biodone(nio->bp); 23425312Smg147109 sema_v(nio->sema); 23435312Smg147109 tio = nio; 23445312Smg147109 nio = nio->contig_chain; 23455312Smg147109 kmem_cache_free(hio_cache, tio); 23465312Smg147109 data = 0; 23475312Smg147109 } 23485312Smg147109 kmem_free(iodata, bsize); 23495312Smg147109 } else { 23505312Smg147109 23515312Smg147109 nbuf = tio->bp; 23525312Smg147109 io_done = tio->sema; 23535312Smg147109 nio = fio; 23545312Smg147109 last = tio; 23555312Smg147109 23565312Smg147109 while (nio != NULL) { 23575312Smg147109 (void) bdev_strategy(nio->bp); 23585312Smg147109 nio = nio->contig_chain; 23595312Smg147109 } 23605312Smg147109 nio = fio; 23615312Smg147109 mutex_exit(&hqueue->strategy_lock); 23625312Smg147109 23635312Smg147109 while (nio != NULL) { 23645312Smg147109 if (nio == last) { 23655312Smg147109 (void) biowait(nbuf); 23665312Smg147109 sema_v(io_done); 23675312Smg147109 break; 23685312Smg147109 /* sentinel last not freed. See above. */ 23695312Smg147109 } else { 23705312Smg147109 (void) biowait(nio->bp); 23715312Smg147109 sema_v(nio->sema); 23725312Smg147109 } 23735312Smg147109 tio = nio; 23745312Smg147109 nio = nio->contig_chain; 23755312Smg147109 kmem_cache_free(hio_cache, tio); 23765312Smg147109 } 23775312Smg147109 } 23785312Smg147109 return (0); 23795312Smg147109 } 23805312Smg147109 23815312Smg147109 /* 23825312Smg147109 * Insert an I/O request in the I/O scheduler's pipeline 23835312Smg147109 * Using AVL tree makes it easy to reorder the I/O request 23845312Smg147109 * based on logical block number. 23855312Smg147109 */ 23865312Smg147109 static void 23875312Smg147109 hsched_enqueue_io(struct hsfs *fsp, struct hio *hsio, int ra) 23885312Smg147109 { 23895312Smg147109 struct hsfs_queue *hqueue = fsp->hqueue; 23905312Smg147109 23915312Smg147109 mutex_enter(&hqueue->hsfs_queue_lock); 23925312Smg147109 23935312Smg147109 fsp->physical_read_bytes += hsio->bp->b_bcount; 23945312Smg147109 if (ra) 23955312Smg147109 fsp->readahead_bytes += hsio->bp->b_bcount; 23965312Smg147109 23975312Smg147109 avl_add(&hqueue->deadline_tree, hsio); 23985312Smg147109 avl_add(&hqueue->read_tree, hsio); 23995312Smg147109 24005312Smg147109 DTRACE_PROBE3(hsfs_io_enqueued, struct hio *, hsio, 24015312Smg147109 struct hsfs_queue *, hqueue, int, ra); 24025312Smg147109 24035312Smg147109 mutex_exit(&hqueue->hsfs_queue_lock); 24045312Smg147109 } 24055312Smg147109 24062900Sfrankho /* ARGSUSED */ 24072900Sfrankho static int 24085331Samw hsfs_pathconf(struct vnode *vp, 24095331Samw int cmd, 24105331Samw ulong_t *valp, 24115331Samw struct cred *cr, 24125331Samw caller_context_t *ct) 24132900Sfrankho { 24142900Sfrankho struct hsfs *fsp; 24152900Sfrankho 24162900Sfrankho int error = 0; 24172900Sfrankho 24182900Sfrankho switch (cmd) { 24192900Sfrankho 24202900Sfrankho case _PC_NAME_MAX: 24212900Sfrankho fsp = VFS_TO_HSFS(vp->v_vfsp); 24222900Sfrankho *valp = fsp->hsfs_namemax; 24232900Sfrankho break; 24242900Sfrankho 24252900Sfrankho case _PC_FILESIZEBITS: 24262900Sfrankho *valp = 33; /* Without multi extent support: 4 GB - 2k */ 24272900Sfrankho break; 24282900Sfrankho 24292900Sfrankho default: 24305331Samw error = fs_pathconf(vp, cmd, valp, cr, ct); 24312900Sfrankho } 24322900Sfrankho 24332900Sfrankho return (error); 24342900Sfrankho } 24352900Sfrankho 24362900Sfrankho 24372900Sfrankho 24380Sstevel@tonic-gate const fs_operation_def_t hsfs_vnodeops_template[] = { 24393898Srsb VOPNAME_OPEN, { .vop_open = hsfs_open }, 24403898Srsb VOPNAME_CLOSE, { .vop_close = hsfs_close }, 24413898Srsb VOPNAME_READ, { .vop_read = hsfs_read }, 24423898Srsb VOPNAME_GETATTR, { .vop_getattr = hsfs_getattr }, 24433898Srsb VOPNAME_ACCESS, { .vop_access = hsfs_access }, 24443898Srsb VOPNAME_LOOKUP, { .vop_lookup = hsfs_lookup }, 24453898Srsb VOPNAME_READDIR, { .vop_readdir = hsfs_readdir }, 24463898Srsb VOPNAME_READLINK, { .vop_readlink = hsfs_readlink }, 24473898Srsb VOPNAME_FSYNC, { .vop_fsync = hsfs_fsync }, 24483898Srsb VOPNAME_INACTIVE, { .vop_inactive = hsfs_inactive }, 24493898Srsb VOPNAME_FID, { .vop_fid = hsfs_fid }, 24503898Srsb VOPNAME_SEEK, { .vop_seek = hsfs_seek }, 24513898Srsb VOPNAME_FRLOCK, { .vop_frlock = hsfs_frlock }, 24523898Srsb VOPNAME_GETPAGE, { .vop_getpage = hsfs_getpage }, 24533898Srsb VOPNAME_PUTPAGE, { .vop_putpage = hsfs_putpage }, 24543898Srsb VOPNAME_MAP, { .vop_map = hsfs_map }, 24553898Srsb VOPNAME_ADDMAP, { .vop_addmap = hsfs_addmap }, 24563898Srsb VOPNAME_DELMAP, { .vop_delmap = hsfs_delmap }, 24573898Srsb VOPNAME_PATHCONF, { .vop_pathconf = hsfs_pathconf }, 24583898Srsb NULL, NULL 24590Sstevel@tonic-gate }; 24600Sstevel@tonic-gate 24610Sstevel@tonic-gate struct vnodeops *hsfs_vnodeops; 2462