xref: /onnv-gate/usr/src/uts/common/fs/ufs/ufs_directio.c (revision 4662:9c48274ded8b)
10Sstevel@tonic-gate /*
20Sstevel@tonic-gate  * CDDL HEADER START
30Sstevel@tonic-gate  *
40Sstevel@tonic-gate  * The contents of this file are subject to the terms of the
5*4662Sfrankho  * Common Development and Distribution License (the "License").
6*4662Sfrankho  * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate  *
80Sstevel@tonic-gate  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate  * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate  * See the License for the specific language governing permissions
110Sstevel@tonic-gate  * and limitations under the License.
120Sstevel@tonic-gate  *
130Sstevel@tonic-gate  * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate  * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate  * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate  * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate  *
190Sstevel@tonic-gate  * CDDL HEADER END
200Sstevel@tonic-gate  */
210Sstevel@tonic-gate /*
22*4662Sfrankho  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
230Sstevel@tonic-gate  * Use is subject to license terms.
240Sstevel@tonic-gate  */
250Sstevel@tonic-gate 
260Sstevel@tonic-gate /* Copyright (c) 1983, 1984, 1985, 1986, 1987, 1988, 1989 AT&T */
270Sstevel@tonic-gate /* All Rights Reserved */
280Sstevel@tonic-gate 
290Sstevel@tonic-gate /*
300Sstevel@tonic-gate  * Portions of this source code were derived from Berkeley 4.3 BSD
310Sstevel@tonic-gate  * under license from the Regents of the University of California.
320Sstevel@tonic-gate  */
330Sstevel@tonic-gate 
340Sstevel@tonic-gate #pragma ident	"%Z%%M%	%I%	%E% SMI"
350Sstevel@tonic-gate 
360Sstevel@tonic-gate #include <sys/types.h>
370Sstevel@tonic-gate #include <sys/t_lock.h>
380Sstevel@tonic-gate #include <sys/param.h>
390Sstevel@tonic-gate #include <sys/time.h>
400Sstevel@tonic-gate #include <sys/systm.h>
410Sstevel@tonic-gate #include <sys/sysmacros.h>
420Sstevel@tonic-gate #include <sys/resource.h>
430Sstevel@tonic-gate #include <sys/signal.h>
440Sstevel@tonic-gate #include <sys/cred.h>
450Sstevel@tonic-gate #include <sys/user.h>
460Sstevel@tonic-gate #include <sys/buf.h>
470Sstevel@tonic-gate #include <sys/vfs.h>
480Sstevel@tonic-gate #include <sys/vnode.h>
490Sstevel@tonic-gate #include <sys/proc.h>
500Sstevel@tonic-gate #include <sys/disp.h>
510Sstevel@tonic-gate #include <sys/file.h>
520Sstevel@tonic-gate #include <sys/fcntl.h>
530Sstevel@tonic-gate #include <sys/flock.h>
540Sstevel@tonic-gate #include <sys/kmem.h>
550Sstevel@tonic-gate #include <sys/uio.h>
560Sstevel@tonic-gate #include <sys/dnlc.h>
570Sstevel@tonic-gate #include <sys/conf.h>
580Sstevel@tonic-gate #include <sys/mman.h>
590Sstevel@tonic-gate #include <sys/pathname.h>
600Sstevel@tonic-gate #include <sys/debug.h>
610Sstevel@tonic-gate #include <sys/vmsystm.h>
620Sstevel@tonic-gate #include <sys/cmn_err.h>
630Sstevel@tonic-gate #include <sys/filio.h>
640Sstevel@tonic-gate #include <sys/atomic.h>
650Sstevel@tonic-gate 
660Sstevel@tonic-gate #include <sys/fssnap_if.h>
670Sstevel@tonic-gate #include <sys/fs/ufs_fs.h>
680Sstevel@tonic-gate #include <sys/fs/ufs_lockfs.h>
690Sstevel@tonic-gate #include <sys/fs/ufs_filio.h>
700Sstevel@tonic-gate #include <sys/fs/ufs_inode.h>
710Sstevel@tonic-gate #include <sys/fs/ufs_fsdir.h>
720Sstevel@tonic-gate #include <sys/fs/ufs_quota.h>
730Sstevel@tonic-gate #include <sys/fs/ufs_trans.h>
740Sstevel@tonic-gate #include <sys/fs/ufs_panic.h>
750Sstevel@tonic-gate #include <sys/dirent.h>		/* must be AFTER <sys/fs/fsdir.h>! */
760Sstevel@tonic-gate #include <sys/errno.h>
770Sstevel@tonic-gate 
780Sstevel@tonic-gate #include <sys/filio.h>		/* _FIOIO */
790Sstevel@tonic-gate 
800Sstevel@tonic-gate #include <vm/hat.h>
810Sstevel@tonic-gate #include <vm/page.h>
820Sstevel@tonic-gate #include <vm/pvn.h>
830Sstevel@tonic-gate #include <vm/as.h>
840Sstevel@tonic-gate #include <vm/seg.h>
850Sstevel@tonic-gate #include <vm/seg_map.h>
860Sstevel@tonic-gate #include <vm/seg_vn.h>
870Sstevel@tonic-gate #include <vm/seg_kmem.h>
880Sstevel@tonic-gate #include <vm/rm.h>
890Sstevel@tonic-gate #include <sys/swap.h>
900Sstevel@tonic-gate #include <sys/epm.h>
910Sstevel@tonic-gate 
920Sstevel@tonic-gate #include <fs/fs_subr.h>
930Sstevel@tonic-gate 
940Sstevel@tonic-gate static void	*ufs_directio_zero_buf;
950Sstevel@tonic-gate static int	ufs_directio_zero_len	= 8192;
960Sstevel@tonic-gate 
970Sstevel@tonic-gate int	ufs_directio_enabled = 1;	/* feature is enabled */
980Sstevel@tonic-gate 
990Sstevel@tonic-gate /*
1000Sstevel@tonic-gate  * for kstats reader
1010Sstevel@tonic-gate  */
1020Sstevel@tonic-gate struct ufs_directio_kstats {
1031108Srshoaib 	kstat_named_t	logical_reads;
1041108Srshoaib 	kstat_named_t	phys_reads;
1051108Srshoaib 	kstat_named_t	hole_reads;
1061108Srshoaib 	kstat_named_t	nread;
1071108Srshoaib 	kstat_named_t	logical_writes;
1081108Srshoaib 	kstat_named_t	phys_writes;
1091108Srshoaib 	kstat_named_t	nwritten;
1101108Srshoaib 	kstat_named_t	nflushes;
1111108Srshoaib } ufs_directio_kstats = {
1121108Srshoaib 	{ "logical_reads",	KSTAT_DATA_UINT64 },
1131108Srshoaib 	{ "phys_reads",		KSTAT_DATA_UINT64 },
1141108Srshoaib 	{ "hole_reads",		KSTAT_DATA_UINT64 },
1151108Srshoaib 	{ "nread",		KSTAT_DATA_UINT64 },
1161108Srshoaib 	{ "logical_writes",	KSTAT_DATA_UINT64 },
1171108Srshoaib 	{ "phys_writes",	KSTAT_DATA_UINT64 },
1181108Srshoaib 	{ "nwritten",		KSTAT_DATA_UINT64 },
1191108Srshoaib 	{ "nflushes",		KSTAT_DATA_UINT64 },
1201108Srshoaib };
1210Sstevel@tonic-gate 
1220Sstevel@tonic-gate kstat_t	*ufs_directio_kstatsp;
1230Sstevel@tonic-gate 
1240Sstevel@tonic-gate /*
1250Sstevel@tonic-gate  * use kmem_cache_create for direct-physio buffers. This has shown
1260Sstevel@tonic-gate  * a better cache distribution compared to buffers on the
1270Sstevel@tonic-gate  * stack. It also avoids semaphore construction/deconstruction
1280Sstevel@tonic-gate  * per request
1290Sstevel@tonic-gate  */
1300Sstevel@tonic-gate struct directio_buf {
1310Sstevel@tonic-gate 	struct directio_buf	*next;
1320Sstevel@tonic-gate 	char		*addr;
1330Sstevel@tonic-gate 	size_t		nbytes;
1340Sstevel@tonic-gate 	struct buf	buf;
1350Sstevel@tonic-gate };
1360Sstevel@tonic-gate static struct kmem_cache *directio_buf_cache;
1370Sstevel@tonic-gate 
1380Sstevel@tonic-gate 
1390Sstevel@tonic-gate /* ARGSUSED */
1400Sstevel@tonic-gate static int
1410Sstevel@tonic-gate directio_buf_constructor(void *dbp, void *cdrarg, int kmflags)
1420Sstevel@tonic-gate {
1430Sstevel@tonic-gate 	bioinit((struct buf *)&((struct directio_buf *)dbp)->buf);
1440Sstevel@tonic-gate 	return (0);
1450Sstevel@tonic-gate }
1460Sstevel@tonic-gate 
1470Sstevel@tonic-gate /* ARGSUSED */
1480Sstevel@tonic-gate static void
1490Sstevel@tonic-gate directio_buf_destructor(void *dbp, void *cdrarg)
1500Sstevel@tonic-gate {
1510Sstevel@tonic-gate 	biofini((struct buf *)&((struct directio_buf *)dbp)->buf);
1520Sstevel@tonic-gate }
1530Sstevel@tonic-gate 
1540Sstevel@tonic-gate void
1550Sstevel@tonic-gate directio_bufs_init(void)
1560Sstevel@tonic-gate {
1570Sstevel@tonic-gate 	directio_buf_cache = kmem_cache_create("directio_buf_cache",
158*4662Sfrankho 	    sizeof (struct directio_buf), 0,
159*4662Sfrankho 	    directio_buf_constructor, directio_buf_destructor,
160*4662Sfrankho 	    NULL, NULL, NULL, 0);
1610Sstevel@tonic-gate }
1620Sstevel@tonic-gate 
1630Sstevel@tonic-gate void
1640Sstevel@tonic-gate ufs_directio_init(void)
1650Sstevel@tonic-gate {
1660Sstevel@tonic-gate 	/*
1670Sstevel@tonic-gate 	 * kstats
1680Sstevel@tonic-gate 	 */
1691108Srshoaib 	ufs_directio_kstatsp = kstat_create("ufs", 0,
1701108Srshoaib 	    "directio", "ufs", KSTAT_TYPE_NAMED,
1711108Srshoaib 	    sizeof (ufs_directio_kstats) / sizeof (kstat_named_t),
1721108Srshoaib 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
1730Sstevel@tonic-gate 	if (ufs_directio_kstatsp) {
1740Sstevel@tonic-gate 		ufs_directio_kstatsp->ks_data = (void *)&ufs_directio_kstats;
1750Sstevel@tonic-gate 		kstat_install(ufs_directio_kstatsp);
1760Sstevel@tonic-gate 	}
1770Sstevel@tonic-gate 	/*
1780Sstevel@tonic-gate 	 * kzero is broken so we have to use a private buf of zeroes
1790Sstevel@tonic-gate 	 */
1800Sstevel@tonic-gate 	ufs_directio_zero_buf = kmem_zalloc(ufs_directio_zero_len, KM_SLEEP);
1810Sstevel@tonic-gate 	directio_bufs_init();
1820Sstevel@tonic-gate }
1830Sstevel@tonic-gate 
1840Sstevel@tonic-gate /*
1850Sstevel@tonic-gate  * Wait for the first direct IO operation to finish
1860Sstevel@tonic-gate  */
1870Sstevel@tonic-gate static int
1880Sstevel@tonic-gate directio_wait_one(struct directio_buf *dbp, long *bytes_iop)
1890Sstevel@tonic-gate {
1900Sstevel@tonic-gate 	buf_t	*bp;
1910Sstevel@tonic-gate 	int	error;
1920Sstevel@tonic-gate 
1930Sstevel@tonic-gate 	/*
1940Sstevel@tonic-gate 	 * Wait for IO to finish
1950Sstevel@tonic-gate 	 */
1960Sstevel@tonic-gate 	bp = &dbp->buf;
1970Sstevel@tonic-gate 	error = biowait(bp);
1980Sstevel@tonic-gate 
1990Sstevel@tonic-gate 	/*
2000Sstevel@tonic-gate 	 * bytes_io will be used to figure out a resid
2010Sstevel@tonic-gate 	 * for the caller. The resid is approximated by reporting
2020Sstevel@tonic-gate 	 * the bytes following the first failed IO as the residual.
2030Sstevel@tonic-gate 	 *
2040Sstevel@tonic-gate 	 * I am cautious about using b_resid because I
2050Sstevel@tonic-gate 	 * am not sure how well the disk drivers maintain it.
2060Sstevel@tonic-gate 	 */
2070Sstevel@tonic-gate 	if (error)
2080Sstevel@tonic-gate 		if (bp->b_resid)
2090Sstevel@tonic-gate 			*bytes_iop = bp->b_bcount - bp->b_resid;
2100Sstevel@tonic-gate 		else
2110Sstevel@tonic-gate 			*bytes_iop = 0;
2120Sstevel@tonic-gate 	else
2130Sstevel@tonic-gate 		*bytes_iop += bp->b_bcount;
2140Sstevel@tonic-gate 	/*
2150Sstevel@tonic-gate 	 * Release direct IO resources
2160Sstevel@tonic-gate 	 */
2170Sstevel@tonic-gate 	bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
2180Sstevel@tonic-gate 	kmem_cache_free(directio_buf_cache, dbp);
2190Sstevel@tonic-gate 	return (error);
2200Sstevel@tonic-gate }
2210Sstevel@tonic-gate 
2220Sstevel@tonic-gate /*
2230Sstevel@tonic-gate  * Wait for all of the direct IO operations to finish
2240Sstevel@tonic-gate  */
2250Sstevel@tonic-gate 
2260Sstevel@tonic-gate uint32_t	ufs_directio_drop_kpri = 0;	/* enable kpri hack */
2270Sstevel@tonic-gate 
2280Sstevel@tonic-gate static int
2290Sstevel@tonic-gate directio_wait(struct directio_buf *tail, long *bytes_iop)
2300Sstevel@tonic-gate {
2310Sstevel@tonic-gate 	int	error = 0, newerror;
2320Sstevel@tonic-gate 	struct directio_buf	*dbp;
2330Sstevel@tonic-gate 	uint_t	kpri_req_save;
2340Sstevel@tonic-gate 
2350Sstevel@tonic-gate 	/*
2360Sstevel@tonic-gate 	 * The linked list of directio buf structures is maintained
2370Sstevel@tonic-gate 	 * in reverse order (tail->last request->penultimate request->...)
2380Sstevel@tonic-gate 	 */
2390Sstevel@tonic-gate 	/*
2400Sstevel@tonic-gate 	 * This is the k_pri_req hack. Large numbers of threads
2410Sstevel@tonic-gate 	 * sleeping with kernel priority will cause scheduler thrashing
2420Sstevel@tonic-gate 	 * on an MP machine. This can be seen running Oracle using
2430Sstevel@tonic-gate 	 * directio to ufs files. Sleep at normal priority here to
2440Sstevel@tonic-gate 	 * more closely mimic physio to a device partition. This
2450Sstevel@tonic-gate 	 * workaround is disabled by default as a niced thread could
2460Sstevel@tonic-gate 	 * be starved from running while holding i_rwlock and i_contents.
2470Sstevel@tonic-gate 	 */
2480Sstevel@tonic-gate 	if (ufs_directio_drop_kpri) {
2490Sstevel@tonic-gate 		kpri_req_save = curthread->t_kpri_req;
2500Sstevel@tonic-gate 		curthread->t_kpri_req = 0;
2510Sstevel@tonic-gate 	}
2520Sstevel@tonic-gate 	while ((dbp = tail) != NULL) {
2530Sstevel@tonic-gate 		tail = dbp->next;
2540Sstevel@tonic-gate 		newerror = directio_wait_one(dbp, bytes_iop);
2550Sstevel@tonic-gate 		if (error == 0)
2560Sstevel@tonic-gate 			error = newerror;
2570Sstevel@tonic-gate 	}
2580Sstevel@tonic-gate 	if (ufs_directio_drop_kpri)
2590Sstevel@tonic-gate 		curthread->t_kpri_req = kpri_req_save;
2600Sstevel@tonic-gate 	return (error);
2610Sstevel@tonic-gate }
2620Sstevel@tonic-gate /*
2630Sstevel@tonic-gate  * Initiate direct IO request
2640Sstevel@tonic-gate  */
2650Sstevel@tonic-gate static void
2660Sstevel@tonic-gate directio_start(struct ufsvfs *ufsvfsp, dev_t dev, size_t nbytes,
2670Sstevel@tonic-gate 	offset_t offset, char *addr, enum seg_rw rw, struct proc *procp,
2680Sstevel@tonic-gate 	struct directio_buf **tailp, page_t **pplist)
2690Sstevel@tonic-gate {
2700Sstevel@tonic-gate 	buf_t *bp;
2710Sstevel@tonic-gate 	struct directio_buf *dbp;
2720Sstevel@tonic-gate 
2730Sstevel@tonic-gate 	/*
2740Sstevel@tonic-gate 	 * Allocate a directio buf header
2750Sstevel@tonic-gate 	 *   Note - list is maintained in reverse order.
2760Sstevel@tonic-gate 	 *   directio_wait_one() depends on this fact when
2770Sstevel@tonic-gate 	 *   adjusting the ``bytes_io'' param. bytes_io
2780Sstevel@tonic-gate 	 *   is used to compute a residual in the case of error.
2790Sstevel@tonic-gate 	 */
2800Sstevel@tonic-gate 	dbp = kmem_cache_alloc(directio_buf_cache, KM_SLEEP);
2810Sstevel@tonic-gate 	dbp->next = *tailp;
2820Sstevel@tonic-gate 	*tailp = dbp;
2830Sstevel@tonic-gate 
2840Sstevel@tonic-gate 	/*
2850Sstevel@tonic-gate 	 * Initialize buf header
2860Sstevel@tonic-gate 	 */
2870Sstevel@tonic-gate 	dbp->addr = addr;
2880Sstevel@tonic-gate 	dbp->nbytes = nbytes;
2890Sstevel@tonic-gate 	bp = &dbp->buf;
2900Sstevel@tonic-gate 	bp->b_edev = dev;
2910Sstevel@tonic-gate 	bp->b_lblkno = btodt(offset);
2920Sstevel@tonic-gate 	bp->b_bcount = nbytes;
2930Sstevel@tonic-gate 	bp->b_un.b_addr = addr;
2940Sstevel@tonic-gate 	bp->b_proc = procp;
2950Sstevel@tonic-gate 
2960Sstevel@tonic-gate 	/*
2970Sstevel@tonic-gate 	 * Note that S_WRITE implies B_READ and vice versa: a read(2)
2980Sstevel@tonic-gate 	 * will B_READ data from the filesystem and S_WRITE it into
2990Sstevel@tonic-gate 	 * the user's buffer; a write(2) will S_READ data from the
3000Sstevel@tonic-gate 	 * user's buffer and B_WRITE it to the filesystem.
3010Sstevel@tonic-gate 	 */
3020Sstevel@tonic-gate 	if (rw == S_WRITE) {
3030Sstevel@tonic-gate 		bp->b_flags = B_BUSY | B_PHYS | B_READ;
3041108Srshoaib 		ufs_directio_kstats.phys_reads.value.ui64++;
3051108Srshoaib 		ufs_directio_kstats.nread.value.ui64 += nbytes;
3060Sstevel@tonic-gate 	} else {
3070Sstevel@tonic-gate 		bp->b_flags = B_BUSY | B_PHYS | B_WRITE;
3081108Srshoaib 		ufs_directio_kstats.phys_writes.value.ui64++;
3091108Srshoaib 		ufs_directio_kstats.nwritten.value.ui64 += nbytes;
3100Sstevel@tonic-gate 	}
3110Sstevel@tonic-gate 	bp->b_shadow = pplist;
3120Sstevel@tonic-gate 	if (pplist != NULL)
3130Sstevel@tonic-gate 		bp->b_flags |= B_SHADOW;
3140Sstevel@tonic-gate 
3150Sstevel@tonic-gate 	/*
3160Sstevel@tonic-gate 	 * Issue I/O request.
3170Sstevel@tonic-gate 	 */
3180Sstevel@tonic-gate 	ufsvfsp->vfs_iotstamp = lbolt;
3190Sstevel@tonic-gate 	if (ufsvfsp->vfs_snapshot)
3200Sstevel@tonic-gate 		fssnap_strategy(&ufsvfsp->vfs_snapshot, bp);
3210Sstevel@tonic-gate 	else
3220Sstevel@tonic-gate 		(void) bdev_strategy(bp);
3230Sstevel@tonic-gate 
3240Sstevel@tonic-gate 	if (rw == S_WRITE)
3250Sstevel@tonic-gate 		lwp_stat_update(LWP_STAT_OUBLK, 1);
3260Sstevel@tonic-gate 	else
3270Sstevel@tonic-gate 		lwp_stat_update(LWP_STAT_INBLK, 1);
3280Sstevel@tonic-gate 
3290Sstevel@tonic-gate }
3300Sstevel@tonic-gate 
3310Sstevel@tonic-gate uint32_t	ufs_shared_writes;	/* writes done w/ lock shared */
3320Sstevel@tonic-gate uint32_t	ufs_cur_writes;		/* # concurrent writes */
3330Sstevel@tonic-gate uint32_t	ufs_maxcur_writes;	/* high water concurrent writes */
3340Sstevel@tonic-gate uint32_t	ufs_posix_hits;		/* writes done /w lock excl. */
3350Sstevel@tonic-gate 
3360Sstevel@tonic-gate /*
3370Sstevel@tonic-gate  * Force POSIX syncronous data integrity on all writes for testing.
3380Sstevel@tonic-gate  */
3390Sstevel@tonic-gate uint32_t	ufs_force_posix_sdi = 0;
3400Sstevel@tonic-gate 
3410Sstevel@tonic-gate /*
3420Sstevel@tonic-gate  * Direct Write
3430Sstevel@tonic-gate  */
3440Sstevel@tonic-gate 
3450Sstevel@tonic-gate int
3460Sstevel@tonic-gate ufs_directio_write(struct inode *ip, uio_t *arg_uio, int ioflag, int rewrite,
3470Sstevel@tonic-gate 	cred_t *cr, int *statusp)
3480Sstevel@tonic-gate {
3490Sstevel@tonic-gate 	long		resid, bytes_written;
3500Sstevel@tonic-gate 	u_offset_t	size, uoff;
3510Sstevel@tonic-gate 	uio_t		*uio = arg_uio;
3520Sstevel@tonic-gate 	rlim64_t	limit = uio->uio_llimit;
3530Sstevel@tonic-gate 	int		on, n, error, newerror, len, has_holes;
3540Sstevel@tonic-gate 	daddr_t		bn;
3550Sstevel@tonic-gate 	size_t		nbytes;
3560Sstevel@tonic-gate 	struct fs	*fs;
3570Sstevel@tonic-gate 	vnode_t		*vp;
3580Sstevel@tonic-gate 	iovec_t		*iov;
3590Sstevel@tonic-gate 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
3600Sstevel@tonic-gate 	struct proc	*procp;
3610Sstevel@tonic-gate 	struct as	*as;
3620Sstevel@tonic-gate 	struct directio_buf	*tail;
3630Sstevel@tonic-gate 	int		exclusive, ncur, bmap_peek;
3640Sstevel@tonic-gate 	uio_t		copy_uio;
3650Sstevel@tonic-gate 	iovec_t		copy_iov;
3660Sstevel@tonic-gate 	char		*copy_base;
3670Sstevel@tonic-gate 	long		copy_resid;
3680Sstevel@tonic-gate 
3690Sstevel@tonic-gate 	/*
3700Sstevel@tonic-gate 	 * assume that directio isn't possible (normal case)
3710Sstevel@tonic-gate 	 */
3720Sstevel@tonic-gate 	*statusp = DIRECTIO_FAILURE;
3730Sstevel@tonic-gate 
3740Sstevel@tonic-gate 	/*
3750Sstevel@tonic-gate 	 * Don't go direct
3760Sstevel@tonic-gate 	 */
3770Sstevel@tonic-gate 	if (ufs_directio_enabled == 0)
3780Sstevel@tonic-gate 		return (0);
3790Sstevel@tonic-gate 
3800Sstevel@tonic-gate 	/*
3810Sstevel@tonic-gate 	 * mapped file; nevermind
3820Sstevel@tonic-gate 	 */
3830Sstevel@tonic-gate 	if (ip->i_mapcnt)
3840Sstevel@tonic-gate 		return (0);
3850Sstevel@tonic-gate 
3860Sstevel@tonic-gate 	/*
3870Sstevel@tonic-gate 	 * CAN WE DO DIRECT IO?
3880Sstevel@tonic-gate 	 */
3890Sstevel@tonic-gate 	uoff = uio->uio_loffset;
3900Sstevel@tonic-gate 	resid = uio->uio_resid;
3910Sstevel@tonic-gate 
3920Sstevel@tonic-gate 	/*
3930Sstevel@tonic-gate 	 * beyond limit
3940Sstevel@tonic-gate 	 */
3950Sstevel@tonic-gate 	if (uoff + resid > limit)
3960Sstevel@tonic-gate 		return (0);
3970Sstevel@tonic-gate 
3980Sstevel@tonic-gate 	/*
3990Sstevel@tonic-gate 	 * must be sector aligned
4000Sstevel@tonic-gate 	 */
4010Sstevel@tonic-gate 	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
4020Sstevel@tonic-gate 		return (0);
4030Sstevel@tonic-gate 
4040Sstevel@tonic-gate 	/*
4050Sstevel@tonic-gate 	 * SHOULD WE DO DIRECT IO?
4060Sstevel@tonic-gate 	 */
4070Sstevel@tonic-gate 	size = ip->i_size;
4080Sstevel@tonic-gate 	has_holes = -1;
4090Sstevel@tonic-gate 
4100Sstevel@tonic-gate 	/*
4110Sstevel@tonic-gate 	 * only on regular files; no metadata
4120Sstevel@tonic-gate 	 */
4130Sstevel@tonic-gate 	if (((ip->i_mode & IFMT) != IFREG) || ip->i_ufsvfs->vfs_qinod == ip)
4140Sstevel@tonic-gate 		return (0);
4150Sstevel@tonic-gate 
4160Sstevel@tonic-gate 	/*
4170Sstevel@tonic-gate 	 * Synchronous, allocating writes run very slow in Direct-Mode
4180Sstevel@tonic-gate 	 * 	XXX - can be fixed with bmap_write changes for large writes!!!
4190Sstevel@tonic-gate 	 *	XXX - can be fixed for updates to "almost-full" files
4200Sstevel@tonic-gate 	 *	XXX - WARNING - system hangs if bmap_write() has to
4210Sstevel@tonic-gate 	 * 			allocate lots of pages since pageout
4220Sstevel@tonic-gate 	 * 			suspends on locked inode
4230Sstevel@tonic-gate 	 */
4240Sstevel@tonic-gate 	if (!rewrite && (ip->i_flag & ISYNC)) {
4250Sstevel@tonic-gate 		if ((uoff + resid) > size)
4260Sstevel@tonic-gate 			return (0);
4270Sstevel@tonic-gate 		has_holes = bmap_has_holes(ip);
4280Sstevel@tonic-gate 		if (has_holes)
4290Sstevel@tonic-gate 			return (0);
4300Sstevel@tonic-gate 	}
4310Sstevel@tonic-gate 
4320Sstevel@tonic-gate 	/*
4330Sstevel@tonic-gate 	 * Each iovec must be short aligned and sector aligned.  If
4340Sstevel@tonic-gate 	 * one is not, then kmem_alloc a new buffer and copy all of
4350Sstevel@tonic-gate 	 * the smaller buffers into the new buffer.  This new
4360Sstevel@tonic-gate 	 * buffer will be short aligned and sector aligned.
4370Sstevel@tonic-gate 	 */
4380Sstevel@tonic-gate 	iov = uio->uio_iov;
4390Sstevel@tonic-gate 	nbytes = uio->uio_iovcnt;
4400Sstevel@tonic-gate 	while (nbytes--) {
4410Sstevel@tonic-gate 		if (((uint_t)iov->iov_len & (DEV_BSIZE - 1)) != 0 ||
4420Sstevel@tonic-gate 		    (intptr_t)(iov->iov_base) & 1) {
4430Sstevel@tonic-gate 			copy_resid = uio->uio_resid;
4440Sstevel@tonic-gate 			copy_base = kmem_alloc(copy_resid, KM_NOSLEEP);
4450Sstevel@tonic-gate 			if (copy_base == NULL)
4460Sstevel@tonic-gate 				return (0);
4470Sstevel@tonic-gate 			copy_iov.iov_base = copy_base;
4480Sstevel@tonic-gate 			copy_iov.iov_len = copy_resid;
4490Sstevel@tonic-gate 			copy_uio.uio_iov = &copy_iov;
4500Sstevel@tonic-gate 			copy_uio.uio_iovcnt = 1;
4510Sstevel@tonic-gate 			copy_uio.uio_segflg = UIO_SYSSPACE;
4520Sstevel@tonic-gate 			copy_uio.uio_extflg = UIO_COPY_DEFAULT;
4530Sstevel@tonic-gate 			copy_uio.uio_loffset = uio->uio_loffset;
4540Sstevel@tonic-gate 			copy_uio.uio_resid = uio->uio_resid;
4550Sstevel@tonic-gate 			copy_uio.uio_llimit = uio->uio_llimit;
4560Sstevel@tonic-gate 			error = uiomove(copy_base, copy_resid, UIO_WRITE, uio);
4570Sstevel@tonic-gate 			if (error) {
4580Sstevel@tonic-gate 				kmem_free(copy_base, copy_resid);
4590Sstevel@tonic-gate 				return (0);
4600Sstevel@tonic-gate 			}
4610Sstevel@tonic-gate 			uio = &copy_uio;
4620Sstevel@tonic-gate 			break;
4630Sstevel@tonic-gate 		}
4640Sstevel@tonic-gate 		iov++;
4650Sstevel@tonic-gate 	}
4660Sstevel@tonic-gate 
4670Sstevel@tonic-gate 	/*
4680Sstevel@tonic-gate 	 * From here on down, all error exits must go to errout and
4690Sstevel@tonic-gate 	 * not simply return a 0.
4700Sstevel@tonic-gate 	 */
4710Sstevel@tonic-gate 
4720Sstevel@tonic-gate 	/*
4730Sstevel@tonic-gate 	 * DIRECTIO
4740Sstevel@tonic-gate 	 */
4750Sstevel@tonic-gate 
4760Sstevel@tonic-gate 	fs = ip->i_fs;
4770Sstevel@tonic-gate 
4780Sstevel@tonic-gate 	/*
4790Sstevel@tonic-gate 	 * POSIX check. If attempting a concurrent re-write, make sure
4800Sstevel@tonic-gate 	 * that this will be a single request to the driver to meet
4810Sstevel@tonic-gate 	 * POSIX synchronous data integrity requirements.
4820Sstevel@tonic-gate 	 */
4830Sstevel@tonic-gate 	bmap_peek = 0;
4840Sstevel@tonic-gate 	if (rewrite && ((ioflag & FDSYNC) || ufs_force_posix_sdi)) {
4850Sstevel@tonic-gate 		int upgrade = 0;
4860Sstevel@tonic-gate 
4870Sstevel@tonic-gate 		/* check easy conditions first */
4880Sstevel@tonic-gate 		if (uio->uio_iovcnt != 1 || resid > ufsvfsp->vfs_ioclustsz) {
4890Sstevel@tonic-gate 			upgrade = 1;
4900Sstevel@tonic-gate 		} else {
4910Sstevel@tonic-gate 			/* now look for contiguous allocation */
4920Sstevel@tonic-gate 			len = (ssize_t)blkroundup(fs, resid);
4930Sstevel@tonic-gate 			error = bmap_read(ip, uoff, &bn, &len);
4940Sstevel@tonic-gate 			if (error || bn == UFS_HOLE || len == 0)
4950Sstevel@tonic-gate 				goto errout;
4960Sstevel@tonic-gate 			/* save a call to bmap_read later */
4970Sstevel@tonic-gate 			bmap_peek = 1;
4980Sstevel@tonic-gate 			if (len < resid)
4990Sstevel@tonic-gate 				upgrade = 1;
5000Sstevel@tonic-gate 		}
5010Sstevel@tonic-gate 		if (upgrade) {
5020Sstevel@tonic-gate 			rw_exit(&ip->i_contents);
5030Sstevel@tonic-gate 			rw_enter(&ip->i_contents, RW_WRITER);
5040Sstevel@tonic-gate 			ufs_posix_hits++;
5050Sstevel@tonic-gate 		}
5060Sstevel@tonic-gate 	}
5070Sstevel@tonic-gate 
5080Sstevel@tonic-gate 
5090Sstevel@tonic-gate 	/*
5100Sstevel@tonic-gate 	 * allocate space
5110Sstevel@tonic-gate 	 */
5120Sstevel@tonic-gate 
5130Sstevel@tonic-gate 	/*
5140Sstevel@tonic-gate 	 * If attempting a re-write, there is no allocation to do.
5150Sstevel@tonic-gate 	 * bmap_write would trip an ASSERT if i_contents is held shared.
5160Sstevel@tonic-gate 	 */
5170Sstevel@tonic-gate 	if (rewrite)
5180Sstevel@tonic-gate 		goto skip_alloc;
5190Sstevel@tonic-gate 
5200Sstevel@tonic-gate 	do {
5210Sstevel@tonic-gate 		on = (int)blkoff(fs, uoff);
5220Sstevel@tonic-gate 		n = (int)MIN(fs->fs_bsize - on, resid);
5230Sstevel@tonic-gate 		if ((uoff + n) > ip->i_size) {
5240Sstevel@tonic-gate 			error = bmap_write(ip, uoff, (int)(on + n),
525923Ssdebnath 			    (int)(uoff & (offset_t)MAXBOFFSET) == 0,
526923Ssdebnath 			    NULL, cr);
5270Sstevel@tonic-gate 			/* Caller is responsible for updating i_seq if needed */
5280Sstevel@tonic-gate 			if (error)
5290Sstevel@tonic-gate 				break;
5300Sstevel@tonic-gate 			ip->i_size = uoff + n;
5310Sstevel@tonic-gate 			ip->i_flag |= IATTCHG;
5320Sstevel@tonic-gate 		} else if (n == MAXBSIZE) {
533923Ssdebnath 			error = bmap_write(ip, uoff, (int)(on + n),
534923Ssdebnath 			    BI_ALLOC_ONLY, NULL, cr);
5350Sstevel@tonic-gate 			/* Caller is responsible for updating i_seq if needed */
5360Sstevel@tonic-gate 		} else {
5370Sstevel@tonic-gate 			if (has_holes < 0)
5380Sstevel@tonic-gate 				has_holes = bmap_has_holes(ip);
5390Sstevel@tonic-gate 			if (has_holes) {
5400Sstevel@tonic-gate 				uint_t	blk_size;
5410Sstevel@tonic-gate 				u_offset_t offset;
5420Sstevel@tonic-gate 
5430Sstevel@tonic-gate 				offset = uoff & (offset_t)fs->fs_bmask;
5440Sstevel@tonic-gate 				blk_size = (int)blksize(fs, ip,
5450Sstevel@tonic-gate 				    (daddr_t)lblkno(fs, offset));
546923Ssdebnath 				error = bmap_write(ip, uoff, blk_size,
547923Ssdebnath 				    BI_NORMAL, NULL, cr);
5480Sstevel@tonic-gate 				/*
5490Sstevel@tonic-gate 				 * Caller is responsible for updating
5500Sstevel@tonic-gate 				 * i_seq if needed
5510Sstevel@tonic-gate 				 */
5520Sstevel@tonic-gate 			} else
5530Sstevel@tonic-gate 				error = 0;
5540Sstevel@tonic-gate 		}
5550Sstevel@tonic-gate 		if (error)
5560Sstevel@tonic-gate 			break;
5570Sstevel@tonic-gate 		uoff += n;
5580Sstevel@tonic-gate 		resid -= n;
5590Sstevel@tonic-gate 		/*
5600Sstevel@tonic-gate 		 * if file has grown larger than 2GB, set flag
5610Sstevel@tonic-gate 		 * in superblock if not already set
5620Sstevel@tonic-gate 		 */
5630Sstevel@tonic-gate 		if ((ip->i_size > MAXOFF32_T) &&
5640Sstevel@tonic-gate 		    !(fs->fs_flags & FSLARGEFILES)) {
5650Sstevel@tonic-gate 			ASSERT(ufsvfsp->vfs_lfflags & UFS_LARGEFILES);
5660Sstevel@tonic-gate 			mutex_enter(&ufsvfsp->vfs_lock);
5670Sstevel@tonic-gate 			fs->fs_flags |= FSLARGEFILES;
5680Sstevel@tonic-gate 			ufs_sbwrite(ufsvfsp);
5690Sstevel@tonic-gate 			mutex_exit(&ufsvfsp->vfs_lock);
5700Sstevel@tonic-gate 		}
5710Sstevel@tonic-gate 	} while (resid);
5720Sstevel@tonic-gate 
5730Sstevel@tonic-gate 	if (error) {
5740Sstevel@tonic-gate 		/*
5750Sstevel@tonic-gate 		 * restore original state
5760Sstevel@tonic-gate 		 */
5770Sstevel@tonic-gate 		if (resid) {
5780Sstevel@tonic-gate 			if (size == ip->i_size)
5790Sstevel@tonic-gate 				goto errout;
5800Sstevel@tonic-gate 			(void) ufs_itrunc(ip, size, 0, cr);
5810Sstevel@tonic-gate 		}
5820Sstevel@tonic-gate 		/*
5830Sstevel@tonic-gate 		 * try non-directio path
5840Sstevel@tonic-gate 		 */
5850Sstevel@tonic-gate 		goto errout;
5860Sstevel@tonic-gate 	}
5870Sstevel@tonic-gate skip_alloc:
5880Sstevel@tonic-gate 
5890Sstevel@tonic-gate 	/*
5900Sstevel@tonic-gate 	 * get rid of cached pages
5910Sstevel@tonic-gate 	 */
5920Sstevel@tonic-gate 	vp = ITOV(ip);
5930Sstevel@tonic-gate 	exclusive = rw_write_held(&ip->i_contents);
5940Sstevel@tonic-gate 	if (vn_has_cached_data(vp)) {
5950Sstevel@tonic-gate 		if (!exclusive) {
5960Sstevel@tonic-gate 			/*
5970Sstevel@tonic-gate 			 * Still holding i_rwlock, so no allocations
5980Sstevel@tonic-gate 			 * can happen after dropping contents.
5990Sstevel@tonic-gate 			 */
6000Sstevel@tonic-gate 			rw_exit(&ip->i_contents);
6010Sstevel@tonic-gate 			rw_enter(&ip->i_contents, RW_WRITER);
6020Sstevel@tonic-gate 		}
6030Sstevel@tonic-gate 		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_INVAL, cr);
6040Sstevel@tonic-gate 		if (vn_has_cached_data(vp))
6050Sstevel@tonic-gate 			goto errout;
6060Sstevel@tonic-gate 		if (!exclusive)
6070Sstevel@tonic-gate 			rw_downgrade(&ip->i_contents);
6081108Srshoaib 		ufs_directio_kstats.nflushes.value.ui64++;
6090Sstevel@tonic-gate 	}
6100Sstevel@tonic-gate 
6110Sstevel@tonic-gate 	/*
6120Sstevel@tonic-gate 	 * Direct Writes
6130Sstevel@tonic-gate 	 */
6140Sstevel@tonic-gate 
6150Sstevel@tonic-gate 	if (!exclusive) {
6160Sstevel@tonic-gate 		ufs_shared_writes++;
6170Sstevel@tonic-gate 		ncur = atomic_add_32_nv(&ufs_cur_writes, 1);
6180Sstevel@tonic-gate 		if (ncur > ufs_maxcur_writes)
6190Sstevel@tonic-gate 			ufs_maxcur_writes = ncur;
6200Sstevel@tonic-gate 	}
6210Sstevel@tonic-gate 
6220Sstevel@tonic-gate 	/*
6230Sstevel@tonic-gate 	 * proc and as are for VM operations in directio_start()
6240Sstevel@tonic-gate 	 */
6250Sstevel@tonic-gate 	if (uio->uio_segflg == UIO_USERSPACE) {
6260Sstevel@tonic-gate 		procp = ttoproc(curthread);
6270Sstevel@tonic-gate 		as = procp->p_as;
6280Sstevel@tonic-gate 	} else {
6290Sstevel@tonic-gate 		procp = NULL;
6300Sstevel@tonic-gate 		as = &kas;
6310Sstevel@tonic-gate 	}
6320Sstevel@tonic-gate 	*statusp = DIRECTIO_SUCCESS;
6330Sstevel@tonic-gate 	error = 0;
6340Sstevel@tonic-gate 	newerror = 0;
6350Sstevel@tonic-gate 	resid = uio->uio_resid;
6360Sstevel@tonic-gate 	bytes_written = 0;
6371108Srshoaib 	ufs_directio_kstats.logical_writes.value.ui64++;
6380Sstevel@tonic-gate 	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
6390Sstevel@tonic-gate 		size_t pglck_len, pglck_size;
6400Sstevel@tonic-gate 		caddr_t pglck_base;
6410Sstevel@tonic-gate 		page_t **pplist, **spplist;
6420Sstevel@tonic-gate 
6430Sstevel@tonic-gate 		tail = NULL;
6440Sstevel@tonic-gate 
6450Sstevel@tonic-gate 		/*
6460Sstevel@tonic-gate 		 * Adjust number of bytes
6470Sstevel@tonic-gate 		 */
6480Sstevel@tonic-gate 		iov = uio->uio_iov;
6490Sstevel@tonic-gate 		pglck_len = (size_t)MIN(iov->iov_len, resid);
6500Sstevel@tonic-gate 		pglck_base = iov->iov_base;
6510Sstevel@tonic-gate 		if (pglck_len == 0) {
6520Sstevel@tonic-gate 			uio->uio_iov++;
6530Sstevel@tonic-gate 			uio->uio_iovcnt--;
6540Sstevel@tonic-gate 			continue;
6550Sstevel@tonic-gate 		}
6560Sstevel@tonic-gate 
6570Sstevel@tonic-gate 		/*
6580Sstevel@tonic-gate 		 * Try to Lock down the largest chunck of pages possible.
6590Sstevel@tonic-gate 		 */
6600Sstevel@tonic-gate 		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
6610Sstevel@tonic-gate 		error = as_pagelock(as, &pplist, pglck_base, pglck_len, S_READ);
6620Sstevel@tonic-gate 
6630Sstevel@tonic-gate 		if (error)
6640Sstevel@tonic-gate 			break;
6650Sstevel@tonic-gate 
6660Sstevel@tonic-gate 		pglck_size = pglck_len;
6670Sstevel@tonic-gate 		while (pglck_len) {
6680Sstevel@tonic-gate 
6690Sstevel@tonic-gate 			nbytes = pglck_len;
6700Sstevel@tonic-gate 			uoff = uio->uio_loffset;
6710Sstevel@tonic-gate 
6720Sstevel@tonic-gate 			if (!bmap_peek) {
6730Sstevel@tonic-gate 
6740Sstevel@tonic-gate 				/*
6750Sstevel@tonic-gate 				 * Re-adjust number of bytes to contiguous
6760Sstevel@tonic-gate 				 * range. May have already called bmap_read
6770Sstevel@tonic-gate 				 * in the case of a concurrent rewrite.
6780Sstevel@tonic-gate 				 */
6790Sstevel@tonic-gate 				len = (ssize_t)blkroundup(fs, nbytes);
6800Sstevel@tonic-gate 				error = bmap_read(ip, uoff, &bn, &len);
6810Sstevel@tonic-gate 				if (error)
6820Sstevel@tonic-gate 					break;
6830Sstevel@tonic-gate 				if (bn == UFS_HOLE || len == 0)
6840Sstevel@tonic-gate 					break;
6850Sstevel@tonic-gate 			}
6860Sstevel@tonic-gate 			nbytes = (size_t)MIN(nbytes, len);
6870Sstevel@tonic-gate 			bmap_peek = 0;
6880Sstevel@tonic-gate 
6890Sstevel@tonic-gate 			/*
6900Sstevel@tonic-gate 			 * Get the pagelist pointer for this offset to be
6910Sstevel@tonic-gate 			 * passed to directio_start.
6920Sstevel@tonic-gate 			 */
6930Sstevel@tonic-gate 
6940Sstevel@tonic-gate 			if (pplist != NULL)
6950Sstevel@tonic-gate 				spplist = pplist +
696*4662Sfrankho 				    btop((uintptr_t)iov->iov_base -
697*4662Sfrankho 				    ((uintptr_t)pglck_base & PAGEMASK));
6980Sstevel@tonic-gate 			else
6990Sstevel@tonic-gate 				spplist = NULL;
7000Sstevel@tonic-gate 
7010Sstevel@tonic-gate 			/*
7020Sstevel@tonic-gate 			 * Kick off the direct write requests
7030Sstevel@tonic-gate 			 */
7040Sstevel@tonic-gate 			directio_start(ufsvfsp, ip->i_dev, nbytes, ldbtob(bn),
705*4662Sfrankho 			    iov->iov_base, S_READ, procp, &tail, spplist);
7060Sstevel@tonic-gate 
7070Sstevel@tonic-gate 			/*
7080Sstevel@tonic-gate 			 * Adjust pointers and counters
7090Sstevel@tonic-gate 			 */
7100Sstevel@tonic-gate 			iov->iov_len -= nbytes;
7110Sstevel@tonic-gate 			iov->iov_base += nbytes;
7120Sstevel@tonic-gate 			uio->uio_loffset += nbytes;
7130Sstevel@tonic-gate 			resid -= nbytes;
7140Sstevel@tonic-gate 			pglck_len -= nbytes;
7150Sstevel@tonic-gate 		}
7160Sstevel@tonic-gate 
7170Sstevel@tonic-gate 		/*
7180Sstevel@tonic-gate 		 * Wait for outstanding requests
7190Sstevel@tonic-gate 		 */
7200Sstevel@tonic-gate 		newerror = directio_wait(tail, &bytes_written);
7210Sstevel@tonic-gate 
7220Sstevel@tonic-gate 		/*
7230Sstevel@tonic-gate 		 * Release VM resources
7240Sstevel@tonic-gate 		 */
7250Sstevel@tonic-gate 		as_pageunlock(as, pplist, pglck_base, pglck_size, S_READ);
7260Sstevel@tonic-gate 
7270Sstevel@tonic-gate 	}
7280Sstevel@tonic-gate 
7290Sstevel@tonic-gate 	if (!exclusive) {
7300Sstevel@tonic-gate 		atomic_add_32(&ufs_cur_writes, -1);
7310Sstevel@tonic-gate 		/*
7320Sstevel@tonic-gate 		 * If this write was done shared, readers may
7330Sstevel@tonic-gate 		 * have pulled in unmodified pages. Get rid of
7340Sstevel@tonic-gate 		 * these potentially stale pages.
7350Sstevel@tonic-gate 		 */
7360Sstevel@tonic-gate 		if (vn_has_cached_data(vp)) {
7370Sstevel@tonic-gate 			rw_exit(&ip->i_contents);
7380Sstevel@tonic-gate 			rw_enter(&ip->i_contents, RW_WRITER);
7390Sstevel@tonic-gate 			(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0,
740*4662Sfrankho 			    B_INVAL, cr);
7411108Srshoaib 			ufs_directio_kstats.nflushes.value.ui64++;
7420Sstevel@tonic-gate 			rw_downgrade(&ip->i_contents);
7430Sstevel@tonic-gate 		}
7440Sstevel@tonic-gate 	}
7450Sstevel@tonic-gate 
7460Sstevel@tonic-gate 	/*
7470Sstevel@tonic-gate 	 * If error, adjust resid to begin at the first
7480Sstevel@tonic-gate 	 * un-writable byte.
7490Sstevel@tonic-gate 	 */
7500Sstevel@tonic-gate 	if (error == 0)
7510Sstevel@tonic-gate 		error = newerror;
7520Sstevel@tonic-gate 	if (error)
7530Sstevel@tonic-gate 		resid = uio->uio_resid - bytes_written;
7540Sstevel@tonic-gate 	arg_uio->uio_resid = resid;
7550Sstevel@tonic-gate 
7560Sstevel@tonic-gate 	if (!rewrite) {
7570Sstevel@tonic-gate 		ip->i_flag |= IUPD | ICHG;
7580Sstevel@tonic-gate 		/* Caller will update i_seq */
7590Sstevel@tonic-gate 		TRANS_INODE(ip->i_ufsvfs, ip);
7600Sstevel@tonic-gate 	}
7610Sstevel@tonic-gate 	/*
7620Sstevel@tonic-gate 	 * If there is a residual; adjust the EOF if necessary
7630Sstevel@tonic-gate 	 */
7640Sstevel@tonic-gate 	if (resid) {
7650Sstevel@tonic-gate 		if (size != ip->i_size) {
7660Sstevel@tonic-gate 			if (uio->uio_loffset > size)
7670Sstevel@tonic-gate 				size = uio->uio_loffset;
7680Sstevel@tonic-gate 			(void) ufs_itrunc(ip, size, 0, cr);
7690Sstevel@tonic-gate 		}
7700Sstevel@tonic-gate 	}
7710Sstevel@tonic-gate 
7720Sstevel@tonic-gate 	if (uio == &copy_uio)
7730Sstevel@tonic-gate 		kmem_free(copy_base, copy_resid);
7740Sstevel@tonic-gate 
7750Sstevel@tonic-gate 	return (error);
7760Sstevel@tonic-gate 
7770Sstevel@tonic-gate errout:
7780Sstevel@tonic-gate 	if (uio == &copy_uio)
7790Sstevel@tonic-gate 		kmem_free(copy_base, copy_resid);
7800Sstevel@tonic-gate 
7810Sstevel@tonic-gate 	return (0);
7820Sstevel@tonic-gate }
7830Sstevel@tonic-gate /*
7840Sstevel@tonic-gate  * Direct read of a hole
7850Sstevel@tonic-gate  */
7860Sstevel@tonic-gate static int
7870Sstevel@tonic-gate directio_hole(struct uio *uio, size_t nbytes)
7880Sstevel@tonic-gate {
7890Sstevel@tonic-gate 	int		error = 0, nzero;
7900Sstevel@tonic-gate 	uio_t		phys_uio;
7910Sstevel@tonic-gate 	iovec_t		phys_iov;
7920Sstevel@tonic-gate 
7931108Srshoaib 	ufs_directio_kstats.hole_reads.value.ui64++;
7941108Srshoaib 	ufs_directio_kstats.nread.value.ui64 += nbytes;
7950Sstevel@tonic-gate 
7960Sstevel@tonic-gate 	phys_iov.iov_base = uio->uio_iov->iov_base;
7970Sstevel@tonic-gate 	phys_iov.iov_len = nbytes;
7980Sstevel@tonic-gate 
7990Sstevel@tonic-gate 	phys_uio.uio_iov = &phys_iov;
8000Sstevel@tonic-gate 	phys_uio.uio_iovcnt = 1;
8010Sstevel@tonic-gate 	phys_uio.uio_resid = phys_iov.iov_len;
8020Sstevel@tonic-gate 	phys_uio.uio_segflg = uio->uio_segflg;
8030Sstevel@tonic-gate 	phys_uio.uio_extflg = uio->uio_extflg;
8040Sstevel@tonic-gate 	while (error == 0 && phys_uio.uio_resid) {
8050Sstevel@tonic-gate 		nzero = (int)MIN(phys_iov.iov_len, ufs_directio_zero_len);
8060Sstevel@tonic-gate 		error = uiomove(ufs_directio_zero_buf, nzero, UIO_READ,
807*4662Sfrankho 		    &phys_uio);
8080Sstevel@tonic-gate 	}
8090Sstevel@tonic-gate 	return (error);
8100Sstevel@tonic-gate }
8110Sstevel@tonic-gate 
8120Sstevel@tonic-gate /*
8130Sstevel@tonic-gate  * Direct Read
8140Sstevel@tonic-gate  */
8150Sstevel@tonic-gate int
8160Sstevel@tonic-gate ufs_directio_read(struct inode *ip, uio_t *uio, cred_t *cr, int *statusp)
8170Sstevel@tonic-gate {
8180Sstevel@tonic-gate 	ssize_t		resid, bytes_read;
8190Sstevel@tonic-gate 	u_offset_t	size, uoff;
8200Sstevel@tonic-gate 	int		error, newerror, len;
8210Sstevel@tonic-gate 	size_t		nbytes;
8220Sstevel@tonic-gate 	struct fs	*fs;
8230Sstevel@tonic-gate 	vnode_t		*vp;
8240Sstevel@tonic-gate 	daddr_t		bn;
8250Sstevel@tonic-gate 	iovec_t		*iov;
8260Sstevel@tonic-gate 	struct ufsvfs	*ufsvfsp = ip->i_ufsvfs;
8270Sstevel@tonic-gate 	struct proc	*procp;
8280Sstevel@tonic-gate 	struct as	*as;
8290Sstevel@tonic-gate 	struct directio_buf	*tail;
8300Sstevel@tonic-gate 
8310Sstevel@tonic-gate 	/*
8320Sstevel@tonic-gate 	 * assume that directio isn't possible (normal case)
8330Sstevel@tonic-gate 	 */
8340Sstevel@tonic-gate 	*statusp = DIRECTIO_FAILURE;
8350Sstevel@tonic-gate 
8360Sstevel@tonic-gate 	/*
8370Sstevel@tonic-gate 	 * Don't go direct
8380Sstevel@tonic-gate 	 */
8390Sstevel@tonic-gate 	if (ufs_directio_enabled == 0)
8400Sstevel@tonic-gate 		return (0);
8410Sstevel@tonic-gate 
8420Sstevel@tonic-gate 	/*
8430Sstevel@tonic-gate 	 * mapped file; nevermind
8440Sstevel@tonic-gate 	 */
8450Sstevel@tonic-gate 	if (ip->i_mapcnt)
8460Sstevel@tonic-gate 		return (0);
8470Sstevel@tonic-gate 
8480Sstevel@tonic-gate 	/*
8490Sstevel@tonic-gate 	 * CAN WE DO DIRECT IO?
8500Sstevel@tonic-gate 	 */
8510Sstevel@tonic-gate 	/*
8520Sstevel@tonic-gate 	 * must be sector aligned
8530Sstevel@tonic-gate 	 */
8540Sstevel@tonic-gate 	uoff = uio->uio_loffset;
8550Sstevel@tonic-gate 	resid = uio->uio_resid;
8560Sstevel@tonic-gate 	if ((uoff & (u_offset_t)(DEV_BSIZE - 1)) || (resid & (DEV_BSIZE - 1)))
8570Sstevel@tonic-gate 		return (0);
8580Sstevel@tonic-gate 	/*
8590Sstevel@tonic-gate 	 * must be short aligned and sector aligned
8600Sstevel@tonic-gate 	 */
8610Sstevel@tonic-gate 	iov = uio->uio_iov;
8620Sstevel@tonic-gate 	nbytes = uio->uio_iovcnt;
8630Sstevel@tonic-gate 	while (nbytes--) {
8640Sstevel@tonic-gate 		if (((size_t)iov->iov_len & (DEV_BSIZE - 1)) != 0)
8650Sstevel@tonic-gate 			return (0);
8660Sstevel@tonic-gate 		if ((intptr_t)(iov++->iov_base) & 1)
8670Sstevel@tonic-gate 			return (0);
8680Sstevel@tonic-gate 	}
8690Sstevel@tonic-gate 
8700Sstevel@tonic-gate 	/*
8710Sstevel@tonic-gate 	 * DIRECTIO
8720Sstevel@tonic-gate 	 */
8730Sstevel@tonic-gate 	fs = ip->i_fs;
8740Sstevel@tonic-gate 
8750Sstevel@tonic-gate 	/*
8760Sstevel@tonic-gate 	 * don't read past EOF
8770Sstevel@tonic-gate 	 */
8780Sstevel@tonic-gate 	size = ip->i_size;
8790Sstevel@tonic-gate 
8800Sstevel@tonic-gate 	/*
8810Sstevel@tonic-gate 	 * The file offset is past EOF so bail out here; we don't want
8820Sstevel@tonic-gate 	 * to update uio_resid and make it look like we read something.
8830Sstevel@tonic-gate 	 * We say that direct I/O was a success to avoid having rdip()
8840Sstevel@tonic-gate 	 * go through the same "read past EOF logic".
8850Sstevel@tonic-gate 	 */
8860Sstevel@tonic-gate 	if (uoff >= size) {
8870Sstevel@tonic-gate 		*statusp = DIRECTIO_SUCCESS;
8880Sstevel@tonic-gate 		return (0);
8890Sstevel@tonic-gate 	}
8900Sstevel@tonic-gate 
8910Sstevel@tonic-gate 	/*
8920Sstevel@tonic-gate 	 * The read would extend past EOF so make it smaller.
8930Sstevel@tonic-gate 	 */
8940Sstevel@tonic-gate 	if ((uoff + resid) > size) {
8950Sstevel@tonic-gate 		resid = size - uoff;
8960Sstevel@tonic-gate 		/*
8970Sstevel@tonic-gate 		 * recheck sector alignment
8980Sstevel@tonic-gate 		 */
8990Sstevel@tonic-gate 		if (resid & (DEV_BSIZE - 1))
9000Sstevel@tonic-gate 			return (0);
9010Sstevel@tonic-gate 	}
9020Sstevel@tonic-gate 
9030Sstevel@tonic-gate 	/*
9040Sstevel@tonic-gate 	 * At this point, we know there is some real work to do.
9050Sstevel@tonic-gate 	 */
9060Sstevel@tonic-gate 	ASSERT(resid);
9070Sstevel@tonic-gate 
9080Sstevel@tonic-gate 	/*
9090Sstevel@tonic-gate 	 * get rid of cached pages
9100Sstevel@tonic-gate 	 */
9110Sstevel@tonic-gate 	vp = ITOV(ip);
9120Sstevel@tonic-gate 	if (vn_has_cached_data(vp)) {
9130Sstevel@tonic-gate 		rw_exit(&ip->i_contents);
9140Sstevel@tonic-gate 		rw_enter(&ip->i_contents, RW_WRITER);
9150Sstevel@tonic-gate 		(void) VOP_PUTPAGE(vp, (offset_t)0, (size_t)0, B_INVAL, cr);
9160Sstevel@tonic-gate 		if (vn_has_cached_data(vp))
9170Sstevel@tonic-gate 			return (0);
9180Sstevel@tonic-gate 		rw_downgrade(&ip->i_contents);
9191108Srshoaib 		ufs_directio_kstats.nflushes.value.ui64++;
9200Sstevel@tonic-gate 	}
9210Sstevel@tonic-gate 	/*
9220Sstevel@tonic-gate 	 * Direct Reads
9230Sstevel@tonic-gate 	 */
9240Sstevel@tonic-gate 
9250Sstevel@tonic-gate 	/*
9260Sstevel@tonic-gate 	 * proc and as are for VM operations in directio_start()
9270Sstevel@tonic-gate 	 */
9280Sstevel@tonic-gate 	if (uio->uio_segflg == UIO_USERSPACE) {
9290Sstevel@tonic-gate 		procp = ttoproc(curthread);
9300Sstevel@tonic-gate 		as = procp->p_as;
9310Sstevel@tonic-gate 	} else {
9320Sstevel@tonic-gate 		procp = NULL;
9330Sstevel@tonic-gate 		as = &kas;
9340Sstevel@tonic-gate 	}
9350Sstevel@tonic-gate 
9360Sstevel@tonic-gate 	*statusp = DIRECTIO_SUCCESS;
9370Sstevel@tonic-gate 	error = 0;
9380Sstevel@tonic-gate 	newerror = 0;
9390Sstevel@tonic-gate 	bytes_read = 0;
9401108Srshoaib 	ufs_directio_kstats.logical_reads.value.ui64++;
9410Sstevel@tonic-gate 	while (error == 0 && newerror == 0 && resid && uio->uio_iovcnt) {
9420Sstevel@tonic-gate 		size_t pglck_len, pglck_size;
9430Sstevel@tonic-gate 		caddr_t pglck_base;
9440Sstevel@tonic-gate 		page_t **pplist, **spplist;
9450Sstevel@tonic-gate 
9460Sstevel@tonic-gate 		tail = NULL;
9470Sstevel@tonic-gate 
9480Sstevel@tonic-gate 		/*
9490Sstevel@tonic-gate 		 * Adjust number of bytes
9500Sstevel@tonic-gate 		 */
9510Sstevel@tonic-gate 		iov = uio->uio_iov;
9520Sstevel@tonic-gate 		pglck_len = (size_t)MIN(iov->iov_len, resid);
9530Sstevel@tonic-gate 		pglck_base = iov->iov_base;
9540Sstevel@tonic-gate 		if (pglck_len == 0) {
9550Sstevel@tonic-gate 			uio->uio_iov++;
9560Sstevel@tonic-gate 			uio->uio_iovcnt--;
9570Sstevel@tonic-gate 			continue;
9580Sstevel@tonic-gate 		}
9590Sstevel@tonic-gate 
9600Sstevel@tonic-gate 		/*
9610Sstevel@tonic-gate 		 * Try to Lock down the largest chunck of pages possible.
9620Sstevel@tonic-gate 		 */
9630Sstevel@tonic-gate 		pglck_len = (size_t)MIN(pglck_len,  ufsvfsp->vfs_ioclustsz);
9640Sstevel@tonic-gate 		error = as_pagelock(as, &pplist, pglck_base,
965*4662Sfrankho 		    pglck_len, S_WRITE);
9660Sstevel@tonic-gate 
9670Sstevel@tonic-gate 		if (error)
9680Sstevel@tonic-gate 			break;
9690Sstevel@tonic-gate 
9700Sstevel@tonic-gate 		pglck_size = pglck_len;
9710Sstevel@tonic-gate 		while (pglck_len) {
9720Sstevel@tonic-gate 
9730Sstevel@tonic-gate 			nbytes = pglck_len;
9740Sstevel@tonic-gate 			uoff = uio->uio_loffset;
9750Sstevel@tonic-gate 
9760Sstevel@tonic-gate 			/*
9770Sstevel@tonic-gate 			 * Re-adjust number of bytes to contiguous range
9780Sstevel@tonic-gate 			 */
9790Sstevel@tonic-gate 			len = (ssize_t)blkroundup(fs, nbytes);
9800Sstevel@tonic-gate 			error = bmap_read(ip, uoff, &bn, &len);
9810Sstevel@tonic-gate 			if (error)
9820Sstevel@tonic-gate 				break;
9830Sstevel@tonic-gate 
9840Sstevel@tonic-gate 			if (bn == UFS_HOLE) {
9850Sstevel@tonic-gate 				nbytes = (size_t)MIN(fs->fs_bsize -
986*4662Sfrankho 				    (long)blkoff(fs, uoff), nbytes);
9870Sstevel@tonic-gate 				error = directio_hole(uio, nbytes);
9880Sstevel@tonic-gate 				/*
9890Sstevel@tonic-gate 				 * Hole reads are not added to the list
9900Sstevel@tonic-gate 				 * processed by directio_wait() below so
9910Sstevel@tonic-gate 				 * account for bytes read here.
9920Sstevel@tonic-gate 				 */
9930Sstevel@tonic-gate 				if (!error)
9940Sstevel@tonic-gate 					bytes_read += nbytes;
9950Sstevel@tonic-gate 			} else {
9960Sstevel@tonic-gate 				nbytes = (size_t)MIN(nbytes, len);
9970Sstevel@tonic-gate 
9980Sstevel@tonic-gate 				/*
9990Sstevel@tonic-gate 				 * Get the pagelist pointer for this offset
10000Sstevel@tonic-gate 				 * to be passed to directio_start.
10010Sstevel@tonic-gate 				 */
10020Sstevel@tonic-gate 				if (pplist != NULL)
10030Sstevel@tonic-gate 					spplist = pplist +
1004*4662Sfrankho 					    btop((uintptr_t)iov->iov_base -
1005*4662Sfrankho 					    ((uintptr_t)pglck_base & PAGEMASK));
10060Sstevel@tonic-gate 				else
10070Sstevel@tonic-gate 					spplist = NULL;
10080Sstevel@tonic-gate 
10090Sstevel@tonic-gate 				/*
10100Sstevel@tonic-gate 				 * Kick off the direct read requests
10110Sstevel@tonic-gate 				 */
10120Sstevel@tonic-gate 				directio_start(ufsvfsp, ip->i_dev, nbytes,
1013*4662Sfrankho 				    ldbtob(bn), iov->iov_base,
1014*4662Sfrankho 				    S_WRITE, procp, &tail, spplist);
10150Sstevel@tonic-gate 			}
10160Sstevel@tonic-gate 
10170Sstevel@tonic-gate 			if (error)
10180Sstevel@tonic-gate 				break;
10190Sstevel@tonic-gate 
10200Sstevel@tonic-gate 			/*
10210Sstevel@tonic-gate 			 * Adjust pointers and counters
10220Sstevel@tonic-gate 			 */
10230Sstevel@tonic-gate 			iov->iov_len -= nbytes;
10240Sstevel@tonic-gate 			iov->iov_base += nbytes;
10250Sstevel@tonic-gate 			uio->uio_loffset += nbytes;
10260Sstevel@tonic-gate 			resid -= nbytes;
10270Sstevel@tonic-gate 			pglck_len -= nbytes;
10280Sstevel@tonic-gate 		}
10290Sstevel@tonic-gate 
10300Sstevel@tonic-gate 		/*
10310Sstevel@tonic-gate 		 * Wait for outstanding requests
10320Sstevel@tonic-gate 		 */
10330Sstevel@tonic-gate 		newerror = directio_wait(tail, &bytes_read);
10340Sstevel@tonic-gate 		/*
10350Sstevel@tonic-gate 		 * Release VM resources
10360Sstevel@tonic-gate 		 */
10370Sstevel@tonic-gate 		as_pageunlock(as, pplist, pglck_base, pglck_size, S_WRITE);
10380Sstevel@tonic-gate 
10390Sstevel@tonic-gate 	}
10400Sstevel@tonic-gate 
10410Sstevel@tonic-gate 	/*
10420Sstevel@tonic-gate 	 * If error, adjust resid to begin at the first
10430Sstevel@tonic-gate 	 * un-read byte.
10440Sstevel@tonic-gate 	 */
10450Sstevel@tonic-gate 	if (error == 0)
10460Sstevel@tonic-gate 		error = newerror;
10470Sstevel@tonic-gate 	uio->uio_resid -= bytes_read;
10480Sstevel@tonic-gate 	return (error);
10490Sstevel@tonic-gate }
1050