10Sstevel@tonic-gate /*
20Sstevel@tonic-gate * CDDL HEADER START
30Sstevel@tonic-gate *
40Sstevel@tonic-gate * The contents of this file are subject to the terms of the
5*12173SMichael.Corcoran@Sun.COM * Common Development and Distribution License (the "License").
6*12173SMichael.Corcoran@Sun.COM * You may not use this file except in compliance with the License.
70Sstevel@tonic-gate *
80Sstevel@tonic-gate * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
90Sstevel@tonic-gate * or http://www.opensolaris.org/os/licensing.
100Sstevel@tonic-gate * See the License for the specific language governing permissions
110Sstevel@tonic-gate * and limitations under the License.
120Sstevel@tonic-gate *
130Sstevel@tonic-gate * When distributing Covered Code, include this CDDL HEADER in each
140Sstevel@tonic-gate * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
150Sstevel@tonic-gate * If applicable, add the following below this CDDL HEADER, with the
160Sstevel@tonic-gate * fields enclosed by brackets "[]" replaced with your own identifying
170Sstevel@tonic-gate * information: Portions Copyright [yyyy] [name of copyright owner]
180Sstevel@tonic-gate *
190Sstevel@tonic-gate * CDDL HEADER END
200Sstevel@tonic-gate */
210Sstevel@tonic-gate /*
22*12173SMichael.Corcoran@Sun.COM * Copyright (c) 1986, 2010, Oracle and/or its affiliates. All rights reserved.
230Sstevel@tonic-gate */
240Sstevel@tonic-gate
250Sstevel@tonic-gate /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
260Sstevel@tonic-gate /* All Rights Reserved */
270Sstevel@tonic-gate
280Sstevel@tonic-gate /*
290Sstevel@tonic-gate * University Copyright- Copyright (c) 1982, 1986, 1988
300Sstevel@tonic-gate * The Regents of the University of California
310Sstevel@tonic-gate * All Rights Reserved
320Sstevel@tonic-gate *
330Sstevel@tonic-gate * University Acknowledgment- Portions of this document are derived from
340Sstevel@tonic-gate * software developed by the University of California, Berkeley, and its
350Sstevel@tonic-gate * contributors.
360Sstevel@tonic-gate */
370Sstevel@tonic-gate
380Sstevel@tonic-gate #include <sys/types.h>
390Sstevel@tonic-gate #include <sys/t_lock.h>
400Sstevel@tonic-gate #include <sys/param.h>
410Sstevel@tonic-gate #include <sys/errno.h>
420Sstevel@tonic-gate #include <sys/debug.h>
430Sstevel@tonic-gate #include <sys/cmn_err.h>
440Sstevel@tonic-gate #include <sys/kmem.h>
450Sstevel@tonic-gate #include <sys/sysmacros.h>
460Sstevel@tonic-gate #include <sys/inline.h>
470Sstevel@tonic-gate #include <sys/buf.h>
480Sstevel@tonic-gate #include <sys/uio.h>
490Sstevel@tonic-gate #include <sys/user.h>
500Sstevel@tonic-gate #include <sys/proc.h>
510Sstevel@tonic-gate #include <sys/systm.h>
520Sstevel@tonic-gate #include <sys/vmsystm.h>
530Sstevel@tonic-gate #include <sys/cpuvar.h>
540Sstevel@tonic-gate #include <sys/mman.h>
550Sstevel@tonic-gate #include <sys/cred.h>
560Sstevel@tonic-gate #include <sys/vnode.h>
570Sstevel@tonic-gate #include <sys/file.h>
580Sstevel@tonic-gate #include <sys/vm.h>
590Sstevel@tonic-gate
600Sstevel@tonic-gate #include <sys/swap.h>
610Sstevel@tonic-gate #include <sys/vtrace.h>
620Sstevel@tonic-gate #include <sys/tnf_probe.h>
630Sstevel@tonic-gate #include <sys/fs/snode.h>
640Sstevel@tonic-gate #include <sys/copyops.h>
650Sstevel@tonic-gate #include <sys/conf.h>
660Sstevel@tonic-gate #include <sys/sdt.h>
670Sstevel@tonic-gate
680Sstevel@tonic-gate #include <vm/anon.h>
690Sstevel@tonic-gate #include <vm/hat.h>
700Sstevel@tonic-gate #include <vm/as.h>
710Sstevel@tonic-gate #include <vm/seg.h>
720Sstevel@tonic-gate #include <vm/page.h>
730Sstevel@tonic-gate #include <vm/seg_vn.h>
740Sstevel@tonic-gate #include <vm/seg_kmem.h>
750Sstevel@tonic-gate
760Sstevel@tonic-gate extern int maxphys;
770Sstevel@tonic-gate
780Sstevel@tonic-gate void
minphys(struct buf * bp)790Sstevel@tonic-gate minphys(struct buf *bp)
800Sstevel@tonic-gate {
810Sstevel@tonic-gate if (bp->b_bcount > maxphys)
820Sstevel@tonic-gate bp->b_bcount = maxphys;
830Sstevel@tonic-gate }
840Sstevel@tonic-gate
850Sstevel@tonic-gate /*
860Sstevel@tonic-gate * use kmem_cache_create for physio buffers. This has shown
870Sstevel@tonic-gate * a better cache distribution compared to buffers on the
880Sstevel@tonic-gate * stack. It also avoids semaphore construction/deconstruction
890Sstevel@tonic-gate * per request
900Sstevel@tonic-gate */
910Sstevel@tonic-gate
920Sstevel@tonic-gate static struct kmem_cache *physio_buf_cache;
930Sstevel@tonic-gate
940Sstevel@tonic-gate /* ARGSUSED */
950Sstevel@tonic-gate static int
physio_buf_constructor(void * buf,void * cdrarg,int kmflags)960Sstevel@tonic-gate physio_buf_constructor(void *buf, void *cdrarg, int kmflags)
970Sstevel@tonic-gate {
980Sstevel@tonic-gate bioinit((struct buf *)buf);
990Sstevel@tonic-gate return (0);
1000Sstevel@tonic-gate }
1010Sstevel@tonic-gate
1020Sstevel@tonic-gate /* ARGSUSED */
1030Sstevel@tonic-gate static void
physio_buf_destructor(void * buf,void * cdrarg)1040Sstevel@tonic-gate physio_buf_destructor(void *buf, void *cdrarg)
1050Sstevel@tonic-gate {
1060Sstevel@tonic-gate biofini((struct buf *)buf);
1070Sstevel@tonic-gate }
1080Sstevel@tonic-gate
1090Sstevel@tonic-gate void
physio_bufs_init(void)1100Sstevel@tonic-gate physio_bufs_init(void)
1110Sstevel@tonic-gate {
1120Sstevel@tonic-gate physio_buf_cache = kmem_cache_create("physio_buf_cache",
113*12173SMichael.Corcoran@Sun.COM sizeof (struct buf), 0, physio_buf_constructor,
114*12173SMichael.Corcoran@Sun.COM physio_buf_destructor, NULL, NULL, NULL, 0);
1150Sstevel@tonic-gate }
1160Sstevel@tonic-gate
1170Sstevel@tonic-gate
1180Sstevel@tonic-gate
1190Sstevel@tonic-gate /*
1200Sstevel@tonic-gate * initiate raw I/O request
1210Sstevel@tonic-gate *
1220Sstevel@tonic-gate * allocate buf header if necessary
1230Sstevel@tonic-gate * adjust max size of each I/O request
1240Sstevel@tonic-gate * lock down user pages and verify access protections
1250Sstevel@tonic-gate * call driver's strategy routine to submit request
1260Sstevel@tonic-gate * wait for I/O completion
1270Sstevel@tonic-gate * unlock user pages and free allocated buf header
1280Sstevel@tonic-gate */
1290Sstevel@tonic-gate
1300Sstevel@tonic-gate int
default_physio(int (* strat)(struct buf *),struct buf * bp,dev_t dev,int rw,void (* mincnt)(struct buf *),struct uio * uio)1310Sstevel@tonic-gate default_physio(int (*strat)(struct buf *), struct buf *bp, dev_t dev,
1320Sstevel@tonic-gate int rw, void (*mincnt)(struct buf *), struct uio *uio)
1330Sstevel@tonic-gate {
1340Sstevel@tonic-gate struct iovec *iov;
1350Sstevel@tonic-gate struct proc *procp;
1360Sstevel@tonic-gate struct as *asp;
1370Sstevel@tonic-gate ssize_t c;
1380Sstevel@tonic-gate char *a;
1390Sstevel@tonic-gate int error = 0;
1400Sstevel@tonic-gate page_t **pplist;
1410Sstevel@tonic-gate int allocbuf = 0;
1420Sstevel@tonic-gate
1430Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_START, "physio_start: bp %p", bp);
1440Sstevel@tonic-gate
1450Sstevel@tonic-gate /* Kernel probe */
1460Sstevel@tonic-gate TNF_PROBE_4(physio_start, "io rawio", /* CSTYLED */,
147*12173SMichael.Corcoran@Sun.COM tnf_device, device, dev,
148*12173SMichael.Corcoran@Sun.COM tnf_offset, offset, uio->uio_loffset,
149*12173SMichael.Corcoran@Sun.COM tnf_size, size, uio->uio_resid,
150*12173SMichael.Corcoran@Sun.COM tnf_bioflags, rw, rw);
1510Sstevel@tonic-gate
1520Sstevel@tonic-gate if (rw == B_READ) {
1530Sstevel@tonic-gate CPU_STATS_ADD_K(sys, phread, 1);
1540Sstevel@tonic-gate } else {
1550Sstevel@tonic-gate CPU_STATS_ADD_K(sys, phwrite, 1);
1560Sstevel@tonic-gate }
1570Sstevel@tonic-gate
1580Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_START,
159*12173SMichael.Corcoran@Sun.COM "getbuf_start: bp %p", bp);
1600Sstevel@tonic-gate
1610Sstevel@tonic-gate if (bp == NULL) {
1620Sstevel@tonic-gate bp = kmem_cache_alloc(physio_buf_cache, KM_SLEEP);
1630Sstevel@tonic-gate bp->b_iodone = NULL;
1640Sstevel@tonic-gate bp->b_resid = 0;
1650Sstevel@tonic-gate allocbuf = 1;
1660Sstevel@tonic-gate }
1670Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_GETBUF_END, "getbuf_end: bp %p", bp);
1680Sstevel@tonic-gate
1690Sstevel@tonic-gate if (uio->uio_segflg == UIO_USERSPACE) {
1700Sstevel@tonic-gate procp = ttoproc(curthread);
1710Sstevel@tonic-gate asp = procp->p_as;
1720Sstevel@tonic-gate } else {
1730Sstevel@tonic-gate procp = NULL;
1740Sstevel@tonic-gate asp = &kas;
1750Sstevel@tonic-gate }
1760Sstevel@tonic-gate ASSERT(SEMA_HELD(&bp->b_sem));
1770Sstevel@tonic-gate
1780Sstevel@tonic-gate /*
1790Sstevel@tonic-gate * We need to prepare this buffer for the io:::start probe, including
1800Sstevel@tonic-gate * NULL'ing out the file, clearing the offset, and filling in the
1810Sstevel@tonic-gate * b_dip field.
1820Sstevel@tonic-gate */
1830Sstevel@tonic-gate bp->b_file = NULL;
1840Sstevel@tonic-gate bp->b_offset = -1;
1850Sstevel@tonic-gate
1860Sstevel@tonic-gate if (dev != NODEV) {
1870Sstevel@tonic-gate (void) devopsp[getmajor(dev)]->devo_getinfo(NULL,
1880Sstevel@tonic-gate DDI_INFO_DEVT2DEVINFO, (void *)dev, (void **)&bp->b_dip);
1890Sstevel@tonic-gate } else {
1900Sstevel@tonic-gate bp->b_dip = NULL;
1910Sstevel@tonic-gate }
1920Sstevel@tonic-gate
1930Sstevel@tonic-gate while (uio->uio_iovcnt > 0) {
1940Sstevel@tonic-gate iov = uio->uio_iov;
1950Sstevel@tonic-gate
1960Sstevel@tonic-gate bp->b_error = 0;
1970Sstevel@tonic-gate bp->b_proc = procp;
1980Sstevel@tonic-gate
1990Sstevel@tonic-gate while (iov->iov_len > 0) {
2000Sstevel@tonic-gate if (uio->uio_resid == 0)
2010Sstevel@tonic-gate break;
2020Sstevel@tonic-gate if (uio->uio_loffset < 0) {
2030Sstevel@tonic-gate error = EINVAL;
2040Sstevel@tonic-gate break;
2050Sstevel@tonic-gate }
2060Sstevel@tonic-gate #ifdef _ILP32
2070Sstevel@tonic-gate /*
2080Sstevel@tonic-gate * For 32-bit kernels, check against SPEC_MAXOFFSET_T
2090Sstevel@tonic-gate * which represents the maximum size that can be
2100Sstevel@tonic-gate * supported by the IO subsystem.
2110Sstevel@tonic-gate * XXX this code assumes a D_64BIT driver.
2120Sstevel@tonic-gate */
2130Sstevel@tonic-gate if (uio->uio_loffset > SPEC_MAXOFFSET_T) {
2140Sstevel@tonic-gate error = EINVAL;
2150Sstevel@tonic-gate break;
2160Sstevel@tonic-gate }
2170Sstevel@tonic-gate #endif /* _ILP32 */
2180Sstevel@tonic-gate bp->b_flags = B_BUSY | B_PHYS | rw;
2190Sstevel@tonic-gate bp->b_edev = dev;
2200Sstevel@tonic-gate bp->b_lblkno = btodt(uio->uio_loffset);
2210Sstevel@tonic-gate
2220Sstevel@tonic-gate /*
2230Sstevel@tonic-gate * Don't count on b_addr remaining untouched by the
2240Sstevel@tonic-gate * code below (it may be reset because someone does
2250Sstevel@tonic-gate * a bp_mapin on the buffer) -- reset from the iov
2260Sstevel@tonic-gate * each time through, updating the iov's base address
2270Sstevel@tonic-gate * instead.
2280Sstevel@tonic-gate */
2290Sstevel@tonic-gate a = bp->b_un.b_addr = iov->iov_base;
2300Sstevel@tonic-gate bp->b_bcount = MIN(iov->iov_len, uio->uio_resid);
2310Sstevel@tonic-gate (*mincnt)(bp);
2320Sstevel@tonic-gate c = bp->b_bcount;
2330Sstevel@tonic-gate
2340Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_START,
2350Sstevel@tonic-gate "as_pagelock_start: bp %p", bp);
2360Sstevel@tonic-gate
2370Sstevel@tonic-gate error = as_pagelock(asp, &pplist, a,
2380Sstevel@tonic-gate c, rw == B_READ? S_WRITE : S_READ);
2390Sstevel@tonic-gate
2400Sstevel@tonic-gate TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_LOCK_END,
2410Sstevel@tonic-gate "as_pagelock_end:");
2420Sstevel@tonic-gate
2430Sstevel@tonic-gate if (error != 0) {
2440Sstevel@tonic-gate bp->b_flags |= B_ERROR;
2450Sstevel@tonic-gate bp->b_error = error;
246*12173SMichael.Corcoran@Sun.COM bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS);
2470Sstevel@tonic-gate break;
2480Sstevel@tonic-gate }
2490Sstevel@tonic-gate bp->b_shadow = pplist;
2500Sstevel@tonic-gate if (pplist != NULL) {
2510Sstevel@tonic-gate bp->b_flags |= B_SHADOW;
2520Sstevel@tonic-gate }
2530Sstevel@tonic-gate
2540Sstevel@tonic-gate DTRACE_IO1(start, struct buf *, bp);
2550Sstevel@tonic-gate bp->b_flags |= B_STARTED;
2560Sstevel@tonic-gate
2570Sstevel@tonic-gate (void) (*strat)(bp);
2580Sstevel@tonic-gate error = biowait(bp);
2590Sstevel@tonic-gate
2600Sstevel@tonic-gate /*
2610Sstevel@tonic-gate * unlock the pages
2620Sstevel@tonic-gate */
2630Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_START,
264*12173SMichael.Corcoran@Sun.COM "as_pageunlock_start: bp %p", bp);
2650Sstevel@tonic-gate
2660Sstevel@tonic-gate as_pageunlock(asp, pplist, a, c,
267*12173SMichael.Corcoran@Sun.COM rw == B_READ? S_WRITE : S_READ);
2680Sstevel@tonic-gate
2690Sstevel@tonic-gate TRACE_0(TR_FAC_PHYSIO, TR_PHYSIO_UNLOCK_END,
270*12173SMichael.Corcoran@Sun.COM "as_pageunlock_end:");
2710Sstevel@tonic-gate
2720Sstevel@tonic-gate c -= bp->b_resid;
2730Sstevel@tonic-gate iov->iov_base += c;
2740Sstevel@tonic-gate iov->iov_len -= c;
2750Sstevel@tonic-gate uio->uio_resid -= c;
2760Sstevel@tonic-gate uio->uio_loffset += c;
2770Sstevel@tonic-gate /* bp->b_resid - temp kludge for tape drives */
2780Sstevel@tonic-gate if (bp->b_resid || error)
2790Sstevel@tonic-gate break;
2800Sstevel@tonic-gate }
2810Sstevel@tonic-gate bp->b_flags &= ~(B_BUSY|B_WANTED|B_PHYS|B_SHADOW);
2820Sstevel@tonic-gate /* bp->b_resid - temp kludge for tape drives */
2830Sstevel@tonic-gate if (bp->b_resid || error)
2840Sstevel@tonic-gate break;
2850Sstevel@tonic-gate uio->uio_iov++;
2860Sstevel@tonic-gate uio->uio_iovcnt--;
2870Sstevel@tonic-gate }
2880Sstevel@tonic-gate
2890Sstevel@tonic-gate if (allocbuf) {
2900Sstevel@tonic-gate kmem_cache_free(physio_buf_cache, bp);
2910Sstevel@tonic-gate }
2920Sstevel@tonic-gate
2930Sstevel@tonic-gate /* Kernel probe */
2940Sstevel@tonic-gate TNF_PROBE_1(physio_end, "io rawio", /* CSTYLED */,
2950Sstevel@tonic-gate tnf_device, device, dev);
2960Sstevel@tonic-gate
2970Sstevel@tonic-gate TRACE_1(TR_FAC_PHYSIO, TR_PHYSIO_END, "physio_end: bp %p", bp);
2980Sstevel@tonic-gate
2990Sstevel@tonic-gate return (error);
3000Sstevel@tonic-gate }
3010Sstevel@tonic-gate
3020Sstevel@tonic-gate /*
3030Sstevel@tonic-gate * Returns 0 on success, or an error on failure.
3040Sstevel@tonic-gate *
3050Sstevel@tonic-gate * This function is no longer a part of the DDI/DKI.
3060Sstevel@tonic-gate * However, for compatibility, its interface should not
3070Sstevel@tonic-gate * be changed and it should not be removed from the kernel.
3080Sstevel@tonic-gate */
3090Sstevel@tonic-gate int
useracc(void * addr,size_t count,int access)3100Sstevel@tonic-gate useracc(void *addr, size_t count, int access)
3110Sstevel@tonic-gate {
3120Sstevel@tonic-gate uint_t prot;
3130Sstevel@tonic-gate
3140Sstevel@tonic-gate prot = PROT_USER | ((access == B_READ) ? PROT_READ : PROT_WRITE);
3150Sstevel@tonic-gate return (as_checkprot(ttoproc(curthread)->p_as, addr, count, prot));
3160Sstevel@tonic-gate }
3170Sstevel@tonic-gate
3180Sstevel@tonic-gate #define MAX_MAPIN_PAGES 8
3190Sstevel@tonic-gate
3200Sstevel@tonic-gate /*
3210Sstevel@tonic-gate * This function temporarily "borrows" user pages for kernel use. If
3220Sstevel@tonic-gate * "cow" is on, it also sets up copy-on-write protection (only feasible
3230Sstevel@tonic-gate * on MAP_PRIVATE segment) on the user mappings, to protect the borrowed
3240Sstevel@tonic-gate * pages from any changes by the user. The caller is responsible for
3250Sstevel@tonic-gate * unlocking and tearing down cow settings when it's done with the pages.
3260Sstevel@tonic-gate * For an example, see kcfree().
3270Sstevel@tonic-gate *
3280Sstevel@tonic-gate * Pages behind [uaddr..uaddr+*lenp] under address space "as" are locked
3290Sstevel@tonic-gate * (shared), and mapped into kernel address range [kaddr..kaddr+*lenp] if
3300Sstevel@tonic-gate * kaddr != -1. On entering this function, cached_ppp contains a list
3310Sstevel@tonic-gate * of pages that are mapped into [kaddr..kaddr+*lenp] already (from a
3320Sstevel@tonic-gate * previous call). Thus if same pages remain behind [uaddr..uaddr+*lenp],
3330Sstevel@tonic-gate * the kernel map won't need to be reloaded again.
3340Sstevel@tonic-gate *
3350Sstevel@tonic-gate * For cow == 1, if the pages are anonymous pages, it also bumps the anon
3360Sstevel@tonic-gate * reference count, and change the user-mapping to read-only. This
3370Sstevel@tonic-gate * scheme should work on all types of segment drivers. But to be safe,
3380Sstevel@tonic-gate * we check against segvn here.
3390Sstevel@tonic-gate *
3400Sstevel@tonic-gate * Since this function is used to emulate copyin() semantic, it checks
3410Sstevel@tonic-gate * to make sure the user-mappings allow "user-read".
3420Sstevel@tonic-gate *
3430Sstevel@tonic-gate * On exit "lenp" contains the number of bytes successfully locked and
3440Sstevel@tonic-gate * mapped in. For the unsuccessful ones, the caller can fall back to
3450Sstevel@tonic-gate * copyin().
3460Sstevel@tonic-gate *
3470Sstevel@tonic-gate * Error return:
3480Sstevel@tonic-gate * ENOTSUP - operation like this is not supported either on this segment
3490Sstevel@tonic-gate * type, or on this platform type.
3500Sstevel@tonic-gate */
3510Sstevel@tonic-gate int
cow_mapin(struct as * as,caddr_t uaddr,caddr_t kaddr,struct page ** cached_ppp,struct anon ** app,size_t * lenp,int cow)3520Sstevel@tonic-gate cow_mapin(struct as *as, caddr_t uaddr, caddr_t kaddr, struct page **cached_ppp,
3530Sstevel@tonic-gate struct anon **app, size_t *lenp, int cow)
3540Sstevel@tonic-gate {
3550Sstevel@tonic-gate struct hat *hat;
3560Sstevel@tonic-gate struct seg *seg;
3570Sstevel@tonic-gate caddr_t base;
3580Sstevel@tonic-gate page_t *pp, *ppp[MAX_MAPIN_PAGES];
3590Sstevel@tonic-gate long i;
3600Sstevel@tonic-gate int flags;
3610Sstevel@tonic-gate size_t size, total = *lenp;
3620Sstevel@tonic-gate char first = 1;
3630Sstevel@tonic-gate faultcode_t res;
3640Sstevel@tonic-gate
3650Sstevel@tonic-gate *lenp = 0;
3660Sstevel@tonic-gate if (cow) {
3670Sstevel@tonic-gate AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
3680Sstevel@tonic-gate seg = as_findseg(as, uaddr, 0);
3690Sstevel@tonic-gate if ((seg == NULL) || ((base = seg->s_base) > uaddr) ||
370*12173SMichael.Corcoran@Sun.COM (uaddr + total) > base + seg->s_size) {
3710Sstevel@tonic-gate AS_LOCK_EXIT(as, &as->a_lock);
3720Sstevel@tonic-gate return (EINVAL);
3730Sstevel@tonic-gate }
3740Sstevel@tonic-gate /*
3750Sstevel@tonic-gate * The COW scheme should work for all segment types.
3760Sstevel@tonic-gate * But to be safe, we check against segvn.
3770Sstevel@tonic-gate */
3780Sstevel@tonic-gate if (seg->s_ops != &segvn_ops) {
3790Sstevel@tonic-gate AS_LOCK_EXIT(as, &as->a_lock);
3800Sstevel@tonic-gate return (ENOTSUP);
3810Sstevel@tonic-gate } else if ((SEGOP_GETTYPE(seg, uaddr) & MAP_PRIVATE) == 0) {
3820Sstevel@tonic-gate AS_LOCK_EXIT(as, &as->a_lock);
3830Sstevel@tonic-gate return (ENOTSUP);
3840Sstevel@tonic-gate }
3850Sstevel@tonic-gate }
3860Sstevel@tonic-gate hat = as->a_hat;
3870Sstevel@tonic-gate size = total;
3880Sstevel@tonic-gate tryagain:
3890Sstevel@tonic-gate /*
3900Sstevel@tonic-gate * If (cow), hat_softlock will also change the usr protection to RO.
3910Sstevel@tonic-gate * This is the first step toward setting up cow. Before we
3920Sstevel@tonic-gate * bump up an_refcnt, we can't allow any cow-fault on this
3930Sstevel@tonic-gate * address. Otherwise segvn_fault will change the protection back
3940Sstevel@tonic-gate * to RW upon seeing an_refcnt == 1.
3950Sstevel@tonic-gate * The solution is to hold the writer lock on "as".
3960Sstevel@tonic-gate */
3970Sstevel@tonic-gate res = hat_softlock(hat, uaddr, &size, &ppp[0], cow ? HAT_COW : 0);
3980Sstevel@tonic-gate size = total - size;
3990Sstevel@tonic-gate *lenp += size;
4000Sstevel@tonic-gate size = size >> PAGESHIFT;
4010Sstevel@tonic-gate i = 0;
4020Sstevel@tonic-gate while (i < size) {
4030Sstevel@tonic-gate pp = ppp[i];
4040Sstevel@tonic-gate if (cow) {
4050Sstevel@tonic-gate kmutex_t *ahm;
4060Sstevel@tonic-gate /*
4070Sstevel@tonic-gate * Another solution is to hold SE_EXCL on pp, and
4080Sstevel@tonic-gate * disable PROT_WRITE. This also works for MAP_SHARED
4090Sstevel@tonic-gate * segment. The disadvantage is that it locks the
4100Sstevel@tonic-gate * page from being used by anybody else.
4110Sstevel@tonic-gate */
412*12173SMichael.Corcoran@Sun.COM ahm = AH_MUTEX(pp->p_vnode, pp->p_offset);
4130Sstevel@tonic-gate mutex_enter(ahm);
4140Sstevel@tonic-gate *app = swap_anon(pp->p_vnode, pp->p_offset);
4150Sstevel@tonic-gate /*
4160Sstevel@tonic-gate * Since we are holding the as lock, this avoids a
4170Sstevel@tonic-gate * potential race with anon_decref. (segvn_unmap and
4180Sstevel@tonic-gate * segvn_free needs the as writer lock to do anon_free.)
4190Sstevel@tonic-gate */
4200Sstevel@tonic-gate if (*app != NULL) {
4210Sstevel@tonic-gate #if 0
4220Sstevel@tonic-gate if ((*app)->an_refcnt == 0)
4230Sstevel@tonic-gate /*
4240Sstevel@tonic-gate * Consider the following senario (unlikey
4250Sstevel@tonic-gate * though):
4260Sstevel@tonic-gate * 1. an_refcnt == 2
4270Sstevel@tonic-gate * 2. we solftlock the page.
4280Sstevel@tonic-gate * 3. cow ocurrs on this addr. So a new ap,
4290Sstevel@tonic-gate * page and mapping is established on addr.
4300Sstevel@tonic-gate * 4. an_refcnt drops to 1 (segvn_faultpage
4310Sstevel@tonic-gate * -> anon_decref(oldap))
4320Sstevel@tonic-gate * 5. the last ref to ap also drops (from
4330Sstevel@tonic-gate * another as). It ends up blocked inside
4340Sstevel@tonic-gate * anon_decref trying to get page's excl lock.
4350Sstevel@tonic-gate * 6. Later kcfree unlocks the page, call
4360Sstevel@tonic-gate * anon_decref -> oops, ap is gone already.
4370Sstevel@tonic-gate *
4380Sstevel@tonic-gate * Holding as writer lock solves all problems.
4390Sstevel@tonic-gate */
4400Sstevel@tonic-gate *app = NULL;
4410Sstevel@tonic-gate else
4420Sstevel@tonic-gate #endif
4430Sstevel@tonic-gate (*app)->an_refcnt++;
4440Sstevel@tonic-gate }
4450Sstevel@tonic-gate mutex_exit(ahm);
4460Sstevel@tonic-gate } else {
4470Sstevel@tonic-gate *app = NULL;
4480Sstevel@tonic-gate }
4490Sstevel@tonic-gate if (kaddr != (caddr_t)-1) {
4500Sstevel@tonic-gate if (pp != *cached_ppp) {
4510Sstevel@tonic-gate if (*cached_ppp == NULL)
4520Sstevel@tonic-gate flags = HAT_LOAD_LOCK | HAT_NOSYNC |
4530Sstevel@tonic-gate HAT_LOAD_NOCONSIST;
4540Sstevel@tonic-gate else
4550Sstevel@tonic-gate flags = HAT_LOAD_REMAP |
4560Sstevel@tonic-gate HAT_LOAD_NOCONSIST;
4570Sstevel@tonic-gate /*
4580Sstevel@tonic-gate * In order to cache the kernel mapping after
4590Sstevel@tonic-gate * the user page is unlocked, we call
4600Sstevel@tonic-gate * hat_devload instead of hat_memload so
4610Sstevel@tonic-gate * that the kernel mapping we set up here is
4620Sstevel@tonic-gate * "invisible" to the rest of the world. This
4630Sstevel@tonic-gate * is not very pretty. But as long as the
4640Sstevel@tonic-gate * caller bears the responsibility of keeping
4650Sstevel@tonic-gate * cache consistency, we should be ok -
4660Sstevel@tonic-gate * HAT_NOCONSIST will get us a uncached
4670Sstevel@tonic-gate * mapping on VAC. hat_softlock will flush
4680Sstevel@tonic-gate * a VAC_WRITEBACK cache. Therefore the kaddr
4690Sstevel@tonic-gate * doesn't have to be of the same vcolor as
4700Sstevel@tonic-gate * uaddr.
4710Sstevel@tonic-gate * The alternative is - change hat_devload
4720Sstevel@tonic-gate * to get a cached mapping. Allocate a kaddr
4730Sstevel@tonic-gate * with the same vcolor as uaddr. Then
4740Sstevel@tonic-gate * hat_softlock won't need to flush the VAC.
4750Sstevel@tonic-gate */
4760Sstevel@tonic-gate hat_devload(kas.a_hat, kaddr, PAGESIZE,
4770Sstevel@tonic-gate page_pptonum(pp), PROT_READ, flags);
4780Sstevel@tonic-gate *cached_ppp = pp;
4790Sstevel@tonic-gate }
4800Sstevel@tonic-gate kaddr += PAGESIZE;
4810Sstevel@tonic-gate }
4820Sstevel@tonic-gate cached_ppp++;
4830Sstevel@tonic-gate app++;
4840Sstevel@tonic-gate ++i;
4850Sstevel@tonic-gate }
4860Sstevel@tonic-gate if (cow) {
4870Sstevel@tonic-gate AS_LOCK_EXIT(as, &as->a_lock);
4880Sstevel@tonic-gate }
4890Sstevel@tonic-gate if (first && res == FC_NOMAP) {
4900Sstevel@tonic-gate /*
4910Sstevel@tonic-gate * If the address is not mapped yet, we call as_fault to
4920Sstevel@tonic-gate * fault the pages in. We could've fallen back to copy and
4930Sstevel@tonic-gate * let it fault in the pages. But for a mapped file, we
4940Sstevel@tonic-gate * normally reference each page only once. For zero-copy to
4950Sstevel@tonic-gate * be of any use, we'd better fall in the page now and try
4960Sstevel@tonic-gate * again.
4970Sstevel@tonic-gate */
4980Sstevel@tonic-gate first = 0;
4990Sstevel@tonic-gate size = size << PAGESHIFT;
5000Sstevel@tonic-gate uaddr += size;
5010Sstevel@tonic-gate total -= size;
5020Sstevel@tonic-gate size = total;
5030Sstevel@tonic-gate res = as_fault(as->a_hat, as, uaddr, size, F_INVAL, S_READ);
5040Sstevel@tonic-gate if (cow)
5050Sstevel@tonic-gate AS_LOCK_ENTER(as, &as->a_lock, RW_WRITER);
5060Sstevel@tonic-gate goto tryagain;
5070Sstevel@tonic-gate }
5080Sstevel@tonic-gate switch (res) {
5090Sstevel@tonic-gate case FC_NOSUPPORT:
5100Sstevel@tonic-gate return (ENOTSUP);
5110Sstevel@tonic-gate case FC_PROT: /* Pretend we don't know about it. This will be */
5120Sstevel@tonic-gate /* caught by the caller when uiomove fails. */
5130Sstevel@tonic-gate case FC_NOMAP:
5140Sstevel@tonic-gate case FC_OBJERR:
5150Sstevel@tonic-gate default:
5160Sstevel@tonic-gate return (0);
5170Sstevel@tonic-gate }
5180Sstevel@tonic-gate }
519